diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000..8b5e47b47 --- /dev/null +++ b/.clang-format @@ -0,0 +1,179 @@ +--- +Language: Cpp +# BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveMacros: None +AlignConsecutiveAssignments: None +AlignConsecutiveBitFields: None +AlignConsecutiveDeclarations: None +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: false +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortEnumsOnASingleLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine +AttributeMacros: + - __capability +BinPackArguments: false +BinPackParameters: false +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: Never + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeConceptDeclarations: true +#BreakBeforeBraces: Attach +BreakBeforeBraces: Stroustrup +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 110 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DeriveLineEnding: true +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 1 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseLabels: false +IndentCaseBlocks: false +IndentGotoLabels: true +IndentPPDirectives: None +IndentExternBlock: AfterExternBlock +IndentRequires: false +IndentWidth: 2 +IndentWrappedFunctionNames: false +InsertTrailingCommas: None +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +LambdaBodyIndentation: Signature +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PenaltyIndentedWhitespace: 0 +PointerAlignment: Right +PPIndentWidth: -1 +ReferenceAlignment: Pointer +ReflowComments: true +ShortNamespaceLines: 1 +SortIncludes: false +SortJavaStaticImport: Before +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceAroundPointerQualifiers: Default +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: Never +SpacesInConditionalStatement: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +BitFieldColonSpacing: Both +Standard: Latest +StatementAttributeLikeMacros: + - Q_EMIT +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseCRLF: false +UseTab: Never +WhitespaceSensitiveMacros: + - STRINGIZE + - PP_STRINGIZE + - BOOST_PP_STRINGIZE + - NS_SWIFT_NAME + - CF_SWIFT_NAME +... + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 39229c53d..594aeb570 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,12 +4,9 @@ on: # allows us to run workflows manually workflow_dispatch: - pull_request: - branches: - - next push: branches: - - next + - master env: GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" @@ -20,6 +17,7 @@ env: OCCA_CUDA_ENABLED: "0" OCCA_HIP_ENABLED: "0" OCCA_OPENCL_ENABLED: "0" + NEKRS_COMPILER_FLAGS: "-O2" NEKRS_OCCA_MODE_DEFAULT: "SERIAL" NEKRS_CI: "1" @@ -81,35 +79,55 @@ jobs: - name: 'ethier default' working-directory: ${{ env.NEKRS_EXAMPLES }}/ethier run: | - NEKRS_CACHE_DIR=${{ env.NEKRS_EXAMPLES }}/ethier/custom-cache-dir ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 1 1 + NEKRS_CACHE_DIR=${{ env.NEKRS_EXAMPLES }}/ethier/custom-cache-dir ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 1 --cimode 1 - name: 'ethier subcycle' working-directory: ${{ env.NEKRS_EXAMPLES }}/ethier - run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 2 + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 --cimode 2 - name: 'ethier velocity and pressure projection' working-directory: ${{ env.NEKRS_EXAMPLES }}/ethier - run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 3 + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 --cimode 3 - name: 'ethier (block) velocity and pressure projection with subcycling' working-directory: ${{ env.NEKRS_EXAMPLES }}/ethier - run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 4 + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 --cimode 4 - name: 'ethier default + moving mesh' working-directory: ${{ env.NEKRS_EXAMPLES }}/ethier - run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_ethier 2 5 + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_ethier 2 --cimode 5 - name: 'ethier subcycle + moving mesh + subcycling' working-directory: ${{ env.NEKRS_EXAMPLES }}/ethier - run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_ethier 2 6 + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_ethier 2 --cimode 6 - name: 'ethier gmres bug' working-directory: ${{ env.NEKRS_EXAMPLES }}/ethier - run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 7 + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 --cimode 7 - name: 'ethier variable dt' working-directory: ${{ env.NEKRS_EXAMPLES }}/ethier - run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 8 + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 --cimode 8 + + - name: 'ethier no dealiasing and subcycling' + working-directory: ${{ env.NEKRS_EXAMPLES }}/ethier + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 --cimode 9 + + - name: 'ethier no dealiasing' + working-directory: ${{ env.NEKRS_EXAMPLES }}/ethier + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 --cimode 10 + + - name: 'ethier Chebyshev+Jacobi' + working-directory: ${{ env.NEKRS_EXAMPLES }}/ethier + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 --cimode 11 + + - name: 'ethier skip solving temperature' + working-directory: ${{ env.NEKRS_EXAMPLES }}/ethier + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethier 2 --cimode 12 + + - name: 'ethier solve single scalar only' + working-directory: ${{ env.NEKRS_EXAMPLES }}/ethier + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi ethierScalar 2 --cimode 13 lowMach: needs: install @@ -136,7 +154,7 @@ jobs: - name: 'lowMach default' working-directory: ${{ env.NEKRS_EXAMPLES }}/lowMach - run: ${{ env.NEKRS_HOME }}/bin/nrsmpi lowMach 2 1 + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi lowMach 2 --cimode 1 mv_cyl: @@ -164,11 +182,53 @@ jobs: - name: 'mv_cyl' working-directory: ${{ env.NEKRS_EXAMPLES }}/mv_cyl - run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl 2 1 + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl 2 --cimode 1 - name: 'mv_cyl + subcycling' working-directory: ${{ env.NEKRS_EXAMPLES }}/mv_cyl - run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl 2 2 + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl 2 --cimode 2 + + - name: 'mv_cyl + subcycling + elasticity solve' + working-directory: ${{ env.NEKRS_EXAMPLES }}/mv_cyl + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl 2 --cimode 3 + + - name: 'mv_cyl + subcycling + elasticity solve (projection)' + working-directory: ${{ env.NEKRS_EXAMPLES }}/mv_cyl + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl 2 --cimode 4 + + - name: 'mv_cyl + unaligned SYM' + working-directory: ${{ env.NEKRS_EXAMPLES }}/mv_cyl + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl 2 --cimode 5 + + # TODO: resolve error in unaligned SYM + mesh solver + #- name: 'mv_cyl + unaligned SYM + mesh solver' + # working-directory: ${{ env.NEKRS_EXAMPLES }}/mv_cyl + # run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl 2 --cimode 6 + + - name: 'mv_cyl, derived bc' + working-directory: ${{ env.NEKRS_EXAMPLES }}/mv_cyl + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl_derived_bc 2 --cimode 1 + + - name: 'mv_cyl + subcycling, derived bc' + working-directory: ${{ env.NEKRS_EXAMPLES }}/mv_cyl + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl_derived_bc 2 --cimode 2 + + - name: 'mv_cyl + subcycling + elasticity solve, derived bc' + working-directory: ${{ env.NEKRS_EXAMPLES }}/mv_cyl + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl_derived_bc 2 --cimode 3 + + - name: 'mv_cyl + subcycling + elasticity solve (projection), derived bc' + working-directory: ${{ env.NEKRS_EXAMPLES }}/mv_cyl + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl_derived_bc 2 --cimode 4 + + - name: 'mv_cyl + unaligned SYM, derived bc' + working-directory: ${{ env.NEKRS_EXAMPLES }}/mv_cyl + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl_derived_bc 2 --cimode 5 + + # TODO: resolve error in unaligned SYM + mesh solver + #- name: 'mv_cyl + unaligned SYM + mesh solver, derived bc' + # working-directory: ${{ env.NEKRS_EXAMPLES }}/mv_cyl + # run: ${{ env.NEKRS_HOME }}/bin/nrsmpi mv_cyl_derived_bc 2 --cimode 6 conj_ht: @@ -196,7 +256,7 @@ jobs: - name: 'conj_ht' working-directory: ${{ env.NEKRS_EXAMPLES }}/conj_ht - run: ${{ env.NEKRS_HOME }}/bin/nrsmpi conj_ht 2 1 + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi conj_ht 2 --cimode 1 channelStress: @@ -222,10 +282,44 @@ jobs: - name: Set install dir permissions run: chmod -R 755 ${{ env.NEKRS_INSTALL_DIR }} - - name: 'channelStress' + - name: 'channelStress (no rotation)' + working-directory: ${{ env.NEKRS_EXAMPLES }}/channel + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi channel 2 --cimode 1 + + - name: 'channelStress (45 degree rotation)' working-directory: ${{ env.NEKRS_EXAMPLES }}/channel - run: ${{ env.NEKRS_HOME }}/bin/nrsmpi channel 2 1 + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi channel 2 --cimode 2 + + tractionBoundary: + needs: install + runs-on: ubuntu-18.04 + steps: + + - uses: actions/checkout@v2 + with: + clean: true + + - name: APT dependencies + run: | + sudo apt -y update + sudo apt install -y mpich libmpich-dev + + - name: Download install dir + uses: actions/download-artifact@v2 + with: + name: install-dir + path: ${{ env.NEKRS_INSTALL_DIR }} + + - name: Set install dir permissions + run: chmod -R 755 ${{ env.NEKRS_INSTALL_DIR }} + + - name: 'traction channel (no rotation)' + working-directory: ${{ env.NEKRS_EXAMPLES }}/shlChannel + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi channel 2 --cimode 1 + - name: 'traction channel (45 degree rotation)' + working-directory: ${{ env.NEKRS_EXAMPLES }}/shlChannel + run: ${{ env.NEKRS_HOME }}/bin/nrsmpi channel 2 --cimode 2 kershaw: runs-on: ubuntu-18.04 diff --git a/.gitignore b/.gitignore index 2ff96bbb3..1067bc2d8 100644 --- a/.gitignore +++ b/.gitignore @@ -2,8 +2,8 @@ build/ CMakeFiles *.tgz .vscode/ -.clang-format *.swp # From AMGX plugin_config.cu *.swp +.cache/ diff --git a/3rd_party/AMGX/.gitignore b/3rd_party/AMGX/.gitignore index 35ede212f..195014f91 100644 --- a/3rd_party/AMGX/.gitignore +++ b/3rd_party/AMGX/.gitignore @@ -6,4 +6,5 @@ plugin_config.cu *.sublime-project *.sublime-workspace core/src/version.cu -ci/docker/ \ No newline at end of file +ci/docker/ +plugin_config.cu diff --git a/3rd_party/AMGX/CHANGELIST b/3rd_party/AMGX/CHANGELOG similarity index 94% rename from 3rd_party/AMGX/CHANGELIST rename to 3rd_party/AMGX/CHANGELOG index b87ef2436..0d122b64f 100644 --- a/3rd_party/AMGX/CHANGELIST +++ b/3rd_party/AMGX/CHANGELOG @@ -1,8 +1,8 @@ -CHANGELIST +CHANGELOG =============================================================== -v2.2.0 +v2.2.0 - 2021-04-06 --------------------------------------------------------------- @@ -27,7 +27,7 @@ Tested configurations: =============================================================== -v2.1.0 +v2.1.0 - 2020-03-20 --------------------------------------------------------------- @@ -37,7 +37,7 @@ v2.1.0 =============================================================== -v2.0.0 +v2.0.0 - 2017.10.17 --------------------------------------------------------------- diff --git a/3rd_party/AMGX/CMakeLists.txt b/3rd_party/AMGX/CMakeLists.txt index 246ba8efa..13da82968 100644 --- a/3rd_party/AMGX/CMakeLists.txt +++ b/3rd_party/AMGX/CMakeLists.txt @@ -24,20 +24,18 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -cmake_minimum_required (VERSION 2.8.10) +cmake_minimum_required (VERSION 3.18) # the project -project (AMG) +project (AMG LANGUAGES C CXX CUDA) +set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake" ${CMAKE_MODULE_PATH}) #disable in-place builds if(${CMAKE_BINARY_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}) MESSAGE(FATAL_ERROR "Error: In-place builds are not supported. Please create a separate build directory") endif(${CMAKE_BINARY_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}) -# set the path to extra modules. -set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cuda" ${CMAKE_MODULE_PATH}) - # declare the supported configurations set(CMAKE_CONFIGURATION_TYPES "Debug;Release;Profile;RelWithTraces" CACHE STRING "Avaialble Configuration Types" FORCE) @@ -46,6 +44,8 @@ IF(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are Debug Release Profile RelWithTraces" FORCE) ENDIF(NOT CMAKE_BUILD_TYPE) +find_package(CUDAToolkit 10.0 REQUIRED) + # update/define the compilation flags. IF(WIN32) set(CMAKE_C_FLAGS "/DWIN32 /D_WINDOWS /W3 /bigobj" CACHE STRING "" FORCE) @@ -130,7 +130,6 @@ endif(DEFINED ENV{CRAY_MPICH2_DIR}) # Thrust: -set (THRUST_DIR "${PROJECT_SOURCE_DIR}/../../thrust") if (DEFINED ENV{THRUST_ROOT}) set (THRUST_DIR $ENV{THRUST_ROOT}) endif(DEFINED ENV{THRUST_ROOT}) @@ -161,56 +160,52 @@ FIND_PACKAGE(MPI) message ("This is a MPI build:" ${MPI_FOUND}) -# enable source level parallel builds in visual studio -#IF(CMAKE_GENERATOR MATCHES "Visual Studio") -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /M2 /MP") -#ENDIF (CMAKE_GENERATOR MATCHES "Visual Studio") - # Enable NVTX ranges on Linux if(NOT WIN32) set(NVTXRANGE_FLAG "-DNVTX_RANGES;") endif() -#Configuration specific nvcc flags +# Configuration specific nvcc flags GET_FILENAME_COMPONENT(CMAKE_CXX_COMPILER_NAME "${CMAKE_CXX_COMPILER}" NAME) if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_NAME MATCHES "clang") - set(CUDA_NVCC_FLAGS_DEBUG "-g;-G;-std=c++14;--Werror cross-execution-space-call;${NVTXRANGE_FLAG}" CACHE STRING "Debug compiler flags") - set(CUDA_NVCC_FLAGS_RELEASE "-O3;-DNDEBUG;-std=c++14;--Werror cross-execution-space-call;${NVTXRANGE_FLAG}" CACHE STRING "Release compiler flags") - set(CUDA_NVCC_FLAGS_PROFILE "-O3;-DPROFILE;-std=c++14;${NVTXRANGE_FLAG}" CACHE STRING "Profile compiler flags") - set(CUDA_NVCC_FLAGS_RELWITHTRACES "-O3;-DNDEBUG;-DAMGX_USE_CPU_PROFILER;-std=c++14;${NVTXRANGE_FLAG}" CACHE STRING "RelWithTraces compiler flags") + set(CUDA_NVCC_FLAGS_DEBUG "-g;-G" CACHE STRING "Debug compiler flags") + set(CUDA_NVCC_FLAGS_RELEASE "-O3;-DNDEBUG" CACHE STRING "Release compiler flags") + set(CUDA_NVCC_FLAGS_PROFILE "-O3;-DPROFILE" CACHE STRING "Profile compiler flags") + set(CUDA_NVCC_FLAGS_RELWITHTRACES "-O3;-DNDEBUG;-DAMGX_USE_CPU_PROFILER" CACHE STRING "RelWithTraces compiler flags") else() - set(CUDA_NVCC_FLAGS_DEBUG "-g;-G;--Werror cross-execution-space-call;${NVTXRANGE_FLAG}" CACHE STRING "Debug compiler flags") - set(CUDA_NVCC_FLAGS_RELEASE "-O3;-DNDEBUG;--Werror cross-execution-space-call;${NVTXRANGE_FLAG}" CACHE STRING "Release compiler flags") - set(CUDA_NVCC_FLAGS_PROFILE "-O3;-DPROFILE;${NVTXRANGE_FLAG}" CACHE STRING "Profile compiler flags") - set(CUDA_NVCC_FLAGS_RELWITHTRACES "-O3;-DNDEBUG;-DAMGX_USE_CPU_PROFILER;${NVTXRANGE_FLAG}" CACHE STRING "RelWithTraces compiler flags") - if(WIN32) - set(CUDA_NVCC_FLAGS_RELEASE "${CUDA_NVCC_FLAGS_RELEASE} -DNOMINMAX") - endif() + set(CUDA_NVCC_FLAGS_DEBUG "-g;-G" CACHE STRING "Debug compiler flags") + set(CUDA_NVCC_FLAGS_RELEASE "-O3;-DNDEBUG" CACHE STRING "Release compiler flags") + set(CUDA_NVCC_FLAGS_PROFILE "-O3;-DPROFILE" CACHE STRING "Profile compiler flags") + set(CUDA_NVCC_FLAGS_RELWITHTRACES "-O3;-DNDEBUG;-DAMGX_USE_CPU_PROFILER" CACHE STRING "RelWithTraces compiler flags") + if(WIN32) + set(CUDA_NVCC_FLAGS_RELEASE "${CUDA_NVCC_FLAGS_RELEASE} -DNOMINMAX") + endif() endif() -#keep intermediate files +# Add the build-specific flags to the NVCC flags +string(TOUPPER ${CMAKE_BUILD_TYPE} UPP_BUILD_NAME) +set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS_${UPP_BUILD_NAME}}") + +# Add errors for execution space warnings and enable NVTX ranges +set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --Werror cross-execution-space-call ${NVTXRANGE_FLAG}) + +# Keep intermediate files if (AMGX_keep_intermediate) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler=-keep") endif(AMGX_keep_intermediate) -#windows/linux specific settings for C +# Windows/linux specific settings for C GET_FILENAME_COMPONENT(CMAKE_C_COMPILER_NAME "${CMAKE_C_COMPILER}" NAME) IF(CMAKE_C_COMPILER_NAME MATCHES cl AND NOT CMAKE_C_COMPILER_NAME MATCHES clang) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}" CACHE STRING "nvcc flags") ELSE(CMAKE_C_COMPILER_NAME MATCHES cl AND NOT CMAKE_C_COMPILER_NAME MATCHES clang) - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler=-rdynamic;-Xcompiler=-fPIC;-Xcompiler=-fvisibility=default") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler=-rdynamic;-Xcompiler=-fPIC;-Xcompiler=-fvisibility=default" CACHE STRING "nvcc flags") ENDIF(CMAKE_C_COMPILER_NAME MATCHES cl AND NOT CMAKE_C_COMPILER_NAME MATCHES clang) # VS: include object files in target property SOURCES # otherwise a workaround for extracting ${obj_all} is necessary below set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF) -# load CUDA. -set(CUDA_USE_STATIC_CUDA_RUNTIME OFF) -FIND_PACKAGE(CUDA) - -message("Cuda libraries: " ${CUDA_LIBRARIES}) - if(MPI_FOUND) set(VAMPIR_TRACE_INCLUDE_PATH "${MPI_INCLUDE_PATH}/vampirtrace") set(VAMPIR_TRACE_LIB_PATH "${MPI_INCLUDE_PATH}/../lib") @@ -231,44 +226,52 @@ if(MPI_FOUND) endif (DEFINED ENV{VAMPIR_TRACE_PATH}) endif(MPI_FOUND) -#if CUDA Toolkit is older than 9.0 -if(CUDA_VERSION_MAJOR LESS 9) - message(FATAL_ERROR "CUDA versions older than 9.0 are not supported") -endif() +#if compiling against CUDA Toolkit 11.x + +IF(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.0.0) -#if compiling against CUDA Toolkit 9.x -IF(CUDA_VERSION_MAJOR MATCHES 9) - SET(CUDA_ARCH "70" CACHE STRING "Target Architectures (SM35 SM52 SM60 SM70), multiple are allowed") -ENDIF(CUDA_VERSION_MAJOR MATCHES 9) + SET(CUDA_ALLOW_ARCH "70;80") + + # Use the generic cuSPARSE interfaces available from 10.1 on Linux, cusparseSpGEMM from 11.0 + SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} + -DDISABLE_MIXED_PRECISION + -DCUSPARSE_GENERIC_INTERFACES + -DCUSPARSE_USE_GENERIC_SPGEMM) #if compiling against CUDA Toolkit 10.x -IF(CUDA_VERSION_MAJOR MATCHES 10) - SET(CUDA_ARCH "70" CACHE STRING "Target Architectures (SM35 SM52 SM60 SM70), multiple are allowed") +ELSEIF(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 10.0.0) - IF(CUDA_VERSION_MINOR GREATER 0) - # Disable mixed precision for CUDA 10.1+ - # Use the generic cuSPARSE interfaces available from 10.1 on Linux - SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -DDISABLE_MIXED_PRECISION -DCUSPARSE_GENERIC_INTERFACES) - ENDIF(CUDA_VERSION_MINOR GREATER 0) -ENDIF(CUDA_VERSION_MAJOR MATCHES 10) + SET(CUDA_ALLOW_ARCH "70") -#if compiling against CUDA Toolkit 11.x -IF(CUDA_VERSION_MAJOR MATCHES 11) - SET(CUDA_ARCH "70 80" CACHE STRING "Target Architectures (SM60 SM70 SM80), multiple are allowed") + IF(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 10.1.0) - # Disable mixed precision for CUDA 10.1+ - # Use the generic cuSPARSE interfaces available from 10.1 on Linux, cusparseSpGEMM from 11.0 - SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -DDISABLE_MIXED_PRECISION -DCUSPARSE_GENERIC_INTERFACES -DCUSPARSE_USE_GENERIC_SPGEMM) + # Disable mixed precision for CUDA 10.1+ + SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} + -DDISABLE_MIXED_PRECISION + -DCUSPARSE_GENERIC_INTERFACES) + + ENDIF() +ENDIF() + +if(DEFINED CUDA_ARCH) + # User passed a CUDA_ARCH so check it matches + # Error if incorrect CUDA_ARCH passed + FOREACH(ARCH IN LISTS CUDA_ARCH) + message(STATUS "Checking if arch " ${ARCH} " is supported...") + IF(NOT ${ARCH} IN_LIST CUDA_ALLOW_ARCH) + message(STATUS + "Chosen CUDA_ARCH ${ARCH} not expected for current CUDA version. " + "Please choose one or more of ${CUDA_ALLOW_ARCH}.") + ENDIF() + ENDFOREACH() +ELSE() -ENDIF(CUDA_VERSION_MAJOR MATCHES 11) + # Set a default + SET(CUDA_ARCH "${CUDA_ALLOW_ARCH}" CACHE STRING "Target Architectures (SM70 SM80), multiple are allowed") -#replace ' ' with ; to match the proper cmake format -STRING(REGEX REPLACE " " ";" CUDA_ARCH ${CUDA_ARCH}) +ENDIF() -#set the compiler flags for each NV target -FOREACH(target ${CUDA_ARCH}) - SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode=arch=compute_${target},code=\\\"sm_${target},compute_${target}\\\") -ENDFOREACH(target ${CUDA_ARCH}) +# Add the CXX flags to the host set of CUDA flags +SET(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler ${CMAKE_CXX_FLAGS}) # write version set(versionFile "ReleaseVersion.txt") @@ -282,12 +285,11 @@ if(${versionFile} IS_NEWER_THAN ${versionCode}) file(APPEND ${versionCode} "const char __AMGX_BUILD_ID__ [] = " ${versionString} ";" \n}\n) endif(${versionFile} IS_NEWER_THAN ${versionCode}) - # plugin management set(pluginConf "PluginConfig.txt") # plugin configuration set(pluginCode "${CMAKE_CURRENT_SOURCE_DIR}/plugin_config.cu") # plugin code file(STRINGS ${pluginConf} pluginList) # plugin list -set(libList base core ${pluginList}) # lib list +set(libList base core ${pluginList}) # create initialization and finalization for plugins if(${pluginConf} IS_NEWER_THAN ${pluginCode}) @@ -330,72 +332,64 @@ if (MKL_ROOT_DIR) set(mkl_libs -Wl,--start-group ${MKL_ROOT_DIR}/lib/intel64/libmkl_intel_lp64.a ${MKL_ROOT_DIR}/lib/intel64/libmkl_gnu_thread.a ${MKL_ROOT_DIR}/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread -fopenmp) endif(MKL_ROOT_DIR) +add_library(amgx_libs OBJECT "") + #create a list of all sources in amgx -set(src_all) # collect all sources from libs -set(target_all) # collect all targets from libs -set(tests_all) foreach( lib ${libList} ) add_subdirectory(${lib}) - get_target_property(src amgx_${lib} SOURCES) - set(src_all ${src_all} ${src}) - set(target_all ${target_all} amgx_${lib}) - FILE(GLOB_RECURSE TESTS "${CMAKE_CURRENT_SOURCE_DIR}/${lib}/tests/*.cu") - set(tests_all ${tests_all} ${TESTS}) + #FILE(GLOB_RECURSE TESTS "${CMAKE_CURRENT_SOURCE_DIR}/${lib}/tests/*.cu") endforeach(lib) -# create a list of all objects in amgx -set(obj_all) -set(cpp_all) -foreach( src ${src_all} ) - if(${src} MATCHES ${CMAKE_BINARY_DIR}) - set(obj_all ${obj_all} ${src}) - else(${src} MATCHES ${CMAKE_BINARY_DIR}) - set(cpp_all ${cpp_all} ${src}) - endif(${src} MATCHES ${CMAKE_BINARY_DIR}) -endforeach(src) +# set arch for main libs target +set_target_properties(amgx_libs PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") + +target_compile_options(amgx_libs PUBLIC $<$: ${CUDA_NVCC_FLAGS} >) # build amgx -set_source_files_properties(${obj_all} PROPERTIES GENERATED TRUE) -#CUDA_ADD_LIBRARY(amgx STATIC ${obj_all} ) # static lib -CUDA_ADD_LIBRARY(amgxsh SHARED ${obj_all} ) # shared lib -if (target_all) - # add_dependencies(amgx ${target_all}) - add_dependencies(amgxsh ${target_all}) -endif (target_all) - -find_library_local_first(cublas_library cublas "The cuBLAS CUDA library") -find_library_local_first(cusparse_library cusparse "The cusparse CUDA library") -find_library_local_first(cusolver_library cusolver "The cusolver CUDA library") +add_library(amgx STATIC $) # static lib +target_link_libraries(amgx amgx_libs) + +add_library(amgxsh SHARED $) # shared lib +target_link_libraries(amgxsh amgx_libs) + +set_target_properties(amgx PROPERTIES LINKER_LANGUAGE CUDA) +set_target_properties(amgxsh PROPERTIES LINKER_LANGUAGE CUDA) + +target_compile_options(amgx PUBLIC $<$: ${CUDA_NVCC_FLAGS} >) +target_compile_options(amgxsh PUBLIC $<$: ${CUDA_NVCC_FLAGS} >) IF (WIN32) - #target_link_libraries( amgx ${cublas_library} ${cusparse_library} ${cusolver_library}) - target_link_libraries( amgxsh ${cublas_library} ${cusparse_library} ${cusolver_library}) + target_link_libraries( amgx CUDA::cublas CUDA::cusparse CUDA::cusolver) + target_link_libraries( amgxsh CUDA::cublas CUDA::cusparse CUDA::cusolver) ELSE (WIN32) - find_library_local_first(nvtx nvToolsExt "nvtx CUDA library") - # target_link_libraries( amgx ${cublas_library} ${cusparse_library} ${cusolver_library} ${nvtx} m pthread) - target_link_libraries( amgxsh ${cublas_library} ${cusparse_library} ${cusolver_library} ${nvtx} m pthread) + target_link_libraries( amgx CUDA::cublas CUDA::cusparse CUDA::cusolver CUDA::nvToolsExt m pthread) + target_link_libraries( amgxsh CUDA::cublas CUDA::cusparse CUDA::cusolver CUDA::nvToolsExt m pthread) ENDIF(WIN32) if(MPI_FOUND) - # target_link_libraries( amgx ${MPI_C_LIBRARIES}) + target_link_libraries( amgx ${MPI_C_LIBRARIES}) target_link_libraries( amgxsh ${MPI_C_LIBRARIES}) endif(MPI_FOUND) #link magma if (MAGMA_ROOT_DIR) - # target_link_libraries(amgx ${magma_libs}) + target_link_libraries(amgx ${magma_libs}) target_link_libraries(amgxsh ${magma_libs}) endif(MAGMA_ROOT_DIR) #link lapack (MKL) if (MKL_ROOT_DIR) - # target_link_libraries(amgx ${mkl_libs} ) + target_link_libraries(amgx ${mkl_libs} ) target_link_libraries(amgxsh ${mkl_libs}) endif(MKL_ROOT_DIR) -#install(TARGETS amgx DESTINATION "lib") +# set arch for main libs +set_target_properties(amgx PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") +set_target_properties(amgxsh PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") + +install(TARGETS amgx DESTINATION "lib") install(TARGETS amgxsh DESTINATION "lib") -export(TARGETS amgxsh FILE ${CMAKE_CURRENT_SOURCE_DIR}/amgxsh.cmake) +#export(TARGETS amgxsh FILE ${CMAKE_CURRENT_SOURCE_DIR}/amgxsh.cmake) # build examples #add_subdirectory(examples) diff --git a/3rd_party/AMGX/PluginConfig.txt b/3rd_party/AMGX/PluginConfig.txt index 4ea9f5afc..8191658ba 100644 --- a/3rd_party/AMGX/PluginConfig.txt +++ b/3rd_party/AMGX/PluginConfig.txt @@ -1,2 +1 @@ template_plugin -eigensolvers diff --git a/3rd_party/AMGX/README.md b/3rd_party/AMGX/README.md index 175fe60fb..37fab907c 100644 --- a/3rd_party/AMGX/README.md +++ b/3rd_party/AMGX/README.md @@ -60,15 +60,25 @@ cmake .... -DCUDA_ARCH="35 52 60" .... - AMGX_NO_RPATH: Boolean value. By default CMake adds -rpath flags to binaries. Setting this flag to True tell CMake to not do that - useful for controlling execution environment. - MKL_ROOT_DIR and MAGMA_ROOT_DIR: string values. MAGMA/MKL functionality is used to accelerate some of the AMGX eigensolvers. Those solvers will return error 'not supported' if AMGX was not build with MKL/MAGMA support. -CMakeLists uses FindCUDA and FindMPI module scripts to locate corresponding software +The build system now enables CUDA as a language, and employs FindCUDAToolkit and FindMPI, so refer to those scripts from your CMake installation for module-specific flags. +When building with the NVIDIA HPC SDK, please use CMake >= 3.22, +and GCC for C/CXX compilation, e.g. + +``` +cmake \ + -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_BUILD_TYPE=Release \ + -DCUDA_ARCH="80" .. +``` + Artifacts of the build are shared and static libraries (libamgxsh.so or amgxsh.dll and libamgx.a or amgx.lib) and few binaries from 'examples' directory that give you examples of using various AMGX C API. MPI examples are built only if MPI build was enabled. - ### Running examples Sample input matrix [matrix.mtx](examples/matrix.mtx) is in the examples directory. Sample AMGX solvers configurations are located in the [core/configs](core/configs) directory in the root folder. Make sure that examples are able to find AMGX shared library - by default _-rpath_ flag is used for binaries, but you might specify path manually in the environment variable: _LD_LIBRARY_PATH_ for Linux and _PATH_ for Windows. diff --git a/3rd_party/AMGX/base/CMakeLists.txt b/3rd_party/AMGX/base/CMakeLists.txt index ca77563cc..5afaf212b 100644 --- a/3rd_party/AMGX/base/CMakeLists.txt +++ b/3rd_party/AMGX/base/CMakeLists.txt @@ -24,21 +24,17 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -IF (WIN32) -cmake_minimum_required (VERSION 2.8.8) -ELSE (WIN32) -cmake_minimum_required (VERSION 2.8.0) -ENDIF (WIN32) +cmake_minimum_required (VERSION 3.13) -CUDA_INCLUDE_DIRECTORIES(${THRUST_DIR}) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/include) - -# select all source +# Add base source FILE(GLOB_RECURSE SRCS "src/*.cu") -CUDA_ADD_LIBRARY(amgx_base STATIC ${SRCS} ${pluginCode}) +target_sources(amgx_libs PRIVATE ${SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/../plugin_config.cu) + +target_include_directories(amgx_libs PUBLIC ${THRUST_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/include) install(FILES include/amgx_config.h include/amgx_c.h DESTINATION include) + diff --git a/3rd_party/AMGX/base/include/amg_level.h b/3rd_party/AMGX/base/include/amg_level.h index 9c9811eb3..c4be56ccd 100644 --- a/3rd_party/AMGX/base/include/amg_level.h +++ b/3rd_party/AMGX/base/include/amg_level.h @@ -218,7 +218,6 @@ class AMG_Level Matrix *A; Matrix *Aoriginal; VVector bc, xc, r; - int m_min_rows_latency_hiding; AMG_Class *amg; AMG_Level *next_h; diff --git a/3rd_party/AMGX/base/include/amgx_cusparse.h b/3rd_party/AMGX/base/include/amgx_cusparse.h index f22ac45da..5a127fe3d 100644 --- a/3rd_party/AMGX/base/include/amgx_cusparse.h +++ b/3rd_party/AMGX/base/include/amgx_cusparse.h @@ -250,7 +250,14 @@ class Cusparse Vector &Res); + template + static void transpose(const Matrix &A, Matrix &B); + + template + static void transpose(const Matrix &A, Matrix &B, const int nRows, const int nNz); + private: + template static void bsrmv_internal( typename TConfig::VecPrec alphaConst, const Matrix &A, @@ -326,7 +333,8 @@ class Cusparse int blockDim, const float *x, const float *beta, - float *y); + float *y, + const cudaStream_t& stream); static inline void bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, cusparseOperation_t trans, int mb, int nb, int nnzb, @@ -339,7 +347,8 @@ class Cusparse int blockDim, const double *x, const double *beta, - double *y); + double *y, + const cudaStream_t& stream); static inline void bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, cusparseOperation_t trans, int mb, int nb, int nnzb, @@ -352,7 +361,8 @@ class Cusparse int blockDim, const double *x, const double *beta, - double *y); + double *y, + const cudaStream_t& stream); // overloaded C++ wrappers for cusparse?bsrxmv // bsrxmv @@ -418,7 +428,8 @@ class Cusparse int blockDim, const cuComplex *x, const cuComplex *beta, - cuComplex *y); + cuComplex *y, + const cudaStream_t& stream); static inline void bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, cusparseOperation_t trans, int mb, int nb, int nnzb, @@ -431,7 +442,8 @@ class Cusparse int blockDim, const cuDoubleComplex *x, const cuDoubleComplex *beta, - cuDoubleComplex *y); + cuDoubleComplex *y, + const cudaStream_t& stream); static inline void bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, cusparseOperation_t trans, int mb, int nb, int nnzb, @@ -444,7 +456,8 @@ class Cusparse int blockDim, const cuDoubleComplex *x, const cuDoubleComplex *beta, - cuDoubleComplex *y); + cuDoubleComplex *y, + const cudaStream_t& stream); // overloaded C++ wrappers for cusparse?bsrxmv // bsrxmv diff --git a/3rd_party/AMGX/base/include/amgx_types/util.h b/3rd_party/AMGX/base/include/amgx_types/util.h index 859c11d98..c236d6a15 100644 --- a/3rd_party/AMGX/base/include/amgx_types/util.h +++ b/3rd_party/AMGX/base/include/amgx_types/util.h @@ -117,6 +117,7 @@ struct util ::type > static __host__ __device__ __inline__ float conjugate(const float &val) {return val;}; static __host__ __device__ __inline__ void invert_inplace(float &val) {val = -val;}; static __host__ __device__ __inline__ void conjugate_inplace(float &val) {}; + static __host__ __device__ __inline__ void divide_by_integer(float& val, int64_t &denom) {val /= static_cast(denom);}; static __host__ __device__ __inline__ float abs (const float &val) { @@ -168,6 +169,7 @@ struct util ::type> static __host__ __device__ __inline__ double conjugate(const double &val) {return val;}; static __host__ __device__ __inline__ void invert_inplace(double &val) {val = -val;}; static __host__ __device__ __inline__ void conjugate_inplace(double &val) {}; + static __host__ __device__ __inline__ void divide_by_integer(double& val, int64_t &denom) {val /= static_cast(denom);}; static __host__ __device__ __inline__ double abs (const double &val) { @@ -220,6 +222,12 @@ struct util ::type > static __host__ __device__ __inline__ cuComplex conjugate(const cuComplex &val) {return make_cuComplex(cuCrealf(val), -cuCimagf(val));}; static __host__ __device__ __inline__ void invert_inplace(cuComplex &val) {val = make_cuComplex(-cuCrealf(val), -cuCimagf(val));}; static __host__ __device__ __inline__ void conjugate_inplace(cuComplex &val) {val = make_cuComplex(cuCrealf(val), -cuCimagf(val));}; + static __host__ __device__ __inline__ void divide_by_integer(cuComplex& val, int64_t &denom) + { + float den = static_cast(denom); + val.x /= den; + val.y /= den; + }; static __host__ __device__ __inline__ float abs (const cuComplex &val) { @@ -294,6 +302,12 @@ struct util ::type> static __host__ __device__ __inline__ cuDoubleComplex conjugate(const cuDoubleComplex &val) {return make_cuDoubleComplex(cuCreal(val), -cuCimag(val));}; static __host__ __device__ __inline__ void invert_inplace(cuDoubleComplex &val) {val = make_cuDoubleComplex(-cuCreal(val), -cuCimag(val));}; static __host__ __device__ __inline__ void conjugate_inplace(cuDoubleComplex &val) {val = make_cuDoubleComplex(cuCreal(val), -cuCimag(val));}; + static __host__ __device__ __inline__ void divide_by_integer(cuDoubleComplex& val, int64_t &denom) + { + double den = static_cast(denom); + val.x /= den; + val.y /= den; + }; static __host__ __device__ __inline__ double abs (const cuDoubleComplex &val) { diff --git a/3rd_party/AMGX/base/include/distributed/comms_mpi_hostbuffer_stream.h b/3rd_party/AMGX/base/include/distributed/comms_mpi_hostbuffer_stream.h index 354ecf264..e0fba5687 100644 --- a/3rd_party/AMGX/base/include/distributed/comms_mpi_hostbuffer_stream.h +++ b/3rd_party/AMGX/base/include/distributed/comms_mpi_hostbuffer_stream.h @@ -298,7 +298,7 @@ class CommsMPIHostBufferStream : public CommsMPI void recv_vec_wait_all(T &b); template - void all_gather_templated(T &my_data, T2 &gathered_data, int num_parts); + void all_gather_templated(const T &my_data, T2 &gathered_data, int num_parts); template void all_gather_v_templated(T &my_data, int num_elems, T2 &gathered_data, int num_parts); @@ -501,10 +501,17 @@ class CommsMPIHostBufferStream : public CommsMPI void get_hostname(std::string &my_hostname); void exchange_hostnames(std::string &my_hostname, std::vector &hostnames, int num_parts ); - void all_gather(IndexType_h &my_data, HIVector &gathered_data, int num_parts); + void all_gather(const IndexType_h &my_data, HIVector &gathered_data, int num_parts); + void all_gather(const int64_t &my_data, HI64Vector &gathered_data, int num_parts); void all_gather_v(HIVector &my_data, HIVector &gathered_data, int num_parts); void all_reduce_max(IndexType_h &my_data, IndexType_h &result_data); + void all_gather_v(HDVector& data, int num_elems, HDVector& gathered_data, HIVector counts, HIVector displs); + void all_gather_v(HFVector& data, int num_elems, HFVector& gathered_data, HIVector counts, HIVector displs); + void all_gather_v(HCVector& data, int num_elems, HCVector& gathered_data, HIVector counts, HIVector displs); + void all_gather_v(HZVector& data, int num_elems, HZVector& gathered_data, HIVector counts, HIVector displs); + void all_gather_v(HIVector& data, int num_elems, HIVector& gathered_data, HIVector counts, HIVector displs); + #ifdef AMGX_WITH_MPI const MPI_Comm &get_mpi_comm() const {return mpi_comm;} #endif diff --git a/3rd_party/AMGX/base/include/distributed/distributed_comms.h b/3rd_party/AMGX/base/include/distributed/distributed_comms.h index 789c48037..cdcdcc3bf 100644 --- a/3rd_party/AMGX/base/include/distributed/distributed_comms.h +++ b/3rd_party/AMGX/base/include/distributed/distributed_comms.h @@ -333,11 +333,17 @@ class DistributedComms virtual void get_hostname(std::string &my_hostname) = 0; virtual void exchange_hostnames(std::string &my_hostname, std::vector &hostnames, int num_parts ) = 0; - virtual void all_gather(IndexType_h &my_data, HIVector &gathered_data, int num_parts) = 0; + virtual void all_gather(const IndexType_h &my_data, HIVector &gathered_data, int num_parts) = 0; + virtual void all_gather(const int64_t &my_data, HI64Vector &gathered_data, int num_parts) = 0; virtual void all_gather_v(HIVector &my_data, HIVector &gathered_data, int num_parts) = 0; virtual void all_reduce_max(IndexType_h &my_data, IndexType_h &result_data) = 0; + virtual void all_gather_v(HDVector& data, int num_elems, HDVector& gathered_data, HIVector counts, HIVector displs) = 0; + virtual void all_gather_v(HFVector& data, int num_elems, HFVector& gathered_data, HIVector counts, HIVector displs) = 0; + virtual void all_gather_v(HCVector& data, int num_elems, HCVector& gathered_data, HIVector counts, HIVector displs) = 0; + virtual void all_gather_v(HZVector& data, int num_elems, HZVector& gathered_data, HIVector counts, HIVector displs) = 0; + virtual void all_gather_v(HIVector& data, int num_elems, HIVector& gathered_data, HIVector counts, HIVector displs) = 0; // Increment the reference counter. diff --git a/3rd_party/AMGX/base/include/distributed/distributed_manager.h b/3rd_party/AMGX/base/include/distributed/distributed_manager.h index 5a7ddff77..720624e89 100644 --- a/3rd_party/AMGX/base/include/distributed/distributed_manager.h +++ b/3rd_party/AMGX/base/include/distributed/distributed_manager.h @@ -884,6 +884,7 @@ template class DistributedManagerBase halo_ranges_h = a.halo_ranges; part_offsets = a.part_offsets; part_offsets_h = a.part_offsets_h; + num_rows_per_part = a.num_rows_per_part; for (int i = 0; i < B2L_maps.size(); i++) { @@ -931,6 +932,7 @@ template class DistributedManagerBase halo_ranges_h.swap(a.halo_ranges_h); part_offsets.swap(a.part_offsets); part_offsets_h.swap(a.part_offsets_h); + num_rows_per_part.swap(a.num_rows_per_part); temp = _num_interior_nodes; _num_interior_nodes = a.num_interior_nodes(); temp = _num_boundary_nodes; @@ -1668,21 +1670,21 @@ template class DistributedManagerBase } protected: int64_t _base_index; //LEVEL 0 - the index of the first node owned by this partition - INDEX_TYPE _index_range; //LEVEL 0 - the number of fine nodes owned by this partition - INDEX_TYPE _global_id; //LEVEL 0 - ID of this node (partition) - INDEX_TYPE _num_partitions; //LEVEL 0 - Number of partitions (partition) - INDEX_TYPE _num_halo_rows; //LEVEL 0 - total number of rows in the halo section of the matrix - INDEX_TYPE _num_halo_rings; //LEVEL 0 - number of halo rings + INDEX_TYPE _index_range = 0; //LEVEL 0 - the number of fine nodes owned by this partition + INDEX_TYPE _global_id = 0; //LEVEL 0 - ID of this node (partition) + INDEX_TYPE _num_partitions = 0; //LEVEL 0 - Number of partitions (partition) + INDEX_TYPE _num_halo_rows = 0; //LEVEL 0 - total number of rows in the halo section of the matrix + INDEX_TYPE _num_halo_rings = 0; //LEVEL 0 - number of halo rings bool m_is_root_partition; bool m_is_glued; bool m_is_fine_level_glued; - INDEX_TYPE m_my_destination_part; - INDEX_TYPE m_num_parts_to_consolidate; - INDEX_TYPE m_cons_interior_offset; - INDEX_TYPE m_cons_interior_size; - INDEX_TYPE m_cons_bndry_offset; - INDEX_TYPE m_cons_bndry_size; + INDEX_TYPE m_my_destination_part = 0; + INDEX_TYPE m_num_parts_to_consolidate = 0; + INDEX_TYPE m_cons_interior_offset = 0; + INDEX_TYPE m_cons_interior_size = 0; + INDEX_TYPE m_cons_bndry_offset = 0; + INDEX_TYPE m_cons_bndry_size = 0; std::vector m_consolidationArrayOffsets; Vector m_destination_partitions; @@ -1697,14 +1699,14 @@ template class DistributedManagerBase INDEX_TYPE m_my_fine_level_destination_part; //cached sizes for different views of the matrix (set in Matrix::set_initialized(1)) - INDEX_TYPE _num_rows_interior; - INDEX_TYPE _num_nz_interior; - INDEX_TYPE _num_rows_owned; - INDEX_TYPE _num_nz_owned; - INDEX_TYPE _num_rows_full; - INDEX_TYPE _num_nz_full; - INDEX_TYPE _num_rows_all; - INDEX_TYPE _num_nz_all; + INDEX_TYPE _num_rows_interior = 0; + INDEX_TYPE _num_nz_interior = 0; + INDEX_TYPE _num_rows_owned = 0; + INDEX_TYPE _num_nz_owned = 0; + INDEX_TYPE _num_rows_full = 0; + INDEX_TYPE _num_nz_full = 0; + INDEX_TYPE _num_rows_all = 0; + INDEX_TYPE _num_nz_all = 0; bool m_fixed_view_size; //Containers for Level 0 API: @@ -1712,6 +1714,8 @@ template class DistributedManagerBase std::vector_L2H_maps; std::vector > _B2L_rings; + I64Vector_h num_rows_per_part {}; + public: I64Vector_h &part_offsets_h; @@ -1755,7 +1759,33 @@ template class DistributedManagerBase inline cudaStream_t& get_bdy_stream() { return m_bdy_stream; } inline cudaEvent_t& get_comm_event() { return comm_event; } - int64_t num_rows_global; + int64_t num_rows_global = 0; + + const I64Vector_h& getNumRowsPerPart() + { + if(_comms == nullptr) + { + FatalError("Calling getNumRowsPerPart with no communicator", AMGX_ERR_INTERNAL); + } + + if(_num_rows_owned <= 0) + { + FatalError("_num_rows_owned <= 0 when determining num rows per part", AMGX_ERR_INTERNAL); + } + + if(_num_partitions <= 0) + { + _num_partitions = _comms->get_num_partitions(); + } + + // If necessary, populate the number of rows per partition + if(num_rows_per_part.size() == 0) + { + _comms->all_gather(_num_rows_owned, num_rows_per_part, _num_partitions); + } + + return num_rows_per_part; + } const IVector &getRowsListForView(ViewType type) { diff --git a/3rd_party/AMGX/base/include/getvalue.h b/3rd_party/AMGX/base/include/getvalue.h index 83c017349..6212da77a 100644 --- a/3rd_party/AMGX/base/include/getvalue.h +++ b/3rd_party/AMGX/base/include/getvalue.h @@ -70,6 +70,10 @@ inline NormType getValue(const char *name) { return L1; } + else if (strncmp(name, "L1_SCALED", 100) == 0) + { + return L1_SCALED; + } else if (strncmp(name, "L2", 100) == 0) { return L2; diff --git a/3rd_party/AMGX/base/include/hash_containers_sm35.inl b/3rd_party/AMGX/base/include/hash_containers_sm35.inl index 34b33952c..d971bad1e 100644 --- a/3rd_party/AMGX/base/include/hash_containers_sm35.inl +++ b/3rd_party/AMGX/base/include/hash_containers_sm35.inl @@ -1,4 +1,4 @@ -/* Copyright (c) 2011-2017, NVIDIA CORPORATION. All rights reserved. +/* Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -49,7 +49,7 @@ class Hash_set // The size of the table (occupancy). int m_smem_count, m_gmem_count; // The keys stored in the hash table. - volatile Key_type *m_smem_keys, *m_gmem_keys; + Key_type *m_smem_keys, *m_gmem_keys; // The size of the global memory buffer. const int m_gmem_size; // Is it ok? @@ -57,22 +57,19 @@ class Hash_set public: // Constructor. - __device__ __forceinline__ Hash_set( volatile Key_type *smem_keys, volatile Key_type *gmem_keys, int gmem_size ) : + __device__ __forceinline__ Hash_set( Key_type *smem_keys, Key_type *gmem_keys, int gmem_size ) : m_smem_count(0), m_gmem_count(1), m_smem_keys (smem_keys), m_gmem_keys (gmem_keys), m_gmem_size (gmem_size), m_fail (false) - {} // Clear the table. __device__ __forceinline__ void clear( bool skip_gmem = false ); // Compute the size of the table. Only thread with lane_id==0 gives the correct result (no broadcast of the value). __device__ __forceinline__ int compute_size(); - // Compute the size of the table. Only thread with lane_id==0 gives the correct result (no broadcast of the value). - __device__ __forceinline__ int compute_size_with_duplicates(); // Does the set contain those values? __device__ __forceinline__ bool contains( Key_type key ) const; // Find an index. @@ -146,45 +143,6 @@ int Hash_set::compute_size() // ==================================================================================================================== -template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE> -__device__ __forceinline__ -int Hash_set::compute_size_with_duplicates() -{ - int lane_id = utils::lane_id(); - // Count the number of keys in SMEM. - int sum = 0; - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - const int offset = i_step * WARP_SIZE + lane_id; - Key_type key = m_smem_keys[offset]; - sum += __popc( utils::ballot( key != -1 ) ); - } - - // Is there any key in GMEM. If not, just quit. - m_gmem_count = utils::any(m_gmem_count > 0); - - if ( !m_gmem_count ) - { - return sum; - } - - // Count the number of keys in GMEM. -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - Key_type key = m_gmem_keys[offset]; - sum += __popc( utils::ballot( key != -1, utils::activemask() ) ); - } - - return sum; -} - -// ==================================================================================================================== - template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE> __device__ __forceinline__ bool Hash_set::contains( Key_type key ) const @@ -218,7 +176,6 @@ bool Hash_set::contains( Key_type } } - const int num_bits = utils::bfind( m_gmem_size ); #pragma unroll for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) @@ -229,7 +186,7 @@ bool Hash_set::contains( Key_type } unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (m_gmem_size - 1); if ( !done ) { @@ -283,7 +240,6 @@ int Hash_set::find_index( Key_typ } } - const int num_bits = utils::bfind( m_gmem_size ); #pragma unroll for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) @@ -294,7 +250,7 @@ int Hash_set::find_index( Key_typ } unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (m_gmem_size - 1); if ( !done ) { @@ -317,83 +273,71 @@ template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > __device__ __forceinline__ void Hash_set::insert( Key_type key, int *status ) { - bool done = key == -1; -#pragma unroll + bool active = key != -1; + Key_type winning_key; + int active_mask; +#pragma unroll for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) { - if ( utils::all(done) ) + active_mask = utils::ballot( active ); + + if ( active_mask == 0 ) { return; } - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - - if ( !done ) + if ( active ) { - Key_type stored_key = m_smem_keys[hash]; - - if ( stored_key == key ) - { - done = true; - } + unsigned ukey = reinterpret_cast( key ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - candidate = stored_key == -1; + winning_key = utils::atomic_CAS(&m_smem_keys[hash], -1, key); - if ( candidate ) + if ( winning_key == -1 ) { - m_smem_keys[hash] = key; + winning_key = key; + m_smem_count++; } - if ( candidate && key == m_smem_keys[hash] ) // More than one candidate may have written to that slot. + + if ( key == winning_key ) { - m_smem_count++; - done = true; + active = false; } } } - - const int num_bits = utils::bfind( m_gmem_size ); #pragma unroll - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) { - if ( utils::all(done) ) + active_mask = utils::ballot( active ); + + if ( active_mask == 0 ) { return; } - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); - - if ( !done ) + if ( active ) { - Key_type stored_key = m_gmem_keys[hash]; - - if ( stored_key == key ) - { - done = true; - } + unsigned ukey = reinterpret_cast( key ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (m_gmem_size - 1); - candidate = stored_key == -1; + winning_key = utils::atomic_CAS(&m_gmem_keys[hash], -1, key); - if ( candidate ) + if ( winning_key == -1 ) { - m_gmem_keys[hash] = key; + winning_key = key; + m_gmem_count++; } - if ( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot. + if ( key == winning_key ) { - m_gmem_count++; - done = true; + active = false; } } } - if ( utils::all(done) ) + if ( utils::ballot( active ) == 0 ) { return; } @@ -422,7 +366,7 @@ void Hash_set::load( int count, c Key_type key = keys[offset]; int idx = pos [offset]; // Where to store the item. - volatile Key_type *ptr = m_smem_keys; + Key_type *ptr = m_smem_keys; if ( idx >= SMEM_SIZE ) { @@ -451,7 +395,7 @@ void Hash_set::load_index( int co Key_type key = keys[offset]; int idx = pos [offset]; // Store the item. - volatile Key_type *ptr = m_smem_keys; + Key_type *ptr = m_smem_keys; if ( idx >= SMEM_SIZE ) { @@ -676,11 +620,9 @@ class Hash_map { protected: // The keys stored in the map. - volatile Key_type *m_smem_keys, *m_gmem_keys; - // Vote buffer for values. - volatile Word *m_smem_vote; - // Registers to store values. - T m_regs_vals[4]; + Key_type *m_smem_keys, *m_gmem_keys; + // Shared memory values + T *m_smem_vals = NULL; // The values stored in the map. T *m_gmem_vals; // The size of the global memory buffer. @@ -689,12 +631,11 @@ class Hash_map bool m_any_gmem; public: - // Constructor. __device__ __forceinline__ - Hash_map( volatile Key_type *smem_keys, volatile Key_type *gmem_keys, volatile Word *smem_vote, T *gmem_vals, int gmem_size ) : + Hash_map( Key_type *smem_keys, Key_type *gmem_keys, T *smem_vals, T *gmem_vals, int gmem_size ) : m_smem_keys(smem_keys), m_gmem_keys(gmem_keys), - m_smem_vote(smem_vote), + m_smem_vals(smem_vals), m_gmem_vals(gmem_vals), m_gmem_size(gmem_size), m_any_gmem (true) @@ -702,12 +643,8 @@ class Hash_map // Clear the table. It doesn't clear GMEM values. __device__ __forceinline__ void clear(); - // Clear the table. It also clears GMEM values (set them to 0). - __device__ __forceinline__ void clear_all(); - // Insert a key/value inside the hash table. - __device__ __forceinline__ void insert( Key_type key, T a_value, T b_value, int *status ); // Insert a key/value inside the hash table. - __device__ __forceinline__ void insert_with_duplicates( Key_type key, T val, int *status ); + __device__ __forceinline__ void insert( Key_type key, T val, int *status ); // Load a set. __device__ __forceinline__ void load( int count, const Key_type *keys, const int *pos ); // Store the map. @@ -720,25 +657,6 @@ class Hash_map __device__ __forceinline__ void store_keys_scale_values( int count, Key_type *keys, T alpha, T *vals ); // Update a value in the table but do not insert if it doesn't exist. __device__ __forceinline__ bool update( Key_type key, T value ); - - protected: - // Get the selected item in the register buffer. - __device__ __forceinline__ int get_selected( int hash ) const - { - return static_cast(m_smem_vote[hash % WARP_SIZE].b8[hash / WARP_SIZE]); - } - - // Is it the selected item in the register buffer. - __device__ __forceinline__ bool is_selected( int hash, int lane_id ) const - { - return m_smem_vote[hash % WARP_SIZE].b8[hash / WARP_SIZE] == reinterpret_cast(lane_id); - } - - // Push my ID in the register buffer. - __device__ __forceinline__ void try_selection( int hash, int lane_id ) - { - m_smem_vote[hash % WARP_SIZE].b8[hash / WARP_SIZE] = reinterpret_cast(lane_id); - } }; // ==================================================================================================================== @@ -754,13 +672,7 @@ void Hash_map::clear() for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) { m_smem_keys[i_step * WARP_SIZE + lane_id] = -1; - } - -#pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) - { - m_regs_vals[i_regs] = amgx::types::util::get_zero(); + m_smem_vals[i_step * WARP_SIZE + lane_id] = amgx::types::util::get_zero(); } if ( !m_any_gmem ) @@ -773,43 +685,6 @@ void Hash_map::clear() for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) { m_gmem_keys[offset] = -1; - } - - m_any_gmem = false; -} - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_map::clear_all() -{ - int lane_id = utils::lane_id(); - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - m_smem_keys[i_step * WARP_SIZE + lane_id] = -1; - } - -#pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) - { - m_regs_vals[i_regs] = amgx::types::util::get_zero(); - } - - if ( !m_any_gmem ) - { - return; - } - -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - m_gmem_keys[offset] = -1; m_gmem_vals[offset] = amgx::types::util::get_zero(); } @@ -820,241 +695,73 @@ void Hash_map::clear_all() template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > __device__ __forceinline__ -void Hash_map::insert( Key_type key, T a_value, T b_value, int *status ) +void Hash_map::insert( Key_type key, T val, int *status ) { - const int lane_id = utils::lane_id(); - bool done = key == -1; - m_smem_vote[lane_id].b32 = 0x20202020; -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( i_hash > 0 && utils::all(done) ) - { - break; - } - - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); + const short lane_id = utils::lane_id(); + bool active = key != -1; + Key_type winning_key = -1; + int active_mask; - if ( !done ) - { - Key_type stored_key = m_smem_keys[hash]; - - if ( stored_key == key ) - { - this->try_selection( hash, lane_id ); - done = true; - } - - candidate = stored_key == -1; - - if ( candidate ) - { - m_smem_keys[hash] = key; - } - - if ( candidate && key == m_smem_keys[hash] ) - { - this->try_selection( hash, lane_id ); - done = true; - } - } - } - - Word my_vote; - my_vote.b32 = m_smem_vote[lane_id].b32; #pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) + for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) { - int my_src = my_vote.b8[i_regs]; - T other_val = utils::shfl( b_value, my_src ); - - if ( my_src != WARP_SIZE ) - { - m_regs_vals[i_regs] = m_regs_vals[i_regs] + a_value * other_val; - } - } - const int num_bits = utils::bfind( m_gmem_size ); -#pragma unroll + active_mask = utils::ballot( active ); - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) + if ( active_mask == 0 ) { return; } - m_any_gmem = true; - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); - if ( !done ) + if ( active ) { - Key_type stored_key = m_gmem_keys[hash]; - - if ( stored_key == key ) - { - m_gmem_vals[hash] = m_gmem_vals[hash] + a_value * b_value; - done = true; - } + unsigned ukey = reinterpret_cast( key ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - candidate = stored_key == -1; - - if ( candidate ) - { - m_gmem_keys[hash] = key; - } + winning_key = utils::atomic_CAS(&m_smem_keys[hash], -1, key); + winning_key = (winning_key == -1) ? key : winning_key; - if ( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot. + if (key == winning_key) { - m_gmem_vals[hash] = a_value * b_value; - done = true; + utils::atomic_add(&m_smem_vals[hash], val); + active = false; } } } - if ( status == NULL || utils::all(done) ) - { - return; - } - - if ( lane_id == 0 ) - { - status[0] = 1; - } -} - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_map::insert_with_duplicates( Key_type key, T val, int *status ) -{ - const int lane_id = utils::lane_id(); - bool done = key == -1; - m_smem_vote[lane_id].b32 = 0x20202020; -#pragma unroll + #pragma unroll for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) { - if ( utils::all(done) ) - { - break; - } - - bool candidate = false; - bool maybe_in_conflict = false; - unsigned ukey = reinterpret_cast( key ); - int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - - if ( !done ) - { - Key_type stored_key = m_smem_keys[hash]; - - if ( stored_key == key ) - { - this->try_selection( hash, lane_id ); - maybe_in_conflict = true; - done = true; // Is it really done??? - } - - candidate = stored_key == -1; - - if ( candidate ) - { - m_smem_keys[hash] = key; - } - - if ( candidate && key == m_smem_keys[hash] ) - { - this->try_selection( hash, lane_id ); - maybe_in_conflict = true; - done = true; - } - } - - // Fix conflicts. - bool in_conflict = maybe_in_conflict && !this->is_selected(hash, lane_id); - - while ( utils::any( in_conflict ) ) - { - int winner = in_conflict ? this->get_selected(hash) : WARP_SIZE; - T other_val = utils::shfl( val, winner ); - - if ( in_conflict ) - { - this->try_selection(hash, lane_id); - } - - if ( in_conflict && this->is_selected(hash, lane_id) ) - { - val = val + other_val; - in_conflict = false; - } - } - } - - Word my_vote; - my_vote.b32 = m_smem_vote[lane_id].b32; -#pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) - { - int my_src = my_vote.b8[i_regs]; - T other_val = utils::shfl( val, my_src ); - - if ( my_src != WARP_SIZE ) - { - m_regs_vals[i_regs] = m_regs_vals[i_regs] + other_val; - } - } - - const int num_bits = utils::bfind( m_gmem_size ); -#pragma unroll + active_mask = utils::ballot( active ); - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) + if ( active_mask == 0 ) { return; } m_any_gmem = true; - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); - if ( !done ) + if ( active ) { - Key_type stored_key = m_gmem_keys[hash]; + unsigned ukey = reinterpret_cast( key ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (m_gmem_size - 1); - if ( stored_key == key ) - { - utils::atomic_add( &m_gmem_vals[hash], val ); - done = true; - } - candidate = stored_key == -1; + winning_key = utils::atomic_CAS(&m_gmem_keys[hash], -1, key); + winning_key = (winning_key == -1) ? key : winning_key; - if ( candidate ) - { - m_gmem_keys[hash] = key; - } - if ( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot. + if (key == winning_key) { - utils::atomic_add( &m_gmem_vals[hash], val ); - done = true; + utils::atomic_add(&m_gmem_vals[hash], val); + active = false; } } } - if ( status == NULL || utils::all(done) ) + if (status == NULL ) { return; } @@ -1065,6 +772,7 @@ void Hash_map::insert_with_dup } } + // ==================================================================================================================== template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > @@ -1079,7 +787,7 @@ void Hash_map::load( int count Key_type key = keys[offset]; int idx = pos [offset]; // Where to store the item. - volatile Key_type *ptr = m_smem_keys; + Key_type *ptr = m_smem_keys; if ( idx >= SMEM_SIZE ) { @@ -1123,7 +831,7 @@ void Hash_map::store( int coun if ( key != -1 ) { - vals[dst_offset] = m_regs_vals[i_step]; + vals[dst_offset] = m_smem_vals[offset]; } warp_offset += __popc( poll ); @@ -1185,7 +893,7 @@ void Hash_map::store( int coun if ( key != -1 ) { keys[dst_offset] = key; - vals[dst_offset] = m_regs_vals[i_step]; + vals[dst_offset] = m_smem_vals[offset]; } warp_offset += __popc( poll ); @@ -1248,7 +956,7 @@ void Hash_map::store_map_keys_ if ( key != -1 ) { keys[dst_offset] = map[key]; - vals[dst_offset] = alpha * m_regs_vals[i_step]; + vals[dst_offset] = alpha * m_smem_vals[offset]; } warp_offset += __popc( poll ); @@ -1309,7 +1017,7 @@ void Hash_map::store_keys_scal if ( key != -1 ) { keys[dst_offset] = key; - vals[dst_offset] = alpha * m_regs_vals[i_step]; + vals[dst_offset] = alpha * m_smem_vals[offset]; } warp_offset += __popc( poll ); @@ -1354,14 +1062,13 @@ bool Hash_map::update( Key_typ { const int lane_id = utils::lane_id(); bool done = key == -1, found = false; - m_smem_vote[lane_id].b32 = 0x20202020; #pragma unroll for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) { if ( i_hash > 0 && utils::all(done) ) { - break; + return found; } unsigned ukey = reinterpret_cast( key ); @@ -1373,7 +1080,7 @@ bool Hash_map::update( Key_typ if ( stored_key == key ) { - this->try_selection( hash, lane_id ); + utils::atomic_add(&m_smem_vals[hash], val); found = true; } @@ -1381,22 +1088,6 @@ bool Hash_map::update( Key_typ } } - Word my_vote; - my_vote.b32 = m_smem_vote[lane_id].b32; -#pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) - { - int my_src = my_vote.b8[i_regs]; - T other_val = utils::shfl( val, my_src ); - - if ( my_src != WARP_SIZE ) - { - m_regs_vals[i_regs] += other_val; - } - } - - const int num_bits = utils::bfind( m_gmem_size ); #pragma unroll for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) @@ -1407,7 +1098,7 @@ bool Hash_map::update( Key_typ } unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (m_gmem_size - 1); if ( !done ) { @@ -1415,7 +1106,7 @@ bool Hash_map::update( Key_typ if ( stored_key == key ) { - m_gmem_vals[hash] += val; + utils::atomic_add(&m_gmem_vals[hash], val); found = true; } @@ -1427,4 +1118,3 @@ bool Hash_map::update( Key_typ } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - diff --git a/3rd_party/AMGX/base/include/hash_containers_sm70.inl b/3rd_party/AMGX/base/include/hash_containers_sm70.inl index 96333ab23..d971bad1e 100644 --- a/3rd_party/AMGX/base/include/hash_containers_sm70.inl +++ b/3rd_party/AMGX/base/include/hash_containers_sm70.inl @@ -49,7 +49,7 @@ class Hash_set // The size of the table (occupancy). int m_smem_count, m_gmem_count; // The keys stored in the hash table. - volatile Key_type *m_smem_keys, *m_gmem_keys; + Key_type *m_smem_keys, *m_gmem_keys; // The size of the global memory buffer. const int m_gmem_size; // Is it ok? @@ -57,22 +57,19 @@ class Hash_set public: // Constructor. - __device__ __forceinline__ Hash_set( volatile Key_type *smem_keys, volatile Key_type *gmem_keys, int gmem_size ) : + __device__ __forceinline__ Hash_set( Key_type *smem_keys, Key_type *gmem_keys, int gmem_size ) : m_smem_count(0), m_gmem_count(1), m_smem_keys (smem_keys), m_gmem_keys (gmem_keys), m_gmem_size (gmem_size), m_fail (false) - {} // Clear the table. __device__ __forceinline__ void clear( bool skip_gmem = false ); // Compute the size of the table. Only thread with lane_id==0 gives the correct result (no broadcast of the value). __device__ __forceinline__ int compute_size(); - // Compute the size of the table. Only thread with lane_id==0 gives the correct result (no broadcast of the value). - __device__ __forceinline__ int compute_size_with_duplicates(); // Does the set contain those values? __device__ __forceinline__ bool contains( Key_type key ) const; // Find an index. @@ -146,45 +143,6 @@ int Hash_set::compute_size() // ==================================================================================================================== -template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE> -__device__ __forceinline__ -int Hash_set::compute_size_with_duplicates() -{ - int lane_id = utils::lane_id(); - // Count the number of keys in SMEM. - int sum = 0; - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - const int offset = i_step * WARP_SIZE + lane_id; - Key_type key = m_smem_keys[offset]; - sum += __popc( utils::ballot( key != -1 ) ); - } - - // Is there any key in GMEM. If not, just quit. - m_gmem_count = utils::any(m_gmem_count > 0); - - if ( !m_gmem_count ) - { - return sum; - } - - // Count the number of keys in GMEM. -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - Key_type key = m_gmem_keys[offset]; - sum += __popc( utils::ballot( key != -1, utils::activemask() ) ); - } - - return sum; -} - -// ==================================================================================================================== - template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE> __device__ __forceinline__ bool Hash_set::contains( Key_type key ) const @@ -218,7 +176,6 @@ bool Hash_set::contains( Key_type } } - const int num_bits = utils::bfind( m_gmem_size ); #pragma unroll for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) @@ -229,7 +186,7 @@ bool Hash_set::contains( Key_type } unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (m_gmem_size - 1); if ( !done ) { @@ -283,7 +240,6 @@ int Hash_set::find_index( Key_typ } } - const int num_bits = utils::bfind( m_gmem_size ); #pragma unroll for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) @@ -294,7 +250,7 @@ int Hash_set::find_index( Key_typ } unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (m_gmem_size - 1); if ( !done ) { @@ -317,83 +273,71 @@ template< typename Key_type, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > __device__ __forceinline__ void Hash_set::insert( Key_type key, int *status ) { - bool done = key == -1; -#pragma unroll + bool active = key != -1; + Key_type winning_key; + int active_mask; +#pragma unroll for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) { - if ( utils::all(done) ) + active_mask = utils::ballot( active ); + + if ( active_mask == 0 ) { return; } - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - - if ( !done ) + if ( active ) { - Key_type stored_key = m_smem_keys[hash]; - - if ( stored_key == key ) - { - done = true; - } + unsigned ukey = reinterpret_cast( key ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - candidate = stored_key == -1; + winning_key = utils::atomic_CAS(&m_smem_keys[hash], -1, key); - if ( candidate ) + if ( winning_key == -1 ) { - m_smem_keys[hash] = key; + winning_key = key; + m_smem_count++; } - if ( candidate && key == m_smem_keys[hash] ) // More than one candidate may have written to that slot. + + if ( key == winning_key ) { - m_smem_count++; - done = true; + active = false; } } } - - const int num_bits = utils::bfind( m_gmem_size ); #pragma unroll - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) { - if ( utils::all(done) ) + active_mask = utils::ballot( active ); + + if ( active_mask == 0 ) { return; } - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); - - if ( !done ) + if ( active ) { - Key_type stored_key = m_gmem_keys[hash]; - - if ( stored_key == key ) - { - done = true; - } + unsigned ukey = reinterpret_cast( key ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (m_gmem_size - 1); - candidate = stored_key == -1; + winning_key = utils::atomic_CAS(&m_gmem_keys[hash], -1, key); - if ( candidate ) + if ( winning_key == -1 ) { - m_gmem_keys[hash] = key; + winning_key = key; + m_gmem_count++; } - if ( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot. + if ( key == winning_key ) { - m_gmem_count++; - done = true; + active = false; } } } - if ( utils::all(done) ) + if ( utils::ballot( active ) == 0 ) { return; } @@ -422,7 +366,7 @@ void Hash_set::load( int count, c Key_type key = keys[offset]; int idx = pos [offset]; // Where to store the item. - volatile Key_type *ptr = m_smem_keys; + Key_type *ptr = m_smem_keys; if ( idx >= SMEM_SIZE ) { @@ -451,7 +395,7 @@ void Hash_set::load_index( int co Key_type key = keys[offset]; int idx = pos [offset]; // Store the item. - volatile Key_type *ptr = m_smem_keys; + Key_type *ptr = m_smem_keys; if ( idx >= SMEM_SIZE ) { @@ -676,11 +620,9 @@ class Hash_map { protected: // The keys stored in the map. - volatile Key_type *m_smem_keys, *m_gmem_keys; - // Vote buffer for values. - volatile Word *m_smem_vote; - // Registers to store values. - T m_regs_vals[4]; + Key_type *m_smem_keys, *m_gmem_keys; + // Shared memory values + T *m_smem_vals = NULL; // The values stored in the map. T *m_gmem_vals; // The size of the global memory buffer. @@ -689,12 +631,11 @@ class Hash_map bool m_any_gmem; public: - // Constructor. __device__ __forceinline__ - Hash_map( volatile Key_type *smem_keys, volatile Key_type *gmem_keys, volatile Word *smem_vote, T *gmem_vals, int gmem_size ) : + Hash_map( Key_type *smem_keys, Key_type *gmem_keys, T *smem_vals, T *gmem_vals, int gmem_size ) : m_smem_keys(smem_keys), m_gmem_keys(gmem_keys), - m_smem_vote(smem_vote), + m_smem_vals(smem_vals), m_gmem_vals(gmem_vals), m_gmem_size(gmem_size), m_any_gmem (true) @@ -702,12 +643,8 @@ class Hash_map // Clear the table. It doesn't clear GMEM values. __device__ __forceinline__ void clear(); - // Clear the table. It also clears GMEM values (set them to 0). - __device__ __forceinline__ void clear_all(); - // Insert a key/value inside the hash table. - __device__ __forceinline__ void insert( Key_type key, T a_value, T b_value, int *status ); // Insert a key/value inside the hash table. - __device__ __forceinline__ void insert_with_duplicates( Key_type key, T val, int *status ); + __device__ __forceinline__ void insert( Key_type key, T val, int *status ); // Load a set. __device__ __forceinline__ void load( int count, const Key_type *keys, const int *pos ); // Store the map. @@ -720,25 +657,6 @@ class Hash_map __device__ __forceinline__ void store_keys_scale_values( int count, Key_type *keys, T alpha, T *vals ); // Update a value in the table but do not insert if it doesn't exist. __device__ __forceinline__ bool update( Key_type key, T value ); - - protected: - // Get the selected item in the register buffer. - __device__ __forceinline__ int get_selected( int hash ) const - { - return static_cast(m_smem_vote[hash % WARP_SIZE].b8[hash / WARP_SIZE]); - } - - // Is it the selected item in the register buffer. - __device__ __forceinline__ bool is_selected( int hash, int lane_id ) const - { - return m_smem_vote[hash % WARP_SIZE].b8[hash / WARP_SIZE] == reinterpret_cast(lane_id); - } - - // Push my ID in the register buffer. - __device__ __forceinline__ void try_selection( int hash, int lane_id ) - { - m_smem_vote[hash % WARP_SIZE].b8[hash / WARP_SIZE] = reinterpret_cast(lane_id); - } }; // ==================================================================================================================== @@ -754,13 +672,7 @@ void Hash_map::clear() for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) { m_smem_keys[i_step * WARP_SIZE + lane_id] = -1; - } - -#pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) - { - m_regs_vals[i_regs] = amgx::types::util::get_zero(); + m_smem_vals[i_step * WARP_SIZE + lane_id] = amgx::types::util::get_zero(); } if ( !m_any_gmem ) @@ -773,43 +685,6 @@ void Hash_map::clear() for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) { m_gmem_keys[offset] = -1; - } - - m_any_gmem = false; -} - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_map::clear_all() -{ - int lane_id = utils::lane_id(); - const int NUM_STEPS = SMEM_SIZE / WARP_SIZE; -#pragma unroll - - for ( int i_step = 0 ; i_step < NUM_STEPS ; ++i_step ) - { - m_smem_keys[i_step * WARP_SIZE + lane_id] = -1; - } - -#pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) - { - m_regs_vals[i_regs] = amgx::types::util::get_zero(); - } - - if ( !m_any_gmem ) - { - return; - } - -#pragma unroll 4 - - for ( int offset = lane_id ; offset < m_gmem_size ; offset += WARP_SIZE ) - { - m_gmem_keys[offset] = -1; m_gmem_vals[offset] = amgx::types::util::get_zero(); } @@ -820,241 +695,73 @@ void Hash_map::clear_all() template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > __device__ __forceinline__ -void Hash_map::insert( Key_type key, T a_value, T b_value, int *status ) +void Hash_map::insert( Key_type key, T val, int *status ) { - const int lane_id = utils::lane_id(); - bool done = key == -1; - m_smem_vote[lane_id].b32 = 0x20202020; -#pragma unroll - - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( i_hash > 0 && utils::all(done) ) - { - break; - } - - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); + const short lane_id = utils::lane_id(); + bool active = key != -1; + Key_type winning_key = -1; + int active_mask; - if ( !done ) - { - Key_type stored_key = m_smem_keys[hash]; - - if ( stored_key == key ) - { - this->try_selection( hash, lane_id ); - done = true; - } - - candidate = stored_key == -1; - - if ( candidate ) - { - m_smem_keys[hash] = key; - } - - if ( candidate && key == m_smem_keys[hash] ) - { - this->try_selection( hash, lane_id ); - done = true; - } - } - } - - Word my_vote; - my_vote.b32 = m_smem_vote[lane_id].b32; #pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) + for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) { - int my_src = my_vote.b8[i_regs]; - T other_val = utils::shfl( b_value, my_src ); - - if ( my_src != WARP_SIZE ) - { - m_regs_vals[i_regs] = m_regs_vals[i_regs] + a_value * other_val; - } - } - const int num_bits = utils::bfind( m_gmem_size ); -#pragma unroll + active_mask = utils::ballot( active ); - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) + if ( active_mask == 0 ) { return; } - m_any_gmem = true; - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); - if ( !done ) + if ( active ) { - Key_type stored_key = m_gmem_keys[hash]; - - if ( stored_key == key ) - { - m_gmem_vals[hash] = m_gmem_vals[hash] + a_value * b_value; - done = true; - } + unsigned ukey = reinterpret_cast( key ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - candidate = stored_key == -1; - - if ( candidate ) - { - m_gmem_keys[hash] = key; - } + winning_key = utils::atomic_CAS(&m_smem_keys[hash], -1, key); + winning_key = (winning_key == -1) ? key : winning_key; - if ( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot. + if (key == winning_key) { - m_gmem_vals[hash] = a_value * b_value; - done = true; + utils::atomic_add(&m_smem_vals[hash], val); + active = false; } } } - if ( status == NULL || utils::all(done) ) - { - return; - } - - if ( lane_id == 0 ) - { - status[0] = 1; - } -} - -// ==================================================================================================================== - -template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > -__device__ __forceinline__ -void Hash_map::insert_with_duplicates( Key_type key, T val, int *status ) -{ - const int lane_id = utils::lane_id(); - bool done = key == -1; - m_smem_vote[lane_id].b32 = 0x20202020; -#pragma unroll + #pragma unroll for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) { - if ( utils::all(done) ) - { - break; - } - - bool candidate = false; - bool maybe_in_conflict = false; - unsigned ukey = reinterpret_cast( key ); - int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (SMEM_SIZE - 1); - - if ( !done ) - { - Key_type stored_key = m_smem_keys[hash]; - - if ( stored_key == key ) - { - this->try_selection( hash, lane_id ); - maybe_in_conflict = true; - done = true; // Is it really done??? - } - - candidate = stored_key == -1; - - if ( candidate ) - { - m_smem_keys[hash] = key; - } - - if ( candidate && key == m_smem_keys[hash] ) - { - this->try_selection( hash, lane_id ); - maybe_in_conflict = true; - done = true; - } - } - - // Fix conflicts. - bool in_conflict = maybe_in_conflict && !this->is_selected(hash, lane_id); - - while ( utils::any( in_conflict ) ) - { - int winner = in_conflict ? this->get_selected(hash) : WARP_SIZE; - T other_val = utils::shfl( val, winner ); - - if ( in_conflict ) - { - this->try_selection(hash, lane_id); - } - - if ( in_conflict && this->is_selected(hash, lane_id) ) - { - val = val + other_val; - in_conflict = false; - } - } - } - - Word my_vote; - my_vote.b32 = m_smem_vote[lane_id].b32; -#pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) - { - int my_src = my_vote.b8[i_regs]; - T other_val = utils::shfl( val, my_src ); - - if ( my_src != WARP_SIZE ) - { - m_regs_vals[i_regs] = m_regs_vals[i_regs] + other_val; - } - } - - const int num_bits = utils::bfind( m_gmem_size ); -#pragma unroll + active_mask = utils::ballot( active ); - for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) - { - if ( utils::all(done) ) + if ( active_mask == 0 ) { return; } m_any_gmem = true; - bool candidate = false; - unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); - if ( !done ) + if ( active ) { - Key_type stored_key = m_gmem_keys[hash]; + unsigned ukey = reinterpret_cast( key ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (m_gmem_size - 1); - if ( stored_key == key ) - { - utils::atomic_add( &m_gmem_vals[hash], val ); - done = true; - } - candidate = stored_key == -1; + winning_key = utils::atomic_CAS(&m_gmem_keys[hash], -1, key); + winning_key = (winning_key == -1) ? key : winning_key; - if ( candidate ) - { - m_gmem_keys[hash] = key; - } - if ( candidate && key == m_gmem_keys[hash] ) // More than one candidate may have written to that slot. + if (key == winning_key) { - utils::atomic_add( &m_gmem_vals[hash], val ); - done = true; + utils::atomic_add(&m_gmem_vals[hash], val); + active = false; } } } - if ( status == NULL || utils::all(done) ) + if (status == NULL ) { return; } @@ -1065,6 +772,7 @@ void Hash_map::insert_with_dup } } + // ==================================================================================================================== template< typename Key_type, typename T, int SMEM_SIZE, int NUM_HASH_FCTS, int WARP_SIZE > @@ -1079,7 +787,7 @@ void Hash_map::load( int count Key_type key = keys[offset]; int idx = pos [offset]; // Where to store the item. - volatile Key_type *ptr = m_smem_keys; + Key_type *ptr = m_smem_keys; if ( idx >= SMEM_SIZE ) { @@ -1123,7 +831,7 @@ void Hash_map::store( int coun if ( key != -1 ) { - vals[dst_offset] = m_regs_vals[i_step]; + vals[dst_offset] = m_smem_vals[offset]; } warp_offset += __popc( poll ); @@ -1185,7 +893,7 @@ void Hash_map::store( int coun if ( key != -1 ) { keys[dst_offset] = key; - vals[dst_offset] = m_regs_vals[i_step]; + vals[dst_offset] = m_smem_vals[offset]; } warp_offset += __popc( poll ); @@ -1248,7 +956,7 @@ void Hash_map::store_map_keys_ if ( key != -1 ) { keys[dst_offset] = map[key]; - vals[dst_offset] = alpha * m_regs_vals[i_step]; + vals[dst_offset] = alpha * m_smem_vals[offset]; } warp_offset += __popc( poll ); @@ -1309,7 +1017,7 @@ void Hash_map::store_keys_scal if ( key != -1 ) { keys[dst_offset] = key; - vals[dst_offset] = alpha * m_regs_vals[i_step]; + vals[dst_offset] = alpha * m_smem_vals[offset]; } warp_offset += __popc( poll ); @@ -1354,14 +1062,13 @@ bool Hash_map::update( Key_typ { const int lane_id = utils::lane_id(); bool done = key == -1, found = false; - m_smem_vote[lane_id].b32 = 0x20202020; #pragma unroll for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) { if ( i_hash > 0 && utils::all(done) ) { - break; + return found; } unsigned ukey = reinterpret_cast( key ); @@ -1373,7 +1080,7 @@ bool Hash_map::update( Key_typ if ( stored_key == key ) { - this->try_selection( hash, lane_id ); + utils::atomic_add(&m_smem_vals[hash], val); found = true; } @@ -1381,22 +1088,6 @@ bool Hash_map::update( Key_typ } } - Word my_vote; - my_vote.b32 = m_smem_vote[lane_id].b32; -#pragma unroll - - for ( int i_regs = 0 ; i_regs < 4 ; ++i_regs ) - { - int my_src = my_vote.b8[i_regs]; - T other_val = utils::shfl( val, my_src ); - - if ( my_src != WARP_SIZE ) - { - m_regs_vals[i_regs] += other_val; - } - } - - const int num_bits = utils::bfind( m_gmem_size ); #pragma unroll for ( int i_hash = 0 ; i_hash < NUM_HASH_FCTS ; ++i_hash ) @@ -1407,7 +1098,7 @@ bool Hash_map::update( Key_typ } unsigned ukey = reinterpret_cast( key ); - int hash = utils::bfe( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash], num_bits ); + int hash = ( (ukey ^ c_hash_keys[i_hash]) + c_hash_keys[NUM_HASH_FCTS + i_hash] ) & (m_gmem_size - 1); if ( !done ) { @@ -1415,7 +1106,7 @@ bool Hash_map::update( Key_typ if ( stored_key == key ) { - m_gmem_vals[hash] += val; + utils::atomic_add(&m_gmem_vals[hash], val); found = true; } @@ -1427,4 +1118,3 @@ bool Hash_map::update( Key_typ } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - diff --git a/3rd_party/AMGX/base/include/matrix.h b/3rd_party/AMGX/base/include/matrix.h index ce5598d88..ee155be5e 100644 --- a/3rd_party/AMGX/base/include/matrix.h +++ b/3rd_party/AMGX/base/include/matrix.h @@ -256,7 +256,7 @@ class MatrixBase : public AuxData, public Operator if (cuMatDescr != NULL) { - cusparseCheckError(cusparseDestroyMatDescr(cuMatDescr)); + cusparseDestroyMatDescr(cuMatDescr); cuMatDescr = NULL; } @@ -845,6 +845,8 @@ class MatrixBase : public AuxData, public Operator inline Resources *getResources() const { return m_resources; } inline void setResources(Resources *resources) { m_resources = resources; } + bool isLatencyHidingEnabled(AMG_Config& cfg); + IVector m_larger_color_offsets; //size: num_rows IVector m_smaller_color_offsets; //size: num_rows, IVector m_values_permutation_vector; diff --git a/3rd_party/AMGX/base/include/norm.h b/3rd_party/AMGX/base/include/norm.h index a584bbfaf..c00d26fed 100644 --- a/3rd_party/AMGX/base/include/norm.h +++ b/3rd_party/AMGX/base/include/norm.h @@ -41,10 +41,13 @@ namespace amgx * Returns the norm of a vector *********************************************************/ template -typename types::PODTypes::type get_norm(const MatrixType &A, const VectorType &r, const NormType norm_type); +typename types::PODTypes::type get_norm(const MatrixType &A, const VectorType &r, const NormType norm_type, typename types::PODTypes::type norm_factor = 1.0); template -void get_norm(const MatrixType &A, const VectorType &r, const int block_size, const NormType norm_type, PlainVectorType &block_nrm); +void get_norm(const MatrixType &A, const VectorType &r, const int block_size, const NormType norm_type, PlainVectorType &block_nrm, typename types::PODTypes::type norm_factor = 1.0); + +template +void compute_norm_factor(MatrixType &A, VectorType &b, VectorType &x, const NormType normType, typename types::PODTypes::type &normFactor); } // namespace amgx diff --git a/3rd_party/AMGX/base/include/sm_utils.inl b/3rd_party/AMGX/base/include/sm_utils.inl index 7b71854b6..aefd187f4 100644 --- a/3rd_party/AMGX/base/include/sm_utils.inl +++ b/3rd_party/AMGX/base/include/sm_utils.inl @@ -87,6 +87,17 @@ static __device__ __forceinline__ void atomic_add( cuDoubleComplex *address, cuD atomic_add((double *)((char *)(address) + sizeof(double)), cuCimag(value)); } +static __device__ __forceinline__ int64_t atomic_CAS(int64_t* address, int64_t compare, int64_t val) +{ + return (int64_t)atomicCAS((unsigned long long *)address, (unsigned long long)compare, (unsigned long long)val); +} + +static __device__ __forceinline__ int atomic_CAS(int* address, int compare, int val) +{ + return atomicCAS(address, compare, val); +} + + // ==================================================================================================================== // Bit tools. // ==================================================================================================================== @@ -343,9 +354,7 @@ static __device__ __forceinline__ float shfl( float r, int lane, int bound = war static __device__ __forceinline__ double shfl( double r, int lane, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { #if CUDART_VERSION >= 9000 - int hi = __shfl_sync(mask, __double2hiint(r), lane, bound ); - int lo = __shfl_sync(mask, __double2loint(r), lane, bound ); - return __hiloint2double( hi, lo ); + return __shfl_sync(mask, r, lane, bound ); #else int hi = __shfl( __double2hiint(r), lane, bound ); int lo = __shfl( __double2loint(r), lane, bound ); @@ -395,9 +404,7 @@ static __device__ __forceinline__ float shfl_xor( float r, int lane_mask, int bo static __device__ __forceinline__ double shfl_xor( double r, int lane_mask, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { #if CUDART_VERSION >= 9000 - int hi = __shfl_xor_sync( mask, __double2hiint(r), lane_mask, bound ); - int lo = __shfl_xor_sync( mask, __double2loint(r), lane_mask, bound ); - return __hiloint2double( hi, lo ); + return __shfl_xor_sync( mask, r, lane_mask, bound ); #else int hi = __shfl_xor( __double2hiint(r), lane_mask, bound ); int lo = __shfl_xor( __double2loint(r), lane_mask, bound ); @@ -446,9 +453,7 @@ static __device__ __forceinline__ float shfl_down( float r, int offset, int boun static __device__ __forceinline__ double shfl_down( double r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { #if CUDART_VERSION >= 9000 - int hi = __shfl_down_sync( mask, __double2hiint(r), offset, bound ); - int lo = __shfl_down_sync( mask, __double2loint(r), offset, bound ); - return __hiloint2double( hi, lo ); + return __shfl_down_sync( mask, r, offset, bound ); #else int hi = __shfl_down( __double2hiint(r), offset, bound ); int lo = __shfl_down( __double2loint(r), offset, bound ); @@ -498,9 +503,7 @@ static __device__ __forceinline__ float shfl_up( float r, int offset, int bound static __device__ __forceinline__ double shfl_up( double r, int offset, int bound = warpSize, unsigned int mask = DEFAULT_MASK ) { #if CUDART_VERSION >= 9000 - int hi = __shfl_up_sync( mask, __double2hiint(r), offset, bound ); - int lo = __shfl_up_sync( mask, __double2loint(r), offset, bound ); - return __hiloint2double( hi, lo ); + return __shfl_up_sync( mask, r, offset, bound ); #else int hi = __shfl_up( __double2hiint(r), offset, bound ); int lo = __shfl_up( __double2loint(r), offset, bound ); diff --git a/3rd_party/AMGX/base/include/solvers/solver.h b/3rd_party/AMGX/base/include/solvers/solver.h index 381d2af57..add41b5de 100644 --- a/3rd_party/AMGX/base/include/solvers/solver.h +++ b/3rd_party/AMGX/base/include/solvers/solver.h @@ -120,6 +120,7 @@ class Solver : public AuxData inline bool compute_norm_and_converged() { compute_norm(); + return converged(); } @@ -252,6 +253,7 @@ class Solver : public AuxData std::vector m_res_history; PODVector_h m_nrm; PODVector_h m_nrm_ini; + PODValueB m_norm_factor; bool m_use_scalar_norm; // Convergence object. To decide convergence. diff --git a/3rd_party/AMGX/base/include/types.h b/3rd_party/AMGX/base/include/types.h index 6a12045b2..03e4407c2 100644 --- a/3rd_party/AMGX/base/include/types.h +++ b/3rd_party/AMGX/base/include/types.h @@ -36,7 +36,7 @@ namespace amgx enum ASSIGNMENTS {COARSE = -1, FINE = -2, STRONG_FINE = -3, UNASSIGNED = -4}; // NormType -enum NormType {L1, L2, LMAX}; +enum NormType {L1, L1_SCALED, L2, LMAX}; inline const char *getString(NormType p) { switch (p) @@ -44,6 +44,9 @@ inline const char *getString(NormType p) case L1: return "L1"; + case L1_SCALED: + return "L1_SCALED"; + case L2: return "L2"; diff --git a/3rd_party/AMGX/base/src/amg_level.cu b/3rd_party/AMGX/base/src/amg_level.cu index 113e6af16..e8a9b4bdb 100644 --- a/3rd_party/AMGX/base/src/amg_level.cu +++ b/3rd_party/AMGX/base/src/amg_level.cu @@ -93,12 +93,11 @@ void AMG_Level::setup() if (separation_interior & INTERIOR == 0) { FatalError("Interior separation must include interior nodes", AMGX_ERR_CONFIGURATION); } - m_min_rows_latency_hiding = amg->m_cfg->AMG_Config::getParameter("min_rows_latency_hiding", "default"); this->getA().setExteriorView(separation_exterior); int offset, size; this->getA().getOffsetAndSizeForView(separation_exterior, &offset, &size); - if (m_min_rows_latency_hiding < 0 || size < m_min_rows_latency_hiding) + if (!this->getA().isLatencyHidingEnabled(*this->amg->m_cfg)) { this->getA().setInteriorView(separation_exterior); } diff --git a/3rd_party/AMGX/base/src/amgx_cusparse.cu b/3rd_party/AMGX/base/src/amgx_cusparse.cu index df4b728e1..2e59c2992 100644 --- a/3rd_party/AMGX/base/src/amgx_cusparse.cu +++ b/3rd_party/AMGX/base/src/amgx_cusparse.cu @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -546,6 +547,17 @@ void Cusparse::bsrmv( const int color, } } +__global__ void offset_by_col_off(int nrows, int* rows, const int* bsrRowPtr) +{ + int i = threadIdx.x + blockIdx.x * blockDim.x; + if(i >= nrows+1) + { + return; + } + + rows[i] = bsrRowPtr[i] - bsrRowPtr[0]; +} + template< class TConfig > void Cusparse::bsrmv_internal( const typename TConfig::VecPrec alphaConst, const Matrix &A, @@ -556,8 +568,8 @@ void Cusparse::bsrmv_internal( const typename TConfig::VecPrec alphaConst, const cudaStream_t &stream) { typedef typename TConfig::VecPrec ValueTypeB; - int offset, size, nnz; - A.getOffsetAndSizeForView(view, &offset, &size); + int row_off, nrows, nnz; + A.getOffsetAndSizeForView(view, &row_off, &nrows); A.getNnzForView(view, &nnz); cusparseDirection_t direction = CUSPARSE_DIRECTION_COLUMN; @@ -571,18 +583,16 @@ void Cusparse::bsrmv_internal( const typename TConfig::VecPrec alphaConst, if (has_offdiag ) { - cusparseSetStream(Cusparse::get_instance().m_handle, stream); bsrmv( Cusparse::get_instance().m_handle, direction, CUSPARSE_OPERATION_NON_TRANSPOSE, - size, A.get_num_cols(), nnz, &alphaConst, + nrows, A.get_num_cols(), nnz, &alphaConst, A.cuMatDescr, A.values.raw(), - A.m_seq_offsets.raw() + offset, - A.row_offsets.raw() + offset, A.col_indices.raw(), + A.m_seq_offsets.raw() + row_off, + A.row_offsets.raw() + row_off, A.col_indices.raw(), A.get_block_dimx(), x.raw(), &betaConst, - y.raw() + offset * A.get_block_dimx() ); - // Reset to default stream - cusparseSetStream(Cusparse::get_instance().m_handle, 0); + y.raw() + row_off * A.get_block_dimx(), + stream); } if (A.hasProps(DIAG)) @@ -598,18 +608,16 @@ void Cusparse::bsrmv_internal( const typename TConfig::VecPrec alphaConst, beta = types::util::get_one(); } - cusparseSetStream(Cusparse::get_instance().m_handle, stream); bsrmv( Cusparse::get_instance().m_handle, direction, CUSPARSE_OPERATION_NON_TRANSPOSE, - size, A.get_num_cols(), A.get_num_rows(), &alphaConst, + nrows, A.get_num_cols(), A.get_num_rows(), &alphaConst, A.cuMatDescr, A.values.raw() + A.diagOffset()*A.get_block_size(), A.m_seq_offsets.raw(), - A.m_seq_offsets.raw() + offset, A.m_seq_offsets.raw(), + A.m_seq_offsets.raw() + row_off, A.m_seq_offsets.raw(), A.get_block_dimx(), x.raw(), &beta, - y.raw() + offset * A.get_block_dimx() ); - // Reset to default stream - cusparseSetStream(Cusparse::get_instance().m_handle, 0); + y.raw() + row_off * A.get_block_dimx(), + stream); } } @@ -659,7 +667,7 @@ void Cusparse::bsrmv_internal_with_mask( const typename TConfig::VecPrec alphaCo { cusparseSetStream(Cusparse::get_instance().m_handle, stream); bsrxmv_internal( Cusparse::get_instance().m_handle, direction, CUSPARSE_OPERATION_NON_TRANSPOSE, nrows, - A.get_num_rows(), A.get_num_cols(), nnz, &alphaConst, + nrows, A.get_num_cols(), nnz, &alphaConst, A.cuMatDescr, A.values.raw(), A.manager->getRowsListForView(view).raw(), @@ -707,8 +715,8 @@ void Cusparse::bsrmv_internal_with_mask_restriction( const typename TConfig::Vec direction = CUSPARSE_DIRECTION_ROW; } - int offset, nrows, nnz; - R.getFixedSizesForView(view, &offset, &nrows, &nnz); + int row_off, nrows, nnz; + R.getFixedSizesForView(view, &row_off, &nrows, &nnz); bool has_offdiag = nnz != 0; typedef typename Matrix::index_type index_type; @@ -720,18 +728,16 @@ void Cusparse::bsrmv_internal_with_mask_restriction( const typename TConfig::Vec if (has_offdiag) { - cusparseSetStream(Cusparse::get_instance().m_handle, stream); bsrmv( Cusparse::get_instance().m_handle, direction, CUSPARSE_OPERATION_NON_TRANSPOSE, nrows, R.get_num_cols(), nnz, &alphaConst, R.cuMatDescr, R.values.raw(), - R.m_seq_offsets.raw() + offset, - R.row_offsets.raw() + offset, R.col_indices.raw(), + R.m_seq_offsets.raw() + row_off, + R.row_offsets.raw() + row_off, R.col_indices.raw(), R.get_block_dimx(), x.raw(), &betaConst, - y.raw() + offset * R.get_block_dimx() ); - // Reset to default stream - cusparseSetStream(Cusparse::get_instance().m_handle, 0); + y.raw() + row_off * R.get_block_dimx(), + stream); } if (R.hasProps(DIAG)) @@ -753,23 +759,20 @@ void Cusparse::bsrmv_internal( const typename TConfig::VecPrec alphaConst, const cudaStream_t &stream) { typedef typename TConfig::VecPrec ValueType; - int offset, size; - A.getOffsetAndSizeForView(view, &offset, &size); + int row_off, nrows, nnz; + A.getFixedSizesForView(view, &row_off, &nrows, &nnz); cusparseDirection_t direction = A.getBlockFormat() == ROW_MAJOR ? CUSPARSE_DIRECTION_ROW : CUSPARSE_DIRECTION_COLUMN; - cusparseSetStream(Cusparse::get_instance().m_handle, stream); bsrmv( Cusparse::get_instance().m_handle, direction, CUSPARSE_OPERATION_NON_TRANSPOSE, - size, A.get_num_cols(), A.get_num_nz(), &alphaConst, + nrows, A.get_num_cols(), nnz, &alphaConst, A.cuMatDescr, E.raw(), A.m_seq_offsets.raw(), - A.m_seq_offsets.raw() + offset, A.m_seq_offsets.raw(), + A.m_seq_offsets.raw() + row_off, A.m_seq_offsets.raw(), A.get_block_dimx(), x.raw(), &betaConst, - y.raw() + offset * A.get_block_dimx() ); - - // Reset to default stream - cusparseSetStream(Cusparse::get_instance().m_handle, 0); + y.raw() + row_off * A.get_block_dimx(), + stream); } @@ -994,22 +997,40 @@ template inline void generic_SpMV(cusparseHandle_t handle, cusparseOperation_t trans, int mb, int nb, int nnzb, const MatType *alpha, - const MatType *vals, + const MatType *val, const IndType *rowPtr, const IndType *colInd, const VecType *x, const VecType *beta, VecType *y, cudaDataType matType, - cudaDataType vecType) + cudaDataType vecType, + const cudaStream_t& stream) { + int col_off; + cudaMemcpyAsync(&col_off, &rowPtr[0], sizeof(int), cudaMemcpyDefault, stream); + cudaStreamSynchronize(stream); + + IndType* rows = const_cast(rowPtr); + IndType* cols = const_cast(colInd) + col_off; + MatType* vals = const_cast(val) + col_off; + + if(col_off > 0) + { + amgx::memory::cudaMalloc((void**)&rows, sizeof(IndType)*(mb+1)); + + constexpr int nthreads = 128; + const int nblocks = (mb + 1) / nthreads + 1; + offset_by_col_off<<>>(mb, rows, rowPtr); + } + cusparseSpMatDescr_t matA_descr; cusparseDnVecDescr_t vecX_descr; cusparseDnVecDescr_t vecY_descr; cusparseCheckError(cusparseCreateDnVec(&vecX_descr, nb, const_cast(x), vecType)); cusparseCheckError(cusparseCreateDnVec(&vecY_descr, mb, const_cast(y), vecType)); cusparseCheckError( - cusparseCreateCsr(&matA_descr, mb, nb, nnzb, const_cast(rowPtr), const_cast(colInd), + cusparseCreateCsr(&matA_descr, mb, nb, nnzb, const_cast(rows), const_cast(cols), const_cast(vals), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, matType)); size_t bufferSize = 0; @@ -1031,6 +1052,11 @@ inline void generic_SpMV(cusparseHandle_t handle, cusparseOperation_t trans, { amgx::memory::cudaFreeAsync(dBuffer); } + + if(col_off > 0) + { + amgx::memory::cudaFreeAsync(rows); + } } #endif @@ -1045,12 +1071,16 @@ inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, c int blockDim, const float *x, const float *beta, - float *y) + float *y, + const cudaStream_t& stream) { + // Run cuSparse on selected stream + cusparseSetStream(handle, stream); + if (blockDim == 1) { #ifdef CUSPARSE_GENERIC_INTERFACES - generic_SpMV(handle, trans, mb, nb, nnzb, alpha, bsrVal, bsrRowPtr, bsrColInd, x, beta, y, CUDA_R_32F, CUDA_R_32F); + generic_SpMV(handle, trans, mb, nb, nnzb, alpha, bsrVal, bsrRowPtr, bsrColInd, x, beta, y, CUDA_R_32F, CUDA_R_32F, stream); #else cusparseCheckError(cusparseScsrmv(handle, trans, mb, nb, nnzb, alpha, descr, bsrVal, bsrRowPtr, bsrColInd, x, beta, y)); #endif @@ -1059,6 +1089,9 @@ inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, c { cusparseCheckError(cusparseSbsrmv(handle, dir, trans, mb, nb, nnzb, alpha, descr, bsrVal, bsrRowPtr, bsrColInd, blockDim, x, beta, y)); } + + // Reset cuSparse to default stream + cusparseSetStream(handle, 0); } inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, cusparseOperation_t trans, @@ -1072,12 +1105,17 @@ inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, c int blockDim, const double *x, const double *beta, - double *y) + double *y, + const cudaStream_t& stream) { + // Run cuSparse on selected stream + cusparseSetStream(handle, stream); + if (blockDim == 1) { #ifdef CUSPARSE_GENERIC_INTERFACES - generic_SpMV(handle, trans, mb, nb, nnzb, alpha, bsrVal, bsrRowPtr, bsrColInd, x, beta, y, CUDA_R_64F, CUDA_R_64F); + generic_SpMV(handle, trans, mb, nb, nnzb, alpha, bsrVal, bsrRowPtr, bsrColInd, x, beta, y, CUDA_R_64F, CUDA_R_64F, stream); + #else cusparseCheckError(cusparseDcsrmv(handle, trans, mb, nb, nnzb, alpha, descr, bsrVal, bsrRowPtr, bsrColInd, x, beta, y)); #endif @@ -1086,6 +1124,9 @@ inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, c { cusparseCheckError(cusparseDbsrmv(handle, dir, trans, mb, nb, nnzb, alpha, descr, bsrVal, bsrRowPtr, bsrColInd, blockDim, x, beta, y)); } + + // Reset cuSparse to default stream + cusparseSetStream(handle, 0); } inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, cusparseOperation_t trans, @@ -1099,14 +1140,21 @@ inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, c int blockDim, const double *x, const double *beta, - double *y) + double *y, + const cudaStream_t& stream) { + // Run cuSparse on selected stream + cusparseSetStream(handle, stream); + #ifndef DISABLE_MIXED_PRECISION const double *d_bsrVal = reinterpret_cast(const_cast(bsrVal)); // this works due to private API call in the matrix initialization which sets cusparse matrix description in the half precision mode cusparseCheckError(cusparseDbsrxmv(handle, dir, trans, mb, mb, nb, nnzb, alpha, descr, d_bsrVal, bsrMaskPtr, bsrRowPtr, bsrRowPtr + 1, bsrColInd, blockDim, x, beta, y)); #else FatalError("Mixed precision modes not currently supported for CUDA 10.1 or later.", AMGX_ERR_NOT_IMPLEMENTED); #endif + + // Reset cuSparse to default stream + cusparseSetStream(handle, 0); } // Custom implementation of matrix-vector product to replace the original bsrxmv, @@ -1187,7 +1235,7 @@ inline void Xcsrxmv( cusparseHandle_t handle, cusparseDirection_t dir, cusparseO constexpr int nthreads = 128; constexpr int unroll_factor = 16; - int nblocks = sizeOfMask / nthreads; + int nblocks = sizeOfMask / nthreads + 1; csrxmv<<>>(sizeOfMask, *alpha, bsrVal, bsrMaskPtr, bsrRowPtr, bsrColInd, x, *beta, y); } @@ -1304,12 +1352,16 @@ inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, c int blockDim, const cuComplex *x, const cuComplex *beta, - cuComplex *y) + cuComplex *y, + const cudaStream_t& stream) { + // Run cuSparse on selected stream + cusparseSetStream(handle, stream); + if (blockDim == 1) { #ifdef CUSPARSE_GENERIC_INTERFACES - generic_SpMV(handle, trans, mb, nb, nnzb, alpha, bsrVal, bsrRowPtr, bsrColInd, x, beta, y, CUDA_C_32F, CUDA_C_32F); + generic_SpMV(handle, trans, mb, nb, nnzb, alpha, bsrVal, bsrRowPtr, bsrColInd, x, beta, y, CUDA_C_32F, CUDA_C_32F, stream); #else cusparseCheckError(cusparseCcsrmv(handle, trans, mb, nb, nnzb, alpha, descr, bsrVal, bsrRowPtr, bsrColInd, x, beta, y)); #endif @@ -1318,6 +1370,9 @@ inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, c { cusparseCheckError(cusparseCbsrmv(handle, dir, trans, mb, nb, nnzb, alpha, descr, bsrVal, bsrRowPtr, bsrColInd, blockDim, x, beta, y)); } + + // Reset cuSparse to default stream + cusparseSetStream(handle, 0); } inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, cusparseOperation_t trans, @@ -1331,12 +1386,16 @@ inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, c int blockDim, const cuDoubleComplex *x, const cuDoubleComplex *beta, - cuDoubleComplex *y) + cuDoubleComplex *y, + const cudaStream_t& stream) { + // Run cuSparse on selected stream + cusparseSetStream(handle, stream); + if (blockDim == 1) { #ifdef CUSPARSE_GENERIC_INTERFACES - generic_SpMV(handle, trans, mb, nb, nnzb, alpha, bsrVal, bsrRowPtr, bsrColInd, x, beta, y, CUDA_C_64F, CUDA_C_64F); + generic_SpMV(handle, trans, mb, nb, nnzb, alpha, bsrVal, bsrRowPtr, bsrColInd, x, beta, y, CUDA_C_64F, CUDA_C_64F, stream); #else cusparseCheckError(cusparseZcsrmv(handle, trans, mb, nb, nnzb, alpha, descr, bsrVal, bsrRowPtr, bsrColInd, x, beta, y)); #endif @@ -1345,6 +1404,9 @@ inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, c { cusparseCheckError(cusparseZbsrmv(handle, dir, trans, mb, nb, nnzb, alpha, descr, bsrVal, bsrRowPtr, bsrColInd, blockDim, x, beta, y)); } + + // Reset cuSparse to default stream + cusparseSetStream(handle, 0); } inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, cusparseOperation_t trans, @@ -1358,14 +1420,21 @@ inline void Cusparse::bsrmv( cusparseHandle_t handle, cusparseDirection_t dir, c int blockDim, const cuDoubleComplex *x, const cuDoubleComplex *beta, - cuDoubleComplex *y) + cuDoubleComplex *y, + const cudaStream_t& stream) { + // Run cuSparse on selected stream + cusparseSetStream(handle, stream); + #ifndef DISABLE_MIXED_PRECISION const cuDoubleComplex *d_bsrVal = reinterpret_cast(const_cast(bsrVal)); cusparseCheckError(cusparseZbsrxmv(handle, dir, trans, mb, mb, nb, nnzb, alpha, descr, d_bsrVal, bsrMaskPtr, bsrRowPtr, bsrRowPtr + 1, bsrColInd, blockDim, x, beta, y)); #else FatalError("Mixed precision modes not currently supported for CUDA 10.1 or later.", AMGX_ERR_NOT_IMPLEMENTED); #endif + + // Reset cuSparse to default stream + cusparseSetStream(handle, 0); } @@ -1651,6 +1720,64 @@ void Cusparse::csrmm(typename TConfig::VecPrec alpha, Res.dirtybit = 1; } +template +void transpose_internal(cusparseHandle_t handle, int nRows, int nCols, int nNz, const T* Avals, const int* Arows, const int* Acols, T* Bvals, int* Brows, int* Bcols, cudaDataType valType) +{ + size_t bufferSize; + cusparseCheckError(cusparseCsr2cscEx2_bufferSize( + handle, nRows, nCols, nNz, Avals, Arows, Acols, Bvals, Brows, Bcols, valType, + CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG2, &bufferSize)); + + void *buffer = nullptr; + if (bufferSize > 0) + { + amgx::memory::cudaMalloc(&buffer, bufferSize); + } + + cusparseCheckError(cusparseCsr2cscEx2( + handle, nRows, nCols, nNz, Avals, Arows, Acols, Bvals, Brows, Bcols, valType, + CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG2, buffer)); + + if(bufferSize > 0) + { + amgx::memory::cudaFreeAsync(buffer); + } +} + +void transpose_internal(cusparseHandle_t handle, int nRows, int nCols, int nNz, const float* Avals, const int* Arows, const int* Acols, float* Bvals, int* Brows, int* Bcols) +{ + transpose_internal(handle, nRows, nCols, nNz, Avals, Arows, Acols, Bvals, Brows, Bcols, CUDA_R_32F); +} +void transpose_internal(cusparseHandle_t handle, int nRows, int nCols, int nNz, const double* Avals, const int* Arows, const int* Acols, double* Bvals, int* Brows, int* Bcols) +{ + transpose_internal(handle, nRows, nCols, nNz, Avals, Arows, Acols, Bvals, Brows, Bcols, CUDA_R_64F); +} +void transpose_internal(cusparseHandle_t handle, int nRows, int nCols, int nNz, const cuComplex* Avals, const int* Arows, const int* Acols, cuComplex* Bvals, int* Brows, int* Bcols) +{ + transpose_internal(handle, nRows, nCols, nNz, Avals, Arows, Acols, Bvals, Brows, Bcols, CUDA_C_32F); +} +void transpose_internal(cusparseHandle_t handle, int nRows, int nCols, int nNz, const cuDoubleComplex* Avals, const int* Arows, const int* Acols, cuDoubleComplex* Bvals, int* Brows, int* Bcols) +{ + transpose_internal(handle, nRows, nCols, nNz, Avals, Arows, Acols, Bvals, Brows, Bcols, CUDA_C_64F); +} + +template +void Cusparse::transpose(const Matrix& A, Matrix& B, const int nRows, const int nNz) +{ + cusparseHandle_t handle = Cusparse::get_instance().m_handle; + transpose_internal(handle, nRows, A.get_num_cols(), nNz, + A.values.raw(), A.row_offsets.raw(), A.col_indices.raw(), + B.values.raw(), B.row_offsets.raw(), B.col_indices.raw()); +} + +template +void Cusparse::transpose(const Matrix& A, Matrix& B) +{ + cusparseHandle_t handle = Cusparse::get_instance().m_handle; + transpose_internal(handle, A.get_num_rows(), A.get_num_cols(), A.get_num_nz(), + A.values.raw(), A.row_offsets.raw(), A.col_indices.raw(), + B.values.raw(), B.row_offsets.raw(), B.col_indices.raw()); +} //#define AMGX_CASE_LINE(CASE) template class Cusparse::Type>; // AMGX_FORALL_BUILDS(AMGX_CASE_LINE) @@ -1705,6 +1832,13 @@ AMGX_FORALL_BUILDS(AMGX_CASE_LINE) AMGX_FORCOMPLEX_BUILDS(AMGX_CASE_LINE) #undef AMGX_CASE_LINE +#define AMGX_CASE_LINE(CASE) \ + template void Cusparse::transpose(const Matrix::Type>& A, Matrix::Type>& B); \ + template void Cusparse::transpose(const Matrix::Type>& A, Matrix::Type>& B, const int nRows, const int nNz); +AMGX_FORALL_BUILDS(AMGX_CASE_LINE) +AMGX_FORCOMPLEX_BUILDS(AMGX_CASE_LINE) +#undef AMGX_CASE_LINE + #ifndef DISABLE_MIXED_PRECISION #define AMGX_CASE_LINE(CASE) template struct CusparseMatPrec::Type>; AMGX_FORALL_BUILDS(AMGX_CASE_LINE) diff --git a/3rd_party/AMGX/base/src/classical/selectors/selector.cu b/3rd_party/AMGX/base/src/classical/selectors/selector.cu index 30010ea86..b89ee3c83 100644 --- a/3rd_party/AMGX/base/src/classical/selectors/selector.cu +++ b/3rd_party/AMGX/base/src/classical/selectors/selector.cu @@ -431,7 +431,7 @@ compute_c_hat_kernel( int A_num_rows, // Shared memory to vote. __shared__ volatile int s_b_row_ids[CTA_SIZE]; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -592,7 +592,7 @@ compute_c_hat_kernel( int A_num_rows, // Shared memory to vote. __shared__ volatile int s_b_row_ids[CTA_SIZE]; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -768,7 +768,7 @@ void __global__ createCfMapGlobal(const IndexType *cf_map, int64_t *cf_map_glob #include -enum { WARP_SIZE = 32, GRID_SIZE = 128, SMEM_SIZE = 128 }; +enum { WARP_SIZE = 32, GRID_SIZE = 1024, SMEM_SIZE = 128 }; template< typename Value_type, int CTA_SIZE, int WARP_SIZE > __global__ __launch_bounds__( CTA_SIZE ) @@ -946,18 +946,17 @@ void Selector >::cr const ValueType *Avalues = A.values.raw(); const IndexType Anum_rows = (int) A.get_num_rows(); int *cf_map_ptr = cf_map.raw(); - Hash_Workspace exp_wk; + Hash_Workspace exp_wk(true, GRID_SIZE); IntVector C_hat_start( A.get_num_rows() + 1, 0 ), C_hat_end( A.get_num_rows() + 1, 0 ); { const int CTA_SIZE = 256; const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - int work_offset = GRID_SIZE * NUM_WARPS; - cudaMemcpy( exp_wk.get_work_queue(), &work_offset, sizeof(int), cudaMemcpyHostToDevice ); int avg_nz_per_row = A.get_num_nz() / A.get_num_rows(); + int grid_size = A.get_num_rows()/NUM_WARPS + 1; if ( avg_nz_per_row < 16 ) { - amgx::classical::selector::estimate_c_hat_size_kernel< 8, CTA_SIZE, WARP_SIZE> <<< 2048, CTA_SIZE>>>( + amgx::classical::selector::estimate_c_hat_size_kernel< 8, CTA_SIZE, WARP_SIZE> <<< grid_size, CTA_SIZE>>>( A.get_num_rows(), A.row_offsets.raw(), A.col_indices.raw(), @@ -967,7 +966,7 @@ void Selector >::cr } else { - amgx::classical::selector::estimate_c_hat_size_kernel <<< 2048, CTA_SIZE>>>( + amgx::classical::selector::estimate_c_hat_size_kernel <<< grid_size, CTA_SIZE>>>( A.get_num_rows(), A.row_offsets.raw(), A.col_indices.raw(), @@ -1075,7 +1074,7 @@ void Selector >::cr } // count the number of non-zeros in the interpolation matrix - int numBlocks = min( 4096, (int) (A.get_num_rows() + blockSize - 1) / blockSize ); + int numBlocks = (int) (A.get_num_rows() + blockSize - 1) / blockSize; IntVector nonZeroOffsets(S2_num_rows + 1); IntVector nonZerosPerRow(S2_num_rows); // Updating the number of nonZeros for your own coarse rows @@ -1110,9 +1109,12 @@ void Selector >::cr // Run the computation. typedef typename MatPrecisionMap::Type Value_type; + const int NUM_WARPS = CTA_SIZE / WARP_SIZE; if (!A.is_matrix_distributed()) { - fillS2ColIndices <<< GRID_SIZE, CTA_SIZE>>>( + int grid_size = (A.get_num_rows() / NUM_WARPS) + 1; + + fillS2ColIndices <<< grid_size, CTA_SIZE>>>( A.get_num_rows(), cf_map.raw(), C_hat.raw(), @@ -1129,14 +1131,17 @@ void Selector >::cr int num_owned_fine_pts = A.get_num_rows(); int my_rank = A.manager->global_id(); const int cta_size = 128; - const int grid_size = std::min( 4096, (num_owned_fine_pts + cta_size - 1) / cta_size); + int grid_size = (num_owned_fine_pts + cta_size - 1) / cta_size; createCfMapGlobal <<< grid_size, cta_size>>>(cf_map.raw(), cf_map_global.raw(), S2.manager->part_offsets_h[my_rank], num_owned_fine_pts); cudaCheckError(); + // Exchange the cf_map_global so that we know the coarse global id of halo nodes cf_map_global.dirtybit = 1; A.manager->exchange_halo_2ring(cf_map_global, cf_map_global.tag); I64Vector_d S2_col_indices_global(S2.col_indices.size()); - fillS2ColIndices <<< GRID_SIZE, CTA_SIZE>>>( + + grid_size = (A.get_num_rows() / NUM_WARPS) + 1; + fillS2ColIndices <<< grid_size, CTA_SIZE>>>( A.get_num_rows(), cf_map.raw(), C_hat.raw(), diff --git a/3rd_party/AMGX/base/src/csr_multiply_sm35.cu b/3rd_party/AMGX/base/src/csr_multiply_sm35.cu index 6f8f25f7f..8291e9b5d 100644 --- a/3rd_party/AMGX/base/src/csr_multiply_sm35.cu +++ b/3rd_party/AMGX/base/src/csr_multiply_sm35.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2013-2017, NVIDIA CORPORATION. All rights reserved. +/* Copyright (c) 2013-2022, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -91,7 +91,7 @@ count_non_zeroes_kernel( const int A_num_rows, { const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); const int lane_id = utils::lane_id(); @@ -371,7 +371,7 @@ count_non_zeroes_kernel( const int A_num_rows, // Store the results. if ( COUNT_ONLY ) { - int count = set.compute_size_with_duplicates(); + int count = set.compute_size(); if ( lane_id == 0 ) { @@ -459,7 +459,7 @@ __device__ __forceinline__ void sparse_add_process_row_values(int row_id, const value = utils::Ld::load( &vals[col_it] ); } - map.insert_with_duplicates( col_id, value, wk_status ); + map.insert( col_id, value, wk_status ); } } @@ -543,7 +543,7 @@ count_non_zeroes_RAP_ext_kernel( const int RAP_int_num_rows, // Store the results. if ( COUNT_ONLY ) { - int count = set.compute_size_with_duplicates(); + int count = set.compute_size(); if ( lane_id == 0 ) { @@ -613,7 +613,7 @@ count_non_zeroes_ilu1_kernel( const int A_num_rows, // Tables to broadcast values. __shared__ volatile int s_b_rows[CTA_SIZE], s_b_colors[CTA_SIZE]; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -931,7 +931,7 @@ count_non_zeroes_ilu1_kernel( const int A_num_rows, // Store the results. if ( COUNT_ONLY ) { - int count = set.compute_size_with_duplicates(); + int count = set.compute_size(); if ( lane_id == 0 ) { @@ -987,20 +987,20 @@ compute_values_kernel( const int A_num_rows, int *wk_work_queue, int *wk_status ) { - const int NUM_WARPS = CTA_SIZE / 32; + const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // The hash keys stored in shared memory. __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; // The hash values stored in shared memory. - __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; + __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); const int lane_id = utils::lane_id(); // First threads load the row IDs of A needed by the CTA... int a_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. - Hash_map map( &s_keys[warp_id * SMEM_SIZE], + Hash_map map(&s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], + &s_vals[warp_id * SMEM_SIZE], &g_vals[a_row_id * gmem_size], gmem_size ); @@ -1090,7 +1090,7 @@ compute_values_kernel( const int A_num_rows, } } - map.insert( b_col_id, uniform_a_value, b_value, wk_status ); + map.insert( b_col_id, uniform_a_value * b_value, wk_status ); } } } @@ -1147,7 +1147,7 @@ compute_values_kernel( const int A_num_rows, // The hash keys stored in shared memory. __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; // The hash values stored in shared memory. - __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; + __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -1157,9 +1157,9 @@ compute_values_kernel( const int A_num_rows, // First threads load the row IDs of A needed by the CTA... int a_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. - Hash_map map( &s_keys[warp_id * SMEM_SIZE], + Hash_map map(&s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], + &s_vals[warp_id * SMEM_SIZE], &g_vals[a_row_id * gmem_size], gmem_size ); @@ -1174,7 +1174,7 @@ compute_values_kernel( const int A_num_rows, } // Clear the map. - map.clear_all(); + map.clear(); // Load the range of the row. int a_col_tmp = -1; @@ -1253,7 +1253,7 @@ compute_values_kernel( const int A_num_rows, } } - map.insert_with_duplicates( b_col_id, uniform_a_value * b_value, wk_status ); + map.insert( b_col_id, uniform_a_value * b_value, wk_status ); } } } @@ -1306,7 +1306,7 @@ compute_values_RAP_ext_kernel( const int RAP_int_num_rows, // The hash keys stored in shared memory. __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; // The hash values stored in shared memory. - __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; + __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -1315,7 +1315,7 @@ compute_values_RAP_ext_kernel( const int RAP_int_num_rows, // Create local storage for the set. Hash_map map( &s_keys[warp_id * SMEM_SIZE], &g_keys[rap_int_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], + &s_vals[warp_id * SMEM_SIZE], &g_vals[rap_int_row_id * gmem_size], gmem_size ); @@ -1323,7 +1323,7 @@ compute_values_RAP_ext_kernel( const int RAP_int_num_rows, for ( ; rap_int_row_id < RAP_int_num_rows ; rap_int_row_id = get_work( wk_work_queue, warp_id ) ) { // Clear the map. - map.clear_all(); + map.clear(); // --------------------------------- // First process RAP_int // --------------------------------- @@ -1383,7 +1383,7 @@ namespace amgx /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -enum { WARP_SIZE = 32, GRID_SIZE = 128, SMEM_SIZE = 128 }; +enum { WARP_SIZE = 32, SMEM_SIZE = 128 }; // ==================================================================================================================== @@ -1397,6 +1397,7 @@ CSR_Multiply_Sm35 >::CSR_Multiply_Sm35( boo template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > void CSR_Multiply_Sm35 >::count_non_zeroes( const Matrix_d &A, const Matrix_d &B, Matrix_d &C, IVector *Aq1, IVector *Bq1, IVector *Aq2, IVector *Bq2 ) { + const int GRID_SIZE = 128; const int CTA_SIZE = 256; const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // Reset work queue. @@ -1510,6 +1511,7 @@ template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > void CSR_Multiply_Sm35 >::count_non_zeroes_RAP_sparse_add( Matrix_d &RAP, const Matrix_d &RAP_int, std::vector &RAP_ext_row_offsets, std::vector &RAP_ext_col_indices, std::vector &RAP_ext_values, std::vector &RAP_ext_row_ids) { + const int GRID_SIZE = 128; const int CTA_SIZE = 256; const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // Reset work queue. @@ -1596,6 +1598,7 @@ template< int CTA_SIZE, bool COUNT_ONLY, typename Diag_traits, typename Matrix > static void count_non_zeroes_ilu1_dispatch( const Matrix &A, Matrix &B, int num_threads_per_row_count, int gmem_size, int *keys, int *work_queue, int *status ) { + const int GRID_SIZE = 128; switch ( num_threads_per_row_count ) { case 2: @@ -1676,6 +1679,7 @@ count_non_zeroes_ilu1_dispatch( const Matrix &A, Matrix &B, int num_threads_per_ template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > void CSR_Multiply_Sm35 >::count_non_zeroes_ilu1( const Matrix_d &A, Matrix_d &B ) { + const int GRID_SIZE = 128; const int CTA_SIZE = 256; const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // Reset work queue. @@ -1722,9 +1726,9 @@ void CSR_Multiply_Sm35 >::compute_offsets( template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > void CSR_Multiply_Sm35 >::compute_sparsity( const Matrix_d &A, const Matrix_d &B, Matrix_d &C ) { + const int GRID_SIZE = 128; const int CTA_SIZE = 256; const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - // std::cerr << "CSR_Multiply_Sm35 >::compute_sparsity" << std::endl; // Reset the work queue. int work_offset = GRID_SIZE * NUM_WARPS; CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) ); @@ -1836,6 +1840,7 @@ void CSR_Multiply_Sm35 >::compute_sparsity( template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > void CSR_Multiply_Sm35 >::compute_sparsity_ilu1( const Matrix_d &A, Matrix_d &B ) { + const int GRID_SIZE = 128; const int CTA_SIZE = 256; const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // Reset work queue. @@ -1871,6 +1876,7 @@ void CSR_Multiply_Sm35 >::compute_sparsity_ template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > void CSR_Multiply_Sm35 >::compute_values( const Matrix_d &A, const Matrix_d &B, Matrix_d &C, int num_threads, IVector *Aq1, IVector *Bq1, IVector *Aq2, IVector *Bq2 ) { + const int GRID_SIZE = 128; const int CTA_SIZE = 128; const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // Reset the work queue. @@ -2009,6 +2015,7 @@ void CSR_Multiply_Sm35 >::compute_values( c template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > void CSR_Multiply_Sm35 >::compute_values_RAP_sparse_add( Matrix_d &RAP, const Matrix_d &RAP_int, std::vector &RAP_ext_row_offsets, std::vector &RAP_ext_col_indices, std::vector &RAP_ext_values, std::vector &RAP_ext_row_ids, int num_threads) { + const int GRID_SIZE = 128; const int CTA_SIZE = 128; const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // Reset the work queue. diff --git a/3rd_party/AMGX/base/src/csr_multiply_sm70.cu b/3rd_party/AMGX/base/src/csr_multiply_sm70.cu index b8be9ef1a..152791ac8 100644 --- a/3rd_party/AMGX/base/src/csr_multiply_sm70.cu +++ b/3rd_party/AMGX/base/src/csr_multiply_sm70.cu @@ -67,6 +67,7 @@ __device__ __forceinline__ int get_work( int *queue, int warp_id ) return utils::shfl( offset, 0 ); } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template< int CTA_SIZE, int SMEM_SIZE, int WARP_SIZE, bool COUNT_ONLY > @@ -90,7 +91,7 @@ count_non_zeroes_kernel( const int A_num_rows, { const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); const int lane_id = utils::lane_id(); @@ -149,6 +150,7 @@ count_non_zeroes_kernel( const int A_num_rows, if (Aq2 != NULL) { b_row_id = Aq2[b_row_id]; + } if (Bq1 != NULL) @@ -267,7 +269,6 @@ count_non_zeroes_kernel( const int A_num_rows, // Create local storage for the set. Hash_set set( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], gmem_size ); - // Loop over rows of A. for ( ; a_row_id < A_num_rows ; a_row_id = get_work( wk_work_queue, warp_id ) ) { int c_row_id = a_row_id; @@ -370,7 +371,7 @@ count_non_zeroes_kernel( const int A_num_rows, // Store the results. if ( COUNT_ONLY ) { - int count = set.compute_size_with_duplicates(); + int count = set.compute_size(); if ( lane_id == 0 ) { @@ -458,7 +459,7 @@ __device__ __forceinline__ void sparse_add_process_row_values(int row_id, const value = utils::Ld::load( &vals[col_it] ); } - map.insert_with_duplicates( col_id, value, wk_status ); + map.insert( col_id, value, wk_status ); } } @@ -542,7 +543,7 @@ count_non_zeroes_RAP_ext_kernel( const int RAP_int_num_rows, // Store the results. if ( COUNT_ONLY ) { - int count = set.compute_size_with_duplicates(); + int count = set.compute_size(); if ( lane_id == 0 ) { @@ -612,7 +613,7 @@ count_non_zeroes_ilu1_kernel( const int A_num_rows, // Tables to broadcast values. __shared__ volatile int s_b_rows[CTA_SIZE], s_b_colors[CTA_SIZE]; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -930,7 +931,7 @@ count_non_zeroes_ilu1_kernel( const int A_num_rows, // Store the results. if ( COUNT_ONLY ) { - int count = set.compute_size_with_duplicates(); + int count = set.compute_size(); if ( lane_id == 0 ) { @@ -986,20 +987,20 @@ compute_values_kernel( const int A_num_rows, int *wk_work_queue, int *wk_status ) { - const int NUM_WARPS = CTA_SIZE / 32; + const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // The hash keys stored in shared memory. __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; // The hash values stored in shared memory. - __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; + __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); const int lane_id = utils::lane_id(); // First threads load the row IDs of A needed by the CTA... int a_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. - Hash_map map( &s_keys[warp_id * SMEM_SIZE], + Hash_map map(&s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], + &s_vals[warp_id * SMEM_SIZE], &g_vals[a_row_id * gmem_size], gmem_size ); @@ -1089,7 +1090,7 @@ compute_values_kernel( const int A_num_rows, } } - map.insert( b_col_id, uniform_a_value, b_value, wk_status ); + map.insert( b_col_id, uniform_a_value * b_value, wk_status ); } } } @@ -1114,6 +1115,7 @@ compute_values_kernel( const int A_num_rows, map.store( count, &C_cols[c_col_it], &C_vals[c_col_it] ); } + } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1141,12 +1143,13 @@ compute_values_kernel( const int A_num_rows, int *wk_work_queue, int *wk_status ) { + const int NUM_WARPS = CTA_SIZE / WARP_SIZE; const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW; // The hash keys stored in shared memory. __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; // The hash values stored in shared memory. - __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; + __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -1156,9 +1159,9 @@ compute_values_kernel( const int A_num_rows, // First threads load the row IDs of A needed by the CTA... int a_row_id = blockIdx.x * NUM_WARPS + warp_id; // Create local storage for the set. - Hash_map map( &s_keys[warp_id * SMEM_SIZE], + Hash_map map(&s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], + &s_vals[warp_id * SMEM_SIZE], &g_vals[a_row_id * gmem_size], gmem_size ); @@ -1173,7 +1176,7 @@ compute_values_kernel( const int A_num_rows, } // Clear the map. - map.clear_all(); + map.clear(); // Load the range of the row. int a_col_tmp = -1; @@ -1252,7 +1255,7 @@ compute_values_kernel( const int A_num_rows, } } - map.insert_with_duplicates( b_col_id, uniform_a_value * b_value, wk_status ); + map.insert( b_col_id, uniform_a_value * b_value, wk_status ); } } } @@ -1277,6 +1280,7 @@ compute_values_kernel( const int A_num_rows, map.store( count, &C_cols[c_col_it], &C_vals[c_col_it] ); } + } @@ -1305,7 +1309,7 @@ compute_values_RAP_ext_kernel( const int RAP_int_num_rows, // The hash keys stored in shared memory. __shared__ /*volatile*/ int s_keys[NUM_WARPS * SMEM_SIZE]; // The hash values stored in shared memory. - __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; + __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -1314,7 +1318,7 @@ compute_values_RAP_ext_kernel( const int RAP_int_num_rows, // Create local storage for the set. Hash_map map( &s_keys[warp_id * SMEM_SIZE], &g_keys[rap_int_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], + &s_vals[warp_id * SMEM_SIZE], &g_vals[rap_int_row_id * gmem_size], gmem_size ); @@ -1322,7 +1326,7 @@ compute_values_RAP_ext_kernel( const int RAP_int_num_rows, for ( ; rap_int_row_id < RAP_int_num_rows ; rap_int_row_id = get_work( wk_work_queue, warp_id ) ) { // Clear the map. - map.clear_all(); + map.clear(); // --------------------------------- // First process RAP_int // --------------------------------- @@ -1402,6 +1406,7 @@ void CSR_Multiply_Sm70 >::count_non_zeroes( // Reset work queue. int work_offset = GRID_SIZE * NUM_WARPS; CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) ); + // Compute non-zero elements. switch ( this->m_num_threads_per_row_count ) @@ -1503,6 +1508,7 @@ void CSR_Multiply_Sm70 >::count_non_zeroes( cudaCheckError(); //CUDA_SAFE_CALL( cudaGetLastError() ); + } @@ -1730,7 +1736,6 @@ void CSR_Multiply_Sm70 >::compute_sparsity( const int GRID_SIZE = 1024; const int CTA_SIZE = 256; const int NUM_WARPS = CTA_SIZE / WARP_SIZE; - // std::cerr << "CSR_Multiply_Sm70 >::compute_sparsity" << std::endl; // Reset the work queue. int work_offset = GRID_SIZE * NUM_WARPS; CUDA_SAFE_CALL( cudaMemcpy( this->m_work_queue, &work_offset, sizeof(int), cudaMemcpyHostToDevice ) ); @@ -1843,7 +1848,6 @@ template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > void CSR_Multiply_Sm70 >::compute_sparsity_ilu1( const Matrix_d &A, Matrix_d &B ) { const int GRID_SIZE = 1024; - const int CTA_SIZE = 256; const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // Reset work queue. @@ -1893,6 +1897,8 @@ void CSR_Multiply_Sm70 >::compute_values( c status = this->m_status; } + + switch ( num_threads ) { case 2: diff --git a/3rd_party/AMGX/base/src/distributed/comms_mpi_hostbuffer_stream.cu b/3rd_party/AMGX/base/src/distributed/comms_mpi_hostbuffer_stream.cu index 5dfc6f43a..4fc6ae964 100644 --- a/3rd_party/AMGX/base/src/distributed/comms_mpi_hostbuffer_stream.cu +++ b/3rd_party/AMGX/base/src/distributed/comms_mpi_hostbuffer_stream.cu @@ -1516,7 +1516,10 @@ void CommsMPIHostBufferStream::exchange_hostnames(std::string &my_host } template -void CommsMPIHostBufferStream::all_gather(IndexType_h &my_data, HIVector &gathered_data, int num_parts) { all_gather_templated(my_data, gathered_data, num_parts); } +void CommsMPIHostBufferStream::all_gather(const IndexType_h &my_data, HIVector &gathered_data, int num_parts) { all_gather_templated(my_data, gathered_data, num_parts); } + +template +void CommsMPIHostBufferStream::all_gather(const int64_t &my_data, HI64Vector &gathered_data, int num_parts) { all_gather_templated(my_data, gathered_data, num_parts); } template void CommsMPIHostBufferStream::all_gather_v(HIVector &my_data, HIVector &gathered_data, int num_parts) { all_gather_v_templated(my_data[0], my_data.size(), gathered_data, num_parts); } @@ -1534,7 +1537,7 @@ void CommsMPIHostBufferStream::all_reduce_max(IndexType_h &my_data, In template template -void CommsMPIHostBufferStream::all_gather_templated(T &my_data, T2 &gathered_data, int num_parts) +void CommsMPIHostBufferStream::all_gather_templated(const T &my_data, T2 &gathered_data, int num_parts) { #ifdef AMGX_WITH_MPI gathered_data.resize(num_parts); @@ -1560,6 +1563,57 @@ void CommsMPIHostBufferStream::all_gather_v_templated(T &my_data, int #endif } +template +void CommsMPIHostBufferStream::all_gather_v(HDVector& data, int num_elems, HDVector& gathered_data, HIVector counts, HIVector displs) +{ +#ifdef AMGX_WITH_MPI + MPI_Allgatherv(data.raw(), num_elems, MPI_DOUBLE, gathered_data.raw(), counts.raw(), displs.raw(), MPI_DOUBLE, mpi_comm); +#else + FatalError("MPI Comms module requires compiling with MPI", AMGX_ERR_NOT_IMPLEMENTED); +#endif +} + +template +void CommsMPIHostBufferStream::all_gather_v(HFVector& data, int num_elems, HFVector& gathered_data, HIVector counts, HIVector displs) +{ +#ifdef AMGX_WITH_MPI + MPI_Allgatherv(data.raw(), num_elems, MPI_FLOAT, gathered_data.raw(), counts.raw(), displs.raw(), MPI_FLOAT, mpi_comm); +#else + FatalError("MPI Comms module requires compiling with MPI", AMGX_ERR_NOT_IMPLEMENTED); +#endif +} + +template +void CommsMPIHostBufferStream::all_gather_v(HCVector& data, int num_elems, HCVector& gathered_data, HIVector counts, HIVector displs) +{ +#ifdef AMGX_WITH_MPI + FatalError("AllgatherV with complex data.", AMGX_ERR_NOT_IMPLEMENTED); +#else + FatalError("MPI Comms module requires compiling with MPI", AMGX_ERR_NOT_IMPLEMENTED); +#endif +} + +template +void CommsMPIHostBufferStream::all_gather_v(HZVector& data, int num_elems, HZVector& gathered_data, HIVector counts, HIVector displs) +{ +#ifdef AMGX_WITH_MPI + FatalError("AllgatherV with complex data.", AMGX_ERR_NOT_IMPLEMENTED); +#else + FatalError("MPI Comms module requires compiling with MPI", AMGX_ERR_NOT_IMPLEMENTED); +#endif +} + +template +void CommsMPIHostBufferStream::all_gather_v(HIVector& data, int num_elems, HIVector& gathered_data, HIVector counts, HIVector displs) +{ +#ifdef AMGX_WITH_MPI + MPI_Allgatherv(data.raw(), num_elems, MPI_INT, gathered_data.raw(), counts.raw(), displs.raw(), MPI_INT, mpi_comm); +#else + FatalError("MPI Comms module requires compiling with MPI", AMGX_ERR_NOT_IMPLEMENTED); +#endif +} + + /**************************************** * Explict instantiations ***************************************/ diff --git a/3rd_party/AMGX/base/src/distributed/distributed_io.cu b/3rd_party/AMGX/base/src/distributed/distributed_io.cu index ea1394381..0d58954fd 100644 --- a/3rd_party/AMGX/base/src/distributed/distributed_io.cu +++ b/3rd_party/AMGX/base/src/distributed/distributed_io.cu @@ -406,6 +406,7 @@ AMGX_ERROR DistributedRead >::apply(const Vector &v, Vector &res, ViewType view) { Vector &v_ = const_cast&>(v); - multiply(*this, v_, res); + multiply(*this, v_, res, view); } @@ -772,9 +772,11 @@ MatrixBase::setupMatrix(Solver *outer_solver, AMG_Config &cf if (m_separation_interior > m_separation_exterior) { FatalError("Interior separation cannot be wider than the exterior separation", AMGX_ERR_CONFIGURATION); } - int min_rows_latency_hiding = cfg.getParameter("min_rows_latency_hiding", "default"); - - if (min_rows_latency_hiding < 0 || this->get_num_rows() < min_rows_latency_hiding) { m_separation_interior = m_separation_exterior; } + // If latency hiding is disabled, the interior is overwritten + if(!isLatencyHidingEnabled(cfg)) + { + m_separation_interior = m_separation_exterior; + } bool is_coloring_needed = outer_solver->isColoringNeeded(); @@ -819,6 +821,33 @@ MatrixBase::setupMatrix(Solver *outer_solver, AMG_Config &cf m_is_matrix_setup = true; } +template +bool MatrixBase::isLatencyHidingEnabled(AMG_Config &cfg) +{ + const int min_rows_latency_hiding = + cfg.getParameter("min_rows_latency_hiding", "default"); + + // Test all partitions to check if they all fall below the threshold + if (!is_matrix_singleGPU() && min_rows_latency_hiding >= 0) + { + const auto& nrows_per_part = manager->getNumRowsPerPart(); + + // Look at all partitions to check whether the number of rows falls + // below the user defined minimum + for(auto& nrpp : nrows_per_part) + { + // If any partitions still have a large enough set of rows, + // continue latency hiding + if(nrpp >= min_rows_latency_hiding) + { + return true; + } + } + } + + return false; +} + template void MatrixBase::reorderColumnsByColor(bool insert_diagonal) @@ -1044,7 +1073,6 @@ Matrix >::computeDiag this->setView(oldView); } - template void Matrix >::computeDiagonal() { diff --git a/3rd_party/AMGX/base/src/multiply.cu b/3rd_party/AMGX/base/src/multiply.cu index cc4b33a58..d33de5e62 100644 --- a/3rd_party/AMGX/base/src/multiply.cu +++ b/3rd_party/AMGX/base/src/multiply.cu @@ -117,7 +117,7 @@ void multiply(Matrix &A, Vector &B, Vector &C, ViewTy typedef Matrix TMatrix; typedef Vector TVector; - bool latencyHiding = (A.getViewInterior() != A.getViewExterior() && !A.is_matrix_singleGPU() && B.dirtybit != 0); + bool latencyHiding = (view == A.getViewExterior() && A.getViewInterior() != A.getViewExterior() && !A.is_matrix_singleGPU() && B.dirtybit != 0); if (latencyHiding) { @@ -135,12 +135,12 @@ void multiply(Matrix &A, Vector &B, Vector &C, ViewTy } else { - if (!A.is_matrix_singleGPU() && B.dirtybit != 0) + if (view != INTERIOR && !A.is_matrix_singleGPU() && B.dirtybit != 0) { A.manager->exchange_halo_v2(B, B.tag); } - multiply_block_size(A, B, C, A.getViewExterior()); + multiply_block_size(A, B, C, view); } C.dirtybit = 1; diff --git a/3rd_party/AMGX/base/src/norm.cu b/3rd_party/AMGX/base/src/norm.cu index 6b0fc9a44..99b1f2ff8 100644 --- a/3rd_party/AMGX/base/src/norm.cu +++ b/3rd_party/AMGX/base/src/norm.cu @@ -43,7 +43,9 @@ #include #include "strided_reduction.h" +#include "amgx_timer.h" #include "amgx_types/util.h" +#include "thrust_wrapper.h" namespace amgx { @@ -53,54 +55,58 @@ namespace amgx *********************************************************/ template -typename types::PODTypes::type get_norm(const MatrixType &A, const VectorType &r, const NormType norm_type) +typename types::PODTypes::type get_norm(const MatrixType &A, const VectorType &r, const NormType norm_type, typename types::PODTypes::type norm_factor) { typedef typename types::PODTypes::type value_type; value_type nrm; int offset, size; A.getOffsetAndSizeForView(OWNED, &offset, &size); - switch (norm_type) + if (norm_type == L1 || norm_type == L1_SCALED) { - case L1: - nrm = nrm1(r, offset, size); + nrm = nrm1(r, offset, size); - if (A.is_matrix_distributed()) - { - A.getManager()->global_reduce_sum(&nrm); - } - - return nrm; - - case L2: - nrm = nrm2(r, offset, size); + if (A.is_matrix_distributed()) + { + A.getManager()->global_reduce_sum(&nrm); + } - if (A.is_matrix_distributed()) - { - nrm = nrm * nrm; - A.getManager()->global_reduce_sum(&nrm); - nrm = sqrt(nrm); - } + return (norm_type == L1_SCALED) ? nrm / norm_factor : nrm; + } + else if (norm_type == L2) + { + nrm = nrm2(r, offset, size); - return nrm; + if (A.is_matrix_distributed()) + { + nrm = nrm * nrm; + A.getManager()->global_reduce_sum(&nrm); + nrm = sqrt(nrm); + } + + return nrm; + } + else if (norm_type == LMAX) + { + nrm = nrmmax(r, offset, size); - case LMAX: - nrm = nrmmax(r, offset, size); + if (A.is_matrix_distributed()) + { + typedef TemplateConfig::vec_prec, MatrixType::TConfig::matPrec, MatrixType::TConfig::indPrec> hvector_type; + typedef Vector HVector; + //collect values from all neighbors, and do the "reduction" part + std::vector values(0); + HVector my_nrm(1); + my_nrm[0] = nrm; + A.getManager()->getComms()->global_reduce(values, my_nrm, A, 3); - if (A.is_matrix_distributed()) + for (int j = 0; j < values.size(); j++) { - typedef TemplateConfig::vec_prec, MatrixType::TConfig::matPrec, MatrixType::TConfig::indPrec> hvector_type; - typedef Vector HVector; - //collect values from all neighbors, and do the "reduction" part - std::vector values(0); - HVector my_nrm(1); - my_nrm[0] = nrm; - A.getManager()->getComms()->global_reduce(values, my_nrm, A, 3); - - for (int j = 0; j < values.size(); j++) { nrm = (nrm > values[j][0] ? nrm : values[j][0]); } + nrm = (nrm > values[j][0] ? nrm : values[j][0]); } + } - return nrm; + return nrm; } return -1; @@ -110,9 +116,9 @@ template class Norm_1x1; template -void get_1x1_norm(const MatrixType &A, const VectorType &r, const int block_size, const NormType norm_type, PlainVectorType &block_nrm) +void get_1x1_norm(const MatrixType &A, const VectorType &r, const int block_size, const NormType norm_type, PlainVectorType &block_nrm, typename types::PODTypes::type norm_factor) { - Norm_1x1::get_1x1_norm(A, r, block_size, norm_type, block_nrm); + Norm_1x1::get_1x1_norm(A, r, block_size, norm_type, block_nrm, norm_factor); } template @@ -125,7 +131,7 @@ class Norm_1x1< Vector::vec_prec, MatrixType::TConfig::matPrec, MatrixType::TConfig::indPrec> hvector_type; // TConfig host with pod-values for ValueTypeB typedef Vector HVector; //vectors for saving norms from allgather - static void get_1x1_norm(const MatrixType &A, const Vector_h &r, const int block_size, const NormType norm_type, PODHostVec &block_nrm) + static void get_1x1_norm(const MatrixType &A, const Vector_h &r, const int block_size, const NormType norm_type, PODHostVec &block_nrm, typename types::PODTypes::type norm_factor) { //collect values from all neighbors, and do the "reduction" part std::vector values(0); @@ -134,53 +140,63 @@ class Norm_1x1< VectorgetComms()->global_reduce(values, block_nrm, A, 4); - block_nrm[0] = 0; + block_nrm[0] = nrm1(r, offset, size); - for (int j = 0; j < values.size(); j++) { sum += values[j][0]; } + if (A.is_matrix_distributed()) + { + A.getManager()->getComms()->global_reduce(values, block_nrm, A, 4); + block_nrm[0] = 0; - block_nrm[0] = sum; + for (int j = 0; j < values.size(); j++) + { + sum += values[j][0]; } - break; - - case L2: - block_nrm[0] = nrm2(r, offset, size); + block_nrm[0] = sum; + } - if (A.is_matrix_distributed()) - { - block_nrm[0] *= block_nrm[0]; - A.getManager()->getComms()->global_reduce(values, block_nrm, A, 5); - block_nrm[0] = 0; + if (norm_type == L1_SCALED) + { + block_nrm[0] /= norm_factor; + } + } + else if (norm_type == L2) + { + block_nrm[0] = nrm2(r, offset, size); - for (int j = 0; j < values.size(); j++) { sum += values[j][0]; } + if (A.is_matrix_distributed()) + { + block_nrm[0] *= block_nrm[0]; + A.getManager()->getComms()->global_reduce(values, block_nrm, A, 5); + block_nrm[0] = 0; - block_nrm[0] = sqrt(sum); + for (int j = 0; j < values.size(); j++) + { + sum += values[j][0]; } - break; + block_nrm[0] = sqrt(sum); + } + } + else if (norm_type == LMAX) + { + block_nrm[0] = nrmmax(r, offset, size); - case LMAX: - block_nrm[0] = nrmmax(r, offset, size); + if (A.is_matrix_distributed()) + { + A.getManager()->getComms()->global_reduce(values, block_nrm, A, 6); - if (A.is_matrix_distributed()) + for (int j = 0; j < values.size(); j++) { - A.getManager()->getComms()->global_reduce(values, block_nrm, A, 6); - - for (int j = 0; j < values.size(); j++) { block_nrm[0] = (block_nrm[0] > values[j][0] ? block_nrm[0] : values[j][0]); } + block_nrm[0] = (block_nrm[0] > values[j][0] ? block_nrm[0] : values[j][0]); } - - break; - - default: - FatalError("Normtype is not supported in get_1x1_norm", AMGX_ERR_NOT_IMPLEMENTED); + } + } + else + { + FatalError("Normtype is not supported in get_1x1_norm", AMGX_ERR_NOT_IMPLEMENTED); } }; }; @@ -189,9 +205,9 @@ template class Norm_Square; template -void get_sq_norm(const MatrixType &A, const VectorType &r, const int block_size, const NormType norm_type, PlainVectorType &block_nrm) +void get_sq_norm(const MatrixType &A, const VectorType &r, const int block_size, const NormType norm_type, PlainVectorType &block_nrm, typename types::PODTypes::type norm_factor) { - Norm_Square::get_sq_norm(A, r, block_size, norm_type, block_nrm); + Norm_Square::get_sq_norm(A, r, block_size, norm_type, block_nrm, norm_factor); } template @@ -203,14 +219,14 @@ class Norm_Square::vec_prec, MatrixType::TConfig::matPrec, MatrixType::TConfig::indPrec> hvector_type; // TConfig host with pod-values for ValueTypeB typedef Vector HVector; //vectors for saving norms from allgather - static void get_sq_norm(const MatrixType &A, const Vector_h &r, const int block_size, const NormType norm_type, HVector &block_nrm) + static void get_sq_norm(const MatrixType &A, const Vector_h &r, const int block_size, const NormType norm_type, HVector &block_nrm, typename types::PODTypes::type norm_factor) { int bsize = block_nrm.size(); int offset, size; A.getOffsetAndSizeForView(OWNED, &offset, &size); std::vector norm(block_size, 0.l); - if (norm_type == L1) + if (norm_type == L1 || norm_type == L1_SCALED) { if ( (size * r.get_block_size()) % bsize != 0) { @@ -251,7 +267,7 @@ class Norm_Square::vec_prec, MatrixType::TConfig::matPrec, MatrixType::TConfig::indPrec> hvector_type; // TConfig host with pod-values for ValueTypeB typedef Vector HVector; //vectors for saving norms from allgather - static void get_sq_norm(const MatrixType &A, const Vector_d &r, const int block_size, const NormType norm_type, HVector &block_nrm) + static void get_sq_norm(const MatrixType &A, const Vector_d &r, const int block_size, const NormType norm_type, HVector &block_nrm, typename types::PODTypes::type norm_factor) { int bsize = block_nrm.size(); int offset, size; @@ -333,7 +349,7 @@ class Norm_Square -void get_norm(const MatrixType &A, const VectorType &r, const int block_size, const NormType norm_type, PlainVectorType &block_nrm) +void get_norm(const MatrixType &A, const VectorType &r, const int block_size, const NormType norm_type, PlainVectorType &block_nrm, typename types::PODTypes::type norm_factor) { if (block_size == 1) { - get_1x1_norm(A, r, block_size, norm_type, block_nrm); + get_1x1_norm(A, r, block_size, norm_type, block_nrm, norm_factor); } else { - get_sq_norm(A, r, block_size, norm_type, block_nrm); + get_sq_norm(A, r, block_size, norm_type, block_nrm, norm_factor); + } +} + +template +class Norm_Factor; + +template +void compute_norm_factor(MatrixType &A, VectorType &b, VectorType &x, const NormType normType, typename types::PODTypes::type &normFactor) +{ + if(normType == L1_SCALED) + { + Norm_Factor::compute_norm_factor(A, b, x, normFactor); + } +} + +template +class Norm_Factor >, MatrixType> +{ + public: + typedef Vector > Vector_h; + typedef typename Vector_h::value_type ValueTypeVec; + typedef typename types::PODTypes::type ValueTypeNorm; + + static void compute_norm_factor(MatrixType &A, Vector_h &b, Vector_h &x, ValueTypeNorm& normFactor) + { + FatalError("L1 scaled norm not supported with host execution.", AMGX_ERR_NOT_IMPLEMENTED); + } +}; + +template +__global__ void scaled_norm_factor_calc( + int nRows, ValueTypeMat *Avals, IndexTypeVec *Arows, ValueTypeVec *Ax, ValueTypeVec *b, ValueTypeVec xAvg, ValueTypeNorm *localNormFactor) +{ + int r = threadIdx.x + blockIdx.x * blockDim.x; + + __shared__ ValueTypeNorm normFactor_s; + if(threadIdx.x == 0) + { + normFactor_s = amgx::types::util::get_zero(); + } + + ValueTypeNorm normFactor = 0.0; + + if (r < nRows) + { + ValueTypeMat Arow_sum = amgx::types::util::get_zero(); + + // Read in the row +#pragma unroll + for (int i = Arows[r]; i < Arows[r + 1]; ++i) + { + Arow_sum = Arow_sum + Avals[i]; + } + + normFactor = + types::util::abs(Ax[r] - Arow_sum * xAvg) + + types::util::abs(b[r] - Arow_sum * xAvg); + } + + // Ensure normFactor_s is initialised + __syncthreads(); + + // Warp-local reduction to lane 0 + for(int i = warpSize/2; i > 0; i /= 2) + { + normFactor += utils::shfl_down(normFactor, i); + } + + // Fast shared atomic add by lane 0 of each warp + int laneId = threadIdx.x % warpSize; + if(laneId == 0) + { + utils::atomic_add(&normFactor_s, normFactor); + } + + // Ensure normFactor_s is final + __syncthreads(); + + // Final output of normFactor by first thread of each block + if(threadIdx.x == 0) + { + utils::atomic_add(localNormFactor, normFactor_s); } } -#define AMGX_CASE_LINE(CASE) template typename types::PODTypes< typename Vector::Type>::value_type>::type get_norm(const Matrix::Type>& A, const Vector::Type>& r, const NormType norm_type); +template +class Norm_Factor >, MatrixType> +{ + public: + typedef Vector > Vector_d; + typedef typename Vector_d::value_type ValueTypeVec; + typedef typename Vector_d::index_type IndexTypeVec; + typedef typename types::PODTypes::type ValueTypeNorm; + typedef TemplateConfig::vec_prec, MatrixType::TConfig::matPrec, MatrixType::TConfig::indPrec> NormVectorType; + typedef Vector NVector_d; + + static void compute_norm_factor(MatrixType &A, Vector_d &b, Vector_d &x, ValueTypeNorm &normFactor) + { + if (A.get_block_dimx() != 1 || A.get_block_dimy() != 1) + { + FatalError("L1 scaled norm only supported with scalar matrices", AMGX_ERR_NOT_IMPLEMENTED); + } + + // Calculate Ax + int offset, nRows; + A.getOffsetAndSizeForView(OWNED, &offset, &nRows); + + Vector_d Ax(nRows); + A.apply(x, Ax); + + // Calculate global average x + ValueTypeVec xAvg = thrust::reduce(x.begin(), x.begin() + nRows, amgx::types::util::get_zero()); + A.manager->global_reduce_sum(&xAvg); + amgx::types::util::divide_by_integer(xAvg, A.manager->num_rows_global); + + // Make a copy of b + Vector_d bTmp(b); + + // Calculate row sums then the local norm factors + constexpr int nThreads = 128; + constexpr int warpSize = 32; + const int nBlocks = nRows/nThreads + 1; + NVector_d localNormFactor(1, amgx::types::util::get_zero()); + scaled_norm_factor_calc<<>>( + nRows, + A.values.raw(), + A.row_offsets.raw(), + Ax.raw(), + bTmp.raw(), + xAvg, + localNormFactor.raw()); + + // Fetch the normFactor result and reduce across all ranks + normFactor = localNormFactor[0]; + A.manager->global_reduce_sum(&normFactor); + + // Print the norm factor + std::stringstream info; + info.precision(12); + info << "\tAmgX Scaled Norm Factor: " << std::scientific << normFactor << "\n"; + amgx_output(info.str().c_str(), info.str().length()); + } +}; + +#define AMGX_CASE_LINE(CASE) template typename types::PODTypes< typename Vector::Type>::value_type>::type get_norm(const Matrix::Type>& A, const Vector::Type>& r, const NormType norm_type, typename types::PODTypes::Type>::value_type>::type norm_factor); AMGX_FORALL_BUILDS(AMGX_CASE_LINE) AMGX_FORCOMPLEX_BUILDS(AMGX_CASE_LINE) #undef AMGX_CASE_LINE -#define AMGX_CASE_LINE(CASE) template typename types::PODTypes< typename Vector::Type>::value_type>::type get_norm(const Operator::Type>& A, const Vector::Type>& r, const NormType norm_type); +#define AMGX_CASE_LINE(CASE) template typename types::PODTypes< typename Vector::Type>::value_type>::type get_norm(const Operator::Type>& A, const Vector::Type>& r, const NormType norm_type, typename types::PODTypes::Type>::value_type>::type norm_factor); AMGX_FORALL_BUILDS(AMGX_CASE_LINE) AMGX_FORCOMPLEX_BUILDS(AMGX_CASE_LINE) #undef AMGX_CASE_LINE @@ -429,7 +594,8 @@ AMGX_FORCOMPLEX_BUILDS(AMGX_CASE_LINE) #define AMGX_CASE_LINE(CASE) \ typedef typename Vector< TemplateMode::Type >::value_type ValueTypeMB##CASE ;\ typedef TemplateMode::Type::template setMemSpace::Type::template setVecPrec< types::PODTypes< ValueTypeMB##CASE >::vec_prec >::Type CurTConfigMB_h##CASE ;\ - template void get_norm(const Matrix::Type>& A, const Vector::Type>& r, const int block_size, const NormType norm_type, Vector< CurTConfigMB_h##CASE >& block_nrm); + template void get_norm(const Matrix::Type>& A, const Vector::Type>& r, const int block_size, const NormType norm_type, Vector< CurTConfigMB_h##CASE >& block_nrm, typename types::PODTypes::Type>::value_type>::type norm_factor); \ + template void compute_norm_factor(Matrix::Type> &A, Vector::Type> &b, Vector::Type> &x, const NormType normType, typename types::PODTypes::Type>::value_type>::type &normFactor); AMGX_FORALL_BUILDS(AMGX_CASE_LINE) AMGX_FORCOMPLEX_BUILDS(AMGX_CASE_LINE) #undef AMGX_CASE_LINE @@ -437,7 +603,7 @@ AMGX_FORCOMPLEX_BUILDS(AMGX_CASE_LINE) #define AMGX_CASE_LINE(CASE) \ typedef typename Vector< TemplateMode::Type >::value_type ValueTypeOB##CASE ;\ typedef TemplateMode::Type::template setMemSpace::Type::template setVecPrec< types::PODTypes< ValueTypeOB##CASE >::vec_prec >::Type CurTConfigOB_h##CASE ;\ - template void get_norm(const Operator::Type>& A, const Vector::Type>& r, const int block_size, const NormType norm_type, Vector< CurTConfigOB_h##CASE >& block_nrm); + template void get_norm(const Operator::Type>& A, const Vector::Type>& r, const int block_size, const NormType norm_type, Vector< CurTConfigOB_h##CASE >& block_nrm, typename types::PODTypes::Type>::value_type>::type norm_factor); AMGX_FORALL_BUILDS(AMGX_CASE_LINE) AMGX_FORCOMPLEX_BUILDS(AMGX_CASE_LINE) #undef AMGX_CASE_LINE diff --git a/3rd_party/AMGX/base/src/solvers/solver.cu b/3rd_party/AMGX/base/src/solvers/solver.cu index b52e33502..385b809b7 100644 --- a/3rd_party/AMGX/base/src/solvers/solver.cu +++ b/3rd_party/AMGX/base/src/solvers/solver.cu @@ -52,6 +52,7 @@ Solver::Solver(AMG_Config &cfg, const std::string &cfg_scope, m_r(NULL), m_num_iters(0), m_curr_iter(0), m_ref_count(1), tag(0), m_solver_name("SolverNameNotSet"), m_skip_glued_setup(false), m_tmng(tmng) { + m_norm_factor = types::util::get_one(); m_verbosity_level = cfg.getParameter("verbosity_level", cfg_scope); m_print_vis_data = cfg.getParameter("print_vis_data", cfg_scope) != 0; m_monitor_residual = cfg.getParameter("monitor_residual", cfg_scope) != 0; @@ -213,12 +214,13 @@ void Solver::compute_residual(const VVector &b, VVector &x, m_A->apply(x, r); axpby(b, r, r, types::util::get_one(), types::util::get_minus_one(), offset, size); } + template void Solver::compute_norm() { AMGX_CPU_PROFILER( "Solver::compute_norm " ); get_norm(*m_A, *m_r, (m_use_scalar_norm ? 1 : m_A->get_block_dimy()), - m_norm_type, m_nrm); + m_norm_type, m_nrm, m_norm_factor); } template @@ -226,7 +228,7 @@ void Solver::compute_norm(const VVector &v, PODVector_h &nrm) const { AMGX_CPU_PROFILER( "Solver::compute_norm_vh " ); get_norm(*m_A, v, (m_use_scalar_norm ? 1 : m_A->get_block_dimy()), - m_norm_type, nrm); + m_norm_type, nrm, m_norm_factor); } template @@ -707,6 +709,10 @@ AMGX_STATUS Solver::solve(Vector &b, Vector &x, assert(static_cast(m_nrm_ini.size()) >= bsize); } + // Only happens if L1 scaled norm is utilised + Matrix *m_A = dynamic_cast*>(this->m_A); + compute_norm_factor(*m_A, b, x, m_norm_type, m_norm_factor); + compute_norm(); last_nrm = m_nrm_ini = m_nrm; } diff --git a/3rd_party/AMGX/base/src/transpose.cu b/3rd_party/AMGX/base/src/transpose.cu index af45379f3..e336f2a2f 100644 --- a/3rd_party/AMGX/base/src/transpose.cu +++ b/3rd_party/AMGX/base/src/transpose.cu @@ -77,10 +77,16 @@ void transpose(const Matrix &A, Matrix &B) else { B.addProps(CSR); + B.set_allow_recompute_diag(false); + +#ifdef ENABLE_CUSPARSE_TRANSPOSE + Cusparse::transpose(A, B); +#else MatrixCusp wA((Matrix *) &A); MatrixCusp wB(&B); - B.set_allow_recompute_diag(false); cusp::transpose(wA, wB); +#endif + B.set_allow_recompute_diag(true); cudaCheckError(); B.computeDiagonal(); @@ -105,13 +111,21 @@ void transpose(const Matrix &A, Matrix &B, int num_rows) } B.addProps(CSR); + B.set_allow_recompute_diag(false); + +#if ENABLE_CUSPARSE_TRANSPOSE + int num_nz = A.row_offsets[num_rows]; + B.resize(A.get_num_cols(), num_rows, num_nz); + Cusparse::transpose(A, B, num_rows, num_nz); +#else MatrixCusp wA((Matrix *) &A); MatrixCusp wB(&B); - B.set_allow_recompute_diag(false); + // operate on wA / wB typedef typename Matrix::index_type IndexType; typedef typename Matrix::value_type ValueType; typedef typename Matrix::memory_space MemorySpace; + int num_entries = A.row_offsets[num_rows]; int num_cols = A.get_num_cols(); // resize matrix @@ -133,6 +147,7 @@ void transpose(const Matrix &A, Matrix &B, int num_rows) cusp::detail::sort_by_row(wB_row_indices, wB.column_indices, wB.values); cusp::detail::indices_to_offsets(wB_row_indices, wB.row_offsets); } +#endif B.set_allow_recompute_diag(true); cudaCheckError(); diff --git a/3rd_party/AMGX/core/CMakeLists.txt b/3rd_party/AMGX/core/CMakeLists.txt index 9e52d6028..47e3b1940 100644 --- a/3rd_party/AMGX/core/CMakeLists.txt +++ b/3rd_party/AMGX/core/CMakeLists.txt @@ -24,63 +24,51 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -IF (WIN32) -cmake_minimum_required (VERSION 2.8.8) -ELSE (WIN32) -cmake_minimum_required (VERSION 2.8.0) -ENDIF (WIN32) - -CUDA_INCLUDE_DIRECTORIES(${THRUST_DIR}) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/include) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../base/include) +cmake_minimum_required (VERSION 3.18) #select all sources FILE(GLOB_RECURSE SRCS "src/*.cu") -CUDA_ADD_LIBRARY(amgx_core STATIC ${SRCS}) -if(${AMGX_PUBLIC_RELEASE} MATCHES "FALSE") - install(TARGETS amgx_core DESTINATION "lib/sublibs") -endif(${AMGX_PUBLIC_RELEASE} MATCHES "FALSE") +target_sources(amgx_libs PRIVATE ${SRCS}) +target_include_directories(amgx_libs PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/../base/include) #copy configs to build directory add_custom_target(copy_configs_core ALL) add_custom_command(TARGET copy_configs_core COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_CURRENT_SOURCE_DIR}/configs" "${CMAKE_BINARY_DIR}/configs/core") -if(${AMGX_PUBLIC_RELEASE} MATCHES "TRUE") - install(FILES - "${CMAKE_CURRENT_SOURCE_DIR}/configs/AGGREGATION_DILU.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/AGGREGATION_GS.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/AGGREGATION_JACOBI.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/AGGREGATION_MULTI_PAIRWISE.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/CG_DILU.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/CLASSICAL_CG_CYCLE.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/CLASSICAL_CGF_CYCLE.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/CLASSICAL_F_CYCLE.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/CLASSICAL_V_CYCLE.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/CLASSICAL_W_CYCLE.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/FGMRES_AGGREGATION_DILU.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/FGMRES_AGGREGATION_JACOBI.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/FGMRES_CLASSICAL_AGGRESSIVE_HMIS.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/FGMRES_CLASSICAL_AGGRESSIVE_PMIS.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/FGMRES_NOPREC.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/GMRES_AMG_D2.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/IDR_DILU.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/IDRMSYNC_DILU.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/PBICGSTAB_AGGREGATION_W_JACOBI.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/PBICGSTAB_CLASSICAL_JACOBI.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/PBICGSTAB_NOPREC.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCG_AGGREGATION_JACOBI.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCG_CLASSICAL_F_JACOBI.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCG_CLASSICAL_V_JACOBI.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCG_CLASSICAL_W_JACOBI.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCG_DILU.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCG_NOPREC.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCGF_CLASSICAL_F_JACOBI.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCGF_CLASSICAL_V_JACOBI.json" - "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCGF_CLASSICAL_W_JACOBI.json" - DESTINATION "lib/configs") -else(${AMGX_PUBLIC_RELEASE} MATCHES "TRUE") - install(DIRECTORY "${CMAKE_BINARY_DIR}/configs/core" DESTINATION "lib/configs") -endif(${AMGX_PUBLIC_RELEASE} MATCHES "TRUE") +install(FILES + "${CMAKE_CURRENT_SOURCE_DIR}/configs/AGGREGATION_DILU.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/AGGREGATION_GS.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/AGGREGATION_JACOBI.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/AGGREGATION_MULTI_PAIRWISE.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/CG_DILU.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/CLASSICAL_CG_CYCLE.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/CLASSICAL_CGF_CYCLE.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/CLASSICAL_F_CYCLE.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/CLASSICAL_V_CYCLE.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/CLASSICAL_W_CYCLE.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/FGMRES_AGGREGATION_DILU.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/FGMRES_AGGREGATION_JACOBI.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/FGMRES_CLASSICAL_AGGRESSIVE_HMIS.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/FGMRES_CLASSICAL_AGGRESSIVE_PMIS.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/FGMRES_NOPREC.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/GMRES_AMG_D2.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/IDR_DILU.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/IDRMSYNC_DILU.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/PBICGSTAB_AGGREGATION_W_JACOBI.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/PBICGSTAB_CLASSICAL_JACOBI.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/PBICGSTAB_NOPREC.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCG_AGGREGATION_JACOBI.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCG_CLASSICAL_F_JACOBI.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCG_CLASSICAL_V_JACOBI.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCG_CLASSICAL_W_JACOBI.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCG_DILU.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCG_NOPREC.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCGF_CLASSICAL_F_JACOBI.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCGF_CLASSICAL_V_JACOBI.json" + "${CMAKE_CURRENT_SOURCE_DIR}/configs/PCGF_CLASSICAL_W_JACOBI.json" + DESTINATION "lib/configs") diff --git a/3rd_party/AMGX/core/include/solvers/dense_lu_solver.h b/3rd_party/AMGX/core/include/solvers/dense_lu_solver.h index 78bc1a96e..b7438a4b0 100644 --- a/3rd_party/AMGX/core/include/solvers/dense_lu_solver.h +++ b/3rd_party/AMGX/core/include/solvers/dense_lu_solver.h @@ -88,8 +88,15 @@ class DenseLUSolver > typedef Solver > Base; typedef TemplateConfig Config_d; + typedef TemplateConfig Config_h; typedef Matrix Matrix_d; + typedef Matrix Matrix_h; typedef Vector Vector_d; + typedef Vector Vector_h; + typedef typename Matrix_d::IVector IVector_d; + typedef typename Matrix_h::IVector IVector_h; + typedef typename Matrix_d::MVector MVector_d; + typedef typename Matrix_h::MVector MVector_h; typedef typename MatPrecisionMap::Type Matrix_data; typedef typename VecPrecisionMap::Type Vector_data; @@ -112,10 +119,20 @@ class DenseLUSolver > cusolverDnHandle_t m_cuds_handle; cublasHandle_t m_cublas_handle; int m_num_rows, m_num_cols, m_lda; + int m_nnz_global; Matrix_data *m_dense_A; // store sparse as dense int *m_ipiv; // The pivot sequence from getrf() int *m_cuds_info; // host pointer for debug info from getrf() Matrix_data *m_trf_wspace; // workspace for trf/trs + bool m_enable_exact_solve = false; + + // Cached in the case of an exact coarse solve + IVector_h nz_all; + IVector_h nz_displs; + IVector_h row_all; + IVector_h row_displs; + IVector_d Acols_global; + IVector_d Arows_global; void csr_to_dense(); // Pack a CSR matrix to a dense matrix void cudense_getrf(); // LU decomposition diff --git a/3rd_party/AMGX/core/src/aggregation/aggregation_amg_level.cu b/3rd_party/AMGX/core/src/aggregation/aggregation_amg_level.cu index 7d6970054..7800ab7fa 100644 --- a/3rd_party/AMGX/core/src/aggregation/aggregation_amg_level.cu +++ b/3rd_party/AMGX/core/src/aggregation/aggregation_amg_level.cu @@ -1566,7 +1566,7 @@ void Aggregation_AMG_Level_Base::prepareNextLevelMatrix_none(const Mat Ac.manager->inverse_renumbering.resize(c_size); //get coarse -> fine renumbering int num_blocks = min(4096, (c_size + 127) / 128); - coarse_to_global <<< num_blocks, 128>>>(this->m_aggregates.raw(), this->m_aggregates_fine_idx.raw(), Ac.manager->inverse_renumbering.raw(), f_size, -1 * A.manager->base_index()); + coarse_to_global <<< num_blocks, 128>>>(this->m_aggregates.raw(), this->m_aggregates_fine_idx.raw(), Ac.manager->inverse_renumbering.raw(), f_size, 0); cudaCheckError(); Ac.manager->set_num_halo_rows(Ac.manager->halo_offsets[Ac.manager->halo_offsets.size() - 1] - c_size); Ac.set_initialized(1); diff --git a/3rd_party/AMGX/core/src/aggregation/coarseAgenerators/low_deg_coarse_A_generator.cu b/3rd_party/AMGX/core/src/aggregation/coarseAgenerators/low_deg_coarse_A_generator.cu index c3e7f91cf..17daf54d9 100644 --- a/3rd_party/AMGX/core/src/aggregation/coarseAgenerators/low_deg_coarse_A_generator.cu +++ b/3rd_party/AMGX/core/src/aggregation/coarseAgenerators/low_deg_coarse_A_generator.cu @@ -195,7 +195,7 @@ compute_sparsity_kernel( const int R_num_rows, // same as num_aggregates. // Store the results. if ( COUNT_ONLY ) { - int count = set.compute_size_with_duplicates(); + int count = set.compute_size(); if ( lane_id == 0 ) { @@ -240,9 +240,9 @@ void fill_A_kernel_1x1( const int R_num_rows, const int NUM_WARPS = CTA_SIZE / WARP_SIZE; const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // The hash values stored in shared memory. - __shared__ volatile Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; + __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); const int lane_id = utils::lane_id(); @@ -254,7 +254,7 @@ void fill_A_kernel_1x1( const int R_num_rows, // Create local storage for the set. Hash_map map( &s_keys[warp_id * SMEM_SIZE ], &g_keys[r_row_id * gmem_size ], - &s_vote[warp_id * SMEM_SIZE / 4], + &s_vals[warp_id * SMEM_SIZE], &g_vals[r_row_id * gmem_size ], gmem_size ); // Loop over rows of A. for ( ; r_row_id < R_num_rows ; r_row_id = get_work( wk_work_queue, warp_id ) ) @@ -335,7 +335,7 @@ void fill_A_kernel_1x1( const int R_num_rows, a_agg_id = -1; } - map.insert_with_duplicates( a_agg_id, a_value, NULL ); // It won't insert. Only update. + map.insert( a_agg_id, a_value, NULL ); // It won't insert. Only update. } } } @@ -391,7 +391,7 @@ void fill_A_kernel_4x4( const int R_num_rows, // same as num_aggregates. { const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -569,7 +569,7 @@ void fill_A_kernel_NxN( const int R_num_rows, // same as num_aggregates. const int T_WARP = FORCE_DETERMINISM ? 1 : WARP_SIZE / NxN; const int NUM_ITEMS_PER_WARP = T_WARP == 0 ? 1 : T_WARP; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -743,7 +743,7 @@ void fill_A_kernel_NxN_large( const int R_num_rows, // same as num_aggregates. // Number of items per warp. Let's be chill here and take 1 per warp for large blocks const int NUM_ITEMS_PER_WARP = 1; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); diff --git a/3rd_party/AMGX/core/src/classical/classical_amg_level.cu b/3rd_party/AMGX/core/src/classical/classical_amg_level.cu index f094aa18d..4c5259098 100644 --- a/3rd_party/AMGX/core/src/classical/classical_amg_level.cu +++ b/3rd_party/AMGX/core/src/classical/classical_amg_level.cu @@ -455,7 +455,7 @@ void Classical_AMG_Level_Base::computeProlongationOperator() Truncate::truncateByMaxElements(P, this->max_elmts); } - if (this->m_min_rows_latency_hiding < 0 || P.get_num_rows() < this->m_min_rows_latency_hiding) + if (!P.isLatencyHidingEnabled(*this->amg->m_cfg)) { // This will cause bsrmv_with_mask to not do latency hiding P.setInteriorView(OWNED); @@ -477,7 +477,7 @@ void Classical_AMG_Level_Base::computeRestrictionOperator() P.setView(OWNED); transpose(P, R, P.get_num_rows()); - if (this->m_min_rows_latency_hiding < 0 || R.get_num_rows() < this->m_min_rows_latency_hiding) + if (!R.isLatencyHidingEnabled(*this->amg->m_cfg)) { // This will cause bsrmv_with_mask_restriction to not do latency hiding R.setInteriorView(OWNED); diff --git a/3rd_party/AMGX/core/src/classical/interpolators/distance2.cu b/3rd_party/AMGX/core/src/classical/interpolators/distance2.cu index c38ad0842..437c16611 100644 --- a/3rd_party/AMGX/core/src/classical/interpolators/distance2.cu +++ b/3rd_party/AMGX/core/src/classical/interpolators/distance2.cu @@ -775,6 +775,9 @@ estimate_c_hat_size_kernel( const int A_num_rows, const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW; // A shared location where threads propose a row of B to load. __shared__ volatile int s_b_row_ids[CTA_SIZE]; + s_b_row_ids[threadIdx.x] = 0; + __syncthreads(); + // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -836,7 +839,9 @@ estimate_c_hat_size_kernel( const int A_num_rows, } // For each warp, we have up to 32 rows of B to proceed. - for ( int k = 0, num_rows = __popc(vote) ; k < num_rows ; k += NUM_LOADED_ROWS ) + + int num_rows = __popc(vote); + for ( int k = 0; k < num_rows ; k += NUM_LOADED_ROWS ) { int local_k = k + lane_id_div_num_threads; // Is it an active thread. @@ -906,11 +911,12 @@ compute_c_hat_kernel( int A_num_rows, int *wk_work_queue, int *wk_status ) { + const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // Shared memory to vote. __shared__ volatile int s_b_row_ids[CTA_SIZE]; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -1044,6 +1050,7 @@ compute_c_hat_kernel( int A_num_rows, C_hat_end[a_row_id] = c_col_it + count; } } + } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1070,7 +1077,7 @@ compute_c_hat_kernel( int A_num_rows, // Shared memory to vote. __shared__ volatile int s_b_row_ids[CTA_SIZE]; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); @@ -1221,6 +1228,7 @@ compute_c_hat_kernel( int A_num_rows, C_hat_end[a_row_id] = c_col_it + count; } } + } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1245,9 +1253,10 @@ compute_inner_sum_kernel( const int A_num_rows, int *g_keys, int *wk_work_queue ) { + const int NUM_WARPS = CTA_SIZE / WARP_SIZE; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // A shared location where threads propose a row of B to load. __shared__ volatile int s_b_row_ids[CTA_SIZE]; // A shared location where threads propose a value. @@ -1411,6 +1420,7 @@ compute_inner_sum_kernel( const int A_num_rows, inner_sum_offset += num_rows; } } + } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1435,10 +1445,11 @@ compute_inner_sum_kernel( const int A_num_rows, int *g_keys, int *wk_work_queue ) { + const int NUM_WARPS = CTA_SIZE / WARP_SIZE; const int NUM_LOADED_ROWS = WARP_SIZE / NUM_THREADS_PER_ROW; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // A shared location where threads propose a row of B to load. __shared__ volatile int s_b_row_ids[CTA_SIZE]; // A shared location where threads propose a value. @@ -1617,6 +1628,7 @@ compute_inner_sum_kernel( const int A_num_rows, inner_sum_offset += num_rows; } } + } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1645,19 +1657,16 @@ compute_interp_weight_kernel( const int A_num_rows, Value_type *g_vals, int *wk_work_queue ) { + const int NUM_WARPS = CTA_SIZE / 32; // The hash keys stored in shared memory. - __shared__ volatile int s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ int s_keys[NUM_WARPS * SMEM_SIZE]; // A shared location where threads propose a row of B to load. __shared__ volatile int s_b_row_ids[CTA_SIZE]; // A shared location where threads propose a value. __shared__ volatile Value_type s_aki[NUM_WARPS]; // The hash values stored in shared memory. -#if __CUDA_ARCH__ >= 700 - __shared__ volatile distance2_sm70::Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; -#else - __shared__ volatile distance2_sm35::Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; -#endif + __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); const int lane_id = utils::lane_id(); @@ -1667,13 +1676,13 @@ compute_interp_weight_kernel( const int A_num_rows, #if __CUDA_ARCH__ >= 700 distance2_sm70::Hash_map map( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], + &s_vals[warp_id * SMEM_SIZE], &g_vals[a_row_id * gmem_size], gmem_size ); #else distance2_sm35::Hash_map map( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], + &s_vals[warp_id * SMEM_SIZE], &g_vals[a_row_id * gmem_size], gmem_size ); #endif @@ -1865,6 +1874,7 @@ compute_interp_weight_kernel( const int A_num_rows, } } + } // namespace distance2 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/3rd_party/AMGX/core/src/classical/interpolators/multipass.cu b/3rd_party/AMGX/core/src/classical/interpolators/multipass.cu index 3663e33f6..02f3e2796 100644 --- a/3rd_party/AMGX/core/src/classical/interpolators/multipass.cu +++ b/3rd_party/AMGX/core/src/classical/interpolators/multipass.cu @@ -506,7 +506,7 @@ compute_c_hat_kernel( int A_num_rows, // Shared memory to vote. __shared__ volatile int s_b_row_ids[CTA_SIZE]; // The hash keys stored in shared memory. - __shared__ volatile KeyType s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ KeyType s_keys[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id( ); const int lane_id = utils::lane_id( ); @@ -819,17 +819,13 @@ compute_interp_weight_kernel( const int A_num_rows, { const int NUM_WARPS = CTA_SIZE / 32; // The hash keys stored in shared memory. - __shared__ volatile KeyType s_keys[NUM_WARPS * SMEM_SIZE]; + __shared__ KeyType s_keys[NUM_WARPS * SMEM_SIZE]; // A shared location where threads propose a row of B to load. __shared__ volatile int s_b_row_ids[CTA_SIZE]; // A shared location where threads store a value of B to load. __shared__ volatile Value_type s_b_values[CTA_SIZE]; // The hash values stored in shared memory. -#if __CUDA_ARCH__ >= 700 - __shared__ volatile multipass_sm70::Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; -#else - __shared__ volatile multipass_sm35::Word s_vote[NUM_WARPS * SMEM_SIZE / 4]; -#endif + __shared__ Value_type s_vals[NUM_WARPS * SMEM_SIZE]; // The coordinates of the thread inside the CTA/warp. const int warp_id = utils::warp_id(); const int lane_id = utils::lane_id(); @@ -839,13 +835,13 @@ compute_interp_weight_kernel( const int A_num_rows, #if __CUDA_ARCH__ >= 700 multipass_sm70::Hash_map map( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], + &s_vals[warp_id * SMEM_SIZE], &g_vals[a_row_id * gmem_size], gmem_size ); #else multipass_sm35::Hash_map map( &s_keys[warp_id * SMEM_SIZE], &g_keys[a_row_id * gmem_size], - &s_vote[warp_id * SMEM_SIZE / 4], + &s_vals[warp_id * SMEM_SIZE], &g_vals[a_row_id * gmem_size], gmem_size ); #endif @@ -1033,7 +1029,7 @@ template >::~Multipass_Interpolator() {} -enum { WARP_SIZE = 32, GRID_SIZE = 128, SMEM_SIZE = 128 }; +enum { WARP_SIZE = 32, GRID_SIZE = 1024, SMEM_SIZE = 128 }; struct is_less_than_zero @@ -1195,7 +1191,7 @@ void Multipass_Interpolator exp_wk; + Hash_Workspace exp_wk(true, GRID_SIZE); { const int CTA_SIZE = 256; const int NUM_WARPS = CTA_SIZE / WARP_SIZE; diff --git a/3rd_party/AMGX/core/src/classical/strength/affinity.cu b/3rd_party/AMGX/core/src/classical/strength/affinity.cu index 8daf8eeba..64c8b7fda 100644 --- a/3rd_party/AMGX/core/src/classical/strength/affinity.cu +++ b/3rd_party/AMGX/core/src/classical/strength/affinity.cu @@ -485,7 +485,7 @@ computeStrongConnectionsAndWeights_1x1(Matrix_d &A, // choose a blocksize. Use 1 warp per row const int blockSize = 256; const int numWarps = blockSize / 32; - const int numBlocks = min( 4096, (int) (A.get_num_rows() + numWarps - 1) / numWarps ); + const int numBlocks = (int) (A.get_num_rows() + numWarps - 1) / numWarps; if (A.get_num_rows() > 0) { diff --git a/3rd_party/AMGX/core/src/core.cu b/3rd_party/AMGX/core/src/core.cu index f7b32df21..c478e73a9 100644 --- a/3rd_party/AMGX/core/src/core.cu +++ b/3rd_party/AMGX/core/src/core.cu @@ -368,6 +368,7 @@ inline void registerParameters() AMG_Config::registerParameter("separation_interior", "separation for latency hiding and coloring/smoothing ", INTERIOR, viewtype_values); AMG_Config::registerParameter("separation_exterior", "limit of calculations for coloring/smoothing ", OWNED, viewtype_values); AMG_Config::registerParameter("min_rows_latency_hiding", "number of rows at which to disable latency hiding, negative value means latency hiding is completely disabled", -1); + AMG_Config::registerParameter("exact_coarse_solve", "flag that changes the dense LU coarse solve to solve the exact global problem for Classical AMG preconditioning <0=disable|1=enable>", 0, bool_flag_values); AMG_Config::registerParameter("matrix_halo_exchange", "0 - No halo exchange on lower levels, 1 - just diagonal values, 2 - full", 0); std::vector coloring_values; coloring_values.push_back(FIRST); diff --git a/3rd_party/AMGX/core/src/eigensolvers/qr.cu b/3rd_party/AMGX/core/src/eigensolvers/qr.cu index 67b7b6485..244f3fe21 100644 --- a/3rd_party/AMGX/core/src/eigensolvers/qr.cu +++ b/3rd_party/AMGX/core/src/eigensolvers/qr.cu @@ -338,7 +338,6 @@ vstack(Vector &dst, const Vector &top, const Vector & int height = top.get_num_rows(); cudaMemcpy2D(dst.raw(), dpitch, top.raw(), spitch, width, height, cudaMemcpyDeviceToDevice); - int offset = top.get_num_cols(); cudaMemcpy2D(dst.raw() + top.get_num_cols(), dpitch, bottom.raw(), spitch, width, height, cudaMemcpyDeviceToDevice); } diff --git a/3rd_party/AMGX/core/src/energymin/energymin_amg_level.cu b/3rd_party/AMGX/core/src/energymin/energymin_amg_level.cu index 95080c8e4..35117610a 100644 --- a/3rd_party/AMGX/core/src/energymin/energymin_amg_level.cu +++ b/3rd_party/AMGX/core/src/energymin/energymin_amg_level.cu @@ -242,7 +242,7 @@ void Energymin_AMG_Level_Base R.setView(OWNED); transpose(R, P, R.get_num_rows()); - if (this->m_min_rows_latency_hiding < 0 || P.get_num_rows() < this->m_min_rows_latency_hiding) + if (!P.isLatencyHidingEnabled(*this->amg->m_cfg)) { // This will cause bsrmv to not do latency hiding P.setInteriorView(OWNED); diff --git a/3rd_party/AMGX/core/src/solvers/dense_lu_solver.cu b/3rd_party/AMGX/core/src/solvers/dense_lu_solver.cu index fefbd822f..ef4a6110e 100644 --- a/3rd_party/AMGX/core/src/solvers/dense_lu_solver.cu +++ b/3rd_party/AMGX/core/src/solvers/dense_lu_solver.cu @@ -686,6 +686,11 @@ DenseLUSolver(AMG_Config &cfg, // Make sure we don't run more than 1 iteration. this->set_max_iters(1); allocMem(m_cuds_info, sizeof(int), false); + + // Determine if the scalable coarse solve optimisation is enabled + m_enable_exact_solve = ( + cfg.getParameter( "algorithm", cfg_scope) == CLASSICAL && + cfg.getParameter( "exact_coarse_solve", cfg_scope) == 1); } @@ -729,12 +734,40 @@ template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > void DenseLUSolver >::solve_finalize(Vector_d &, Vector_d &) {} +// Offset the local row offsets to global row offsets +template +__global__ void local_row_offsets_to_global( + int num_rows, int offset, + IndexType* local_Arows) +{ + int i = threadIdx.x + blockIdx.x*blockDim.x; + if(i >= num_rows) { return; } + + local_Arows[i] += offset; +} + +// Offset local packed column indices to global unpacked indices +template +__global__ void local_col_indices_to_global( + int nnz, int num_rows, int offset, IndexType* local_Acols, L2GType* l2g) +{ + int i = threadIdx.x + blockIdx.x*blockDim.x; + if(i >= nnz) { return; } + + if(local_Acols[i] >= num_rows) + { + local_Acols[i] = l2g[local_Acols[i] - num_rows]; + } + else + { + local_Acols[i] += offset; + } +} template< AMGX_VecPrecision V, AMGX_MatPrecision M, AMGX_IndPrecision I > void DenseLUSolver >:: solver_setup(bool reuse_matrix_structure) { - // This is probably not much. Matrix_d *A = dynamic_cast(Base::m_A); if (!A) @@ -745,21 +778,144 @@ solver_setup(bool reuse_matrix_structure) ViewType oldView = A->currentView(); A->setViewExterior(); - m_num_rows = A->get_num_rows() * A->get_block_dimx(); - // don't use A->get_num_cols() because A is rectangular. - // Only the diagonal block owned by this rank is factored. - m_num_cols = A->get_num_rows() * A->get_block_dimy(); - m_lda = m_num_rows; // col-major - // Allocate mem for cudense pivoting sequence. - allocMem(m_ipiv, m_num_rows, false); - // Allocate memory to store the dense A and initialize to zero. - allocMem(m_dense_A, m_num_cols * m_lda, true); - csr_to_dense(); // copy sparse A to dense_A + + if(A->is_matrix_distributed() && m_enable_exact_solve) + { +#ifdef AMGX_WITH_MPI + int rank = A->manager->global_id(); + int nranks = A->manager->get_num_partitions(); + MPI_Comm comm = A->manager->getComms()->get_mpi_comm(); + + int offset, num_rows, nnz; + A->getOffsetAndSizeForView(OWNED, &offset, &num_rows); + A->getNnzForView(OWNED, &nnz); + + m_num_rows = A->manager->num_rows_global * A->get_block_dimx(); + m_num_cols = A->manager->num_rows_global * A->get_block_dimy(); + m_lda = m_num_rows; // col-major + + // Allocate mem for cudense pivoting sequence. + allocMem(m_ipiv, m_num_rows, false); + + // Allocate memory to store the dense A and initialize to zero. + allocMem(m_dense_A, m_num_cols * m_num_rows, true); + + // Much of the data can be reused if we are performing a resetup + if (!reuse_matrix_structure) + { + // Gather the number of non zeros on each rank + A->manager->getComms()->all_gather(nnz, nz_all, nranks); + + // Gather the number of rows on each rank + A->manager->getComms()->all_gather(num_rows, row_all, nranks); + + // Get the number of non zeros on all ranks + m_nnz_global = thrust::reduce(nz_all.begin(), nz_all.end()); + + // Turn the non-zero counts into displacements + nz_displs.resize(nranks); + thrust::exclusive_scan(nz_all.begin(), nz_all.end(), nz_displs.begin()); + + // Turn the number of rows into displacements + row_displs.resize(nranks); + thrust::exclusive_scan(row_all.begin(), row_all.end(), row_displs.begin()); + + IVector_d local_Acols_d(nnz); + IVector_d local_Arows_d(num_rows); + + thrust::copy(A->col_indices.begin(), A->col_indices.begin() + nnz, local_Acols_d.begin()); + thrust::copy(A->row_offsets.begin(), A->row_offsets.begin() + num_rows, local_Arows_d.begin()); + + // XXX Local to global map is the current limiting factor to enabling this + // code for the aggregation based path. It's not clear whether there is + // a structure that provides the same inverse mapping with aggregation. + // Note that at one point inverse_renumbering was tested for aggregation + // but didn't appear to work in all cases. + + // Convert the local column indices and row offsets to the global index space + constexpr int nthreads = 128; + int nblocks = nnz / nthreads + 1; + local_col_indices_to_global<<>>(nnz, num_rows, row_displs[rank], local_Acols_d.raw(), A->manager->local_to_global_map.raw()); + + nblocks = num_rows / nthreads + 1; + local_row_offsets_to_global<<>>(num_rows, nz_displs[rank], local_Arows_d.raw()); + + // Copy the transformed indices to the host + IVector_h local_Acols_h(nnz); + IVector_h local_Arows_h(num_rows); + thrust::copy(local_Acols_d.begin(), local_Acols_d.end(), local_Acols_h.begin()); + thrust::copy(local_Arows_d.begin(), local_Arows_d.end(), local_Arows_h.begin()); + + // Gather the local matrix structure redundantly to every rank + IVector_h Acols_global_h(m_nnz_global); + A->manager->getComms()->all_gather_v(local_Acols_h, nnz, Acols_global_h, nz_all, nz_displs); + + // Note: Copy the local data to global without guard value + IVector_h Arows_global_h(m_num_rows + 1); + A->manager->getComms()->all_gather_v(local_Arows_h, num_rows, Arows_global_h, row_all, row_displs); + + // Manually set the guard value on the global matrix + Arows_global_h[m_num_rows] = m_nnz_global; + Acols_global.resize(m_nnz_global); + Arows_global.resize(m_num_rows + 1); + thrust::copy(Acols_global_h.begin(), Acols_global_h.end(), Acols_global.begin()); + thrust::copy(Arows_global_h.begin(), Arows_global_h.end(), Arows_global.begin()); + } + + // Fetch to the host a copy of the local sparse matrix + MVector_h local_Avals_h(nnz); + thrust::copy(A->values.begin(), A->values.begin() + nnz, local_Avals_h.begin()); + + // Gather the matrix values to all ranks + MVector_h Avals_global_h(m_nnz_global); + A->manager->getComms()->all_gather_v(local_Avals_h, nnz, Avals_global_h, nz_all, nz_displs); + + allocMem(m_dense_A, m_num_cols * m_lda, true); + + MVector_d Avals_global(m_nnz_global); + thrust::copy(Avals_global_h.begin(), Avals_global_h.end(), Avals_global.begin()); + + const int block_size = 256; + const int num_warps = block_size / WARP_SIZE; + const int grid_size = std::min(4096, (A->get_num_rows() + num_warps - 1) / num_warps); + cudaStream_t stream = thrust::global_thread_handle::get_stream(); + csr_to_dense_kernel<<>>( + m_num_rows, + m_num_cols, + A->get_block_dimx(), + A->get_block_dimy(), + Arows_global.raw(), + Acols_global.raw(), + Avals_global.raw(), + A->hasProps(DIAG) ? A->diag.raw() : NULL, + m_dense_A, + m_lda); + + cudaStreamSynchronize(stream); + cudaCheckError(); +#endif + } + else + { + ViewType oldView = A->currentView(); + A->setViewExterior(); + m_num_rows = A->get_num_rows() * A->get_block_dimx(); + // don't use A->get_num_cols() because A is rectangular. + // Only the diagonal block owned by this rank is factored. + m_num_cols = A->get_num_rows() * A->get_block_dimy(); + m_lda = m_num_rows; // col-major + // Allocate mem for cudense pivoting sequence. + allocMem(m_ipiv, m_num_rows, false); + // Allocate memory to store the dense A and initialize to zero. + allocMem(m_dense_A, m_num_cols * m_lda, true); + csr_to_dense(); // copy sparse A to dense_A + } + cudense_getrf(); // do LU factor A->setView(oldView); } -// There is one subtle point here: +// There is one subtle point here (for inexact solve): // We only do LU on the diagonal blocks associated with each rank. // Halo is used to update the right-hand-side (RHS) vector. // For multi GPU cases, this is essentially block Jacobi. Since the block size @@ -774,24 +930,74 @@ solve_iteration(Vector_d &rhs, ViewType oldView = A->currentView(); A->setViewExterior(); - if ((!A->is_matrix_singleGPU()) && (!xIsZero)) + if(A->is_matrix_distributed() && m_enable_exact_solve) { - // Modify rhs to include contribution from halo nodes - // i.e. new_rhs = b - A_halo*x; - // Note: dense_lu solver doesn't support latency hiding - A->manager->exchange_halo_async(x, x.tag); - A->manager->exchange_halo_wait(x, x.tag); - Vector_d new_rhs(rhs.size()); - distributed_rhs_mod(x, rhs, new_rhs); - thrust::copy(new_rhs.begin(), new_rhs.begin() + m_num_rows, x.begin()); - cudaCheckError(); +#ifdef AMGX_WITH_MPI + int offset, num_rows; + A->getOffsetAndSizeForView(OWNED, &offset, &num_rows); + + int rank = A->manager->global_id(); + int nranks = A->manager->get_num_partitions(); + MPI_Comm comm = A->manager->getComms()->get_mpi_comm(); + + // Make host copy of the RHS + MVector_h rhs_local_h(num_rows); + thrust::copy(rhs.begin(), rhs.begin() + num_rows, rhs_local_h.begin()); + + // Gather the local RHS from all ranks to global vectors on all ranks + MVector_h rhs_global_h(m_num_rows); + A->manager->getComms()->all_gather_v(rhs_local_h, num_rows, rhs_global_h, row_all, row_displs); + + //Solve L*X = RHS + MVector_d x_global(m_num_rows); + thrust::copy(rhs_global_h.begin(), rhs_global_h.end(), x_global.begin()); + cusolverStatus_t status = + cusolverDnXgetrs(m_cuds_handle, + CUBLAS_OP_N, + m_num_rows, + 1, + m_dense_A, + m_lda, + m_ipiv, + x_global.raw(), + m_num_rows, + m_cuds_info); + + // Copy the local portion of the solution back into x + thrust::copy(x_global.begin() + row_displs[rank], x_global.begin() + row_displs[rank] + num_rows, x.begin()); + + if (status != CUSOLVER_STATUS_SUCCESS) + { + FatalError("cuSolver trsv failed to solve Lx=rhs", AMGX_ERR_INTERNAL); + } +#endif } else { - x.copy(rhs); + Matrix_d *A = dynamic_cast(Base::m_A); + ViewType oldView = A->currentView(); + A->setViewExterior(); + + if ((!A->is_matrix_singleGPU()) && (!xIsZero)) + { + // Modify rhs to include contribution from halo nodes + // i.e. new_rhs = b - A_halo*x; + // Note: dense_lu solver doesn't support latency hiding + A->manager->exchange_halo_async(x, x.tag); + A->manager->exchange_halo_wait(x, x.tag); + Vector_d new_rhs(rhs.size()); + distributed_rhs_mod(x, rhs, new_rhs); + thrust::copy(new_rhs.begin(), new_rhs.begin() + m_num_rows, x.begin()); + cudaCheckError(); + } + else + { + x.copy(rhs); + } + + cudense_getrs(x); // triangular solves } - cudense_getrs(x); // triangular solves //Speculative send of x vector x.dirtybit = 1; A->setView(oldView); diff --git a/3rd_party/AMGX/eigen_examples/CMakeLists.txt b/3rd_party/AMGX/eigen_examples/CMakeLists.txt index 7302acfc8..4b29cdfc2 100644 --- a/3rd_party/AMGX/eigen_examples/CMakeLists.txt +++ b/3rd_party/AMGX/eigen_examples/CMakeLists.txt @@ -24,28 +24,17 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -IF (WIN32) -cmake_minimum_required (VERSION 2.8.8) -ELSE (WIN32) -cmake_minimum_required (VERSION 2.8.0) -ENDIF (WIN32) +cmake_minimum_required (VERSION 3.18) -include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../base/include" "${CUDA_TOOLKIT_ROOT_DIR}/include") -if(WIN32) - if(MSVC) - link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib/x64") - endif(MSVC) -else(WIN32) - link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64") -endif(WIN32) +include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../base/include" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}") GET_FILENAME_COMPONENT(CMAKE_C_COMPILER_NAME "${CMAKE_C_COMPILER}" NAME) IF(CMAKE_C_COMPILER_NAME MATCHES cl AND NOT CMAKE_C_COMPILER_NAME MATCHES clang) - set(libs_all ${cusparse_library} ${cusolver_library}) - set(dyn_libs amgxsh cudart.lib) + set(libs_all CUDA::cusparse CUDA::cusolver) + set(dyn_libs amgxsh CUDA::cudart CUDA::cublas) ELSE(CMAKE_C_COMPILER_NAME MATCHES cl AND NOT CMAKE_C_COMPILER_NAME MATCHES clang) - set(libs_all ${cusparse_library} ${cusolver_library} rt dl) - set(dyn_libs amgxsh rt dl cudart) + set(libs_all rt dl CUDA::cusparse CUDA::cusolver) + set(dyn_libs amgxsh rt dl CUDA::cudart CUDA::cublas) ENDIF(CMAKE_C_COMPILER_NAME MATCHES cl AND NOT CMAKE_C_COMPILER_NAME MATCHES clang) ADD_EXECUTABLE(eigensolver eigensolver.c) @@ -63,6 +52,7 @@ ENDIF(WIN32) if(MPI_FOUND) ADD_EXECUTABLE(eigensolver_mpi eigensolver_mpi.c) + IF(WIN32) IF(MSVC) set_source_files_properties( eigensolver_mpi.c PROPERTIES LANGUAGE CXX) @@ -74,7 +64,7 @@ if(MPI_FOUND) SET_SOURCE_FILES_PROPERTIES( eigensolver_mpi.c PROPERTIES COMPILE_FLAGS -std=c99 ) set_target_properties ( eigensolver_mpi PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS} -pthread" ) set_target_properties ( eigensolver_mpi PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") - target_link_libraries(eigensolver_mpi cudart amgxsh ${omp_lib} ${MPI_C_LIBRARIES}) + target_link_libraries(eigensolver_mpi amgxsh ${omp_lib} ${MPI_C_LIBRARIES} ${dyn_libs}) ENDIF(WIN32) if(MPI_COMPILE_FLAGS) diff --git a/3rd_party/AMGX/eigensolvers/CMakeLists.txt b/3rd_party/AMGX/eigensolvers/CMakeLists.txt index de6c36538..d22cea9d3 100644 --- a/3rd_party/AMGX/eigensolvers/CMakeLists.txt +++ b/3rd_party/AMGX/eigensolvers/CMakeLists.txt @@ -24,26 +24,31 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -IF (WIN32) -cmake_minimum_required (VERSION 2.8.8) -ELSE (WIN32) -cmake_minimum_required (VERSION 2.8.0) -ENDIF (WIN32) +cmake_minimum_required (VERSION 3.18) -FIND_PACKAGE(CUDA) - -set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake" ${CMAKE_MODULE_PATH}) - -CUDA_INCLUDE_DIRECTORIES(${THRUST_DIR}) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/include) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../base/include) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../core/include) +set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/../cmake" ${CMAKE_MODULE_PATH}) # Select all sources FILE(GLOB_RECURSE SRCS "src/*.cu") -# Generic nvcc flags +target_sources(amgx_libs PRIVATE ${SRCS}) + +add_library(amgx_eigensolvers STATIC ${SRCS}) + +set(AMGX_INCLUDES + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/../base/include + ${CMAKE_CURRENT_SOURCE_DIR}/../core/include) + +target_include_directories(amgx_eigensolvers PUBLIC + ${THRUST_DIR} + ${AMGX_INCLUDES}) + +find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) + +target_link_libraries(amgx_eigensolvers amgxsh CUDA::cudart CUDA::cublas) + +# set arch for eigensolvers lib target +set_target_properties(amgx_eigensolvers PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") -CUDA_ADD_LIBRARY(amgx_eigensolvers STATIC ${SRCS}) -target_link_libraries(amgx_eigensolvers amgx_base amgx_core) -#install(TARGETS amgx_eigensolvers DESTINATION "lib/sublibs") +install(TARGETS amgx_eigensolvers DESTINATION "lib/sublibs") diff --git a/3rd_party/AMGX/examples/CMakeLists.txt b/3rd_party/AMGX/examples/CMakeLists.txt index 8a8980988..b0cbf844b 100644 --- a/3rd_party/AMGX/examples/CMakeLists.txt +++ b/3rd_party/AMGX/examples/CMakeLists.txt @@ -24,36 +24,23 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -IF (WIN32) -cmake_minimum_required (VERSION 2.8.8) -ELSE (WIN32) -cmake_minimum_required (VERSION 2.8.0) -ENDIF (WIN32) - -CUDA_INCLUDE_DIRECTORIES(${THRUST_DIR}) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../base/include) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../core/include) - -include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../base/include" "${CUDA_TOOLKIT_ROOT_DIR}/include") -if(WIN32) - if(MSVC) - link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib/x64") - endif(MSVC) -else(WIN32) - link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64") -endif(WIN32) +cmake_minimum_required (VERSION 3.18) + +include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../base/include" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}") GET_FILENAME_COMPONENT(CMAKE_C_COMPILER_NAME "${CMAKE_C_COMPILER}" NAME) IF(CMAKE_C_COMPILER_NAME MATCHES cl AND NOT CMAKE_C_COMPILER_NAME MATCHES clang) - set(libs_all ${cusparse_library} ${cusolver_library}) - set(dyn_libs amgxsh cudart.lib) + set(libs_all CUDA::cusparse CUDA::cusolver) + set(dyn_libs amgxsh CUDA::cudart_static CUDA::cublas) ELSE(CMAKE_C_COMPILER_NAME MATCHES cl AND NOT CMAKE_C_COMPILER_NAME MATCHES clang) - set(libs_all ${cusparse_library} ${cusolver_library} rt dl) - set(dyn_libs amgxsh rt dl cudart) + set(libs_all CUDA::cusparse CUDA::cusolver rt dl) + set(dyn_libs amgxsh rt dl CUDA::cudart_static CUDA::cublas) ENDIF(CMAKE_C_COMPILER_NAME MATCHES cl AND NOT CMAKE_C_COMPILER_NAME MATCHES clang) ADD_EXECUTABLE(amgx_capi amgx_capi.c) +set(AMGX_INCLUDES ${THRUST_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../base/include ${CMAKE_CURRENT_SOURCE_DIR}/../core/include) +target_include_directories( amgx_capi PUBLIC ${AMGX_INCLUDES}) target_link_libraries ( amgx_capi ${dyn_libs} ) IF(WIN32) @@ -65,74 +52,71 @@ ELSE(WIN32) SET_SOURCE_FILES_PROPERTIES( amgx_capi.c PROPERTIES COMPILE_FLAGS -std=c99 ) ENDIF(WIN32) - -CUDA_ADD_EXECUTABLE(generate_poisson generate_poisson.cu OPTIONS "-Xcompiler=-I$ENV{METIS_INSTALL_PATH}/include") +add_executable(generate_poisson generate_poisson.cu) +target_compile_options(generate_poisson PUBLIC "-Xcompiler=-I$ENV{METIS_INSTALL_PATH}/include") target_link_libraries(generate_poisson amgx ${libs_all} ) set_target_properties(generate_poisson PROPERTIES LINK_FLAGS "") +target_include_directories(generate_poisson PUBLIC ${AMGX_INCLUDES}) + +set_target_properties(generate_poisson PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") -CUDA_ADD_EXECUTABLE(generate_poisson7_dist_renum generate_poisson7_dist_renum.cu OPTIONS "-Xcompiler=-I$ENV{METIS_INSTALL_PATH}/include") +add_executable(generate_poisson7_dist_renum generate_poisson7_dist_renum.cu) +target_compile_options(generate_poisson7_dist_renum PUBLIC "-Xcompiler=-I$ENV{METIS_INSTALL_PATH}/include") target_link_libraries(generate_poisson7_dist_renum amgx ${libs_all} ) set_target_properties(generate_poisson7_dist_renum PROPERTIES LINK_FLAGS "") +target_include_directories(generate_poisson7_dist_renum PUBLIC ${AMGX_INCLUDES}) + +set_target_properties(generate_poisson7_dist_renum PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") if(MPI_FOUND) ADD_EXECUTABLE(amgx_mpi_capi amgx_mpi_capi.c ) ADD_EXECUTABLE(amgx_mpi_capi_agg amgx_mpi_capi_agg.c ) ADD_EXECUTABLE(amgx_mpi_capi_cla amgx_mpi_capi_cla.c ) ADD_EXECUTABLE(amgx_mpi_poisson7 amgx_mpi_poisson7.c ) -# ADD_EXECUTABLE(amgx_spmv_example amgx_spmv_test.c ) IF(WIN32) IF(MSVC) set_source_files_properties( amgx_mpi_capi.c PROPERTIES LANGUAGE CXX) - set_target_properties ( amgx_mpi_capi PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS}" LINKER_LANGUAGE CXX ) - set_target_properties ( amgx_mpi_capi PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") + set_target_properties(amgx_mpi_capi PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS}" LINKER_LANGUAGE CXX ) + set_target_properties(amgx_mpi_capi PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") target_link_libraries(amgx_mpi_capi ${MPI_C_LIBRARIES} ${dyn_libs}) - set_source_files_properties( amgx_mpi_capi_agg.c PROPERTIES LANGUAGE CXX) - set_target_properties ( amgx_mpi_capi_agg PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS}" LINKER_LANGUAGE CXX ) - set_target_properties ( amgx_mpi_capi_agg PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") + set_source_files_properties(amgx_mpi_capi_agg.c PROPERTIES LANGUAGE CXX) + set_target_properties(amgx_mpi_capi_agg PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS}" LINKER_LANGUAGE CXX ) + set_target_properties(amgx_mpi_capi_agg PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") target_link_libraries(amgx_mpi_capi_agg ${MPI_C_LIBRARIES} ${dyn_libs}) - set_source_files_properties( amgx_mpi_capi_cla.c PROPERTIES LANGUAGE CXX) - set_target_properties ( amgx_mpi_capi_cla PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS}" LINKER_LANGUAGE CXX ) - set_target_properties ( amgx_mpi_capi_cla PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") + set_source_files_properties(amgx_mpi_capi_cla.c PROPERTIES LANGUAGE CXX) + set_target_properties(amgx_mpi_capi_cla PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS}" LINKER_LANGUAGE CXX ) + set_target_properties(amgx_mpi_capi_cla PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") target_link_libraries(amgx_mpi_capi_cla ${MPI_C_LIBRARIES} ${dyn_libs}) - set_source_files_properties( amgx_mpi_poisson7.c PROPERTIES LANGUAGE CXX) - set_target_properties ( amgx_mpi_poisson7 PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS}" LINKER_LANGUAGE CXX ) - set_target_properties ( amgx_mpi_poisson7 PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") - target_link_libraries( amgx_mpi_poisson7 ${MPI_C_LIBRARIES} ${dyn_libs}) - - #set_source_files_properties( amgx_spmv_example.c PROPERTIES LANGUAGE CXX) - #set_target_properties ( amgx_spmv_example PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS}" LINKER_LANGUAGE CXX ) - #set_target_properties ( amgx_spmv_example PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") - #target_link_libraries( amgx_spmv_example ${MPI_C_LIBRARIES} ${dyn_libs}) + set_source_files_properties(amgx_mpi_poisson7.c PROPERTIES LANGUAGE CXX) + set_target_properties(amgx_mpi_poisson7 PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS}" LINKER_LANGUAGE CXX ) + set_target_properties(amgx_mpi_poisson7 PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") + target_link_libraries(amgx_mpi_poisson7 ${MPI_C_LIBRARIES} ${dyn_libs}) ENDIF(MSVC) ELSE(WIN32) + SET_SOURCE_FILES_PROPERTIES( amgx_mpi_capi.c PROPERTIES COMPILE_FLAGS -std=c99 ) set_target_properties ( amgx_mpi_capi PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS} -pthread" ) set_target_properties ( amgx_mpi_capi PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") - target_link_libraries(amgx_mpi_capi cudart amgxsh ${omp_lib} ${MPI_C_LIBRARIES}) + target_link_libraries(amgx_mpi_capi amgxsh ${omp_lib} ${MPI_C_LIBRARIES} ${dyn_libs}) SET_SOURCE_FILES_PROPERTIES( amgx_mpi_capi_agg.c PROPERTIES COMPILE_FLAGS -std=c99 ) set_target_properties ( amgx_mpi_capi_agg PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS} -pthread" ) set_target_properties ( amgx_mpi_capi_agg PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") - target_link_libraries(amgx_mpi_capi_agg cudart amgxsh ${omp_lib} ${MPI_C_LIBRARIES}) + target_link_libraries(amgx_mpi_capi_agg amgxsh ${omp_lib} ${MPI_C_LIBRARIES} ${dyn_libs}) SET_SOURCE_FILES_PROPERTIES( amgx_mpi_capi_cla.c PROPERTIES COMPILE_FLAGS -std=c99 ) set_target_properties ( amgx_mpi_capi_cla PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS} -pthread" ) set_target_properties ( amgx_mpi_capi_cla PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") - target_link_libraries(amgx_mpi_capi_cla cudart amgxsh ${omp_lib} ${MPI_C_LIBRARIES}) + target_link_libraries(amgx_mpi_capi_cla amgxsh ${omp_lib} ${MPI_C_LIBRARIES} ${dyn_libs}) SET_SOURCE_FILES_PROPERTIES( amgx_mpi_poisson7.c PROPERTIES COMPILE_FLAGS -std=c99 ) set_target_properties ( amgx_mpi_poisson7 PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS} -pthread" ) set_target_properties ( amgx_mpi_poisson7 PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") - target_link_libraries( amgx_mpi_poisson7 cudart amgxsh ${omp_lib} ${MPI_C_LIBRARIES} ${dyn_libs}) - -# SET_SOURCE_FILES_PROPERTIES( amgx_spmv_test.c PROPERTIES COMPILE_FLAGS -std=c99 ) -# set_target_properties ( amgx_spmv_example PROPERTIES COMPILE_FLAGS "${OpenMP_C_FLAGS} -pthread" ) -# set_target_properties ( amgx_spmv_example PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}") -# target_link_libraries( amgx_spmv_example cudart amgxsh ${omp_lib} ${MPI_C_LIBRARIES} ${dyn_libs}) + target_link_libraries( amgx_mpi_poisson7 amgxsh ${omp_lib} ${MPI_C_LIBRARIES} ${dyn_libs}) ENDIF(WIN32) @@ -143,6 +127,16 @@ if(MPI_FOUND) set_target_properties(amgx_mpi_poisson7 PROPERTIES COMPILE_FLAGS "${MPI_COMPILE_FLAGS}") endif(MPI_COMPILE_FLAGS) + target_include_directories(amgx_mpi_capi PUBLIC ${AMGX_INCLUDES}) + target_include_directories(amgx_mpi_capi_agg PUBLIC ${AMGX_INCLUDES}) + target_include_directories(amgx_mpi_capi_cla PUBLIC ${AMGX_INCLUDES}) + target_include_directories(amgx_mpi_poisson7 PUBLIC ${AMGX_INCLUDES}) + + set_target_properties(amgx_mpi_capi PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") + set_target_properties(amgx_mpi_capi_agg PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") + set_target_properties(amgx_mpi_capi_cla PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") + set_target_properties(amgx_mpi_poisson7 PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") + endif(MPI_FOUND) install(TARGETS amgx_capi DESTINATION "lib/examples") diff --git a/3rd_party/AMGX/examples/amgx_mpi_poisson7.c b/3rd_party/AMGX/examples/amgx_mpi_poisson7.c index e93c10027..df3874a33 100644 --- a/3rd_party/AMGX/examples/amgx_mpi_poisson7.c +++ b/3rd_party/AMGX/examples/amgx_mpi_poisson7.c @@ -318,28 +318,45 @@ int main(int argc, char **argv) } } + int nrepeats = 1; + int tidx = findParamIndex(argv, argc, "-r"); + if(tidx != -1) + { + nrepeats = atoi(argv[tidx+1]); + printf("Running for %d repeats\n", nrepeats); + } + + + /* set the connectivity information (for the vector) */ AMGX_vector_bind(x, A); AMGX_vector_bind(b, A); /* upload the vector (and the connectivity information) */ AMGX_vector_upload(x, n, 1, h_x); AMGX_vector_upload(b, n, 1, h_b); - /* solver setup */ - //MPI barrier for stability (should be removed in practice to maximize performance) - MPI_Barrier(amgx_mpi_comm); - AMGX_solver_setup(solver, A); - /* solver solve */ - //MPI barrier for stability (should be removed in practice to maximize performance) - MPI_Barrier(amgx_mpi_comm); - AMGX_solver_solve(solver, b, x); - /* example of how to change parameters between non-linear iterations */ - //AMGX_config_add_parameters(&cfg, "config_version=2, default:tolerance=1e-12"); - //AMGX_solver_solve(solver, b, x); - /* example of how to replace coefficients between non-linear iterations */ - //AMGX_matrix_replace_coefficients(A, n, nnz, values, diag); - //AMGX_solver_setup(solver, A); - //AMGX_solver_solve(solver, b, x); - AMGX_solver_get_status(solver, &status); + for(int r = 0; r < nrepeats; ++r) + { + /* upload the vector (and the connectivity information) */ + AMGX_vector_upload(x, n, 1, h_x); + AMGX_vector_upload(b, n, 1, h_b); + /* solver setup */ + //MPI barrier for stability (should be removed in practice to maximize performance) + MPI_Barrier(amgx_mpi_comm); + AMGX_solver_setup(solver, A); + /* solver solve */ + //MPI barrier for stability (should be removed in practice to maximize performance) + MPI_Barrier(amgx_mpi_comm); + AMGX_solver_solve(solver, b, x); + /* example of how to change parameters between non-linear iterations */ + //AMGX_config_add_parameters(&cfg, "config_version=2, default:tolerance=1e-12"); + //AMGX_solver_solve(solver, b, x); + /* example of how to replace coefficients between non-linear iterations */ + //AMGX_matrix_replace_coefficients(A, n, nnz, values, diag); + //AMGX_solver_setup(solver, A); + //AMGX_solver_solve(solver, b, x); + AMGX_solver_get_status(solver, &status); + } + /* example of how to get (the local part of) the solution */ //int sizeof_v_val; //sizeof_v_val = ((NVAMG_GET_MODE_VAL(NVAMG_VecPrecision, mode) == NVAMG_vecDouble))? sizeof(double): sizeof(float); @@ -354,12 +371,12 @@ int main(int argc, char **argv) AMGX_resources_destroy(rsrc); /* destroy config (need to use AMGX_SAFE_CALL after this point) */ AMGX_SAFE_CALL(AMGX_config_destroy(cfg)) - /* shutdown and exit */ - AMGX_SAFE_CALL(AMGX_finalize_plugins()) - AMGX_SAFE_CALL(AMGX_finalize()) - /* close the library (if it was dynamically loaded) */ + /* shutdown and exit */ + AMGX_SAFE_CALL(AMGX_finalize_plugins()) + AMGX_SAFE_CALL(AMGX_finalize()) + /* close the library (if it was dynamically loaded) */ #ifdef AMGX_DYNAMIC_LOADING - amgx_libclose(lib_handle); + amgx_libclose(lib_handle); #endif MPI_Finalize(); CUDA_SAFE_CALL(cudaDeviceReset()); diff --git a/3rd_party/AMGX/template_plugin/CMakeLists.txt b/3rd_party/AMGX/template_plugin/CMakeLists.txt index 64a6c7e49..149a987c1 100644 --- a/3rd_party/AMGX/template_plugin/CMakeLists.txt +++ b/3rd_party/AMGX/template_plugin/CMakeLists.txt @@ -25,24 +25,34 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. IF (WIN32) -cmake_minimum_required (VERSION 2.8.8) +cmake_minimum_required (VERSION 3.13) ELSE (WIN32) -cmake_minimum_required (VERSION 2.8.0) +cmake_minimum_required (VERSION 3.13) ENDIF (WIN32) set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../cmake" ${CMAKE_MODULE_PATH}) -CUDA_INCLUDE_DIRECTORIES(${THRUST_DIR}) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/include) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../base/include) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../core/include) - #select all sources FILE(GLOB_RECURSE SRCS "src/*.cu") -#Generic nvcc flags +add_library(amgx_template_plugin STATIC ${SRCS}) + +set(AMGX_INCLUDES + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/../base/include + ${CMAKE_CURRENT_SOURCE_DIR}/../core/include) + +target_sources(amgx_libs PRIVATE ${SRCS}) + +target_include_directories(amgx_libs PUBLIC ${AMGX_INCLUDES}) + +target_include_directories(amgx_template_plugin PUBLIC ${THRUST_DIR} ${AMGX_INCLUDES}) + +target_link_libraries(amgx_template_plugin amgx_libs) + +if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0") + set_target_properties(amgx_template_plugin PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") +endif() -CUDA_ADD_LIBRARY(amgx_template_plugin STATIC ${SRCS}) -target_link_libraries(amgx_template_plugin amgx_base) -#install(TARGETS amgx_template_plugin DESTINATION "lib/sublibs") +install(TARGETS amgx_template_plugin DESTINATION "lib/sublibs") diff --git a/3rd_party/AMGX/tests/CMakeLists.txt b/3rd_party/AMGX/tests/CMakeLists.txt index 06e3d0bce..7a6b8eb72 100644 --- a/3rd_party/AMGX/tests/CMakeLists.txt +++ b/3rd_party/AMGX/tests/CMakeLists.txt @@ -25,53 +25,57 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. IF (WIN32) -cmake_minimum_required (VERSION 2.8.8) +cmake_minimum_required (VERSION 3.13) ELSE (WIN32) -cmake_minimum_required (VERSION 2.8.0) +cmake_minimum_required (VERSION 3.13) ENDIF (WIN32) -FIND_PACKAGE(CUDA) - -CUDA_INCLUDE_DIRECTORIES(${THRUST_DIR}) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/tests/include) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/core/include) -CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/base/include) +set(AMGX_INCLUDES + ${THRUST_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/tests/include + ${CMAKE_CURRENT_SOURCE_DIR}/core/include + ${CMAKE_CURRENT_SOURCE_DIR}/base/include) set(tests_all ${tests_all} src/testframework.cu src/test_utils.cu src/unit_test.cu) -CUDA_ADD_LIBRARY(amgx_tests_library STATIC ${tests_all}) +add_library(amgx_tests_libs OBJECT ${tests_all}) + +target_include_directories(amgx_tests_libs PRIVATE ${AMGX_INCLUDES}) + +add_library(amgx_tests_library STATIC $) + add_dependencies(amgx_tests_library amgx) + target_link_libraries(amgx_tests_library amgx) -# workaround -get_target_property(src_tmp amgx_tests_library SOURCES) -set(tests_all ${tests_all} ${src_tmp}) +add_executable(amgx_tests_launcher src/utest.cu $) + +if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0") + set_target_properties(amgx_tests_launcher PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") + set_target_properties(amgx_tests_libs PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") + set_target_properties(amgx_tests_library PROPERTIES CUDA_ARCHITECTURES "${CUDA_ARCH}") -set(tests_obj_all) -foreach( src ${tests_all} ) - if(${src} MATCHES ${CMAKE_BINARY_DIR}) - set(tests_obj_all ${tests_obj_all} ${src}) - endif(${src} MATCHES ${CMAKE_BINARY_DIR}) -endforeach(src) -SET_SOURCE_FILES_PROPERTIES(${tests_obj_all} PROPERTIES GENERATED TRUE EXTERNAL_OBJECT TRUE) -#file(WRITE "tests_obj_all.txt" "${tests_obj_all}") # debug, write out all objects for amgx + target_compile_options(amgx_tests_launcher PUBLIC $<$: ${CUDA_NVCC_FLAGS} >) + target_compile_options(amgx_tests_libs PUBLIC $<$: ${CUDA_NVCC_FLAGS} >) + target_compile_options(amgx_tests_library PUBLIC $<$: ${CUDA_NVCC_FLAGS} >) +endif() -#CUDA_ADD_EXECUTABLE(amgx_tests_launcher ${tests_all} src/testframework.cu src/test_utils.cu src/unit_test.cu src/utest.cu) -CUDA_ADD_EXECUTABLE(amgx_tests_launcher src/utest.cu ${tests_obj_all}) +target_include_directories(amgx_tests_launcher PRIVATE ${AMGX_INCLUDES}) # create executables for host and device, scalar and block version -# set(libs_all ${cusparse_library}) GET_FILENAME_COMPONENT(CMAKE_C_COMPILER_NAME "${CMAKE_C_COMPILER}" NAME) IF(CMAKE_C_COMPILER_NAME MATCHES cl) - set(libs_all ${cusparse_library}) + set(libs_all ${CUSPARSE_LIB} ${CUBLAS_LIB}) ELSE(CMAKE_C_COMPILER_NAME MATCHES cl) - set(libs_all ${cusparse_library} rt dl) + set(libs_all ${CUSPARSE_LIB} ${CUBLAS_LIB} rt dl) ENDIF(CMAKE_C_COMPILER_NAME MATCHES cl) add_dependencies(amgx_tests_launcher amgx_tests_library) + target_link_libraries(amgx_tests_launcher amgxsh amgx ${libs_all}) if(${AMGX_PUBLIC_RELEASE} MATCHES "FALSE") install(TARGETS amgx_tests_launcher DESTINATION "lib/tests") endif(${AMGX_PUBLIC_RELEASE} MATCHES "FALSE") + diff --git a/3rd_party/gslib/ogs/include/ogsDefs.h b/3rd_party/gslib/ogs/include/ogsDefs.h new file mode 100644 index 000000000..c8e14715c --- /dev/null +++ b/3rd_party/gslib/ogs/include/ogsDefs.h @@ -0,0 +1,78 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +/* the supported types */ +typedef long long long_long; +#if 0 +#define OGS_FOR_EACH_TYPE(macro) \ + macro(double ) \ + macro(float ) \ + macro(int ) \ + macro(long ) \ + macro(long_long) +#else +#define OGS_FOR_EACH_TYPE(macro) \ + macro(double ) \ + macro(float ) +#endif + +/* the supported ops */ +#if 0 +#define OGS_FOR_EACH_OP(T,macro) \ + macro(T,add) \ + macro(T,mul) \ + macro(T,min) \ + macro(T,max) +#else +#define OGS_FOR_EACH_OP(T,macro) \ + macro(T,add) \ + macro(T,min) \ + macro(T,max) +#endif + +#define OGS_DO_add(a,b) a+=b +#define OGS_DO_mul(a,b) a*=b +#define OGS_DO_min(a,b) if(ba) a=b + +/* type size array */ +#define OGS_TYPE_SIZE_ITEM(T) sizeof(T), +#define OGS_DEFINE_TYPE_SIZES() \ + static const unsigned ogs_type_size[] = \ + { OGS_FOR_EACH_TYPE(OGS_TYPE_SIZE_ITEM) 0 }; + +/* mapping from ogs types to gs types */ +#define gs_int64_t gs_long_long +#define OGS_GS_MAP_TYPE_ITEM(T) gs_##T, +#define OGS_GS_DEFINE_TYPE_MAP() \ + static const gs_dom ogs_gs_type_map[] = \ + { OGS_FOR_EACH_TYPE(OGS_GS_MAP_TYPE_ITEM) gs_dom_n }; + +/* mapping from ogs ops to gs ops */ +#define OGS_GS_MAP_OP_ITEM(T,OP) gs_##OP, +#define OGS_GS_DEFINE_OP_MAP() \ + static const gs_op ogs_gs_op_map[] = \ + { OGS_FOR_EACH_OP(T,OGS_GS_MAP_OP_ITEM) gs_op_n }; diff --git a/3rd_party/gslib/ogs/include/ogsKernels.hpp b/3rd_party/gslib/ogs/include/ogsKernels.hpp index 4cd069326..bdeb41b8c 100644 --- a/3rd_party/gslib/ogs/include/ogsKernels.hpp +++ b/3rd_party/gslib/ogs/include/ogsKernels.hpp @@ -27,10 +27,13 @@ SOFTWARE. #ifndef OGS_KERNELS_HPP #define OGS_KERNELS_HPP 1 +#include #include "ogs.hpp" namespace ogs { + extern const int gatherNodesPerBlock; + extern int Nrefs; extern void* hostBuf; @@ -40,6 +43,11 @@ namespace ogs { extern occa::memory o_haloBuf; extern occa::memory h_haloBuf; + extern occa::kernel gatherScatterNewKernel_floatAdd; + extern occa::kernel gatherScatterNewKernel_doubleAdd; + extern occa::kernel gatherScatterNewKernel_doubleMin; + extern occa::kernel gatherScatterNewKernel_doubleMax; + extern occa::kernel gatherScatterKernel_floatAdd; extern occa::kernel gatherScatterKernel_floatMul; extern occa::kernel gatherScatterKernel_floatMin; diff --git a/3rd_party/gslib/ogs/ogs.hpp b/3rd_party/gslib/ogs/ogs.hpp index f2a83fe77..4173d0da2 100644 --- a/3rd_party/gslib/ogs/ogs.hpp +++ b/3rd_party/gslib/ogs/ogs.hpp @@ -151,11 +151,13 @@ typedef struct { int Nhalo; // number of halo nodes int NhaloGather; // number of gathered nodes on halo int NownedHalo; // number of owned halo nodes + int NrowBlocks; int *localGatherOffsets; int *localGatherIds; occa::memory o_localGatherOffsets; occa::memory o_localGatherIds; + occa::memory o_blockRowStarts; int *haloGatherOffsets; int *haloGatherIds; @@ -236,7 +238,7 @@ void *ogsHostMallocPinned(occa::device &device, size_t size, void *source, occa: #define USE_OOGS -enum oogs_mode { OOGS_AUTO, OOGS_DEFAULT, OOGS_HOSTMPI, OOGS_DEVICEMPI }; +enum oogs_mode { OOGS_LOCAL, OOGS_DEFAULT, OOGS_HOSTMPI, OOGS_DEVICEMPI, OOGS_AUTO }; enum oogs_modeExchange { OOGS_EX_PW, OOGS_EX_NBC }; typedef struct { @@ -288,6 +290,7 @@ oogs_t *setup(ogs_t *ogs, int nVec, int stride, const char *type, std::function< oogs_t *setup(int N, long long int *ids, const int k, const int stride, const char *type, MPI_Comm &comm, int verbose, occa::device device, std::function callback, oogs_mode mode); void gpu_mpi(int val); +void overlap(int val); int gpu_mpi(); void destroy(oogs_t *h); diff --git a/3rd_party/gslib/ogs/okl/gatherScatterNew.okl b/3rd_party/gslib/ogs/okl/gatherScatterNew.okl new file mode 100644 index 000000000..e6ec4dcee --- /dev/null +++ b/3rd_party/gslib/ogs/okl/gatherScatterNew.okl @@ -0,0 +1,91 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +// OCCA will #include "ogsDefs.h" before compilation + +/*------------------------------------------------------------------------------ + The basic gather-scatter kernel +------------------------------------------------------------------------------*/ +#define DEFINE_GATHERSCATTER(T,OP) \ +@kernel void gatherScatter_##T##_##OP(const dlong Nblocks, \ + const int Nentries, \ + const int Nvectors, \ + const dlong stride, \ + @restrict const dlong *blockStarts, \ + @restrict const dlong *gatherStarts, \ + @restrict const dlong *gatherIds, \ + @restrict T *q) \ +{ \ + for(dlong m=0;m0)) { + { + occa::properties props2 = ogs::kernelInfo; + props2["includes"] += DOGS "/include/ogsDefs.h"; + props2["defines/p_gatherNodesPerBlock"] = gatherNodesPerBlock; + props2["defines/init_" "float" "_add"] = (float) 0; + props2["defines/init_" "float" "_add"] = (float) 0; + props2["defines/init_" "float" "_min"] = (float) std::numeric_limits::max(); + props2["defines/init_" "float" "_max"] = (float) -std::numeric_limits::max(); + props2["defines/init_" "double" "_add"] = (double) 0; + props2["defines/init_" "double" "_min"] = (double) std::numeric_limits::max(); + props2["defines/init_" "double" "_max"] = (double) -std::numeric_limits::max(); + ogs::gatherScatterNewKernel_floatAdd = + device.buildKernel(DOGS "/okl/gatherScatterNew.okl", "gatherScatter_float_add", props2); + ogs::gatherScatterNewKernel_doubleAdd = + device.buildKernel(DOGS "/okl/gatherScatterNew.okl", "gatherScatter_double_add", props2); + ogs::gatherScatterNewKernel_doubleMin = + device.buildKernel(DOGS "/okl/gatherScatterNew.okl", "gatherScatter_double_min", props2); + ogs::gatherScatterNewKernel_doubleMax = + device.buildKernel(DOGS "/okl/gatherScatterNew.okl", "gatherScatter_double_max", props2); + } + ogs::gatherScatterKernel_floatAdd = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_floatAdd", props); ogs::gatherScatterKernel_floatMul = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_floatMul", props); ogs::gatherScatterKernel_floatMin = device.buildKernel(DOGS "/okl/gatherScatter.okl", "gatherScatter_floatMin", props); @@ -313,20 +340,20 @@ void ogs::initKernels(MPI_Comm comm, occa::device device, bool verbose) { - ogs::scatterKernel_float = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_float", props); + ogs::scatterKernel_float = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_float", props); ogs::scatterKernel_double = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_double", props); - ogs::scatterKernel_int = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_int", props); - ogs::scatterKernel_long = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_long", props); + ogs::scatterKernel_int = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_int", props); + ogs::scatterKernel_long = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_long", props); - ogs::scatterVecKernel_float = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_float", props); + ogs::scatterVecKernel_float = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_float", props); ogs::scatterVecKernel_double = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_double", props); - ogs::scatterVecKernel_int = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_int", props); - ogs::scatterVecKernel_long = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_long", props); + ogs::scatterVecKernel_int = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_int", props); + ogs::scatterVecKernel_long = device.buildKernel(DOGS "/okl/scatterVec.okl", "scatterVec_long", props); - ogs::scatterManyKernel_float = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_float", props); + ogs::scatterManyKernel_float = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_float", props); ogs::scatterManyKernel_double = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_double", props); - ogs::scatterManyKernel_int = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_int", props); - ogs::scatterManyKernel_long = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_long", props); + ogs::scatterManyKernel_int = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_int", props); + ogs::scatterManyKernel_long = device.buildKernel(DOGS "/okl/scatterMany.okl", "scatterMany_long", props); } MPI_Barrier(comm); } diff --git a/3rd_party/gslib/ogs/src/ogsSetup.cpp b/3rd_party/gslib/ogs/src/ogsSetup.cpp index d01571269..68f1919f3 100644 --- a/3rd_party/gslib/ogs/src/ogsSetup.cpp +++ b/3rd_party/gslib/ogs/src/ogsSetup.cpp @@ -66,6 +66,48 @@ int compareLocalId(const void *a, const void *b){ return 0; } +void setupRowBlocks(ogs_t *ogs, occa::device &device) +{ + dlong blockSum=0; + ogs->NrowBlocks=0; + if (ogs->NlocalGather) ogs->NrowBlocks++; + for (dlong i=0;iNlocalGather;i++) { + dlong rowSize = ogs->localGatherOffsets[i+1]-ogs->localGatherOffsets[i]; + + if (rowSize > ogs::gatherNodesPerBlock) { + //this row is pathalogically big. We can't currently run this + std::cout << "Multiplicity of global node id: " << i << "in ogsSetup is too large."; + exit(1); + } + + if (blockSum+rowSize > ogs::gatherNodesPerBlock) { //adding this row will exceed the nnz per block + ogs->NrowBlocks++; //count the previous block + blockSum=rowSize; //start a new row block + } else { + blockSum+=rowSize; //add this row to the block + } + } + + dlong* blockRowStarts = (dlong*) calloc(ogs->NrowBlocks+1,sizeof(dlong)); + + blockSum=0; + ogs->NrowBlocks=0; + if (ogs->NlocalGather) ogs->NrowBlocks++; + for (dlong i=0;iNlocalGather;i++) { + dlong rowSize = ogs->localGatherOffsets[i+1]-ogs->localGatherOffsets[i]; + + if (blockSum+rowSize > ogs::gatherNodesPerBlock) { //adding this row will exceed the nnz per block + blockRowStarts[ogs->NrowBlocks++] = i; //mark the previous block + blockSum=rowSize; //start a new row block + } else { + blockSum+=rowSize; //add this row to the block + } + } + blockRowStarts[ogs->NrowBlocks] = ogs->NlocalGather; + ogs->o_blockRowStarts = device.malloc((ogs->NrowBlocks+1)*sizeof(dlong), blockRowStarts); + free(blockRowStarts); +} + ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, int verbose, occa::device device){ @@ -349,6 +391,8 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, if(ogs->N) ogs->o_invDegree.copyFrom(ogs->invDegree); + setupRowBlocks(ogs, device); + return ogs; } diff --git a/3rd_party/gslib/ogs/src/oogs.cpp b/3rd_party/gslib/ogs/src/oogs.cpp index bdc7a073a..8843ac48f 100644 --- a/3rd_party/gslib/ogs/src/oogs.cpp +++ b/3rd_party/gslib/ogs/src/oogs.cpp @@ -18,6 +18,7 @@ static const unsigned transpose = 0; static const unsigned recv = 0^transpose, send = 1^transpose; static int OGS_MPI_SUPPORT = 0; +static int OGS_OVERLAP = 1; static int compiled = 0; typedef enum { mode_plain, mode_vec, mode_many, @@ -68,11 +69,11 @@ static void convertPwMap(const uint *restrict map, int *restrict starts, int *restrict ids) { - uint i,j; + uint i,j; int n=0, s=0; while((i=*map++)!=UINT_MAX) { // end of map starts[s] = n; - j=*map++; + j=*map++; do { ids[n] = j; n++; @@ -86,8 +87,8 @@ static void neighborAllToAll(int unit_size, oogs_t *gs) { ogs_t *ogs = gs->ogs; struct gs_data *hgs = (gs_data*) ogs->haloGshSym; - const void* execdata = hgs->r.data; - const struct pw_data *pwd = (pw_data*) execdata; + const void* execdata = hgs->r.data; + const struct pw_data *pwd = (pw_data*) execdata; { uint bufOffset = 0; @@ -110,12 +111,12 @@ static void neighborAllToAll(int unit_size, oogs_t *gs) } } - unsigned char *bufRecv = (unsigned char*)gs->o_bufRecv.ptr(); - unsigned char *bufSend = (unsigned char*)gs->o_bufSend.ptr(); + unsigned char *bufRecv = (unsigned char*)gs->o_bufRecv.ptr(); + unsigned char *bufSend = (unsigned char*)gs->o_bufSend.ptr(); if(gs->mode != OOGS_DEVICEMPI) { ogs->device.finish(); // waiting for send buffers to be ready - bufRecv = (unsigned char*)gs->bufRecv; - bufSend = (unsigned char*)gs->bufSend; + bufRecv = (unsigned char*)gs->bufRecv; + bufSend = (unsigned char*)gs->bufSend; } MPI_Neighbor_alltoallv(bufSend, gs->nbc.sendcounts, gs->nbc.senddispls, MPI_UNSIGNED_CHAR, bufRecv, gs->nbc.recvcounts, gs->nbc.recvdispls, MPI_UNSIGNED_CHAR, @@ -126,15 +127,15 @@ static void pairwiseExchange(int unit_size, oogs_t *gs) { ogs_t *ogs = gs->ogs; struct gs_data *hgs = (gs_data*) ogs->haloGshSym; - const void* execdata = hgs->r.data; - const struct pw_data *pwd = (pw_data*) execdata; + const void* execdata = hgs->r.data; + const struct pw_data *pwd = (pw_data*) execdata; const struct comm *comm = &hgs->comm; if(!gs->earlyPrepostRecv) { - unsigned char *buf = (unsigned char*)gs->o_bufRecv.ptr(); + unsigned char *buf = (unsigned char*)gs->o_bufRecv.ptr(); if(gs->mode != OOGS_DEVICEMPI) buf = (unsigned char *)gs->bufRecv; - comm_req *req = pwd->req; + comm_req *req = pwd->req; const struct pw_comm_data *c = &pwd->comm[recv]; const uint *p, *pe, *size=c->size; for(p=c->p,pe=p+c->n;p!=pe;++p) { @@ -148,10 +149,10 @@ static void pairwiseExchange(int unit_size, oogs_t *gs) unsigned char *buf = (unsigned char*)gs->o_bufSend.ptr(); if(gs->mode != OOGS_DEVICEMPI) { ogs->device.finish(); // waiting for send buffers to be ready - buf = (unsigned char*)gs->bufSend; + buf = (unsigned char*)gs->bufSend; } - comm_req *req = &pwd->req[pwd->comm[recv].n]; + comm_req *req = &pwd->req[pwd->comm[recv].n]; const struct pw_comm_data *c = &pwd->comm[send]; const uint *p, *pe, *size=c->size; for(p=c->p,pe=p+c->n;p!=pe;++p) { @@ -162,12 +163,75 @@ static void pairwiseExchange(int unit_size, oogs_t *gs) MPI_Waitall(pwd->comm[send].n + pwd->comm[recv].n, pwd->req, MPI_STATUSES_IGNORE); } } +void occaGatherScatterLocal(const dlong NlocalGather, + const dlong NrowBlocks, + occa::memory& o_bstart, + occa::memory& o_gstart, + occa::memory& o_gids, + const int Nvectors, + const dlong stride, + const char* type, + const char* op, + occa::memory& o_v) +{ +#if 1 + occaGatherScatterMany(NlocalGather, Nvectors, stride, o_gstart, + o_gids, type, op, o_v); +#else + const int Nentries = 1; + if (!strcmp(type, "float") && !strcmp(op, "add")){ + ogs::gatherScatterNewKernel_floatAdd(NrowBlocks, + Nentries, + Nvectors, + stride, + o_bstart, + o_gstart, + o_gids, + o_v); + } else if (!strcmp(type, "double") && !strcmp(op, "add")){ + ogs::gatherScatterNewKernel_doubleAdd(NrowBlocks, + Nentries, + Nvectors, + stride, + o_bstart, + o_gstart, + o_gids, + o_v); + } else if (!strcmp(type, "double") && !strcmp(op, "min")){ + ogs::gatherScatterNewKernel_doubleMin(NrowBlocks, + Nentries, + Nvectors, + stride, + o_bstart, + o_gstart, + o_gids, + o_v); + } else if (!strcmp(type, "double") && !strcmp(op, "max")){ + ogs::gatherScatterNewKernel_doubleMax(NrowBlocks, + Nentries, + Nvectors, + stride, + o_bstart, + o_gstart, + o_gids, + o_v); + } else { + printf("occaGatherScatterNewKernel: unsupported operation or datatype!\n"); + exit(1); + } +#endif +} void oogs::gpu_mpi(int val) { OGS_MPI_SUPPORT = val; } +void oogs::overlap(int val) +{ + OGS_OVERLAP = val; +} + int oogs::gpu_mpi() { return OGS_MPI_SUPPORT; @@ -195,16 +259,38 @@ void oogs::compile(const occa::device& device, std::string mode, MPI_Comm comm, if(mode == "HIP" || mode == "CUDA") { std::string fileName = DOGS; - if(mode == "CUDA") fileName += "/okl/oogs-half.cu"; - if(mode == "HIP") fileName += "/okl/oogs-half.hip"; + fileName += "/okl/"; + std::string extension; + if(mode == "CUDA") extension = ".cu"; + if(mode == "HIP") extension = ".hip"; occa::properties nativeProperties = props; nativeProperties["okl/enabled"] = false; - device.buildKernel(fileName.c_str(), "packBuf_halfAdd", nativeProperties); - device.buildKernel(fileName.c_str(), "unpackBuf_halfAdd", nativeProperties); + device.buildKernel(fileName + "oogs-half" + extension, "packBuf_halfAdd", nativeProperties); + device.buildKernel(fileName + "oogs-half" + extension, "unpackBuf_halfAdd", nativeProperties); } } compiled++; } + +void reallocBuffers(int unit_size, oogs_t *gs) +{ + ogs_t *ogs = gs->ogs; + struct gs_data *hgs = (gs_data*) ogs->haloGshSym; + const void* execdata = hgs->r.data; + const struct pw_data *pwd = (pw_data*) execdata; + + if (gs->o_bufSend.size() < pwd->comm[send].total*unit_size) { + if(gs->o_bufSend.size()) gs->o_bufSend.free(); + if(gs->h_buffSend.size()) gs->h_buffSend.free(); + gs->bufSend = (unsigned char*) ogsHostMallocPinned(ogs->device, pwd->comm[send].total*unit_size, NULL, gs->o_bufSend, gs->h_buffSend); + } + if (gs->o_bufRecv.size() < pwd->comm[recv].total*unit_size) { + if(gs->o_bufRecv.size()) gs->o_bufRecv.free(); + if(gs->h_buffRecv.size()) gs->h_buffRecv.free(); + gs->bufRecv = (unsigned char*) ogsHostMallocPinned(ogs->device, pwd->comm[recv].total*unit_size, NULL, gs->o_bufRecv, gs->h_buffRecv); + } +} + oogs_t* oogs::setup(ogs_t *ogs, int nVec, dlong stride, const char *type, std::function callback, oogs_mode gsMode) { oogs_t *gs = new oogs_t[1]; @@ -215,94 +301,101 @@ oogs_t* oogs::setup(ogs_t *ogs, int nVec, dlong stride, const char *type, std::f struct gs_data *hgs = (gs_data*) ogs->haloGshSym; const void* execdata = hgs->r.data; const struct pw_data *pwd = (pw_data*) execdata; - const unsigned unit_size = nVec*sizeof(double); // just need to be big enough to run callcack + const unsigned unit_size = std::max(nVec, 6)*sizeof(double); // just need to be big enough to run callcack gs->comm = hgs->comm.c; int rank; MPI_Comm_rank(gs->comm, &rank); - gs->rank = rank; + gs->rank = rank; gs->mode = gsMode; if(!compiled) oogs::compile(device, device.mode(), gs->comm); - if(gsMode == OOGS_DEFAULT) return gs; - gs->packBufFloatAddKernel = device.buildKernel(DOGS "/okl/oogs.okl", "packBuf_floatAdd", ogs::kernelInfo); - gs->unpackBufFloatAddKernel = device.buildKernel(DOGS "/okl/oogs.okl", "unpackBuf_floatAdd", ogs::kernelInfo); - gs->packBufDoubleAddKernel = device.buildKernel(DOGS "/okl/oogs.okl", "packBuf_doubleAdd", ogs::kernelInfo); + if(gsMode == OOGS_DEFAULT) return gs; + gs->packBufFloatAddKernel = device.buildKernel(DOGS "/okl/oogs.okl", "packBuf_floatAdd", ogs::kernelInfo); + gs->unpackBufFloatAddKernel = device.buildKernel(DOGS "/okl/oogs.okl", "unpackBuf_floatAdd", ogs::kernelInfo); + gs->packBufDoubleAddKernel = device.buildKernel(DOGS "/okl/oogs.okl", "packBuf_doubleAdd", ogs::kernelInfo); gs->unpackBufDoubleAddKernel = device.buildKernel(DOGS "/okl/oogs.okl", "unpackBuf_doubleAdd", ogs::kernelInfo); - gs->packBufDoubleMinKernel = device.buildKernel(DOGS "/okl/oogs.okl", "packBuf_doubleMin", ogs::kernelInfo); + gs->packBufDoubleMinKernel = device.buildKernel(DOGS "/okl/oogs.okl", "packBuf_doubleMin", ogs::kernelInfo); gs->unpackBufDoubleMinKernel = device.buildKernel(DOGS "/okl/oogs.okl", "unpackBuf_doubleMin", ogs::kernelInfo); - gs->packBufDoubleMaxKernel = device.buildKernel(DOGS "/okl/oogs.okl", "packBuf_doubleMax", ogs::kernelInfo); + gs->packBufDoubleMaxKernel = device.buildKernel(DOGS "/okl/oogs.okl", "packBuf_doubleMax", ogs::kernelInfo); gs->unpackBufDoubleMaxKernel = device.buildKernel(DOGS "/okl/oogs.okl", "unpackBuf_doubleMax", ogs::kernelInfo); if(device.mode() == "HIP" || device.mode() == "CUDA") { std::string fileName = DOGS; - if(device.mode() == "CUDA") fileName += "/okl/oogs-half.cu"; - if(device.mode() == "HIP") fileName += "/okl/oogs-half.hip"; + fileName += "/okl/"; + std::string extension; + if(device.mode() == "CUDA") extension = ".cu"; + if(device.mode() == "HIP") extension = ".hip"; occa::properties nativeProperties = ogs::kernelInfo; nativeProperties["okl/enabled"] = false; - gs->packBufFloatToHalfAddKernel = device.buildKernel(fileName.c_str(), "packBuf_halfAdd", nativeProperties); - gs->unpackBufHalfToFloatAddKernel = device.buildKernel(fileName.c_str(), "unpackBuf_halfAdd", nativeProperties); + gs->packBufFloatToHalfAddKernel = + device.buildKernel(fileName + "oogs-half" + extension, "packBuf_halfAdd", nativeProperties); + gs->unpackBufHalfToFloatAddKernel = + device.buildKernel(fileName + "oogs-half" + extension, "unpackBuf_halfAdd", nativeProperties); } - if(ogs->NhaloGather == 0) return gs; - - gs->bufSend = (unsigned char*) ogsHostMallocPinned(ogs->device, pwd->comm[send].total*unit_size, NULL, gs->o_bufSend, gs->h_buffSend); - int *scatterOffsets = (int*) calloc(ogs->NhaloGather+1,sizeof(int)); - int *scatterIds = (int*) calloc(pwd->comm[send].total,sizeof(int)); - convertPwMap(pwd->map[send], scatterOffsets, scatterIds); - gs->o_scatterOffsets = ogs->device.malloc((ogs->NhaloGather+1)*sizeof(int), scatterOffsets); - gs->o_scatterIds = ogs->device.malloc(pwd->comm[send].total*sizeof(int), scatterIds); - free(scatterOffsets); - free(scatterIds); - - gs->bufRecv = (unsigned char*) ogsHostMallocPinned(ogs->device, pwd->comm[recv].total*unit_size, NULL, gs->o_bufRecv, gs->h_buffRecv); - int* gatherOffsets = (int*) calloc(ogs->NhaloGather+1,sizeof(int)); - int *gatherIds = (int*) calloc(pwd->comm[recv].total,sizeof(int)); - convertPwMap(pwd->map[recv], gatherOffsets, gatherIds); - gs->o_gatherOffsets = ogs->device.malloc((ogs->NhaloGather+1)*sizeof(int), gatherOffsets); - gs->o_gatherIds = ogs->device.malloc(pwd->comm[recv].total*sizeof(int), gatherIds); - free(gatherOffsets); - free(gatherIds); - - const int reorder = 0; - int* src = (int*) calloc(pwd->comm[recv].n, sizeof(int)); - int* dst = (int*) calloc(pwd->comm[send].n, sizeof(int)); - for(int i = 0; i < pwd->comm[recv].n; ++i) { - src[i] = pwd->comm[recv].p[i]; - } - for(int i = 0; i < pwd->comm[send].n; ++i) { - dst[i] = pwd->comm[send].p[i]; - } - MPI_Dist_graph_create_adjacent(gs->comm, - pwd->comm[recv].n, src, MPI_UNWEIGHTED, - pwd->comm[send].n, dst, MPI_UNWEIGHTED, - MPI_INFO_NULL, reorder, &gs->nbc.comm); - free(src); - free(dst); - gs->nbc.sendcounts = (int*) calloc(pwd->comm[send].n, sizeof(int)); - gs->nbc.senddispls = (int*) calloc(pwd->comm[send].n, sizeof(int)); - gs->nbc.recvcounts = (int*) calloc(pwd->comm[recv].n, sizeof(int)); - gs->nbc.recvdispls = (int*) calloc(pwd->comm[recv].n, sizeof(int)); - std::list oogs_mode_list; - oogs_mode_list.push_back(OOGS_DEFAULT); - oogs_mode_list.push_back(OOGS_HOSTMPI); - if(OGS_MPI_SUPPORT && ogs->device.mode() != "Serial") { - oogs_mode_list.push_back(OOGS_DEVICEMPI);; - } + oogs_mode_list.push_back(OOGS_LOCAL); std::list oogs_modeExchange_list; oogs_modeExchange_list.push_back(OOGS_EX_PW); - oogs_modeExchange_list.push_back(OOGS_EX_NBC); + + if(ogs->NhaloGather > 0) { + gs->bufSend = (unsigned char*) ogsHostMallocPinned(ogs->device, pwd->comm[send].total*unit_size, NULL, gs->o_bufSend, gs->h_buffSend); + int *scatterOffsets = (int*) calloc(ogs->NhaloGather+1,sizeof(int)); + int *scatterIds = (int*) calloc(pwd->comm[send].total,sizeof(int)); + convertPwMap(pwd->map[send], scatterOffsets, scatterIds); + gs->o_scatterOffsets = ogs->device.malloc((ogs->NhaloGather+1)*sizeof(int), scatterOffsets); + gs->o_scatterIds = ogs->device.malloc(pwd->comm[send].total*sizeof(int), scatterIds); + free(scatterOffsets); + free(scatterIds); + + gs->bufRecv = (unsigned char*) ogsHostMallocPinned(ogs->device, pwd->comm[recv].total*unit_size, NULL, gs->o_bufRecv, gs->h_buffRecv); + int* gatherOffsets = (int*) calloc(ogs->NhaloGather+1,sizeof(int)); + int *gatherIds = (int*) calloc(pwd->comm[recv].total,sizeof(int)); + convertPwMap(pwd->map[recv], gatherOffsets, gatherIds); + gs->o_gatherOffsets = ogs->device.malloc((ogs->NhaloGather+1)*sizeof(int), gatherOffsets); + gs->o_gatherIds = ogs->device.malloc(pwd->comm[recv].total*sizeof(int), gatherIds); + free(gatherOffsets); + free(gatherIds); + const int reorder = 0; + int* src = (int*) calloc(pwd->comm[recv].n, sizeof(int)); + int* dst = (int*) calloc(pwd->comm[send].n, sizeof(int)); + for(int i = 0; i < pwd->comm[recv].n; ++i) { + src[i] = pwd->comm[recv].p[i]; + } + for(int i = 0; i < pwd->comm[send].n; ++i) { + dst[i] = pwd->comm[send].p[i]; + } + MPI_Dist_graph_create_adjacent(gs->comm, + pwd->comm[recv].n, src, MPI_UNWEIGHTED, + pwd->comm[send].n, dst, MPI_UNWEIGHTED, + MPI_INFO_NULL, reorder, &gs->nbc.comm); + free(src); + free(dst); + gs->nbc.sendcounts = (int*) calloc(pwd->comm[send].n, sizeof(int)); + gs->nbc.senddispls = (int*) calloc(pwd->comm[send].n, sizeof(int)); + gs->nbc.recvcounts = (int*) calloc(pwd->comm[recv].n, sizeof(int)); + gs->nbc.recvdispls = (int*) calloc(pwd->comm[recv].n, sizeof(int)); + + + oogs_mode_list.push_back(OOGS_DEFAULT); + oogs_mode_list.push_back(OOGS_HOSTMPI); + if(OGS_MPI_SUPPORT && ogs->device.mode() != "Serial") { + oogs_mode_list.push_back(OOGS_DEVICEMPI);; + } + oogs_modeExchange_list.push_back(OOGS_EX_NBC); + } + if(gsMode == OOGS_AUTO) { - if(gs->rank == 0) printf("timing oogs modes: "); + if(gs->rank == 0) printf("timing gs modes: "); const int Ntests = 10; double elapsedMin = std::numeric_limits::max(); - oogs_mode fastestMode; - oogs_modeExchange fastestModeExchange; - int fastestPrepostRecv; + oogs_mode fastestMode = OOGS_DEFAULT; + oogs_modeExchange fastestModeExchange = OOGS_EX_PW; + int fastestPrepostRecv = 0; char* q = (char*) calloc(std::max(stride,ogs->N)*unit_size, sizeof(char)); occa::memory o_q = device.malloc(std::max(stride,ogs->N)*unit_size, q); @@ -313,26 +406,29 @@ oogs_t* oogs::setup(ogs_t *ogs, int nVec, dlong stride, const char *type, std::f for (auto const& modeExchange : oogs_modeExchange_list) { gs->modeExchange = modeExchange; - if(gs->modeExchange == OOGS_EX_NBC && gs->mode == OOGS_DEVICEMPI) - continue; // not yet supported by some MPI implementations + if(gs->modeExchange == OOGS_EX_NBC && gs->mode == OOGS_DEVICEMPI) + continue; // not yet supported by some MPI implementations - // warum-up - gs->earlyPrepostRecv = 0; - oogs::start (o_q, nVec, stride, type, ogsAdd, gs); - if(callback) callback(); - oogs::finish(o_q, nVec, stride, type, ogsAdd, gs); - - int nPass = 1; - if(gs->modeExchange == OOGS_EX_PW) nPass = 2; + int nPass = 1; for(int pass = 0; pass < nPass; pass++) { gs->earlyPrepostRecv = pass; - if(gs->mode == OOGS_DEFAULT) { - if(!(gs->modeExchange == OOGS_EX_PW && gs->earlyPrepostRecv ==0)) continue; + // skip invalid combinations + if(gs->modeExchange != OOGS_EX_PW && gs->earlyPrepostRecv) + continue; + if(gs->mode == OOGS_DEFAULT || gs->mode == OOGS_LOCAL) { + if(gs->modeExchange != OOGS_EX_PW) continue; + if(gs->earlyPrepostRecv) continue; } - //printf("testing mode %d exchange %d earlyPrepost %d\n", gs->mode, gs->modeExchange, gs->earlyPrepostRecv); - double elapsedTest[Ntests]; +#if 0 + if(gs->rank == 0) + printf("\ntesting mode %d exchange %d earlyPrepost %d\n", + gs->mode, gs->modeExchange, gs->earlyPrepostRecv); +#endif + + // run Ntests measurements to eliminate runtime variations + double elapsedTest = std::numeric_limits::max(); for(int test=0;testcomm); @@ -342,16 +438,20 @@ oogs_t* oogs::setup(ogs_t *ogs, int nVec, dlong stride, const char *type, std::f if(callback) callback(); oogs::finish(o_q, nVec, stride, type, ogsAdd, gs); - elapsedTest[test] = MPI_Wtime() - tStart; + device.finish(); + elapsedTest = std::min(elapsedTest, MPI_Wtime() - tStart); } - MPI_Allreduce(MPI_IN_PLACE, elapsedTest, Ntests, MPI_DOUBLE, MPI_MIN, gs->comm); - const double elapsed = elapsedTest[0]; - if(gs->rank == 0) printf("%gs ", elapsed); - if(elapsed < elapsedMin){ - elapsedMin = elapsed; - fastestMode = gs->mode; - fastestModeExchange = gs->modeExchange; - fastestPrepostRecv = gs->earlyPrepostRecv; + MPI_Allreduce(MPI_IN_PLACE, &elapsedTest, 1, MPI_DOUBLE, MPI_MAX, gs->comm); + + if(gs->rank == 0) printf("%.2es ", elapsedTest); + fflush(stdout); + if(elapsedTest < elapsedMin){ + if(gs->mode != OOGS_LOCAL) { + elapsedMin = elapsedTest; + fastestMode = gs->mode; + fastestModeExchange = gs->modeExchange; + fastestPrepostRecv = gs->earlyPrepostRecv; + } } } } @@ -373,10 +473,68 @@ oogs_t* oogs::setup(ogs_t *ogs, int nVec, dlong stride, const char *type, std::f #ifdef DISABLE_OOGS gs->mode = OOGS_DEFAULT; #endif - MPI_Barrier(gs->comm); - if(gs->rank == 0) printf("used config: %d.%d.%d\n", gs->mode, gs->modeExchange, gs->earlyPrepostRecv); - return gs; + double elapsedMinMPI = std::numeric_limits::max(); + { + const int earlyPrepostRecv = gs->earlyPrepostRecv; + gs->earlyPrepostRecv = 0; + const int Ntests = 10; + size_t Nbytes; + if (!strcmp(type, "float")) + Nbytes = sizeof(float); + else if (!strcmp(type, "double")) + Nbytes = sizeof(double); + else if (!strcmp(type, "int")) + Nbytes = sizeof(int); + else if (!strcmp(type, "long long int")) + Nbytes = sizeof(long long int); + + const size_t unit_size = nVec*Nbytes; + reallocBuffers(unit_size, gs); + + device.finish(); + for(int test=0;testcomm); + const double tStart = MPI_Wtime(); + if(gs->modeExchange == OOGS_EX_NBC) + neighborAllToAll(unit_size, gs); + else + pairwiseExchange(unit_size, gs); + elapsedMinMPI = std::min(elapsedMinMPI, MPI_Wtime() - tStart); + } + gs->earlyPrepostRecv = earlyPrepostRecv; + } + + { + double nBytesExchange = (pwd->comm[send].total + pwd->comm[recv].total)*unit_size; + MPI_Allreduce(MPI_IN_PLACE, &nBytesExchange, 1, MPI_DOUBLE, MPI_SUM, gs->comm); + MPI_Allreduce(MPI_IN_PLACE, &elapsedMinMPI, 1, MPI_DOUBLE, MPI_MAX, gs->comm); + + int size; + MPI_Comm_size(gs->comm, &size); + nBytesExchange /= size; + const std::string gsModeExchangeStr = (gs->modeExchange == OOGS_EX_NBC) ? "nbc": "pw"; + const std::string gsEarlyPrepostRecvStr = (gs->earlyPrepostRecv) ? "+early": ""; + if(gs->rank == 0) { + if(ogs->NhaloGather > 0) { + std::string gsModeStr; + switch(gs->mode) { + case OOGS_DEFAULT : gsModeStr = "+host"; break; + case OOGS_HOSTMPI : gsModeStr = "+hybrid"; break; + case OOGS_DEVICEMPI: gsModeStr = "+device"; break; + } + printf("\nused config: %s%s%s ", gsModeExchangeStr.c_str(), gsEarlyPrepostRecvStr.c_str(), gsModeStr.c_str()); + if(elapsedMinMPI > MPI_Wtick()) + printf("(MPI: %.2es / bi-bw: %.1fGB/s/rank)\n", elapsedMinMPI, nBytesExchange/elapsedMinMPI/1e9); + else + printf("\n"); + } else { + printf("\nused config: local\n"); + } + } + fflush(stdout); + } + return gs; } oogs_t* oogs::setup(dlong N, hlong *ids, int nVec, dlong stride, const char *type, MPI_Comm &comm, @@ -456,29 +614,10 @@ static void unpackBuf(oogs_t *gs, } } -void reallocBuffers(int unit_size, oogs_t *gs) -{ - ogs_t *ogs = gs->ogs; - struct gs_data *hgs = (gs_data*) ogs->haloGshSym; - const void* execdata = hgs->r.data; - const struct pw_data *pwd = (pw_data*) execdata; - - if (gs->o_bufSend.size() < pwd->comm[send].total*unit_size) { - if(gs->o_bufSend.size()) gs->o_bufSend.free(); - if(gs->h_buffSend.size()) gs->h_buffSend.free(); - gs->bufSend = (unsigned char*) ogsHostMallocPinned(ogs->device, pwd->comm[send].total*unit_size, NULL, gs->o_bufSend, gs->h_buffSend); - } - if (gs->o_bufRecv.size() < pwd->comm[recv].total*unit_size) { - if(gs->o_bufRecv.size()) gs->o_bufRecv.free(); - if(gs->h_buffRecv.size()) gs->h_buffRecv.free(); - gs->bufRecv = (unsigned char*) ogsHostMallocPinned(ogs->device, pwd->comm[recv].total*unit_size, NULL, gs->o_bufRecv, gs->h_buffRecv); - } -} - -void oogs::start(occa::memory &o_v, const int k, const dlong stride, const char *_type, const char *op, oogs_t *gs) +void oogs::start(occa::memory &o_v, const int k, const dlong stride, const char *_type, const char *op, oogs_t *gs) { size_t Nbytes; - ogs_t *ogs = gs->ogs; + ogs_t *ogs = gs->ogs; const char* type = (!strcmp(_type,"floatCommHalf")) ? "float" : _type; if (!strcmp(_type, "floatCommHalf")) Nbytes = sizeof(float)/2; @@ -497,23 +636,23 @@ void oogs::start(occa::memory &o_v, const int k, const dlong stride, const char exit(-1); } - if(gs->mode == OOGS_DEFAULT) { + if(gs->mode == OOGS_DEFAULT) { if(k>1) ogsGatherScatterManyStart(o_v, k, stride, type, op, ogs); else ogsGatherScatterStart(o_v, type, op, ogs); - + return; } - if (ogs->NhaloGather) { + if (ogs->NhaloGather && gs->mode != OOGS_LOCAL) { reallocBuffers(Nbytes*k, gs); - packBuf(gs, ogs->NhaloGather, k, stride, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, + packBuf(gs, ogs->NhaloGather, k, stride, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, gs->o_scatterOffsets, gs->o_scatterIds, _type, op, o_v, gs->o_bufSend); if(gs->earlyPrepostRecv) { - unsigned char *buf = (unsigned char*)gs->o_bufRecv.ptr(); + unsigned char *buf = (unsigned char*)gs->o_bufRecv.ptr(); if(gs->mode != OOGS_DEVICEMPI) buf = (unsigned char *)gs->bufRecv; struct gs_data *hgs = (gs_data*) ogs->haloGshSym; @@ -535,10 +674,10 @@ void oogs::start(occa::memory &o_v, const int k, const dlong stride, const char } } -void oogs::finish(occa::memory &o_v, const int k, const dlong stride, const char *_type, const char *op, oogs_t *gs) +void oogs::finish(occa::memory &o_v, const int k, const dlong stride, const char *_type, const char *op, oogs_t *gs) { size_t Nbytes; - ogs_t *ogs = gs->ogs; + ogs_t *ogs = gs->ogs; const char* type = (!strcmp(_type,"floatCommHalf")) ? "float" : _type; if (!strcmp(_type, "floatCommHalf")) Nbytes = sizeof(float)/2; @@ -553,20 +692,22 @@ void oogs::finish(occa::memory &o_v, const int k, const dlong stride, const char exit(-1); } - if(gs->mode == OOGS_DEFAULT) { + if(gs->mode == OOGS_DEFAULT) { if(k>1) ogsGatherScatterManyFinish(o_v, k, stride, type, op, ogs); else ogsGatherScatterFinish(o_v, type, op, ogs); - + return; } - if(ogs->NlocalGather) - occaGatherScatterMany(ogs->NlocalGather, k, stride, ogs->o_localGatherOffsets, - ogs->o_localGatherIds, type, op, o_v); + if(ogs->NlocalGather) + occaGatherScatterLocal(ogs->NlocalGather, ogs->NrowBlocks, ogs->o_blockRowStarts, + ogs->o_localGatherOffsets, ogs->o_localGatherIds, + k, stride, type, op, o_v); - if (ogs->NhaloGather) { + if (ogs->NhaloGather && gs->mode != OOGS_LOCAL) { + if(!OGS_OVERLAP) ogs->device.finish(); ogs->device.setStream(ogs::dataStream); struct gs_data *hgs = (gs_data*) ogs->haloGshSym; @@ -586,7 +727,7 @@ void oogs::finish(occa::memory &o_v, const int k, const dlong stride, const char if(gs->mode == OOGS_HOSTMPI) gs->o_bufRecv.copyFrom(gs->bufRecv,pwd->comm[recv].total*Nbytes*k, 0, "async: true"); - unpackBuf(gs, ogs->NhaloGather, k, stride, gs->o_gatherOffsets, gs->o_gatherIds, + unpackBuf(gs, ogs->NhaloGather, k, stride, gs->o_gatherOffsets, gs->o_gatherIds, ogs->o_haloGatherOffsets, ogs->o_haloGatherIds, _type, op, gs->o_bufRecv, o_v); ogs->device.finish(); @@ -631,6 +772,6 @@ void oogs::destroy(oogs_t *gs) gs->packBufFloatToHalfAddKernel.free(); gs->unpackBufHalfToFloatAddKernel.free(); - + free(gs); } diff --git a/3rd_party/hypre/src/IJ_mv/HYPRE_IJ_mv.h b/3rd_party/hypre/src/IJ_mv/HYPRE_IJ_mv.h index 269613f4d..bc9302b53 100644 --- a/3rd_party/hypre/src/IJ_mv/HYPRE_IJ_mv.h +++ b/3rd_party/hypre/src/IJ_mv/HYPRE_IJ_mv.h @@ -43,6 +43,8 @@ struct hypre_IJMatrix_struct; **/ typedef struct hypre_IJMatrix_struct *HYPRE_IJMatrix; +#pragma GCC visibility push(default) + /** * Create a matrix object. Each process owns some unique consecutive * range of rows, indicated by the global row indices {\tt ilower} and @@ -327,6 +329,8 @@ HYPRE_Int HYPRE_IJMatrixRead(const char *filename, HYPRE_Int HYPRE_IJMatrixPrint(HYPRE_IJMatrix matrix, const char *filename); +#pragma GCC visibility pop + /*@}*/ /*-------------------------------------------------------------------------- @@ -343,6 +347,8 @@ struct hypre_IJVector_struct; **/ typedef struct hypre_IJVector_struct *HYPRE_IJVector; +#pragma GCC visibility push(default) + /** * Create a vector object. Each process owns some unique consecutive * range of vector unknowns, indicated by the global indices {\tt @@ -490,6 +496,8 @@ HYPRE_Int HYPRE_IJVectorRead(const char *filename, HYPRE_Int HYPRE_IJVectorPrint(HYPRE_IJVector vector, const char *filename); +#pragma GCC visibility pop + /*@}*/ /*@}*/ diff --git a/3rd_party/hypre/src/examples/CMakeLists.txt b/3rd_party/hypre/src/examples/CMakeLists.txt new file mode 100644 index 000000000..505d55d0e --- /dev/null +++ b/3rd_party/hypre/src/examples/CMakeLists.txt @@ -0,0 +1,40 @@ +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + +set(EXAMPLE_SRCS + ex1.c + ex2.c + ex3.c + ex4.c + ex5.c + ex6.c + ex7.c + ex8.c + ex9.c + ex11.c + ex12.c + ex13.c + ex14.c + ex15.c + ex16.c + ex17.c + ex18.c +) + +if (HYPRE_BIGINT) + list(APPEND EXAMPLE_SRCS + ex5big.c + ex15big.c + ) +endif() + +if (HYPRE_COMPLEX) + list(APPEND EXAMPLE_SRCS + ex18comp.c + ) +endif() + +add_hypre_executables(EXAMPLE_SRCS) + diff --git a/3rd_party/hypre/src/examples/Makefile b/3rd_party/hypre/src/examples/Makefile new file mode 100644 index 000000000..beab033c6 --- /dev/null +++ b/3rd_party/hypre/src/examples/Makefile @@ -0,0 +1,235 @@ +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + +######################################################################## +# Compiler and external dependences +######################################################################## +CC = mpicc +F77 = mpif77 +CXX = mpicxx +F90 = mpifort +HYPRE_DIR = ../hypre + +######################################################################## +# Compiling and linking options +######################################################################## +COPTS = -g -Wall +CINCLUDES = -I$(HYPRE_DIR)/include +CDEFS = -DHAVE_CONFIG_H -DHYPRE_TIMING +CFLAGS = $(COPTS) $(CINCLUDES) $(CDEFS) +FOPTS = -g +FINCLUDES = $(CINCLUDES) +FFLAGS = $(FOPTS) $(FINCLUDES) +CXXOPTS = $(COPTS) -Wno-deprecated +CXXINCLUDES = $(CINCLUDES) -I.. +CXXDEFS = $(CDEFS) +IFLAGS_BXX = +CXXFLAGS = $(CXXOPTS) $(CXXINCLUDES) $(CXXDEFS) $(IFLAGS_BXX) +IF90FLAGS = +F90FLAGS = $(FFLAGS) $(IF90FLAGS) + + +LINKOPTS = $(COPTS) +LIBS = -L$(HYPRE_DIR)/lib -lHYPRE -lm +LFLAGS = $(LINKOPTS) $(LIBS) -lstdc++ +LFLAGS_B =\ + -L${HYPRE_DIR}/lib\ + -lbHYPREClient-C\ + -lbHYPREClient-CX\ + -lbHYPREClient-F\ + -lbHYPRE\ + -lsidl -ldl -lxml2 +LFLAGS77 = $(LFLAGS) +LFLAGS90 = + +######################################################################## +# Rules for compiling the source files +######################################################################## +.SUFFIXES: .c .f .cxx .f90 + +.c.o: + $(CC) $(CFLAGS) -c $< +.f.o: + $(F77) $(FFLAGS) -c $< +.cxx.o: + $(CXX) $(CXXFLAGS) -c $< + +######################################################################## +# List of all programs to be compiled +######################################################################## +ALLPROGS = ex1 ex2 ex3 ex4 ex5 ex5f ex6 ex7 ex8 ex9 ex11 ex12 ex12f \ + ex13 ex14 ex15 ex16 +BIGINTPROGS = ex5big ex15big +FORTRANPROGS = ex5f ex12f +MAXDIMPROGS = ex17 ex18 +COMPLEXPROGS = ex18comp + +all: $(ALLPROGS) + +default: all + +bigint: $(BIGINTPROGS) + +fortran: $(FORTRANPROGS) + +maxdim: $(MAXDIMPROGS) + +complex: $(COMPLEXPROGS) + +######################################################################## +# Example 1 +######################################################################## +ex1: ex1.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 2 +######################################################################## +ex2: ex2.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 3 +######################################################################## +ex3: ex3.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 4 +######################################################################## +ex4: ex4.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 5 +######################################################################## +ex5: ex5.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 5 with 64-bit integers +######################################################################## +ex5big: ex5big.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 5 Fortran 77 +######################################################################## +ex5f: ex5f.o + $(F77) -o $@ $^ $(LFLAGS77) + +######################################################################## +# Example 6 +######################################################################## +ex6: ex6.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 7 +######################################################################## +ex7: ex7.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 8 +######################################################################## +ex8: ex8.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 9 +######################################################################## +ex9: ex9.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 10 +######################################################################## +ex10: ex10.o + $(CXX) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 11 +######################################################################## +ex11: ex11.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 12 +######################################################################## +ex12: ex12.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 12 Fortran 77 +######################################################################## +ex12f: ex12f.o + $(F77) -o $@ $^ $(LFLAGS77) + +######################################################################## +# Example 13 +######################################################################## +ex13: ex13.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 14 +######################################################################## +ex14: ex14.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 15 +######################################################################## +ex15: ex15.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 15 with 64-bit integers +######################################################################## +ex15big: ex15big.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 16 +######################################################################## +ex16: ex16.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 17 +######################################################################## +ex17: ex17.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 18 +######################################################################## +ex18: ex18.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Example 18 (complex) +######################################################################## +ex18comp: ex18comp.o + $(CC) -o $@ $^ $(LFLAGS) + +######################################################################## +# Clean up +######################################################################## +clean: + rm -f $(ALLPROGS:=.o) + rm -f $(BIGINTPROGS:=.o) + rm -f $(FORTRANPROGS:=.o) + rm -f $(MAXDIMPROGS:=.o) + rm -f $(COMPLEXPROGS:=.o) + cd vis; make clean +distclean: clean + rm -f $(ALLPROGS) $(ALLPROGS:=*~) + rm -f $(BIGINTPROGS) $(BIGINTPROGS:=*~) + rm -f $(FORTRANLPROGS) $(FORTRANPROGS:=*~) + rm -f $(MAXDIMPROGS) $(MAXDIMPROGS:=*~) + rm -f $(COMPLEXPROGS) $(COMPLEXPROGS:=*~) + rm -fr README* diff --git a/3rd_party/hypre/src/examples/docs/2d-bi.htm b/3rd_party/hypre/src/examples/docs/2d-bi.htm new file mode 100644 index 000000000..715324cd9 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/2d-bi.htm @@ -0,0 +1,13 @@ +Example Codes +

2D Biharmonic Equation Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/2d-con-reac-diff.htm b/3rd_party/hypre/src/examples/docs/2d-con-reac-diff.htm new file mode 100644 index 000000000..65dba4c19 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/2d-con-reac-diff.htm @@ -0,0 +1,14 @@ +Example Codes +

2D Convection-Reaction-Diffusion Example Codes

+ +
+ + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/2d-laplace-eig.htm b/3rd_party/hypre/src/examples/docs/2d-laplace-eig.htm new file mode 100644 index 000000000..2ebdac118 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/2d-laplace-eig.htm @@ -0,0 +1,13 @@ +Example Codes +

2D Laplace Eigenproblem Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/2d-laplace.htm b/3rd_party/hypre/src/examples/docs/2d-laplace.htm new file mode 100644 index 000000000..a80c9b2ac --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/2d-laplace.htm @@ -0,0 +1,21 @@ +Example Codes +

2D Laplace Example Codes

+ +
+ + + + + + + + + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/3d-maxwell.htm b/3rd_party/hypre/src/examples/docs/3d-maxwell.htm new file mode 100644 index 000000000..c46a855ad --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/3d-maxwell.htm @@ -0,0 +1,13 @@ +Example Codes +

3D Definite Maxwell Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/5pt.htm b/3rd_party/hypre/src/examples/docs/5pt.htm new file mode 100644 index 000000000..f4b2e22ed --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/5pt.htm @@ -0,0 +1,21 @@ +Example Codes +

5-Point Stencil Example Codes

+ +
+ + + + + + + + + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/9pt.htm b/3rd_party/hypre/src/examples/docs/9pt.htm new file mode 100644 index 000000000..6531b2468 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/9pt.htm @@ -0,0 +1,13 @@ +Example Codes +

9-Point Stencil Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/Makefile b/3rd_party/hypre/src/examples/docs/Makefile new file mode 100644 index 000000000..573ec0cc5 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/Makefile @@ -0,0 +1,15 @@ +######################################################################## +# hypre examples documentation +######################################################################## + +EXAMPLES_DIR = .. + +all: + @./build_doc.sh $(EXAMPLES_DIR) + +clean: + rm -f *.html + +distclean: clean + rm -f *~ + rm -rf ../*.html ../README_files diff --git a/3rd_party/hypre/src/examples/docs/README b/3rd_party/hypre/src/examples/docs/README new file mode 100644 index 000000000..5e7c766a7 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/README @@ -0,0 +1,46 @@ +1. Requirements. + +- The code should be well documented with references to the + user manual. + +- The user should be able to download and build the examples + independently of hypre (i.e. assuming only that hypre was + installed in some directory $HYPRE_DIR). In particular, no + assumptions for the locations of the examples or hypre + directories should be made. + +- The examples should mimic an application code, so a user + can pick one of them and use it as a starting template. + + +2. Adding a new example code. + +- Use the following naming convention: ex.c + +- Edit the Makefile. + +- Test if everything builds and runs correctly. + +- Documentation: create ex.htm in the docs directory, and include + it in the appropriate Interface, Equation, Discretization or Solver + documentation files. Add a short description in index.htm. Then do + "make" to build the documentation in the README_files directory and + create/update the README.html in the examples directory. + + NOTE: To clarify, the docs directory is NOT distributed + to users. To test what is in the distribution, do "make" + in the docs directory. This updates the examples/README_files + directory (which IS distributed) and the file README.html + which is created in the *examples* directory (from docs/index.htm + - which is why the links in index.htm do not - and should not - + work in the docs directory) + +3. Maintaining, updating and testing. + +- Changes in hypre interface should be reflected in the examples. + +- The outputs of some example runs should be kept as benchmarks. + If changes in hypre or the example itself result in different + output, this should be recorded. + +- The examples are part of hypre's regression testing. diff --git a/3rd_party/hypre/src/examples/docs/amg.htm b/3rd_party/hypre/src/examples/docs/amg.htm new file mode 100644 index 000000000..f72ad6f6c --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/amg.htm @@ -0,0 +1,18 @@ +Example Codes +

BoomerAMG Example Codes

+ +
+ + + + + + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/build_doc.sh b/3rd_party/hypre/src/examples/docs/build_doc.sh new file mode 100755 index 000000000..43f8ed796 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/build_doc.sh @@ -0,0 +1,44 @@ +#!/bin/sh + +examplesdir="$1" +currentdir=`pwd` + +# Create the README_files directory +if [ ! -d $examplesdir/README_files ]; then + mkdir $examplesdir/README_files +fi + +# Syntax highlighting +cd $examplesdir/README_files +for target in `ls ../*.c`; do + $currentdir/code2html.perl -l c -n -o html $target $target.html + mv $target.html . +done +for target in `ls ../*.f*`; do + $currentdir/code2html.perl -l f -n -o html $target $target.html + mv $target.html . +done +for target in `ls ../*.cxx`; do + $currentdir/code2html.perl -l c++ -n -o html $target $target.html + mv $target.html . +done +cd $currentdir + +# Copy the example files +for file in `ls ex*.htm`; do + cp -fp "$file" "$file"l +done + +# Replace the server side includes +for file in `ls *.htm`; do + $currentdir/replace-ssi.perl "$file" > $examplesdir/README_files/"$file"l +done + +# Copy images +cp -fp *.gif $examplesdir/README_files + +# Remove the html example files +rm -f ex*.html + +# Rename index.html +mv $examplesdir/README_files/index.html $examplesdir/README.html diff --git a/3rd_party/hypre/src/examples/docs/c++.htm b/3rd_party/hypre/src/examples/docs/c++.htm new file mode 100644 index 000000000..10edd4a75 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/c++.htm @@ -0,0 +1,13 @@ +Example Codes +

C++ Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/c.htm b/3rd_party/hypre/src/examples/docs/c.htm new file mode 100644 index 000000000..c4bcdb845 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/c.htm @@ -0,0 +1,26 @@ +Example Codes +

C Example Codes

+ +
+ + + + + + + + + + + + + + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/code2html.perl b/3rd_party/hypre/src/examples/docs/code2html.perl new file mode 100755 index 000000000..093d18f08 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/code2html.perl @@ -0,0 +1,3876 @@ +#!/usr/bin/perl -w +my $vernr = "0.9.1"; +my $monthshort = "Jan"; +my $monthlong = "Jan"; +my $year = "2002"; +######################################################################## +# # +# Code2HTML # +# --------- # +# # +# Code2Html, peter@palfrader.org # +# # +# $Date$ +# $Revision$ +# $Id$ +# # +# AUTHOR # +# Peter Palfrader. Written in 1999, 2000, 2001, 2002. # +# A lot of other people. See CREDITS file. # +# # +# DESCRIPTION # +# code2html is a perlscript which converts a program # +# source code to syntax highlighted HTML by applying a set # +# of regular expressions depending on the language # +# the source code is written. # +# # +# see the man-page for details, # +# # +######################################################################## + +use strict; +use Getopt::Long; + +my $FILES_DISALLOWED_IN_CGI = 1; +# you may set this to false to allow file reading from your hd in +# cgi mode. This may be not good if your httpd runs as 'root' (yes, I've +# seen this!) and so any user could with some knowledge easily read +# your /etc/shadow for example! +my $FILES_REDIRECT_DISALLOWED = 1; +my $LANG_TEST_LENGTH = 1024; + + +# PP: I think Compress::Zlib could be nice for this. but it's not very widespread :( +# PP: A hash would be nicer but then it would not possible to get the keys in this very order (AFAIK) +# PP: If names contain meta characters, then those must be metaquoted (if you don't want the meta chars to be meta chars of course) +my @CGI_ENCODING = ( + ['bzip2' , '/usr/bin/bzip2' , '--stdout' ], + ['gzip' , '/bin/gzip' , '--stdout' ], + ['compress' , '/usr/bin/compress' , '-c' ] + ); + + + +# undefine the input record separator so everything gets loaded in one turn +undef $/; + + + +my $pure_version_message = "Code2Html, version $vernr, $monthshort $year, peter\@palfrader.org"; +my $version_message = "$pure_version_message\n"; + +my $short_short_help = "Try `code2html --help' for more information.\n"; +my $short_help = +"$pure_version_message +Usage: code2html [options] [input_file [output_file]] + +Convert a program source to syntax highlighted HTML, +or any other format for wich rules are defined. + +-l, --language-mode set language mode + --fallback LANG fallback language mode +-v, --verbose prints progress information to STDER +-n, --linenumbers print out the source code with line numbers +-P, --prefix optional prefix to use for linenumber anchors +-N, --linknumbers linenumbers will link to themselves +-t, --replace-tabs[=TABSTOP-WIDTH] + replace with spaces +-L, --language-file=LANGUAGE-FILE + specify an alternate file for definitions +-m, --modes print all available modes +-h, --help print this message +-V, --version print version +-c, --content-type prints a Content-Type header +-o, --output-format selects the output-format +-H, --no-header don't use the template + --template=FILE override template +-T, --title set title + +-w, --linewidth max characters per line +-b, --linebreakprefix prefix of the new lines + +see the man-page code2html for further help +"; + + + + + +my $USE_CGI_FOR_ERRORS = 0; # is switched on in parse params if necessary +$SIG{'__DIE__'} = + sub { + if ($USE_CGI_FOR_ERRORS) { print "Content-Type: text/plain\n\n", $0, ': ', $_[0], "\n"; } + else { print STDERR $0, ': ', $_[0]; }; + exit 1; + }; + +$SIG{'__WARN__'} = + sub { + unless ($USE_CGI_FOR_ERRORS) { print STDERR $0.': '.$_[0]; }; + }; + + + + + + + + + +my $DEFAULT_OUTPUTFORMAT='html'; +my $DEFAULT_OUTPUTFORMAT_IN_CGI='html'; +my $ENTITIES; +my %ENTITIES; + + +my %params = &parse_params; +if ($params{'what_to_do'} eq 'patch_html') { &patch_html(\%params) } +elsif ($params{'what_to_do'} eq 'normal' ) { &main(\%params) } +else { die("I don't know what to do :(\n") }; + + + + + + + + + + +sub main + { + my %params = %{shift()}; + + + print STDERR "getting patterns...\n" if ($params{'verbose'}); + # building up the database + # newer entries overwrite old ones + my @CONFIG_FILES; + push @CONFIG_FILES, "/etc/code2html.config"; + push @CONFIG_FILES, $ENV{'HOME'}."/.code2html.config" if (defined($ENV{'HOME'})); + push @CONFIG_FILES, split(/:/,$ENV{'CODE2HTML_CONFIG'}) if ($ENV{'CODE2HTML_CONFIG'}); + push @CONFIG_FILES, split(/:/,$params{'langfile'}) if defined($params{'langfile'}); + + my %STYLESHEET = %{ &get_default_stylesheet } ; + my %LANGUAGE = %{ &get_default_database } ; + + for (@CONFIG_FILES) { + if ( -r $_){ + # if I use `do $_` instead of scalar eval... %LANGUAGE is not exported and imported correctly (read: at all) (PP) + unless (scalar eval `cat $_`) { + warn "couldn't parse $_: $@" if $@; + }; + }; + }; + + + + + if (defined($params{'modes'}) && $params{'modes'}) + { + print "Defined modes: "; + print join( ', ', sort keys %LANGUAGE ), ".\n" ; + print "Defined outputformats: "; + print join( ', ', sort keys %STYLESHEET ), ".\n" ; + exit; + }; + + + + + + # set outputformat + die "Outputformat $params{'outputformat'} not defined" unless defined $STYLESHEET{$params{'outputformat'}}; + my %STYLE = % { $STYLESHEET{$params{'outputformat'}} }; + + # load alternate template if given + if (($params{'template'} ne "") && ( ! $params{'noheader'} )) { + open (FILE, $params{'template'}) || die ("Could not open template file $params{'template'}: $!"); + $STYLE{'template'} = ; + close (FILE); + }; + + # set up the global ENTITIES variables ( the scalar and the hash ) from the STYLE definition + $ENTITIES = $ { $STYLE{'entities'} }{'listofchars'}; + %ENTITIES = % { $ { $STYLE{'entities'} }{'replace_by' } }; + + # modify the header and footer so that the template variables are set correcly + unless ($STYLE{'template'} =~ /^(.*)%%code%%(.*)$/s) { + die "template does not contain a %%code%% variable"; + }; + $STYLE{'header'} = $1; + $STYLE{'footer'} = $2; + $STYLE{'header'} =~ s/%%title%%/$params{'title'}/g; + $STYLE{'footer'} =~ s/%%title%%/$params{'title'}/g; + $STYLE{'header'} =~ s/%%version%%/$vernr/g; + $STYLE{'footer'} =~ s/%%version%%/$vernr/g; + + + + # load the input file and set params{'langmode'} if it is not already. this is done by probing a + # set of rules defined in %LANGUAGE + my $code_ref; + print STDERR "loading input file...\n" if ($params{'verbose'}); + $code_ref = &get_input_file(\%params, \%LANGUAGE, $params{'langmode'}, $params{'alt_langmode'}); + + # select the rules for out language. + my $language_rules_ref = $LANGUAGE{ lc($params{'langmode'}) }->{'patterns'}; + + print STDERR "applying stylesheet...\n" if ($params{'verbose'}); + # Apply the Stylesheets + # set 'starttag' and 'endtag' for every rule according to its 'style' value + # the tags are defined in the stylesheet + &apply_stylesheets_to_rules( $language_rules_ref, \%STYLE ); + + print STDERR "outputting headers...\n" if ($params{'verbose'}); + &put_headers(\%params, \%STYLE); + + my $snippetlist_ref = [] ; + print STDERR "creating snippet-list...\n" if $params{'verbose'}; + &create_snippetlist( $language_rules_ref, $$code_ref, $snippetlist_ref, \%STYLE); + + print STDERR "outputting file...\n" if $params{'verbose'}; + return &put_output(\%params, $snippetlist_ref, \%STYLE); +} + + + + + +sub patch_html + { + my %params = %{shift()}; + my $code; + + open(FILEHANDLE, $params{'infile'}) || die("While opening '$params{'infile'}' for input: ".$!."\n"); + $code = ; + close(FILEHANDLE); + + $code =~ s/.*?//gs; + my $counter=0; + my @chunks = split ( /()/s , $code); + + $code = ''; + for (@chunks) + { + $code .= $_; + if ($_ =~ //s) + { + my $cmdline = $1; + my $input = $2; + $cmdline =~ s/^[ \t]*//g; + $cmdline =~ s/[ \t]*$//g; + @ARGV = split ( / / , $cmdline); + my %new_params = &parse_params; + + + $new_params{'input'} = $input if ($new_params{'infile'} eq "-"); + + + undef $new_params{'outfile'}; + ++$counter; + $new_params{'line_number_prefix'} = $counter unless (defined $new_params{'line_number_prefix'}); + + $new_params{'verbose'} = $params{'verbose'}; + + my $no_header = $new_params{'noheader'}; + $new_params{'noheader'} = 1; + $new_params{'dont_print_output'} = 1; + + if ($no_header) + { + $code .= ''.. + &main(\%new_params). + ''; + } + else + { + $code .= '
'.
+			  &main(\%new_params).
+			    '
'; + }; + }; + }; + + + open(FILEHANDLE, '>'.$params{'outfile'}) || die("While opening '$params{'outfile'}' for output: ".$!."\n"); + print FILEHANDLE $code; + close(FILEHANDLE); + }; + + + + + + +##################################################################### +################### get_input_data ################################## +##################################################################### +# Reads the input data for the cgi script. +# in : nothing +# out: a hash with the input data +sub get_input_data + { + my $input_data; + my %f; + if($ENV{'REQUEST_METHOD'} eq 'GET') { $input_data = $ENV{'QUERY_STRING'}; } + else { read(STDIN, $input_data, $ENV{'CONTENT_LENGTH'}); }; + + + if ($ENV{'CONTENT_TYPE'} =~ m/^multipart\/form-data; boundary=(.*)$/i) + { + my $boundary = quotemeta($1); + my @blocks = split(/$boundary/, $input_data); + + for (@blocks) + { + if (my $dummy = m/name="(.*?)"/i) + { + my $name = $1; + $_ =~ s/\r\n/\n/g; + m/\n\n(.*)\n/s; + my $value = $1; + $f{$name}=$value; + }; + }; + } + elsif ($ENV{'CONTENT_TYPE'} =~ m/^multipart\/form-data;$/i) # if the boundary is not in the enviroment variable we'll guess + { + my $dummy = $input_data =~ m/^(.*?)(\n|\r)/; + my $boundary = $1; + + my @blocks = split(/$boundary/, $input_data); + + for (@blocks) + { + if (my $dummy = m/name="(.*?)"/i) + { + my $name = $1; + $_ =~ s/\r\n/\n/g; + m/\n\n(.*)\n/s; + my $value = $1; + $f{$name}=$value; + }; + }; + } + else + { + my @form_fields = split(/&/, $input_data); + + for (@form_fields) + { + my ($name, $value) = split(/=/, $_); + $value =~ tr/+/ /; + $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; + + $f{$name} = $value; + } + }; + + return %f; + }; + +################################################################################ +####################### parse_params ########################################### +################################################################################ +sub parse_params + { + my %RESULT; + + if (defined($ENV{'GATEWAY_INTERFACE'}) && (!scalar(@ARGV))) # if there is a CGI enviroment and no parameters/options given + { + $USE_CGI_FOR_ERRORS = 1; + $RESULT{'content-type'} = 1; + $RESULT{'what_to_do'} = 'normal'; + + my %input = &get_input_data; + + $input{'input-selector'} = $input{'input_selector'} unless (defined $input{'input-selector'}); + $input{'no-encoding'} = $input{'no_encoding'} unless (defined $input{'no-encoding'}); + $input{'line-numbers'} = $input{'line_numbers'} unless (defined $input{'line-numbers'}); + $input{'replace-tabs'} = $input{'replace_tabs'} unless (defined $input{'replace-tabs'}); + $input{'language-mode'} = $input{'language_mode'} unless (defined $input{'language-mode'}); + $input{'cgi-input1'} = $input{'cgi_input1'} unless (defined $input{'cgi-input1'}); + $input{'cgi-input2'} = $input{'cgi_input2'} unless (defined $input{'cgi-input2'}); + + if ($input{'input-selector'} =~ /^cgi[-_]input[12]$/ ) + { + my $input_selector = $input{'input-selector'}; + die("CGI parse error: $input_selector does not exist!") unless (defined $input{$input_selector}); + $RESULT{'input'} = $input{$input_selector}; + $RESULT{'title'} = 'code2html result of cgi input form'; + } + elsif ($input{'input-selector'} eq "file") + { + die('CGI parse error: option not supported due to security reasons!') if ($FILES_DISALLOWED_IN_CGI); + die('CGI parse error: filename not defined!') unless (defined $input{'filename'}); + $RESULT{'infile'} = $input{'filename'}; + $RESULT{'title'} = $RESULT{'infile'}; + } + elsif ($input{'input-selector'} eq "REDIRECT_URL") + { + die('CGI parse error: option not supported due to security reasons!') if ($FILES_REDIRECT_DISALLOWED); + die('CGI parse error: ENV: REDIRECT_URL not defined!') unless (defined $ENV{'REDIRECT_URL'}); + $RESULT{'infile'} = $ENV{'DOCUMENT_ROOT'}.$ENV{'REDIRECT_URL'}; + $RESULT{'title'} = $RESULT{'infile'}; + } + else + { + die('CGI parse error: input selector not given!'); + }; + + if ((!defined ($input{'no-encoding'})) || $input{'no-encoding'}) + { + for (@CGI_ENCODING) + { + if ( ($ENV{'HTTP_ACCEPT_ENCODING'} =~ m/\b $_->[0] \b/x) && # PP: if supported by the browser + (-x $_->[1]) ) # PP: and executable by the script + { + $RESULT{'encoding'} = $_->[0]; + $RESULT{'encoder' } = $_->[1] .' '. $_->[2]; + last; + }; + } + }; + + $RESULT{'linenumbers'} = 'none'; + if ($input{'line-numbers'} eq "yes") { $RESULT{'linenumbers'} = 'normal'; }; + if ($input{'line-numbers'} eq "link") { $RESULT{'linenumbers'} = 'linked'; }; + if (defined($input{'replace_tabs'})) { $RESULT{'replacetabs'} = $input{'replace-tabs'} }; + if (defined($input{'fallback'})) { $RESULT{'alt_langmode'} = $input{'fallback'} }; + if (defined($input{'language_mode'})) { $RESULT{'langmode'} = $input{'language-mode'} }; + if (defined($input{'title'})) { $RESULT{'title'} = $input{'title'} }; + + $RESULT{'content_type'} = 1; + $RESULT{'outputformat'} = $DEFAULT_OUTPUTFORMAT_IN_CGI; + $RESULT{'outfile'} = '-'; + } + else + { + my $verbose = 0; + my $linenumbers = 0; + my $linknumbers = 0; + my $replace_tabs = 0; + my $language_file = ''; + my $language_mode = ''; + my $modes = 0; + my $fallback = ''; + my $help = 0; + my $version = 0; + my $content_type = 0; + my $no_header = 0; + my $outputformat = $DEFAULT_OUTPUTFORMAT; + my $template = ''; + my $title = "__NOTHING__$$"; # some magix ;( + my $prefix = undef; + my $linewidth = undef; + my $linebreakprefix = undef; + my $linebreakprefixdefault = '» '; + + my $patch_html; + + + # Get Options does not like - as a parameters (used for STDIN and STDOUT) + # So we're using a stupid magix again + @ARGV = map { $_ eq '-' ? "__STD__$$" : $_ } @ARGV; + + Getopt::Long::config('bundling'); + unless ( GetOptions( + "--verbose" , \$verbose , + "-v" , \$verbose , + + "--linenumbers" , \$linenumbers , + "-n" , \$linenumbers , + + "--linknumbers" , \$linknumbers , + "-N" , \$linknumbers , + + "--prefix=s" , \$prefix , + "-P=s" , \$prefix , + + "--replace-tabs=i" , \$replace_tabs , + "--replace_tabs=i" , \$replace_tabs , + "-t=i" , \$replace_tabs , + + "--language-file=s" , \$language_file , + "--language_file=s" , \$language_file , + "-L=s" , \$language_file , + + "--language-mode=s" , \$language_mode , + "--language_mode=s" , \$language_mode , + "-l=s" , \$language_mode , + + "--title=s" , \$title , + "-T=s" , \$title , + + "--modes" , \$modes , + "-m" , \$modes , + + "--fallback=s" , \$fallback , + + "--output=s" , \$outputformat , + "-o=s" , \$outputformat , + + "--template=s" , \$template , + + "--help" , \$help , + "-h" , \$help , + + "--version" , \$version , + "-V" , \$version , + + "--content-type" , \$content_type , + "--content_type" , \$content_type , + "-c" , \$content_type , + + "--no-header" , \$no_header , + "--no_header" , \$no_header , + "-H" , \$no_header , + + + "--patch-html" , \$patch_html , + "--patch_html" , \$patch_html , + "-p" , \$patch_html , + + "--linewidth=i" , \$linewidth , + "-w=i" , \$linewidth , + "--linebreakprefix=s" , \$linebreakprefix , + "-b=s" , \$linebreakprefix , + ) + ) + { + print STDERR $short_short_help; + exit 1; + } + + #reversing magix + @ARGV = map { $_ eq "__STD__$$" ? '-' : $_ } @ARGV; + + if ($help) { print STDERR $short_help; exit 0; }; + if ($version) { print $version_message; exit 0; }; + + if ($patch_html) + { + $RESULT{'what_to_do'} = 'patch_html'; + $RESULT{'verbose'} = $verbose; + + if (!defined ($RESULT{'infile'} = shift(@ARGV))) { $RESULT{'infile'} = '-' }; + if (!defined ($RESULT{'outfile'} = shift(@ARGV))) { $RESULT{'outfile'} = $RESULT{'infile'}}; + if (defined (shift(@ARGV))) { print STDERR "too many parameters!\n"; + print STDERR $short_help; + exit 1; + }; + } + else + { + $RESULT{'what_to_do'} = 'normal'; + + $RESULT{'verbose'} = $verbose; + if ($linknumbers) { $RESULT{'linenumbers'} = 'linked' } + elsif ($linenumbers) { $RESULT{'linenumbers'} = 'normal' } + else { $RESULT{'linenumbers'} = 'none' }; + $RESULT{'line_number_prefix'} = $prefix; + $RESULT{'replacetabs'} = $replace_tabs; + $RESULT{'langfile'} = $language_file; + $RESULT{'modes'} = $modes; + $RESULT{'alt_langmode'} = $fallback; + $RESULT{'content_type'} = $content_type; + $RESULT{'noheader'} = $no_header; + $RESULT{'langmode'} = $language_mode; + $RESULT{'template'} = $template; + $RESULT{'outputformat'} = $outputformat; + $RESULT{'linewidth'} = $linewidth; + $RESULT{'linebreakprefix'}= $linebreakprefix; + + if (defined ($RESULT{'linebreakprefix'}) && + !defined ($RESULT{'linewidth'})) { + printf (STDERR "--linebreakprefix|-b does not make sense without --linewidth|-w!\n"); + print STDERR $short_help; + exit 1; + } + if (defined ($RESULT{'linewidth'})) { + if ($RESULT{'linewidth'} <= 0) { + printf (STDERR "linewidth must be greater then 0!\n"); + print STDERR $short_help; + exit 1; + } + if (!defined ($RESULT{'linebreakprefix'})) { + $RESULT{'linebreakprefix'} = $linebreakprefixdefault; + } + } + + if (!defined ($RESULT{'infile'} = shift(@ARGV))) { $RESULT{'infile'} = '-'}; + if (!defined ($RESULT{'outfile'} = shift(@ARGV))) { $RESULT{'outfile'} = '-'}; + if (defined (shift(@ARGV))) { print STDERR "too many parameters!\n"; + print STDERR $short_help; + exit 1; + }; + }; + #the magix again + $RESULT{'title'} = $title eq "__NOTHING__$$" ? ($RESULT{'infile'} eq '-' ? 'STDIN' : $RESULT{'infile'}) : $title; + }; + + + return %RESULT; + }; + + +################################################################################ +####################### checkTabulator ######################################### +################################################################################ +sub checkTabulator +{ + my ($line, $TABSTOP) = @_; + + while ((my $at = index($line, "\t")) != -1) + { + my $cnt = ($TABSTOP - ($at % $TABSTOP)); + my $replace_with = ' ' x $cnt if ($cnt); + $line =~ s/\t/$replace_with/; + }; + + return $line; +} + +################################################################################ +####################### splitLine ############################################## +################################################################################ +sub splitLine +{ + my ($line, $linewidth, $prefix) = @_; + + my $length = length ($line); + my $pos = 0; + + while ($length - $pos > $linewidth) + { + my $maxoff = ($pos + $linewidth > $length) ? ($length - 1) + : ($pos + $linewidth); + my $newpos = rindex ($line, " ", $maxoff); + if ($newpos > $pos) { + $pos = $newpos; + $line = substr ($line, 0, $pos)."\0$prefix".substr ($line, $pos + 1, $length); + } else { + $pos = $pos + $linewidth + 1; + $line = substr ($line, 0, $pos)."\0$prefix".substr ($line, $pos, $length); + } + }; + + return $line; +} + +################################################################################ +####################### get_input_file ######################################### +################################################################################ +sub get_input_file + { + + # in : \%params + # in : \%LANGUAGE; + # in/out : $langmode; + # in/out : $alt_langmode; + # returns: input file + + my %PARAMS = %{$_[0]}; + my %LANGUAGE = %{$_[1]}; + my $langmode = $_[2]; + my $alt_langmode = $_[3]; + my $code; + + + if (defined $PARAMS{'input'}) + { + $code = $PARAMS{'input'}; + $code =~ s/\r//g; + } + else + { + open(FILEHANDLE, $PARAMS{'infile'}) || die("While opening '$PARAMS{'infile'}' for input: ".$!."\n"); + $code = ; + close(FILEHANDLE); + }; + + if ($PARAMS{'replacetabs'} != 0) + { + $code = join ( + "\n", + map{ + &checkTabulator($_, $PARAMS{'replacetabs'}) + } + my @dummy = split(/\n/, $code) + ); + }; + + + + if (defined ($PARAMS{'linewidth'})) + { + $code = join ( + "\n", + map{ + &splitLine($_, $PARAMS{'linewidth'}, + $PARAMS{'linebreakprefix'}) + } + my @dummy = split(/\n/, $code) + ); + }; + + + + if ((!defined($langmode)) || ($langmode eq '')) + { + my $test_code = substr($code, 0, $LANG_TEST_LENGTH); + warn("language mode not given. guessing...\n"); + + $langmode = ''; + + for (keys %LANGUAGE) + { + if ( (($LANGUAGE{$_}->{'filename'} ne '') && ($PARAMS{'infile'} =~ m/$LANGUAGE{$_}->{filename}/)) || + (($LANGUAGE{$_}->{'regex'} ne '') && ($test_code =~ m/$LANGUAGE{$_}->{regex}/ )) ) + { + $langmode = $_; + last; + }; + }; + + if ($langmode eq '') + { + if ((defined($alt_langmode)) && ($alt_langmode ne '')) + { + warn("Guessing language mode failed. Using fallback mode: '$alt_langmode'\n"); + $langmode = $alt_langmode; + $alt_langmode = ''; + } + else + { + die("Guessing language mode failed.\n") + }; + } + else + { + warn("using '$langmode'\n"); + }; + }; + + $_[2] = $langmode; + $_[3] = $alt_langmode; + return \$code; + }; + + +################################################################################ +####################### put_headers ############################################ +################################################################################ +sub put_headers +{ + my %PARAMS = %{shift()}; + my $STYLE_REF = shift(); + + if (defined($PARAMS{'outfile'})) + { + unless ($PARAMS{'outfile'} eq '-'){ + open(SAVEOUT, ">&STDOUT"); print SAVEOUT ''; # so perl does not typo warn + open (STDOUT, '>'.$PARAMS{'outfile'}) || die("While redirecting STDOUT to '$PARAMS{'outfile'}' for output: ".$!."\n"); + }; + + if (defined $PARAMS{'encoding'}) + { + $|=1; # PP: so the header is written before the data! + # PP: this took me hours of debugging :( + print "Content-Type: $$STYLE_REF{'content-type'}\n" if ($PARAMS{'content_type'}); + print "Content-Encoding: $PARAMS{'encoding'}\n\n"; + open (FILEHANDLE, "|$PARAMS{'encoder'}") || die("While opening '$PARAMS{'encoder'}': ".$!."\n"); + } + else + { + open( FILEHANDLE, ">&STDOUT" ) ; + print FILEHANDLE "Content-Type: $$STYLE_REF{'content-type'}\n\n" if ($PARAMS{'content_type'}); + }; + + print FILEHANDLE $$STYLE_REF{'header'} unless $PARAMS{'noheader'}; + } +}; + +################################################################################ +####################### apply_stylesheets_to_rules ############################# +################################################################################ +sub apply_stylesheets_to_rules + { + my ( $regexps_ref, $style_ref ) = @_; + + for ( @$regexps_ref ) { +# warn ("Style '".$_->{style}."' not defined in stylesheet.\n") unless defined $ { $$style_ref{'tags'} } { $_->{style} }; + if (defined ($ { $$style_ref{'tags'} } { $_->{style} })) { + $_->{'starttag'} = $ { $ { $$style_ref{'tags'} } { $_->{style} } } { 'start' }; + $_->{'endtag'} = $ { $ { $$style_ref{'tags'} } { $_->{style} } } { 'stop' }; + } else { + # no style no formating; if style == '' formating is done by childregex + warn ("Style '".$_->{style}."' not defined in stylesheet.\n") if ($_->{style} ne ''); + $_->{'starttag'} = ''; #$ { $ { $$style_ref{'tags'} } { $_->{style} } } { 'start' }; + $_->{'endtag'} = ''; #$ { $ { $$style_ref{'tags'} } { $_->{style} } } { 'stop' }; + } + apply_stylesheets_to_rules( $_->{childregex}, $style_ref ) if $_->{childregex}; + }; + }; + +################################################################################ +####################### create_snippetlist ##################################### +################################################################################ +sub create_snippetlist + { + my ( $regexps_ref, $code, $snippetlist_ref, $style_ref ) = @_ ; + my $length = length( $code ); + + ## An array of regular expression sturctures, each of which is an + ## array. @res is kept sorted by starting position of the RExen and + ## then by the position of the regex in the language file. This allows + ## us to just evaluate $res[0], and to hand write fast code that typically + ## handles 90% of the cases without resorting to the _big_ guns. + ## + ## FWIW, I pronounce '@res' REEZE, as in the plural of '$re'. + ## + my @res ; + + my $pos ; + + for ( @$regexps_ref ) { + pos( $code ) = 0 ; +#++$m ; + next unless $code =~ m/($_->{regex})/gms ; + + $pos = pos( $code ) ; +# $res[@res] = [ +# $_->{regex}, +# $ { $ { $$style_ref{'tags'} } { $_->{style} } } { 'start' }, +# $ { $ { $$style_ref{'tags'} } { $_->{style} } } { 'stop' }, +# $_->{childregex}, +# $pos - length( $1 ), +# $pos, +# scalar( @res ), +# ] ; + $res[@res] = [ + $_->{regex}, + $_->{starttag}, + $_->{endtag}, + $_->{childregex}, + $pos - length( $1 ), + $pos, + scalar( @res ), + ] ; + } + + ## 90% of all child regexes end up with 0 or 1 regex that needs to be + ## worried about. Trimming out the 0's speeds things up a bit and + ## makes the below loop simpler, since there's always at least + ## 1 regexp. It donsn't speed things up much by itself: the percentage + ## of times this fires is really small. But it does simplify the loop + ## below and speed it up. + unless ( @res ) { + $code =~ s/($ENTITIES)/$ENTITIES{$1}/ge ; + push @$snippetlist_ref, $code ; + return ; + } + + @res = sort { $a->[4] <=> $b->[4] || $a->[6] <=> $b->[6] } @res ; + + ## Add a dummy at the end, which makes the logic below simpler / faster. + $res[@res] = [ + undef, + undef, + undef, + undef, + $length, + $length, + scalar( @res ), + ] ; + + ## These are declared here for (minor) speed improvement. + my $re ; + my $match_spos ; + my $match_pos ; + my $re_spos ; + my $re_pos ; + my $re_num ; + my $prefix ; + my $snippet ; + my $rest ; + my $i ; + my $l ; + +my @changed_res ; +my $j ; + + $pos = 0 ; +MAIN: + while ( $pos < $length ) { + $re = $res[0] ; + + $match_spos = $re->[4] ; + $match_pos = $re->[5] ; + + if ( $match_spos > $pos ) { + $prefix = substr( $code, $pos, $match_spos - $pos ) ; + $prefix =~ s/($ENTITIES)/$ENTITIES{$1}/ge ; + push @$snippetlist_ref, $prefix ; + } + + if ( $match_pos > $match_spos ) { + $snippet = substr( $code, $match_spos, $match_pos - $match_spos ) ; + if ( @{$re->[3]} ) { + push @$snippetlist_ref, $re->[1] ; + create_snippetlist( $re->[3], $snippet, $snippetlist_ref, $style_ref ) ; + push @$snippetlist_ref, $re->[2] ; + } + else { + $snippet =~ s/($ENTITIES)/$ENTITIES{$1}/ge ; + push @$snippetlist_ref, $re->[1], $snippet, $re->[2]; + } + } + + $pos = $match_pos ; + + ## + ## Hand coded optimizations. Luckily, the cases that arise most often + ## are the easiest to tune. + ## + +# =pod + + if ( $res[1]->[4] >= $pos ) { + ## Only first regex needs to be moved, 2nd and later are still valid. + ## This is often 90% of the cases for Perl or C (others not tested, + ## just uncomment the $n, $o, and $p lines and try it yourself). +#++$n{1} ; +#++$m ; + pos( $code ) = $pos ; + unless ( $code =~ m/($re->[0])/gms ) { +#++$o{'0'} ; + if ( @res == 2 ) { + ## If the only regexp left is the dummy, we're done. + $rest = substr( $code, $pos ) ; + $rest =~ s/($ENTITIES)/$ENTITIES{$1}/ge ; + push @$snippetlist_ref, $rest ; + last ; + } + shift @res ; + } + else { + $re->[5] = $re_pos = pos( $code ) ; + $re->[4] = $re_spos = $re_pos - length( $1 ) ; + + ## Walk down the array looking for $re's new home. + ## The first few loop iterations are unrolled and done manually + ## for speed, which handles 85 to 90% of the cases where only + ## $re needs to be moved. + ## + ## Here's where that dummy regexp at the end of the array comes + ## in handy: we don't need to worry about array size here, since + ## it will always be after $re no matter what. The unrolled + ## loop stuff is outdented to make the conditionals fit on one + ## 80 char line. + ## Element 4 in @{$res[x]} is the start position of the match. + ## Element 6 is the order in which it was declared in the lang file. + $re_num = $re->[6] ; + if ( ( $re_spos <=> $res[1]->[4] || $re_num <=> $res[1]->[6] ) <= 0 ) { +#++$o{'1'} ; + next + } + $res[0] = $res[1] ; + +#++$o{'2'} ; + if ( ( $re_spos <=> $res[2]->[4] || $re_num <=> $res[2]->[6] ) <= 0 ) { + $res[1] = $re ; + next ; + } + $res[1] = $res[2] ; + + if ( ( $re_spos <=> $res[3]->[4] || $re_num <=> $res[3]->[6] ) <= 0 ) { +#++$o{'3'} ; + $res[2] = $re ; + next ; + } + $res[2] = $res[3] ; + + if ( ( $re_spos <=> $res[4]->[4] || $re_num <=> $res[4]->[6] ) <= 0 ) { +#++$o{'3'} ; + $res[3] = $re ; + next ; + } + $res[3] = $res[4] ; + + if ( ( $re_spos <=> $res[5]->[4] || $re_num <=> $res[5]->[6] ) <= 0 ) { +#++$o{'4'} ; + $res[4] = $re ; + next ; + } + $res[4] = $res[5] ; + +#++$o{'ugh'} ; + $i = 6 ; + $l = $#res ; + for ( ; $i < $l ; ++$i ) { + last + if ( + ( $re_spos <=> $res[$i]->[4] || $re_num <=> $res[$i]->[6] ) + <= 0 + ) ; + $res[$i-1] = $res[$i] ; + } +#++$p{sprintf( "%2d", $i )} ; + $res[$i-1] = $re ; + } + + next ; + } + +# =cut + + ## + ## End optimizations. You can comment them all out and this net + ## does all the work, just more slowly. If you do that, then + ## you also need to comment out the code below that deals with + ## the second entry in @res. + ## + +#my $ni = 0 ; + ## First re always needs to be tweaked +#++$m ; +#++$ni ; + pos( $code ) = $pos ; + unless ( $code =~ m/($re->[0])/gms ) { + if ( @res == 2 ) { + ## If the only regexp left is the dummy, we're done. + $rest = substr( $code, $pos ) ; + $rest =~ s/($ENTITIES)/$ENTITIES{$1}/ge ; + push @$snippetlist_ref, $rest ; + last ; + } + shift @res ; + @changed_res = () ; + $i = 0 ; + } + else { + $re->[5] = $re_pos = pos( $code ) ; + $re->[4] = $re_pos - length( $1 ) ; + @changed_res = ( $re ) ; + $i = 1 ; + } + + ## If the optimizations above are in, the second one always + ## needs to be tweaked, too. + $re = $res[$i] ; +#++$m ; +#++$ni ; + pos( $code ) = $pos ; + unless ( $code =~ m/($re->[0])/gms ) { + if ( @res == 2 ) { + ## If the only regexp left is the dummy, we're done. + $rest = substr( $code, $pos ) ; + $rest =~ s/($ENTITIES)/$ENTITIES{$1}/ge ; + push @$snippetlist_ref, $rest ; + last ; + } + shift @res ; + } + else { + $re->[5] = $re_pos = pos( $code ) ; + $re->[4] = $re_spos = $re_pos - length( $1 ) ; + if ( @changed_res && + ( $changed_res[0]->[4] <=> $re_spos || + $changed_res[0]->[6] <=> $re->[6] + ) > 0 + ) { + unshift @changed_res, $re ; + } + else { + $changed_res[$i] = $re ; + } + ++$i ; + } + + for ( ; ; ++$i ) { + local $_ = $res[$i] ; +#++$m ; + last if $_->[4] >= $pos ; +#++$ni ; +#++$m ; + pos( $code ) = $pos ; + unless ( $code =~ m/($_->[0])/gms ) { + if ( @res <= 2 ) { + $rest = substr( $code, $pos ) ; + $rest =~ s/($ENTITIES)/$ENTITIES{$1}/ge ; + push @$snippetlist_ref, $rest ; + last MAIN ; + } + ## If this regex is no longer needed, remove it by not pushing it + ## on to @changed_res. This means we need one less slot in @res. + shift @res ; + redo ; + } + + $_->[5] = $re_pos = pos( $code ) ; + $_->[4] = $re_spos = $re_pos - length( $1 ) ; + + ## Insertion sort in to @changed_res + $re_num = $_->[6] ; + for ( $j = $#changed_res ; $j > -1 ; --$j ) { + last + if ( + ( $changed_res[$j]->[4] <=> $re_spos || + $changed_res[$j]->[6] <=> $re_num + ) < 0 + ) ; + $changed_res[$j+1] = $changed_res[$j] ; + } + $changed_res[$j+1] = $_ ; + } + + ## Merge sort @changed_res and @res in to @res + $j = 0 ; + $l = $#res ; + for ( @changed_res ) { + while ( + $i < $l && + ( $_->[4] <=> $res[$i]->[4] || $_->[6] <=> $res[$i]->[6] ) > 0 + ) { + $res[$j++] = $res[$i++] ; + } + $res[$j++] = $_ ; + } +# =cut + } +}; + +################################################################################## +######################### create_snippetlist ##################################### +################################################################################## +##sub create_snippetlist +## { +## my ( $regexps_ref, $code, $snippetlist_ref ) = @_ ; + +## my $length = length( $code ); +## my @regexps; +## $regexps[scalar(@$regexps_ref)] = undef; + +## my $head_ptr = undef; +## my $current_ptr; +## my $help_ptr; + +## my $index = 0; + +## for (@$regexps_ref) +## { +## $current_ptr = $regexps[$index]; #0: start_ptr 1: length 2: next_ptr, 3: regex, 4:start, 5:end, 6: child 7: index +## $current_ptr->[7] = $index++; +## $current_ptr->[6] = $$_{'childregex'}; +## $current_ptr->[5] = $$_{'endtag'}; +## $current_ptr->[4] = $$_{'starttag'}; +## $current_ptr->[3] = $$_{'regex'}; + + +## pos( $code ) = 0; +## if ( $code =~ /($current_ptr->[3])/gms ) { $current_ptr->[0] = pos ($code) - length($1); $current_ptr->[1] = length($1); } else {next}; + +## if (!defined ($head_ptr) || $current_ptr->[0] < $head_ptr->[0] ) +## { +## $current_ptr->[2] = $head_ptr; +## $head_ptr = $current_ptr; +## } +## else +## { +## $help_ptr = $head_ptr; +## $help_ptr = $help_ptr->[2] +## while (defined ( $help_ptr->[2] ) && ($current_ptr->[0] >= $help_ptr->[2]->[0]) ); #iow: while (defined help->next && current->pos <= help->next->pos) + +## $current_ptr->[2] = $help_ptr->[2]; +## $help_ptr->[2] = $current_ptr; +## }; +## }; + + +## my $endpos = 0; +## my $oldhead; + +## my %entities ; +## $entities{'&'} = '&' ; +## $entities{'<'} = '<' ; +## $entities{'>'} = '>' ; +## $entities{'"'} = '"' ; + +## my $snippet; +## while (defined $head_ptr) +## { +## if ($head_ptr->[0] - $endpos > 0) { +## $snippet = substr($code, $endpos, $head_ptr->[0] - $endpos); +## $snippet =~ s/($ENTITIES)/$ENTITIES{$1}/ge; #"]); +## push @$snippetlist_ref, $snippet; +## }; +## push @$snippetlist_ref, $head_ptr->[4]; + +## &create_snippetlist( $head_ptr->[6], substr($code, $head_ptr->[0], $head_ptr->[1]) , $snippetlist_ref); +## push @$snippetlist_ref, $head_ptr->[5]; + +## $endpos = $head_ptr->[0] + $head_ptr->[1]; + +## # update & repair list : + +## $oldhead = $head_ptr; +## # 1) shift now invalid matches from list + +## $help_ptr = $head_ptr; +## $help_ptr = $help_ptr->[2] +## while (defined ( $help_ptr->[2] ) && ($endpos > $help_ptr->[2]->[0]) ); +## $head_ptr = $help_ptr->[2]; +## $help_ptr->[2] = undef; + +## # 2) rematch invalid matches and insert them into the list + +## while (defined $oldhead) +## { +## $current_ptr = $oldhead; +## $oldhead = $oldhead->[2]; + +## pos( $code ) = $endpos; +## if ( $code =~ /($current_ptr->[3])/gms ) { $current_ptr->[0] = pos ($code) - length($1); $current_ptr->[1] = length($1); } else {next}; +## if (!defined ($head_ptr) || +## ($current_ptr->[0] < $head_ptr->[0]) || +## ( +## ( $current_ptr->[0] == $head_ptr->[0]) && +## ( $current_ptr->[7] < $head_ptr->[7]) +## ) +## ) +## { +## $current_ptr->[2] = $head_ptr; +## $head_ptr = $current_ptr; +## } +## else +## { +## $help_ptr = $head_ptr; +## $help_ptr = $help_ptr->[2] +## while (defined ( $help_ptr->[2] ) && +## ( +## ($current_ptr->[0] > $help_ptr->[2]->[0]) || +## ( +## ( $current_ptr->[0] == $help_ptr->[2]->[0]) && +## ( $current_ptr->[7] > $help_ptr->[2]->[7]) +## ) +## ) +## ); #iow: while (defined help->next && current->pos <= help->next->pos) # if two patterns match at the same pos +## # the one that was declared earlier is taken + +## $current_ptr->[2] = $help_ptr->[2]; +## $help_ptr->[2] = $current_ptr; +## }; +## }; + +## # 3) done +## }; + +## $snippet = substr($code, $endpos); $snippet =~ s/($ENTITIES)/$ENTITIES{$1}/ge; #" ]); +## push @$snippetlist_ref, $snippet; +## }; + + + +################################################################################ +####################### put_output ############################################# +################################################################################ +sub put_output { + my ( $params, $snippetlist_ref, $STYLE_REF ) = @_ ; + + my $result; + + my $prefix = ''; + $prefix = $params->{'line_number_prefix'}.'_' if defined $params->{'line_number_prefix'}; + $result = & { $ { $$STYLE_REF{'linenumbers'} }{$params->{'linenumbers'}} } (join ('', @$snippetlist_ref), $prefix); + + if (defined ($params{'linewidth'})) { + $result =~ tr=\0=\n=; + } + + print FILEHANDLE $result unless (defined $params->{'dont_print_output'} && $params->{'dont_print_output'}); + print FILEHANDLE $$STYLE_REF{'footer'} unless $params->{'noheader'}; + + if (defined($params->{'outfile'})) { + unless ($params->{'outfile'} eq '-'){ + close (FILEHANDLE); + close (STDOUT); + open (STDOUT, ">&SAVEOUT"); + }; + }; + return $result; +}; + + + + +################################################################################ +####################### get_default_stylesheet ################################# +################################################################################ +sub get_default_stylesheet +{ + +my %STYLESHEET; + + +########## +########## different color modes for html. +# those are named html-dark, html-nobc and html-light. +# html-light is also named html +# the only difference between html-light and html-nobc is +# that html-light defines a body background and text color. +# nobc stands for no body colors. + +$STYLESHEET{'html-light'} = { 'template' => +' + + %%title%% + + +download the original source code. +
+%%code%%
+
+
+syntax highlighted by Code2HTML, v. %%version%% + + +', + 'content-type' => 'text/html', + 'entities' => { 'listofchars' => '[<>&"]', # a regex actually + 'replace_by' => { + '&' => '&', + '<' => '<', + '>' => '>', + '"' => '"' + } + }, + 'linenumbers' => { + 'none' => sub { + return $_[0]; + }, + 'normal' => sub { + # o as the first parameter is the joined snippetlist + # o the second is an optional prefix, needed if more than one block + # in a file is highlighted. needed in patch-mode. may be empty + # the sub should the return a scalar made up of the joined lines including linenumbers + my @lines = split ( /\n/, $_[0] ); + + my $nr = 0; + my $lengthofnr = length(@lines); + my $format = qq{%${lengthofnr}u %s\n} ; + join ('', map ( {$nr++; sprintf ( $format , $nr, $nr, $_ )} @lines)); + }, + 'linked' => sub { + # this should do the same as above only with linenumbers that link to themselves + # If this style does not support this, use the same as above. + my @lines = split ( /\n/, $_[0] ); + + my $nr = 0; + my $lengthofnr = length(@lines); + my $format = qq{%$ {lengthofnr}u %s\n}; + join ('', map ( {$nr++; sprintf ( $format , $nr, $nr, $nr, $_ )} @lines)); + } + }, + 'tags' => { + 'comment' => { 'start' => '', + 'stop' => '' }, + 'doc comment' => { 'start' => '', + 'stop' => '' }, + 'string' => { 'start' => '', + 'stop' => '' }, + 'esc string' => { 'start' => '', + 'stop' => '' }, + 'character' => { 'start' => '', + 'stop' => '' }, + 'esc character' => { 'start' => '', + 'stop' => '' }, + 'numeric' => { 'start' => '', + 'stop' => '' }, + + 'identifier' => { 'start' => '', + 'stop' => '' }, + 'predefined identifier' => { 'start' => '', + 'stop' => '' }, + + 'type' => { 'start' => '', + 'stop' => '' }, + 'predefined type' => { 'start' => '', + 'stop' => '' }, + + 'reserved word' => { 'start' => '', + 'stop' => '' }, + 'library function' => { 'start' => '', + 'stop' => '' }, + + 'include' => { 'start' => '', + 'stop' => '' }, + 'preprocessor' => { 'start' => '', + 'stop' => '' }, + + 'braces' => { 'start' => '', + 'stop' => '' }, + 'symbol' => { 'start' => '', + 'stop' => '' }, + + 'function header' => { 'start' => '', + 'stop' => '' }, + 'function header name' => { 'start' => '', + 'stop' => '' }, + 'function header args' => { 'start' => '', + 'stop' => '' }, + + 'regex' => { 'start' => '', + 'stop' => '' }, + + 'text' => { 'start' => '', + 'stop' => ''}, + + # HTML + 'entity' => { 'start' => '', + 'stop' => '' }, + + # MAKEFILE + 'assignment' => { 'start' => '', + 'stop' => '' }, + 'dependency line' => { 'start' => '', + 'stop' => '' }, + 'dependency target' => { 'start' => '', + 'stop' => '' }, + 'dependency continuation'=> { 'start' => '', + 'stop' => '' }, + 'continuation' => { 'start' => '', + 'stop' => '' }, + 'macro' => { 'start' => '', + 'stop' => '' }, + 'int macro' => { 'start' => '', + 'stop' => '' }, + 'esc $$$' => { 'start' => '', + 'stop' => '' }, + + # PATCH + 'separator' => { 'start' => '', + 'stop' => '' }, + 'line spec' => { 'start' => '', + 'stop' => '' }, + 'deletion' => { 'start' => '', + 'stop' => '' }, + 'insertion' => { 'start' => '', + 'stop' => '' } + + } + }; +# html-light is also called html + +$STYLESHEET{'html'} = $STYLESHEET{'html-light'}; + + +# html-nobc is a modification of html-light +# in such a way, that the body tag does not define +# a background and a text color +# nobc stands for no body colors. + +%{$STYLESHEET{'html-nobg'}} = %{$STYLESHEET{'html-light'}}; +${ $STYLESHEET{'html-nobg'}} {'template'} = ' + + %%title%% + + +
+%%code%%
+
+
+syntax highlighted by Code2HTML, v. %%version%% + + +'; + + +# html-dark is a modification of html-light +# in such a way, that the body tag does define +# different colors and that the colors are different. + +%{$STYLESHEET{'html-dark'}} = %{$STYLESHEET{'html-light'}}; +${ $STYLESHEET{'html-dark'}} {'template'} = ' + + %%title%% + + +
+%%code%%
+
+
+syntax highlighted by Code2HTML, v. %%version%% + + +'; +${ $STYLESHEET{'html-dark'}} {'tags'} = { + 'comment' => { 'start' => '', + 'stop' => '' }, + 'doc comment' => { 'start' => '', + 'stop' => '' }, + 'string' => { 'start' => '', + 'stop' => '' }, + 'esc string' => { 'start' => '', + 'stop' => '' }, + 'character' => { 'start' => '', + 'stop' => '' }, + 'esc character' => { 'start' => '', + 'stop' => '' }, + 'numeric' => { 'start' => '', + 'stop' => '' }, + + 'identifier' => { 'start' => '', + 'stop' => '' }, + 'predefined identifier' => { 'start' => '', + 'stop' => '' }, + + 'type' => { 'start' => '', + 'stop' => '' }, + 'predefined type' => { 'start' => '', + 'stop' => '' }, + + 'reserved word' => { 'start' => '', + 'stop' => '' }, + 'library function' => { 'start' => '', + 'stop' => '' }, + + 'include' => { 'start' => '', + 'stop' => '' }, + 'preprocessor' => { 'start' => '', + 'stop' => '' }, + + 'braces' => { 'start' => '', + 'stop' => '' }, + 'symbol' => { 'start' => '', + 'stop' => '' }, + + 'function header' => { 'start' => '', + 'stop' => '' }, + 'function header name' => { 'start' => '', + 'stop' => '' }, + 'function header args' => { 'start' => '', + 'stop' => '' }, + + 'regex' => { 'start' => '', + 'stop' => '' }, + + 'text' => { 'start' => '', + 'stop' => ''}, + + # HTML + 'entity' => { 'start' => '', + 'stop' => '' }, + + # MAKEFILE + 'assignment' => { 'start' => '', + 'stop' => '' }, + 'dependency line' => { 'start' => '', + 'stop' => '' }, + 'dependency target' => { 'start' => '', + 'stop' => '' }, + 'dependency continuation'=> { 'start' => '', + 'stop' => '' }, + 'continuation' => { 'start' => '', + 'stop' => '' }, + 'macro' => { 'start' => '', + 'stop' => '' }, + 'int macro' => { 'start' => '', + 'stop' => '' }, + 'esc $$$' => { 'start' => '', + 'stop' => '' }, + + # PATCH + 'separator' => { 'start' => '', + 'stop' => '' }, + 'line spec' => { 'start' => '', + 'stop' => '' }, + 'deletion' => { 'start' => '', + 'stop' => '' }, + 'insertion' => { 'start' => '', + 'stop' => '' } + }; + +##### +# +# nocolor +# +%{$STYLESHEET{'html-nocolor'}} = %{$STYLESHEET{'html-nobg'}}; +${ $STYLESHEET{'html-nocolor'}} {'tags'} = { + 'comment' => { + 'start' => '', + 'stop' => '' + }, + 'doc comment' => { + 'start' => '', + 'stop' => '' + }, + 'string' => { + 'start' => '', + 'stop' => '' + }, + 'esc string' => { + 'start' => '', + 'stop' => '' + }, + 'character' => { + 'start' => '', + 'stop' => '' + }, + 'esc character' => { + 'start' => '', + 'stop' => '' + }, + 'numeric' => { + 'start' => '', + 'stop' => '' + }, + 'identifier' => { + 'start' => '', + 'stop' => '' + }, + 'predefined identifier' => { + 'start' => '', + 'stop' => '' + }, + 'type' => { + 'start' => '', + 'stop' => '' + }, + 'predefined type' => { + 'start' => '', + 'stop' => '' + }, + 'reserved word' => { + 'start' => '', + 'stop' => '' + }, + 'library function' => { + 'start' => '', + 'stop' => '' + }, + 'include' => { + 'start' => '', + 'stop' => '' + }, + 'preprocessor' => { + 'start' => '', + 'stop' => '' + }, + 'braces' => { + 'start' => '', + 'stop' => '' + }, + 'symbol' => { + 'start' => '', + 'stop' => '' + }, + 'function header' => { + 'start' => '', + 'stop' => '' + }, + 'function header name' => { + 'start' => '', + 'stop' => '' + }, + 'function header args' => { + 'start' => '', + 'stop' => '' + }, + 'regex' => { + 'start' => '', + 'stop' => '' + }, + 'text' => { + 'start' => '', + 'stop' => '' + }, + # HTML + 'entity' => { + 'start' => '', + 'stop' => '' + }, + # MAKEFILE + 'assignment' => { + 'start' => '', + 'stop' => '' + }, + 'dependency line' => { + 'start' => '', + 'stop' => '' + }, + 'dependency target' => { + 'start' => '', + 'stop' => '' + }, + 'dependency continuation' => { + 'start' => '', + 'stop' => '' + }, + 'continuation' => { + 'start' => '', + 'stop' => '' + }, + 'macro' => { + 'start' => '', + 'stop' => '' + }, + 'int macro' => { + 'start' => '', + 'stop' => '' + }, + 'esc $$$' => { + 'start' => '', + 'stop' => '' + }, + # PATCH + 'separator' => { + 'start' => '', + 'stop' => '' }, + 'line spec' => { + 'start' => '', + 'stop' => '' + }, + 'deletion' => { + 'start' => '', + 'stop' => '' + }, + 'insertion' => { + 'start' => '', + 'stop' => '' + } +}; + + + +##### +# +# simple +# +%{$STYLESHEET{'html-simple'}} = %{$STYLESHEET{'html-nocolor'}}; +${ $STYLESHEET{'html-simple'}} {'template'} = ' + + %%title%% + + + +

%%title%%

+
+%%code%%
+    
+ '; + + + + +# Vincent Sanders +# html-fntlck is a modification of html-light +# in such a way, that the body tag does define +# different colors and that the colors are different. +#it is supposed to be the colours i get from emacs default font-lock mode + +%{$STYLESHEET{'html-fntlck'}} = %{$STYLESHEET{'html-light'}}; +${ $STYLESHEET{'html-fntlck'}} {'template'} = ' + + %%title%% + + +
+%%code%%
+
+
+syntax highlighted by Code2HTML, v. %%version%% + + +'; +${ $STYLESHEET{'html-fntlck'}} {'tags'} = { + 'comment' => { 'start' => '', + 'stop' => '' }, + 'doc comment' => { 'start' => '', + 'stop' => '' }, + 'string' => { 'start' => '', + 'stop' => '' }, + 'esc string' => { 'start' => '', + 'stop' => '' }, + 'character' => { 'start' => '', + 'stop' => '' }, + 'esc character' => { 'start' => '', + 'stop' => '' }, + 'numeric' => { 'start' => '', + 'stop' => '' }, + + 'identifier' => { 'start' => '', + 'stop' => '' }, + 'predefined identifier' => { 'start' => '', + 'stop' => '' }, + + 'type' => { 'start' => '', + 'stop' => '' }, + 'predefined type' => { 'start' => '', + 'stop' => '' }, + + 'reserved word' => { 'start' => '', + 'stop' => '' }, + 'library function' => { 'start' => '', + 'stop' => '' }, + + 'include' => { 'start' => '', + 'stop' => '' }, + 'preprocessor' => { 'start' => '', + 'stop' => '' }, + + 'braces' => { 'start' => '', + 'stop' => '' }, + 'symbol' => { 'start' => '', + 'stop' => '' }, + + 'function header' => { 'start' => '', + 'stop' => '' }, + 'function header name' => { 'start' => '', + 'stop' => '' }, + 'function header args' => { 'start' => '', + 'stop' => '' }, + + 'regex' => { 'start' => '', + 'stop' => '' }, + + 'text' => { 'start' => '', + 'stop' => ''}, + + # HTML + 'entity' => { 'start' => '', + 'stop' => '' }, + + # MAKEFILE + 'assignment' => { 'start' => '', + 'stop' => '' }, + 'dependency line' => { 'start' => '', + 'stop' => '' }, + 'dependency target' => { 'start' => '', + 'stop' => '' }, + 'dependency continuation'=> { 'start' => '', + 'stop' => '' }, + 'continuation' => { 'start' => '', + 'stop' => '' }, + 'macro' => { 'start' => '', + 'stop' => '' }, + 'int macro' => { 'start' => '', + 'stop' => '' }, + 'esc $$$' => { 'start' => '', + 'stop' => '' }, + + # PATCH + 'separator' => { 'start' => '', + 'stop' => '' }, + 'line spec' => { 'start' => '', + 'stop' => '' }, + 'deletion' => { 'start' => '', + 'stop' => '' }, + 'insertion' => { 'start' => '', + 'stop' => '' } + + }; + + +return \%STYLESHEET; + +}; + + + +################################################################################ +####################### get_default_database ################################### +################################################################################ +sub get_default_database +{ + +my %LANGUAGE; + +# written by PP +$LANGUAGE{'plain'} = { + 'filename' => '', + 'regex' => '', + 'patterns' => [] + }; + + + + + + +# taken from nedit +# modified by PP +$LANGUAGE{'ada'} = { + 'filename' => '(?i)\\.a(d[asb]?)?$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'Comments', + 'regex' => '--.*?$', + 'style' => 'comment', + 'childregex' => [], + }, + { + 'name' => 'String Literals', + 'regex' => '".*?("|$)', + 'style' => 'string', + 'childregex' => [] + }, + { + 'name' => 'Character Literals', + 'regex' => '\'.\'', + 'style' => 'character', + 'childregex' => [] + }, + { + 'name' => 'Ada Attributes', + 'regex' => '\'[a-zA-Z][a-zA-Z_]+\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'Numeric Literals', + 'regex' => '(((2|8|10|16)#[_0-9a-fA-F]*#)|[0-9.]+)', + 'style' => 'numeric', + 'childregex' => [] + }, + { + 'name' => 'Withs Pragmas Use', + 'regex' => '\\b(?i)((with|pragma|use)[ \\t\\n\\f\\r]+[a-zA-Z0-9_.]+;)+\\b', + 'style' => 'include', + 'childregex' => [] + }, + { + 'name' => 'Predefined Types', + 'regex' => '\\b(?i)(boolean|character|count|duration|float|integer|long_float|long_integer|priority|short_float|short_integer|string)\\b', + 'style' => 'predefined type', + 'childregex' => [] + }, + { + 'name' => 'Predefined Subtypes', + 'regex' => '\\b(?i)field|natural|number_base|positive|priority\\b', + 'style' => 'predefined type', + 'childregex' => [] + }, + { + 'name' => 'Reserved Words', + 'regex' => '\\b(?i)(abort|abs|accept|access|and|array|at|begin|body|case|constant|declare|delay|delta|digits|do|else|elsif|end|entry|exception|exit|for|function|generic|goto|if|in|is|limited|loop|mod|new|not|null|of|or|others|out|package|pragma|private|procedure|raise|range|record|rem|renames|return|reverse|select|separate|subtype|task|terminate|then|type|use|when|while|with|xor)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'Ada 95 Only', + 'regex' => '\\b(?i)(abstract|tagged|all|protected|aliased|requeue|until)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'Identifiers', + 'regex' => '\\b[a-zA-Z][a-zA-Z0-9_]*\\b', + 'style' => 'identifier', + 'childregex' => [] + }, + { + 'name' => 'Dot All', + 'regex' => '(?i)\\.all\\b', + 'style' => 'predefined identifier', + 'childregex' => [] + } + ] + }; +$LANGUAGE{'ada95'} = $LANGUAGE{'ada'}; + + + + + + + + + + + + + + + +# written by JA +$LANGUAGE{'awk'} = { + 'filename' => '(?i)\\.awk$', + 'regex' => '^\\s*#\\s*![^\\s]*awk', + 'patterns' => [ + { + 'name' => 'comment', + 'regex' => '#.*?$', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'string', + 'regex' => '""|".*?([^\\\\](\\\\\\\\)*)"|"\\\\\\\\"', +# 'regex' => '""|"\\\\\\\\"|"[^"\\\\]"|"[^"].*?[^\\\\]"', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'string', + 'regex' => '\'\'|\'.*?([^\\\\](\\\\\\\\)*)\'|\'\\\\\\\\\'', +# 'regex' => '\'\'|\'\\\\\\\\\'|\'[^\'\\\\]\'|\'[^\'].*?[^\\\\]\'', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'function header', + 'regex' => 'function[\\t ]+([a-zA-Z0-9_]+)[\\t \\n]*(\\{|\\n)', + 'style' => 'function header', + 'childregex' => [ + { + 'name' => 'function coloring', + 'regex' => '[\\t ]([a-zA-Z0-9_]+)', + 'style' => 'function header name', + 'childregex' => [] + } + ] + }, + { + 'name' => 'regex matching I 1', + 'regex' => '(\\b| )?(/)(\\\\/|[^/\\n])*(/[gimesox]*)', + 'style' => 'regex', + 'childregex' => [] + }, + { + 'name' => 'regex matching I 2', + 'regex' => '(?:\\b| )(?:(?:m|q|qq)([!"#$%&\'*+-/]))(\\\\\\2|[^\\2\\n])*(\\2[gimesox]*)', + 'style' => 'regex', + 'childregex' => [] + }, + { + 'name' => 'regex matching II', + 'regex' => '(?:\\b| )?(?:s([!"#$%&\'*+-/]))(?:\\\\\\2|[^\\2\\n])*?(\\2)[^(\\2)\\n]*?(\\2[gimesox]*)', + 'style' => 'regex', + 'childregex' => [] + }, + { + 'name' => 'translate', + 'regex' => '(?:\\b| )(?:(?:tr|y)([^\w\s]))(?:\\\\\\2|[^\\2\\n])*?(\\2)[^(\\2)\\n]*?(\\2[gimesox]*)', + 'style' => 'regex', + 'childregex' => [] + }, + { + 'name' => 'keywords', + 'regex' => '\\b(BEGIN|END|ARGC|ARGIND|ARGV|CONVFMT|ENVIRON|ERRNO|FIELDWIDTHS|FILENAME|FNR|FS|IGNORECASE|NF|NR|OFMT|OFS|ORS|RS|RT|RSTART|RLENGTH|SUBSEP)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'keywords 2', + 'regex' => '\\b(if|while|do|for|in|break|continue|delete|exit|next|nextfile|function)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'library fns', + 'regex' => '\\b(close|getline|print|printf|system|fflush|atan2|cos|exp|int|log|rand|sin|sqrt|srand|gensub|gsub|index|length|split|sprintf|sub|substr|tolower|toupper|systime|strftime)\\b', + 'style' => 'library function', + 'childregex' => [] + }, + { + 'name' => 'braces and parens', + 'regex' => '[\\[\\]\\{\\}\\(\\)]', + 'style' => 'braces', + 'childregex' => [] + }, + { + 'name' => '<< stuff', + 'regex' => '<<\'([^\\n]*)\';.*?^\\2$', + 'style' => 'text', + 'childregex' => [] + }, + { + 'name' => '<< stuff', + 'regex' => '<<([^\\n]*).*?^\\2$', + 'style' => 'text', + 'childregex' => [] + } + ] + }; + + + + + + + + + + + + + + + +# taken from nedit +# modified by PP +$LANGUAGE{'c'} = { + 'filename' => '\\.[ch]$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'doc comment', + 'regex' => '/\\*\\*.*?\\*/', + 'style' => 'doc comment', + 'childregex' => [] + }, + { + 'name' => 'comment', + 'regex' => '/\\*.*?\\*/', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'string', + 'regex' => '""|".*?([^\\\\](\\\\\\\\)*)"|"\\\\\\\\"', +# 'regex' => '""|"\\\\\\\\"|"[^"\\\\]"|"[^"].*?[^\\\\]"', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'preprocessor line', + 'regex' => '^[ \\t]*#.*?$', + 'style' => 'preprocessor', + 'childregex' => [ + { + 'name' => 'string', + 'regex' => '""|".*?([^\\\\](\\\\\\\\)*)"|"\\\\\\\\"', +# 'regex' => '""|"\\\\\\\\"|"[^"\\\\]"|"[^"].*?[^\\\\]"', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => '', + 'regex' => '<.*?>', + 'style' => 'string', + 'childregex' => [] + }, + { + 'name' => 'comment', + 'regex' => '[^/]/\\*.*?\\*/', + 'style' => 'comment', + 'childregex' => [] + } + ] + }, + { + 'name' => 'character constant', + 'regex' => '\'(\\\\)?.\'', + 'style' => 'character', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'numeric constant', + 'regex' => '\\b((0(x|X)[0-9a-fA-F]*)|(([0-9]+\\.?[0-9]*)|(\\.[0-9]+))((e|E)(\\+|-)?[0-9]+)?)(L|l|UL|ul|u|U|F|f)?\\b', + 'style' => 'numeric', + 'childregex' => [] + }, + { + 'name' => 'storage keyword', + 'regex' => '\\b(const|extern|auto|register|static|unsigned|signed|volatile|char|double|float|int|long|short|void|typedef|struct|union|enum)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'keyword', + 'regex' => '\\b(return|goto|if|else|case|default|switch|break|continue|while|do|for|sizeof)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'braces', + 'regex' => '[\\{\\}]', + 'style' => 'braces', + 'childregex' => [] + }, + { + 'name' => 'symbols', + 'regex' => '([\\*\\-\\+=:;%&\\|<>\\(\\)\\[\\]!])', + 'style' => 'symbol', + 'childregex' => [] + }, + { + 'name' => 'identifiers', + 'regex' => '([a-zA-Z_][a-zA-Z_0-9]*)', + 'style' => 'identifier', + 'childregex' => [] + } + ] + }; + + + + + + + + + + + + + + + +# taken from nedit +# modified by PP +$LANGUAGE{'c++'} = { + 'filename' => '\\.(c(c|pp|xx)|h(h|pp|xx)|C(C|PP|XX)?|H(H|PP|XX)?|i)$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'doc comment', + 'regex' => '/\\*\\*.*?\\*/', + 'style' => 'doc comment', + 'childregex' => [] + }, + { + 'name' => 'comment', + 'regex' => '/\\*.*?\\*/', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'cplus comment', + 'regex' => '//.*?$', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'string', + 'regex' => '""|"\\\\\\\\"|".*?([^\\\\](\\\\\\\\)*)"', +# 'regex' => '""|"\\\\\\\\"|"[^"\\\\]"|"[^"].*?[^\\\\]"', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'preprocessor line', + 'regex' => '^[ \\t]*#.*?$', + 'style' => 'preprocessor', + 'childregex' => [ + { + 'name' => 'string', + 'regex' => '""|".*?([^\\\\](\\\\\\\\)*)"|"\\\\\\\\"', +# 'regex' => '""|"\\\\\\\\"|"[^"\\\\]"|"[^"].*?[^\\\\]"', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => '', + 'regex' => '<.*?>', + 'style' => 'string', + 'childregex' => [] + }, + { + 'name' => 'comment', + 'regex' => '[^/]/\\*.*?\\*/', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'cplus comment', + 'regex' => '//.*?$', + 'style' => 'comment', + 'childregex' => [] + } + ] + }, + { + 'name' => 'character constant', + 'regex' => '\'(\\\\)?.\'', + 'style' => 'character', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'numeric constant', + 'regex' => '\\b((0(x|X)[0-9a-fA-F]*)|(([0-9]+\\.?[0-9]*)|(\\.[0-9]+))((e|E)(\\+|-)?[0-9]+)?)(L|l|UL|ul|u|U|F|f)?\\b', + 'style' => 'numeric', + 'childregex' => [] + }, + { + 'name' => 'storage keyword', + 'regex' => '\\b(class|typename|typeid|template|friend|virtual|inline|explicit|operator|overload|public|private|protected|const|extern|auto|register|static|mutable|unsigned|signed|volatile|char|double|float|int|long|short|bool|wchar_t|void|typedef|struct|union|enum)\\b', + 'style' => 'reserved word', + 'childregex' => [], + }, + { + 'name' => 'keyword', + 'regex' => '\\b(new|delete|this|return|goto|if|else|case|default|switch|break|continue|while|do|for|catch|throw|sizeof|true|false|namespace|using|dynamic_cast|static_cast|reinterpret_cast)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'braces', + 'regex' => '[\\{\\}]', + 'style' => 'braces', + 'childregex' => [] + }, + { + 'name' => 'symbols', + 'regex' => '([\\*\\-\\+=:;%&\\|<>\\(\\)\\[\\]!])', + 'style' => 'symbol', + 'childregex' => [] + }, + { + 'name' => 'identifiers', + 'regex' => '([a-zA-Z_][a-zA-Z_0-9]*)', + 'style' => 'identifier', + 'childregex' => [] + } + ] + }; +$LANGUAGE{'cc'} = $LANGUAGE{'c++'}; +$LANGUAGE{'cpp'} = $LANGUAGE{'c++'}; +$LANGUAGE{'cxx'} = $LANGUAGE{'c++'}; + + + + + + + + + + +# taken from nedit +# modified by tk +$LANGUAGE{'f'} = { + 'filename' => '\\.(f(f|or|77)|F(F|OR|77))$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'comment', + 'regex' => '^[C|c].*?$', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'string', + 'regex' => '""|".*?([^\\\\](\\\\\\\\)*)"|"\\\\\\\\"', + 'style' => 'string', + 'childregex' => [] + }, + { + 'name' => 'numeric constant', + 'regex' => '\\b([0-9]+(\\.[0-9]*)?([DEde][-+]?[0-9]*)?|\\.[0-9]+([DEde][-+]?[0-9]*)?)\\b', + 'style' => 'numeric', + 'childregex' => [] + }, + { + 'name' => 'storage keyword', + 'regex' => '\\b(BYTE|[Bb]yte|CHARACTER|[Cc]haracter|COMPLEX|[Cc]omplex|DOUBLE *COMPLEX|[Dd]ouble *[Cc]omplex|DOUBLE *PRECISION|[Dd]ouble *[Pp]recision|DOUBLE|[Dd]ouble|INTEGER|[Ii]nteger|REAL|[Rr]eal)(\\*[0-9]+)?\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'keyword', + 'regex' => '\\b(ACCEPT|[Aa]ccept|ASSIGN|[Aa]ssign|AUTOMATIC|[Aa]utomatic|BACKSPACE|[Bb]ackspace|BLOCK|[Bb]lock|CALL|[Cc]all|CLOSE|[Cc]lose|COMMON|[Cc]ommon|CONTINUE|[Cc]ontinue|DATA|[Dd]ata|DECODE|[Dd]ecode|DELETE|[Dd]elete|DIMENSION|[Dd]imension|DO|[Dd]o|ELSE|[Ee]lse|ELSEIF|[Ee]lseif|ENCODE|[Ee]ncode|END *FILE|[Ee]nd *[Ff]ile|ENDFILE|[Ee]ndfile|END|[Ee]nd|ENDIF|[Ee]ndif|ENTRY|[Ee]ntry|EQUIVALENCE|[Ee]quivalence|EXIT|[Ee]xit|EXTERNAL|[Ee]xternal|FORMAT|[Ff]ormat|FUNCTION|[Ff]unction|GOTO|[Gg]oto|IF|[Ii]f|IMPLICIT|[Ii]mplicit|INCLUDE|[Ii]nclude|INQUIRE|[Ii]nquire|INTRINSIC|[Ii]ntrinsic|LOGICAL|[Ll]ogical|MAP|[Mm]ap|NONE|[Nn]one|ON|[Oo]n|OPEN|[Oo]pen|PARAMETER|[Pp]arameter|PAUSE|[Pp]ause|POINTER|[Pp]ointer|PRINT|[Pp]rint|PROGRAM|[Pp]rogram|READ|[Rr]ead|RECORD|[Rr]ecord|RETURN|[Rr]eturn|REWIND|[Rr]ewind|SAVE|[Ss]ave|STATIC|[Ss]tatic|STOP|[Ss]top|STRUCTURE|[Ss]tructure|SUBROUTINE|[Ss]ubroutine|SYSTEM|[Ss]ystem|THEN|[Tt]hen|TO|[Tt]o|TYPE|[Tt]ype|UNION|[Uu]nion|UNLOCK|[Uu]nlock|VIRTUAL|[Vv]irtual|VOLATILE|[Vv]olatile|WHILE|[Ww]hile|WRITE|[Ww]rite)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'symbols', + 'regex' => '([\\*\\-\\+=:;%&\\|<>\\(\\)\\[\\]!])', + 'style' => 'symbol', + 'childregex' => [] + }, + { + 'name' => 'identifiers', + 'regex' => '([a-zA-Z_][a-zA-Z_0-9]*)', + 'style' => 'identifier', + 'childregex' => [] + } + ] + }; + + + + + + + + + +# written by VRS +$LANGUAGE{'gpasm'} = { + 'filename' => '(?i)\\.(asm|inc)$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'args', + 'regex' => '^.*$', + 'style' => 'symbol', + 'childregex' => [ + { + 'name' => 'comment', + 'regex' => ';.*?$', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'labels', + 'regex' => '^[A-Za-z_][A-Za-z_0-9]*:?', + 'style' => 'identifier', + 'childregex' => [] + }, + + { + 'name' => 'menonics', + 'regex' => '^[ \t]+[A-Za-z_][A-Za-z_0-9]*', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'string', + 'regex' => '""|".*?([^\\\\](\\\\\\\\)*)"|"\\\\\\\\"', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + } + + + ] + } + ] + }; + + + + + + + + +# written by JA +$LANGUAGE{'groff'} = { + 'filename' => '\\.groff$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'comment', + 'regex' => '\\\\".*?$', + 'style' => 'comment', + 'childregex' => [] + } + ] + }; + + + + + + + + + + + + + + + +# taken from nedit +# modified by PP +$LANGUAGE{'html'} = { + 'filename' => '(?i)\\.html?$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'comment', + 'regex' => '', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'entity', + 'regex' => '\\&[-.a-zA-Z0-9#]*;?', + 'style' => 'entity', + 'childregex' => [] + }, + { + 'name' => 'tag', + 'regex' => '<(/|!)?[-.a-zA-Z0-9]*.*?>', + 'style' => 'predefined identifier', + 'childregex' => [ + { + 'name' => 'double quote string', + 'regex' => '".*?"', + 'style' => 'string', + 'childregex' => [] + }, + { + 'name' => 'single quote string', + 'regex' => '\'.*?\'', + 'style' => 'string', + 'childregex' => [] + }, + { + 'name' => 'brackets', + 'regex' => '[<>]', + 'style' => 'braces', + 'childregex' => [] + }, + { + 'name' => 'attribute', + 'regex' => '[^\'" ]+(?=.)', + 'style' => 'identifier', + 'childregex' => [] + } + ] + } + ] + }; + + + + + + + + + + + + + + + +# taken from nedit +# modified by PP +$LANGUAGE{'java'} = { + 'filename' => '\\.java$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'doc comment', + 'regex' => '/\\*\\*.*?\\*/', + 'style' => 'doc comment', + 'childregex' => [] + }, + { + 'name' => 'comment', + 'regex' => '/\\*.*?\\*/', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'cplus comment', + 'regex' => '//.*?$', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'string', + 'regex' => '""|".*?([^\\\\](\\\\\\\\)*)"|"\\\\\\\\"', +# 'regex' => '""|"\\\\\\\\"|"[^"\\\\]"|"[^"].*?[^\\\\]"', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'single quoted', + 'regex' => '\'\'|\'.*?([^\\\\](\\\\\\\\)*)\'|\'\\\\\\\\\'', +# 'regex' => '\'\'|\'\\\\\\\\\'|\'[^\'\\\\]\'|\'[^\'].*?[^\\\\]\'', + 'style' => 'string', + 'childregex' => [] + }, + { + 'name' => 'numeric constant', + 'regex' => '\\b((0(x|X)[0-9a-fA-F]*)|(([0-9]+\\.?[0-9]*)|(\\.[0-9]+))((e|E)(\\+|-)?[0-9]+)?)(L|l|UL|ul|u|U|F|f)?\\b', + 'style' => 'numeric', + 'childregex' => [] + }, + { + 'name' => 'include', + 'regex' => '\\b(import|package)\\b.*?$', + 'style' => 'include', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\(.|\\n)', + 'style' => 'esc character', + 'childregex' => [] + }, + { + 'name' => 'comment', + 'regex' => '[^/]/\\*.*?\\*/', + 'style' => 'comment', + 'childregex' => [] + } + ] + }, + { + 'name' => 'storage keyword', + 'regex' => '\\b(abstract|boolean|byte|char|class|double|extends|final|float|int|interface|long|native|private|protected|public|short|static|transient|synchronized|void|volatile|implements)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'keyword', + 'regex' => '\\b(break|case|catch|continue|default|do|else|false|finally|for|if|instanceof|new|null|return|super|switch|this|throw|throws|true|try|while)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'braces and parens', + 'regex' => '[\\{\\}\\(\\)\\[\\]]', + 'style' => 'braces', + 'childregex' => [] + }, + { + 'name' => 'Identifiers', + 'regex' => '\\b[a-zA-Z_][a-zA-Z0-9_]*\\b', + 'style' => 'identifier', + 'childregex' => [] + }, + { + 'name' => 'symbols', + 'regex' => '([\\*\\-\\+=:;%&\\|<>!])', + 'style' => 'symbol', + 'childregex' => [] + } + ] + }; + + + + + + + + + + + + + + +# taken from nedit +# modified by PP +$LANGUAGE{'javascript'} = { + 'filename' => '(?i)\\.js$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'comment', + 'regex' => '/\\*.*?\\*/', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'cplus comment', + 'regex' => '//.*?$', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'numeric constant', + 'regex' => '\\b((0(x|X)[0-9a-fA-F]*)|(([0-9]+\\.?[0-9]*)|(\\.[0-9]+))((e|E)(\\+|-)?[0-9]+)?)(L|l|UL|ul|u|U|F|f)?\\b', + 'style' => 'numeric', + 'childregex' => [] + }, + { + 'name' => 'events', + 'regex' => '\\b(onAbort|onBlur|onClick|onChange|onDblClick|onDragDrop|onError|onFocus|onKeyDown|onKeyPress|onLoad|onMouseDown|onMouseMove|onMouseOut|onMouseOver|onMouseUp|onMove|onResize|onSelect|onSubmit|onUnload)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'braces', + 'regex' => '[\\{\\}]', + 'style' => 'braces', + 'childregex' => [] + }, + { + 'name' => 'statements', + 'regex' => '\\b(break|continue|else|for|if|in|new|return|this|typeof|var|while|with)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'function', + 'regex' => 'function[\\t ]+([a-zA-Z0-9_]+)[\\t \\(]+.*?[\\n{]', + 'style' => 'function header', + 'childregex' => [ + { + 'name' => 'function args', + 'regex' => '\\(.*?\\)', + 'style' => 'function header args', + 'childregex' => [] + }, + { + 'name' => 'function name', + 'regex' => '[\\t ][a-zA-Z0-9_]+', + 'style' => 'function header name', + 'childregex' => [] + } + ] + }, + { + 'name' => 'built in object type', + 'regex' => '\\b(anchor|Applet|Area|Array|button|checkbox|Date|document|elements|FileUpload|form|frame|Function|hidden|history|Image|link|location|Math|navigator|Option|password|Plugin|radio|reset|select|string|submit|text|textarea|window)\\b', + 'style' => 'predefined type', + 'childregex' => [] + }, + { + 'name' => 'string', + 'regex' => '".*?("|$)', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'colors', + 'regex' => '(aliceblue|antiquewhite|aqua|aquamarine|azure|beige|bisque|black|blanchedalmond|blue|blueviolet|brown|burlywood|cadetblue|chartreuse|chocolate|coral|cornflowerblue|cornsilk|crimson|cyan|darkblue|darkcyan|darkgoldenrod|darkgray|darkgreen|darkkhaki|darkmagenta|darkolivegreen|darkorange|darkorchid|darkred|darksalmon|darkseagreen|darkslateblue|darkslategray|darkturquoise|darkviolet|deeppink|deepskyblue|dimgray|dodgerblue|firebrick|floralwhite|forestgreen|fuchsia|gainsboro|ghostwhite|gold|goldenrod|gray|green|greenyellow|honeydew|hotpink|indianred|indigo|ivory|khaki|lavender|lavenderblush|lawngreen|lemonchiffon|lightblue|lightcoral|lightcyan|lightgoldenrodyellow|lightgreen|lightgrey|lightpink|lightsalmon|lightseagreen|lightskyblue|lightslategray|lightsteelblue|lightyellow|lime|limegreen|linen|magenta|#008000|mediumaquamarine|mediumblue|mediumorchid|mediumpurple|mediumseagreen|mediumslateblue|mediumspringgreen|mediumturquoise|mediumvioletred|midnightblue|mintcream|mistyrose|moccasin|navajowhite|navy|oldlace|olive|olivedrab|orange|orangered|orchid|palegoldenrod|palegreen|paleturquoise|palevioletred|papayawhip|peachpuff|peru|pink|plum|powderblue|purple|red|rosybrown|royalblue|saddlebrown|salmon|sandybrown|seagreen|seashell|sienna|silver|skyblue|slateblue|slategray|snow|springgreen|steelblue|tan|teal|thistle|tomato|turquoise|violet|wheat|white|whitesmoke|yellow|yellowgreen|#[A-Fa-f0-9][A-Fa-f0-9][A-Fa-f0-9][A-Fa-f0-9][A-Fa-f0-9][A-Fa-f0-9])', + 'style' => 'identifier', + 'childregex' => [] + } + ] + }, + { + 'name' => 'string', + 'regex' => '\'.*?(\'|$)', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'colors', + 'regex' => '(aliceblue|antiquewhite|aqua|aquamarine|azure|beige|bisque|black|blanchedalmond|blue|blueviolet|brown|burlywood|cadetblue|chartreuse|chocolate|coral|cornflowerblue|cornsilk|crimson|cyan|darkblue|darkcyan|darkgoldenrod|darkgray|darkgreen|darkkhaki|darkmagenta|darkolivegreen|darkorange|darkorchid|darkred|darksalmon|darkseagreen|darkslateblue|darkslategray|darkturquoise|darkviolet|deeppink|deepskyblue|dimgray|dodgerblue|firebrick|floralwhite|forestgreen|fuchsia|gainsboro|ghostwhite|gold|goldenrod|gray|green|greenyellow|honeydew|hotpink|indianred|indigo|ivory|khaki|lavender|lavenderblush|lawngreen|lemonchiffon|lightblue|lightcoral|lightcyan|lightgoldenrodyellow|lightgreen|lightgrey|lightpink|lightsalmon|lightseagreen|lightskyblue|lightslategray|lightsteelblue|lightyellow|lime|limegreen|linen|magenta|#008000|mediumaquamarine|mediumblue|mediumorchid|mediumpurple|mediumseagreen|mediumslateblue|mediumspringgreen|mediumturquoise|mediumvioletred|midnightblue|mintcream|mistyrose|moccasin|navajowhite|navy|oldlace|olive|olivedrab|orange|orangered|orchid|palegoldenrod|palegreen|paleturquoise|palevioletred|papayawhip|peachpuff|peru|pink|plum|powderblue|purple|red|rosybrown|royalblue|saddlebrown|salmon|sandybrown|seagreen|seashell|sienna|silver|skyblue|slateblue|slategray|snow|springgreen|steelblue|tan|teal|thistle|tomato|turquoise|violet|wheat|white|whitesmoke|yellow|yellowgreen|#[A-Fa-f0-9][A-Fa-f0-9][A-Fa-f0-9][A-Fa-f0-9][A-Fa-f0-9][A-Fa-f0-9])', + 'style' => 'identifier', + 'childregex' => [], + } + ] + }, + { + 'name' => 'event capturing', + 'regex' => '\\b(captureEvents|releaseEvents|routeEvent|handleEvent)\\b.*?(\\)|$)', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'predefined methods', + 'regex' => '\\b(abs|acos|alert|anchor|asin|atan|atan2|back|big|blink|blur|bold|ceil|charAt|clear|clearTimeout|click|close|confirm|cos|escape|eval|exp|fixed|floor|focus|fontcolor|fontsize|forward|getDate|getDay|getHours|getMinutes|getMonth|getSeconds|getTime|getTimezoneOffset|getYear|go|indexOf|isNaN|italics|javaEnabled|join|lastIndexOf|link|log|max|min|open|parse|parseFloat|parseInt|pow|prompt|random|reload|replace|reset|reverse|round|scroll|select|setDate|setHours|setMinutes|setMonth|setSeconds|setTimeout|setTime|setYear|sin|small|sort|split|sqrt|strike|sub|submit|substring|sup|taint|tan|toGMTString|toLocaleString|toLowerCase|toString|toUpperCase|unescape|untaint|UTC|write|writeln)\\b', + 'style' => 'library function', + 'childregex' => [] + }, + { + 'name' => 'properties', + 'regex' => '\\b(action|alinkColor|anchors|appCodeName|appName|appVersion|bgColor|border|checked|complete|cookie|defaultChecked|defaultSelected|defaultStatus|defaultValue|description|E|elements|enabledPlugin|encoding|fgColor|filename|forms|frames|hash|height|host|hostname|href|hspace|index|lastModified|length|linkColor|links|LN2|LN10|LOG2E|LOG10E|lowsrc|method|name|opener|options|parent|pathname|PI|port|protocol|prototype|referrer|search|selected|selectedIndex|self|SQRT1_2|SQRT2|src|status|target|text|title|top|type|URL|userAgent|value|vlinkColor|vspace|width|window)\\b', + 'style' => 'predefined identifier', + 'childregex' => [] + }, + { + 'name' => 'operators', + 'regex' => '([=;->/&|])', + 'style' => 'symbol', + 'childregex' => [] + } + ] + }; +$LANGUAGE{'js'} = $LANGUAGE{'javascript'}; + + + + + + + + +# written by Andreas Krennmair +# extremely incomplete + +$LANGUAGE{'lisp'} = { + 'filename' => '\\.(lsp|l)$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'parens', + 'regex' => '[()]', + 'style' => 'braces', + 'childregex' => [] + }, + { + 'name' => 'comment', + 'regex' => ';.*?$', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'string', + 'regex' => '".*?("|$)', + 'style' => 'string', + 'childregex' => [] + }, + { + 'name' => 'keywords', + 'regex' => '\\b(defun |xyz)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'numeric constant', + 'regex' => '(#\([0-9]+ [0-9]+\)|[0-9]+)', + 'style' => 'numeric', + 'childregex' => [] + }, + { + 'name' => 'identifiers', + 'regex' => '([-a-zA-Z]+)', + 'style' => 'identifier', + 'childregex' => [] + } + ] + }; + + + + + + + + + + +# written by JA +$LANGUAGE{'m4'} = { + 'filename' => '\\.m4$', + 'regex' => '', + 'patterns' => [ + { + 'regex' => 'dnl.*?$', + 'style' => 'doc comment', + 'childregex' => [] + }, + { + 'regex' => '#.*?$', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'regex' => '\\b(define|undefine|defn|pushdef|popdef|indir|builtin|changequote|changecom|changeword|m4wrap|m4exit|include|sinclude|divert|undivert|divnum|cleardiv|shift|dumpdef|traceon|traceoff|debugfile|debugmode|len|index|regexp|substr|translit|patsubst|format|incr|decr|syscmd|esyscmd|sysval|maketemp|errprint)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'regex' => '\\b(ifdef|ifelse|loops)\\b', + 'style' => 'reserved word', + 'childregex' => [ + { + 'regex' => '[$]\\$?({[^}]*}|[^a-zA-Z0-9_/\\t\\n\\.,\\\\[\\\\{\\\\(]|[0-9]+|[a-zA-Z_][a-zA-Z0-9_]*)?', + 'style' => 'identifier', + 'childregex' => [] + } + ] + } + ] + }; + + + + + + + + + + + + + + + +# taken from nedit +# modified by PP +$LANGUAGE{'make'} = { + 'filename' => '[Mm]akefile.*', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'Comment', + 'regex' => '#.*?$', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'Assignment', + 'regex' => '^( *| [ \\t]*)[A-Za-z0-9_+]*[ \\t]*(\\+|:)?=', + 'style' => 'assignment', + 'childregex' => [] + }, + { + 'name' => 'Dependency Line', + 'regex' => '^ *([A-Za-z0-9./$(){} _%+-]|\\n)*::?', + 'style' => 'dependency line', + 'childregex' => [ + { + 'name' => 'Dependency Target', + 'regex' => '[A-Za-z0-9./$(){} _%+-]+', + 'style' => 'dependency target', + 'childregex' => [] + }, + { + 'name' => 'Dependency Continuation', + 'regex' => '\\\\\\n', + 'style' => 'dependency continuation', + 'childregex' => [] + }, + { + 'name' => 'comment', + 'regex' => '#.*?$', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'macro', + 'regex' => '\\$([A-Za-z0-9_]|\\([^)]*\\)|{[^}]*})', + 'style' => 'macro', + 'childregex' => [] + }, + { + 'name' => 'int macro', + 'regex' => '\\$([<@*?%]|\\$@)', + 'style' => 'int macro', + 'childregex' => [] + } + ] + }, + { + 'name' => 'Continuation', + 'regex' => '\\\\$', + 'style' => 'continuation', + 'childregex' => [] + }, + { + 'name' => 'Macro', + 'regex' => '\\$([A-Za-z0-9_]|\\([^)]*\\)|{[^}]*})', + 'style' => 'macro', + 'childregex' => [] + }, + { + 'name' => 'Internal Macro', + 'regex' => '\\$([<@*?%]|\\$@)', + 'style' => 'int macro', + 'childregex' => [] + }, + { + 'name' => 'Escaped $$$', + 'regex' => '\\$\\$', + 'style' => 'esc $$$', + 'childregex' => [] + }, + { + 'name' => 'Include', + 'regex' => '^include[ \\t]', + 'style' => 'include', + 'childregex' => [] + } + ] + }; +$LANGUAGE{'makefile'} = $LANGUAGE{'make'}; + + + + + + + + + + + + + + + +# taken from nedit +# modified by PP +$LANGUAGE{'pas'} = { + 'filename' => '(?i)\\.p(as)?$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'comment1 (* *)', + 'regex' => '\\(\\*.*?\\*\\)', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'comment2 { }', + 'regex' => '\\{.*?\\}', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'string', + 'regex' => '\'.*?(\'|$)', + 'style' => 'string', + 'childregex' => [] + }, + { + 'name' => 'preprocessor line', + 'regex' => '^[ \\t]*#.*?$', + 'style' => 'preprocessor', + 'childregex' => [ + { + 'name' => 'comment1 (* *)', + 'regex' => '\\(\\*.*?\\*\\)', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'comment2 { }', + 'regex' => '\\{.*?\\}', + 'style' => 'comment', + 'childregex' => [] + } + ] + }, + { + 'name' => 'character constant', + 'regex' => '\'.\'', + 'style' => 'character', + 'childregex' => [] + }, + { + 'name' => 'numeric constant', + 'regex' => '\\b((0(x|X)[0-9a-fA-F]*)|[0-9.]+((e|E)(\\+|-)?)?[0-9]*)(L|l|UL|ul|u|U|F|f)?\\b', + 'style' => 'numeric', + 'childregex' => [] + }, + { + 'name' => 'storage and ops', + 'regex' => '\\b(?i)(and|array|const|div|export|file|function|import|in|label|mod|module|nil|not|only|or|packed|pow|pragma|procedure|program|protected|qualified|record|restricted|set|type|var)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'keywords', + 'regex' => '\\b(?i)(begin|case|do|downto|else|end|for|goto|if|of|otherwise|repeat|then|to|until|while|with)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'sumbols', + 'regex' => '([\\*\\-\\+=:;<>\\(\\)\\[\\]!]|[^/]/[^/])', + 'style' => 'symbol', + 'childregex' => [] + }, + { + 'name' => 'identifiers', + 'regex' => '([a-zA-Z_][a-zA-Z_0-9.^]*[a-zA-Z_0-9]|[a-zA-Z_][a-zA-Z_0-9]*)', + 'style' => 'identifier', + 'childregex' => [ + { + 'regex' => '(\\.|\\^)+', + 'style' => 'symbol', + 'childregex' => [] + } + ] + } + ], + }; +$LANGUAGE{'pascal'} = $LANGUAGE{'pas'}; + + + + + + + + + + + + + + + +# taken from nedit +# modified by PP +# modified by BS +# modified by JD +# modified by JP +$LANGUAGE{'perl'} = { + 'filename' => '(?i)\\.p([lm5]|od)$', + 'regex' => '^\\s*#\\s*![^\\s]*perl', + 'patterns' => [ + { + 'name' => 'comment', + 'regex' => '(?:#.*?(?:\r?\n\s*)+)+', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'variables', + 'regex' => '[\\$@%]\\$?(?:{[^}]*}|[^a-zA-Z0-9_/\\t\\n\\.,\\\\[\\\\{\\\\(]|[0-9]+|[a-zA-Z_][a-zA-Z0-9_]*)?', + 'style' => 'identifier', + 'childregex' => [] + }, + { + 'name' => '"" string', + 'regex' => '""|".*?([^\\\\](\\\\\\\\)*)"|"\\\\\\\\"', +# 'regex' => '""|"\\\\\\\\"|"[^"\\\\]"|"[^"].*?[^\\\\]"', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + }, + { + 'name' => 'variables', + 'regex' => '[\\$@%]\\$?(?:{[^}]*}|[^a-zA-Z0-9_/\\t\\n\\.,\\\\[\\\\{\\\\(]|[0-9]+|[a-zA-Z_][a-zA-Z0-9_]*)?', + 'style' => 'identifier', + 'childregex' => [] + } + ] + }, + { + 'name' => '\'\' string', + 'regex' => '\'\'|\'.*?([^\\\\](\\\\\\\\)*)\'|\'\\\\\\\\\'', +# 'regex' => '\'\'|\'\\\\\\\\\'|\'[^\'\\\\]\'|\'[^\'].*?[^\\\\]\'', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'more strings - q// qw//', + 'regex' => '(?:\\b| )(?:q|qw)([^\w\s])(?:\\\\\\2|[^\\2\\n])*\\2', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'more strings - qq// qx//', + 'regex' => '(?:\\b| )(?:qq|qx)([^\w\s])(?:\\\\\\2|[^\\2\\n])*\\2', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + }, + { + 'name' => 'variables', + 'regex' => '[\\$@%]\\$?(?:{[^}]*}|[^a-zA-Z0-9_/\\t\\n\\.,\\\\[\\\\{\\\\(]|[0-9]+|[a-zA-Z_][a-zA-Z0-9_]*)?', + 'style' => 'identifier', + 'childregex' => [] + } + ] + }, + { + 'name' => 'subroutine header', + 'regex' => 'sub[\\t ]+(?:[a-zA-Z0-9_]+)[\\t \\n]*(?:\\{|\\(|\\n)', + 'style' => 'function header', + 'childregex' => [ + { + 'name' => 'subroutine header coloring', + 'regex' => '[\\t ][a-zA-Z0-9_]+', + 'style' => 'function header name', + 'childregex' => [] + } + ] + }, + { + 'name' => 'regex matching I', + 'regex' => '(?:\\b| )?(?:/(?:\\\\/|[^/\\n])*(?:/[gimesox]*)|s([^\w\s])(?:\\\\\\2|[^\\2\\n])*?(\\2)[^(\\2)\\n]*?(\\2[gimesox]*))', + 'style' => 'regex', + 'childregex' => [] + }, + { + 'name' => 'regex matching II', + 'regex' => '(?:\\b| )(?:m|qq?|tr|y)([^\w\s])(?:\\\\\\2|[^\\2\\n])*(?:\\2[gimesox]*)', + 'style' => 'regex', + 'childregex' => [] + }, + { + 'name' => 'keywords', + 'regex' => '\\b(my|local|new|if|until|while|elsif|else|eval|unless|for|foreach|continue|exit|die|last|goto|next|redo|return|local|exec|do|use|require|package|eval|BEGIN|END|eq|ne|not|\\|\\||\\&\\&|and|or)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'library functions', + 'regex' => '\\b(?:a(?:bs|ccept|larm|tan2)|b(?:ind|inmode|less)|c(?:aller|hdir|hmod|homp|hop|hr|hroot|hown|losedir|lose|onnect|os|rypt)|d(?:bmclose|bmopen|efined|elete|ie|ump)|e(?:ach|nd(?:grent|hostent|netent|protoent|pwent|servent)|of|xec|xists|xp)|f(?:ctnl|ileno|lock|ork|ormat|ormline)|g(?:et(?:c|grent|grgid|grnam|hostbyaddr|hostbyname|hostent|login|netbyaddr|netbyname|netent|peername|pgrp|ppid|priority|protobyname|protobynumber|protoent|pwent|pwnam|pwuid|servbyname|servbyport|servent|sockname|sockopt)|lob|mtime|rep)|hex|i(?:mport|ndex|nt|octl)|join|keys|kill|l(?:cfirst|c|ength|ink|isten|og|ocaltime|stat)|m(?:ap|kdir|sgctl|sgget|sgrcv)|no|o(?:ct|pendir|pen|rd)|p(?:ack|ipe|op|os|rintf|rint|ush)|quotemeta|r(?:and|eaddir|ead|eadlink|ecv|ef|ename|eset|everse|ewinddir|index|mdir)|s(?:calar|eekdir|eek|elect|emctl|emget|emop|end|et(?:grent|hostent|netent|pgrp|priority|protoent|pwent|sockopt)|hift|hmctl|hmget|hmread|hmwrite|hutdown|in|leep|ocket|ocketpair|ort|plice|plit|printf|qrt|rand|tat|tudy|ubstr|ymlink|yscall|ysopen|ysread|ystem|yswrite)|t(?:elldir|ell|ie|ied|ime|imes|runcate)|u(?:c|cfirst|mask|ndef|nlink|npack|nshift|ntie|time)|values|vec|w(?:ait|aitpid|antarray|arn|rite)|qw|-[rwxoRWXOezsfdlpSbctugkTBMAC])\\b', + 'style' => 'library function', + 'childregex' => [] + }, + { + 'name' => 'braces, parens and brakets', + 'regex' => '[\\[\\]\\{\\}\\(\\)]', + 'style' => 'braces', + 'childregex' => [] + }, + { + 'name' => '<< stuff', + 'regex' => '<<(?:("|\')([^\\n]*)\\2|\\w*).*?^\\3$', + 'style' => 'text', + 'childregex' => [] + }, + { + 'name' => 'POD', + 'regex' => '^=.*?^(?:=cut|\\Z)', + 'style' => 'doc comment', + 'childregex' => [] + } + ] + }; + + + + + + + + + + + + + + + +# Thanks to Matt Giwer +$LANGUAGE{'pov'} = { + 'filename' => '(?i)\\.pov$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'doc comment', + 'regex' => '/\\*\\*.*?\\*/', + 'style' => 'doc comment', + 'childregex' => [] + }, + { + 'name' => 'comment', + 'regex' => '/\\*.*?\\*/', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'cplus comment', + 'regex' => '//.*?$', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'string', + 'regex' => '""|".*?([^\\\\](\\\\\\\\)*)"|"\\\\\\\\"', +# 'regex' => '""|"\\\\\\\\"|"[^"\\\\]"|"[^"].*?[^\\\\]"', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'preprocessor line', + 'regex' => '^[ \\t]*#.*?$', + 'style' => 'preprocessor', + 'childregex' => [ + { + 'name' => 'string', + 'regex' => '""|".*?([^\\\\](\\\\\\\\)*)"|"\\\\\\\\"', +# 'regex' => '""|"\\\\\\\\"|"[^"\\\\]"|"[^"].*?[^\\\\]"', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => '', + 'regex' => '<.*?>', + 'style' => 'string', + 'childregex' => [] + }, + { + 'name' => 'comment', + 'regex' => '[^/]/\\*.*?\\*/', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'cplus comment', + 'regex' => '//.*?$', + 'style' => 'comment', + 'childregex' => [] + } + ] + }, + { + 'name' => 'character constant', + 'regex' => '\'(\\\\)?.\'', + 'style' => 'character', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'numeric constant', + 'regex' => '\\b((0(x|X)[0-9a-fA-F]*)|(([0-9]+\\.?[0-9]*)|(\\.[0-9]+))((e|E)(\\+|-)?[0-9]+)?)(L|l|UL|ul|u|U|F|f)?\\b', + 'style' => 'numeric', + 'childregex' => [] + }, + { + 'name' => 'keyword', + 'regex' => '\\b(abs|absorption|acos|acosh|adaptive|adc_bailout|agate|agate_turb|all|alpha|ambient|ambient_light|angle|aperture|append|arc_angle|area_light|array|asc|asin|asinh|assumed_gamma|atan|atan2|atanh|average|background|bezier_spline|bicubic_patch|black_hole|blob|blue|blur_samples|bounded_by|box|boxed|bozo|break|brick|brick_size|brightness|brilliance|bumps|bump_map|bump_size|camera|case|caustics|ceil|checker|chr|clipped_by|clock|clock_delta|color|color_map|colour|colour_map|component|composite|concat|cone|confidence|conic_sweep|control0|control1|cos|cosh|count|crackle|crand|cube|cubic|cubic_spline|cubic_wave|cylinder|cylindrical|debug|declare|default|defined|degrees|density|density_file|density_map|dents|difference|diffuse|dimensions|dimension_size|direction|disc|distance|distance_maximum|div|eccentricity|else|emission|end|error|error_bound|exp|extinction|fade_distance|fade_power|falloff|falloff_angle|false|fclose|file_exists|filter|finish|fisheye|flatness|flip|floor|focal_point|fog|fog_alt|fog_offset|fog_type|fopen|frequency|gif|global_settings|gradient|granite|gray_threshold|green|height_field|hexagon|hf_gray_16|hierarchy|hollow|hypercomplex|if|ifdef|iff|ifndef|image_map|include|int|interior|interpolate|intersection|intervals|inverse|ior|irid|irid_wavelength|jitter|julia_fractal|lambda|lathe|leopard|light_source|linear_spline|linear_sweep|local|location|log|looks_like|look_at|low_error_factor|macro|mandel|map_type|marble|material|material_map|matrix|max|max_intersections|max_iteration|max_trace_level|media|media_attenuation|media_interaction|merge|mesh|metallic|min|minimum_reuse|mod|mortar|nearest_count|no|normal|normal_map|no_shadow|number_of_waves|object|octaves|off|offset|omega|omnimax|on|once|onion|open|orthographic|panoramic|perspective|pgm|phase|phong|phong_size|pi|pigment|pigment_map|planar|plane|png|point_at|poly|polygon|poly_wave|pot|pow|ppm|precision|prism|pwr|quadratic_spline|quadric|quartic|quaternion|quick_color|quick_colour|quilted|radial|radians|radiosity|radius|rainbow|ramp_wave|rand|range|ratio|read|reciprocal|recursion_limit|red|reflection|reflection_exponent|refraction|render|repeat|rgb|rgbf|rgbft|rgbt|right|ripples|rotate|roughness|samples|scale|scallop_wave|scattering|seed|shadowless|sin|sine_wave|sinh|sky|sky_sphere|slice|slope_map|smooth|smooth_triangle|sor|specular|sphere|spherical|spiral1|spiral2|spotlight|spotted|sqr|sqrt|statistics|str|strcmp|strength|strlen|strlwr|strupr|sturm|substr|superellipsoid|switch|sys|t|tan|tanh|text|texture|texture_map|tga|thickness|threshold|tightness|tile2|tiles|torus|track|transform|translate|transmit|triangle|triangle_wave|true|ttf|turbulence|turb_depth|type|u|ultra_wide_angle|undef|union|up|use_color|use_colour|use_index|u_steps|v|val|variance|vaxis_rotate|vcross|vdot|version|vlength|vnormalize|vrotate|v_steps|warning|warp|water_level|waves|while|width|wood|wrinkles|write|x|y|yes|z)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'braces', + 'regex' => '[\\{\\}]', + 'style' => 'braces', + 'childregex' => [] + }, + { + 'name' => 'symbols', + 'regex' => '([\\*\\-\\+=:;%&\\|<>\\(\\)\\[\\]!])', + 'style' => 'symbol', + 'childregex' => [] + }, + { + 'name' => 'identifiers', + 'regex' => '([a-zA-Z_][a-zA-Z_0-9]*)', + 'style' => 'identifier', + 'childregex' => [] + } + ] + }; +$LANGUAGE{'povray'} = $LANGUAGE{'pov'}; + + + + +# by Tom Good +$LANGUAGE{'python'} = { + 'filename' => '(?i)\\.py$', + 'regex' => '^\\s*#\\s*![^\\s]*python', + 'patterns' => [ + { + 'name' => 'python comment', + 'regex' => '#.*?$', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'single quote string', + 'regex' => '\'.*?\'', + 'style' => 'string', + 'childregex' => [] + }, + + { + 'name' => 'string', + 'regex' => '""|"\\\\\\\\"|".*?([^\\\\](\\\\\\\\)*)"', + 'regex' => '""|".*?([^\\\\](\\\\\\\\)*)"|"\\\\\\\\"', + 'regex' => '""|"\\\\\\\\"|"[^"\\\\]"|"[^"].*?[^\\\\]"', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'character constant', + 'regex' => '\'(\\\\)?.\'', + 'style' => 'character', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '\\\\.', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'numeric constant', + 'regex' => '\\b((0(x|X)[0-9a-fA-F]*)|(([0-9]+\\.?[0-9]*)|(\\.[0-9]+))((e|E)(\\+|-)?[0-9]+)?)(L|l|UL|ul|u|U|F|f)?\\b', + 'style' => 'numeric', + 'childregex' => [] + }, + { + 'name' => 'keyword', + 'regex' => '\\b(and|assert|break|class|continue|del|elif|else|except|exec|finally|for|from|global|if|import|in|is|lambda|not|or|pass|print|raise|return|try|while)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'braces', + 'regex' => '[\\{\\}]', + 'style' => 'braces', + 'childregex' => [] + }, + { + 'name' => 'symbols', + 'regex' => '([\\*\\-\\+=:;%&\\|<>\\(\\)\\[\\]!])', + 'style' => 'symbol', + 'childregex' => [] + }, + { + 'name' => 'identifiers', + 'regex' => '([a-zA-Z_][a-zA-Z_0-9]*)', + 'style' => 'identifier', + 'childregex' => [] + }, + { + 'name' => 'function', + 'regex' => '[\\t ]*def[\\t ]+([a-zA-Z0-9_]+)[\\t \\(]+.*?[\\n{]', + 'style' => 'function header', + 'childregex' => [ + { + 'name' => 'function args', + 'regex' => '\\(.*?\\)', + 'style' => 'function header args', + 'childregex' => [] + }, + { + 'name' => 'function name', + 'regex' => '[\\t ][a-zA-Z0-9_]+', + 'style' => 'function header name', + 'childregex' => [] + } + ] + }, + { + 'name' => 'library functions', + 'regex' => '\\b(__import__|abs|apply|buffer|callable|chr|cmp|coerce|compile|complex|delatter|dir|divmod|eval|execfile|filter|float|getattr|globals|hasattr|hash|hex|id|input|int|intern|isinstance|issubclass|len|list|locals|long|map|max|min|oct|open|ord|pow|range|raw_input|reduce|reload|repr|round|setattr|slice|str|tuple|type|unichr|unicode|vars|xrange|zip)\\b', + 'style' => 'library function', + 'childregex' => [] + }, + ] + }; + + + +# by Joshua Swink +$LANGUAGE{'ruby'} = { + 'filename' => '\\.rb$', + 'regex' => '^\\s*#\\s*![^\\s]*\\bruby\\b', + 'patterns' => [ + { + 'name' => 'comment', + 'regex' => '(?:#.*?(?:\r?\n\s*)+)+', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'predefined variables', + 'regex' => '(?:\\$(?:[!@&`\'+\\d~=/\\\\,;.<>_*\\$?:"]|DEBUG|FILENAME|LOAD_PATH|stdin|stdout|stderr|VERBOSE|-[0adFiIlpv])|\\b(?:TRUE|FALSE|NIL|STDIN|STDOUT|STDERR|ENV|ARGF|ARGV|DATA|RUBY_VERSION|RUBY_RELEASE_DATE|RUBY_PLATFORM)\\b)', + 'style' => 'predefined identifier', + 'childregex' => [] + }, + { + 'name' => 'variables', + 'regex' => '[\\$@](?:{[^}]*}|[^\\w/\\t\\n\\.,\\\\[\\\\{\\\\(]|[0-9]+|[a-zA-Z_][\\w.]*)?', + 'style' => 'identifier', + 'childregex' => [] + }, + { + 'name' => '"" string', + 'regex' => '""|"(?:\\\\\\\\)+"|".*?(?:[^\\\\](?:\\\\\\\\)*)"|%[Qwx]?([^\\w\\[\\](){}<>])\\2|%[Qwx]?([^\\w\\[\\](){}<>]).*?(?:[^\\\\](?:\\\\\\\\)*)\\3|%[Qwx]?([^\\w\\[\\](){}<>])\\\\\\\\\\4|%[Qwx]?\\[\\]|%[Qwx]?\\[.*?([^\\\\](\\\\\\\\)*)\\]|%[Qwx]?\\[\\\\\\\\\\]|%[Qwx]?\\{\\}|%[Qwx]?\\{.*?([^\\\\](\\\\\\\\)*)\\}|%[Qwx]?\\{\\\\\\\\\\}|%[Qwx]?\\(\\)|%[Qwx]?\\(.*?([^\\\\](\\\\\\\\)*)\\)|%[Qwx]?\\(\\\\\\\\\\)|%[Qwx]?<>|%[Qwx]?<.*?([^\\\\](\\\\\\\\)*)>|%[Qwx]?<\\\\\\\\>', + + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex', => '\\\\(?:x[\\da-fA-F]{2}|\d\d\d|c.|M-\\\\C-.|M-.|C-.|.)', + 'style' => 'esc character', + 'childregex' => [] + }, + { + 'name' => 'string expression', + 'regex' => '#[\\$\\@][a-zA-Z_][\\w.]*|#\\{[\\$\\@]?[^\\}]*\\}', + 'style' => 'identifier', + 'childregex' => [] + } + ] + }, + { + 'name' => '\'\' string', + 'regex' => '\'\'|\'(?:\\\\\\\\)+\'|\'.*?(?:[^\\\\](?:\\\\\\\\)*)\'|%q([^\\w\\[\\](){}<>])\\2|%q([^\\w\\[\\](){}<>]).*?(?:[^\\\\](?:\\\\\\\\)*)\\3|%q([^\\w\\[\\](){}<>])\\\\\\\\\\4|%q\\[\\]|%q\\[.*?([^\\\\](\\\\\\\\)*)\\]|%q\\[\\\\\\\\\\]|%q\\{\\}|%q\\{.*?([^\\\\](\\\\\\\\)*)\\}|%q\\{\\\\\\\\\\}|%q\\(\\)|%q\\(.*?([^\\\\](\\\\\\\\)*)\\)|%q\\(\\\\\\\\\\)|%q<>|%q<.*?([^\\\\](\\\\\\\\)*)>|%q<\\\\\\\\>', + 'style' => 'string', + 'childregex' => [ + { + 'name' => 'esc character', + 'regex' => '(?:\\\\\'|\\\\\\\\)', + 'style' => 'esc character', + 'childregex' => [] + } + ] + }, + { + 'name' => 'subroutine header', + 'regex' => 'def[\\t ]+\\w[\\w.]*(?:\\([^)]*\\))?', + 'style' => 'function header', + 'childregex' => [ + { + 'name' => 'arg list', + 'regex' => '\\(.*\\)', + 'style' => 'function header args', + 'childregex' => [ + { + 'name' => 'arg list parens', + 'regex' => '[\\(\\)]', + 'style' => 'symbol', + 'childregex' => [] + } + ] + }, + { + 'name' => 'subroutine header', + 'regex' => '[\\t ]\w+', + 'style' => 'function header name', + 'childregex' => [] + } + ] + }, + { + 'name' => 'class header', + 'regex' => 'class[\\t ]+\\w+(?:\\s*<\\s*\\w+)?', + 'style' => 'function header', + 'childregex' => [ + { + 'name' => 'class ancestor', + 'regex' => '<\\s*\\w+', + 'style' => 'include', + 'childregex' => [ + { + 'name' => 'inheritance doohickey', + 'regex' => '<', + 'style' => 'symbol', + 'childregex' => [] + } + ] + }, + { + 'name' => 'class main', + 'regex' => '[\\t ]\\w+', + 'style' => 'type', + 'childregex' => [] + } + ] + }, + { + 'name' => 'regex matching 0', + 'regex' => '(?:%r([^\\w\\[\\](){}<>])\\2|%r([^\\w\\[\\](){}<>]).*?(?:[^\\\\](?:\\\\\\\\)*)\\3|%r([^\\w\\[\\](){}<>])\\\\\\\\\\4|%r\\[\\]|%r\\[.*?([^\\\\](\\\\\\\\)*)\\]|%r\\[\\\\\\\\\\]|%r\\{\\}|%r\\{.*?([^\\\\](\\\\\\\\)*)\\}|%r\\{\\\\\\\\\\}|%r\\(\\)|%r\\(.*?([^\\\\](\\\\\\\\)*)\\)|%r\\(\\\\\\\\\\)|%r<>|%r<.*?([^\\\\](\\\\\\\\)*)>|%r<\\\\\\\\>)[ixpno]*', + 'style' => 'regex', + 'childregex' => [ + { + 'name' => 'string expression', + 'regex' => '#[\\$\\@][a-zA-Z_][\\w.]*|#\\{[\\$\\@]?[a-zA-Z_][^\\}]*\\}', + 'style' => 'identifier', + 'childregex' => [] + } + ] + }, + { + 'name' => 'regex matching I', + 'regex' => '(?:\\b| )?(?:/(?:\\\\/|[^/\\n])*(?:/[ixpno]*))', + 'style' => 'regex', + 'childregex' => [ + { + 'name' => 'string expression', + 'regex' => '#[\\$\\@][a-zA-Z_][\\w.]*|#\\{[\\$\\@]?[a-zA-Z_][^\\}]*\\}', + 'style' => 'identifier', + 'childregex' => [] + } + ] + }, + { + 'name' => 'reserved words', + 'regex' => '\\b(BEGIN|class|ensure|nil|self|when|END|def|false|not|super|while|alias|defined|for|or|then|yield|and|do|if|redo|true|begin|else|in|rescue|undef|break|elsif|module|retry|unless|case|end|next|return|until)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'kernel module methods', + 'regex', => '\\b(Array|Float|Integer|String|at_exit|autoload|binding|caller|catch|chop|chomp|chomp!|eval|exec|exit|fail|fork|format|gets|global_variables|gsub|iterator|lambda|load|local_variables|loop|open|p|print|printf|proc|putc|puts|raise|rand|readline|readlines|require|select|sleep|split|sprintf|srand|sub|syscall|system|test|trace_var|trap|untrace_var)\\b', + 'style' => 'library function', + 'childregex' => [] + }, + { + 'name' => 'braces, parens and brakets', + 'regex' => '[\\[\\]\\{\\}\\(\\)]', + 'style' => 'braces', + 'childregex' => [] + }, + { + 'name' => '<< stuff', + 'regex' => '<<(?:("|\')([^\\n]*)\\2|\\w*).*?^\\3$', + 'style' => 'text', + 'childregex' => [] + }, + { + 'name' => 'symbols', + 'regex' => '(?:[:*-+<>=^!,/]+|\.\.+)', + 'style' => 'symbol', + 'childregex' => [] + }, + { + 'name' => 'numbers', + 'regex' => '\d[\d.]*', + 'style' => 'numeric', + 'childregex' => [] + }, + { + 'name' => 'embedded documentation', + 'regex' => '^=.*?^(?:=end|\\Z)', + 'style' => 'doc comment', + 'childregex' => [] + } + ] + }; + +# taken from nedit +# modified by PP +# very inclomplete! +$LANGUAGE{'sql'} = { + 'filename' => '(?i)\\.sql$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'keywords I', + 'regex' => '(?i)(,|%|<|>|:=|=|\\(|\\)|\\bselect|on|from|order by|desc|where|and|or|not|null|true|false)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'comment I', + 'regex' => '--.*?$', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'comment II', + 'regex' => '/\\*.*?\\*/', + 'style' => 'comment', + 'childregex' => [] + }, + { + 'name' => 'string', + 'regex' => '\'\'|\'.*?([^\\\\](\\\\\\\\)*)\'|\'\\\\\\\\\'', +# 'regex' => '(\'\'|\'[^\'\\\\]\'|\'[^\'].*?[^\\\\]\')', + 'style' => 'string', + 'childregex' => [] + }, + { + 'name' => 'keywords II', + 'regex' => '(?i)end if;|\\b(create|replace|begin|end|function|return|fetch|open|close|into|is|in|when|others|grant|on|to|exception|show|set|out|pragma|as|package)\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'keywords III', + 'regex' => '(?i)\\balter\\b', + 'style' => 'reserved word', + 'childregex' => [] + }, + { + 'name' => 'datatypes', + 'regex' => '(?i)\\b(integer|blol|date|numeric|character|varying|varchar|char)\\b', + 'style' => 'predefined type', + 'childregex' => [] + }, + { + 'name' => 'words', + 'regex' => '(?i)\\b(constraint|key|references|primary|table|foreign|add|insert|group by)\\b', + 'style' => 'reserved word', + 'childregex' => [] + } + ] + }; + + + +$LANGUAGE{'patch'} = { + 'filename' => '(?i)\\.patch$|\\.diff$', + 'regex' => '', + 'patterns' => [ + { + 'name' => 'header', + 'regex' => '^Index: .*?$|^===== .*?$|^diff .*?$|^--- .*?$|^\+\+\+ .*?$', + 'style' => 'separator', + 'childregex' => [] + }, + { + 'name' => 'hunk', + 'regex' => '^@@ .*?$', + 'style' => 'line spec', + 'childregex' => [] + }, + { + 'name' => 'from', + 'regex' => '^-.*?$', + 'style' => 'deletion', + 'childregex' => [] + }, + { + 'name' => 'to', + 'regex' => '^\+.*?$', + 'style' => 'insertion', + 'childregex' => [] + } + ] + }; + + + +##### +# +# LANGUAGE: shell script +# + +$LANGUAGE{'shellscript'} = { + 'filename' => '\\.(sh|shell)$', + 'regex' => '^\\s*#\\s*![^\\s]*(sh|bash|ash|zsh|ksh)', + 'patterns' => [ { + 'name' => 'comment', +# 'regex' => '^[ \t]*[^$]?\#[^!]?.*?$', + 'regex' => '(^| )#([^\\!].)*?$', + 'style' => 'comment', + 'childregex' => [] + }, { + 'name' => 'identifier', + 'regex' => '[a-zA-Z][a-zA-Z0-9_]*=', + 'style' => '', + 'childregex' => [ { + 'name' => 'identifier', + 'regex' => '[a-zA-Z][a-zA-Z0-9_]*', + 'style' => 'identifier', + 'childregex' => [] + } ] + }, { + 'name' => 'identifier', + 'regex' => '\\$([0-9#\\*]|[a-zA-Z][a-zA-Z0-9_]*)', + 'style' => 'identifier', + 'childregex' => [] + }, { + 'name' => 'interpreter line', + 'regex' => '^[ \t]*#!.*?$', + 'style' => 'preprocessor', + childregex => [] + }, { + 'name' => 'string', + 'regex' => '""|"(\\\\"|[^\\"])*"', + 'style' => 'string', + childregex => [ { + 'name' => 'identifier', + 'regex' => '\\$([0-9#\\*]|[a-zA-Z][a-zA-Z0-9_]*)', + 'style' => 'identifier', + 'childregex' => [] + } ] + } ] +}; + +$LANGUAGE{'sh'} = $LANGUAGE{'shellscript'}; +return \%LANGUAGE; + +}; diff --git a/3rd_party/hypre/src/examples/docs/ex1.htm b/3rd_party/hypre/src/examples/docs/ex1.htm new file mode 100644 index 000000000..4d61d6d3e --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex1.htm @@ -0,0 +1,12 @@ +

Example 1

+

+This is a two processor example. Each processor owns one +box in the grid. For reference, the two grid boxes are those +in the example diagram in the struct interface chapter +of the User's Manual. Note that in this example code, we have +used the two boxes shown in the diagram as belonging +to processor 0 (and given one box to each processor). The +solver is PCG with no preconditioner. +

+We recommend viewing examples 1-4 sequentially for +a nice overview/tutorial of the struct interface. diff --git a/3rd_party/hypre/src/examples/docs/ex10.htm b/3rd_party/hypre/src/examples/docs/ex10.htm new file mode 100644 index 000000000..4cf9efdb0 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex10.htm @@ -0,0 +1,11 @@ +

Example 10

+

+This code solves a system corresponding to a discretization +of the Laplace equation with zero boundary conditions on the +unit square. The domain is split into a n x n grid of +quadrilateral elements and each processors owns a horizontal +strip of size m x n, where m = n/nprocs. We use bilinear +finite element discretization, so there are nodes (vertices) +that are shared between neighboring processors. The Finite +Element Interface is used to assemble the matrix and solve +the problem. Nine different solvers are available. diff --git a/3rd_party/hypre/src/examples/docs/ex11.htm b/3rd_party/hypre/src/examples/docs/ex11.htm new file mode 100644 index 000000000..8a7460ca5 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex11.htm @@ -0,0 +1,8 @@ +

Example 11

+

+This example solves the 2-D Laplacian eigenvalue problem with zero boundary +conditions on an nxn grid. The number of unknowns is N=n^2. The standard +5-point stencil is used, and we solve for the interior nodes only. +

+We use the same matrix as in Examples 3 and 5. The eigensolver is LOBPCG with +AMG preconditioner. diff --git a/3rd_party/hypre/src/examples/docs/ex12.htm b/3rd_party/hypre/src/examples/docs/ex12.htm new file mode 100644 index 000000000..8fdadcff7 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex12.htm @@ -0,0 +1,10 @@ +

Example 12

+

+The grid layout is the same as ex1, but with nodal unknowns. The solver is PCG +preconditioned with either PFMG or BoomerAMG, selected on the command line. +

+We recommend viewing the Struct examples before viewing this and the other +SStruct examples. This is one of the simplest SStruct examples, used primarily +to demonstrate how to set up non-cell-centered problems, and to demonstrate how +easy it is to switch between structured solvers (PFMG) and solvers designed for +more general settings (AMG). diff --git a/3rd_party/hypre/src/examples/docs/ex12f.htm b/3rd_party/hypre/src/examples/docs/ex12f.htm new file mode 100644 index 000000000..eade707f4 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex12f.htm @@ -0,0 +1,10 @@ +

Example 12 (Fortran version)

+

+The grid layout is the same as ex1, but with nodal unknowns. The solver is PCG +preconditioned with either PFMG or BoomerAMG, set in the code. +

+We recommend viewing the Struct examples before viewing this and the other +SStruct examples. This is one of the simplest SStruct examples, used primarily +to demonstrate how to set up non-cell-centered problems, and to demonstrate how +easy it is to switch between structured solvers (PFMG) and solvers designed for +more general settings (AMG). diff --git a/3rd_party/hypre/src/examples/docs/ex13.htm b/3rd_party/hypre/src/examples/docs/ex13.htm new file mode 100644 index 000000000..2e05fbc43 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex13.htm @@ -0,0 +1,38 @@ +

Example 13

+

+This code solves the 2D Laplace equation using bilinear finite element +discretization on a mesh with an "enhanced connectivity" point. Specifically, +we solve -Delta u = 1 with zero boundary conditions on a star-shaped domain +consisting of identical rhombic parts each meshed with a uniform n x n grid. +Every part is assigned to a different processor and all parts meet at the +origin, equally subdividing the 2*pi angle there. The case of six processors +(parts) looks as follows: +

+

+                                    +
+                                   / \
+                                  /   \
+                                 /     \
+                       +--------+   1   +---------+
+                        \        \     /         /
+                         \    2   \   /    0    /
+                          \        \ /         /
+                           +--------+---------+
+                          /        / \         \
+                         /    3   /   \    5    \
+                        /        /     \         \
+                       +--------+   4   +---------+
+                                 \     /
+                                  \   /
+                                   \ /
+                                    +
+
+

+Note that in this problem we use nodal variables, which will be shared between +the different parts, so the node at the origin, for example, will belong to all +parts. +

+We recommend viewing the Struct examples before viewing this and the other +SStruct examples. The primary role of this particular SStruct example is to +demonstrate how to set up non-cell-centered problems, and specifically problems +with an "enhanced connectivity" point. diff --git a/3rd_party/hypre/src/examples/docs/ex14.htm b/3rd_party/hypre/src/examples/docs/ex14.htm new file mode 100644 index 000000000..a5473a86e --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex14.htm @@ -0,0 +1,6 @@ +

Example 14

+

+This is a version of Example 13, which uses the SStruct +FEM input functions instead of stencils to describe a problem on a mesh with an "enhanced +connectivity" point. This is the recommended way to set up a finite element +problem in the SStruct interface. diff --git a/3rd_party/hypre/src/examples/docs/ex15.htm b/3rd_party/hypre/src/examples/docs/ex15.htm new file mode 100644 index 000000000..bf370db89 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex15.htm @@ -0,0 +1,22 @@ +

Example 15

+

+This code solves a 3D electromagnetic diffusion (definite curl-curl) problem +using the lowest order Nedelec, or "edge" finite element discretization on a +uniform hexahedral meshing of the unit cube. The right-side corresponds to a +unit force and we use uniform zero Dirichlet boundary conditions. The overall +problem reads: curl alpha curl E + beta E = 1, with E x n = 0 on the boundary, +where alpha and beta are piecewise-constant material coefficients. + +

+The linear system is split in parallel using the SStruct interface with a n x n +x n grid on each processors. Note that, the number of processors should +therefore be a perfect cube! + +

+This code is mainly meant as an illustration of using the Auxiliary-space +Maxwell Solver (AMS) through the SStruct interface. It uses two grids -- one +for the nodal and one for the edge variables, and we show how to constructs the +rectangular "discrete gradient" matrix that connects them. Finally, this is +also an example of setting up a finite element discretization in the SStruct +interface, and we recommend viewing Example 13 and Example 14 before viewing this example. diff --git a/3rd_party/hypre/src/examples/docs/ex15big.htm b/3rd_party/hypre/src/examples/docs/ex15big.htm new file mode 100644 index 000000000..3b2145b87 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex15big.htm @@ -0,0 +1,14 @@ +

Example 15 (64-bit version)

+

+This example is a slight modification of Example 15 that illustrates the 64-bit +integer support in hypre needed to run problems with more than 2B unknowns. +

+Specifically, the changes compared to Example 15 are as follows: +

    +
  1. All integer arguments to HYPRE functions should be declared of type HYPRE_Int. +
  2. Variables of type HYPRE_Int are 64-bit integers, so they should be printed in the %lld format (not %d). +
+

+To enable the 64-bit integer support, you need to build hypre with the +--enable-bigint option of the configure script. We recommend comparing this +example with Example 15. diff --git a/3rd_party/hypre/src/examples/docs/ex16.htm b/3rd_party/hypre/src/examples/docs/ex16.htm new file mode 100644 index 000000000..456152df5 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex16.htm @@ -0,0 +1,7 @@ +

Example 16

+

+This code solves the 2D Laplace equation using a high order Q3 finite element +discretization. Specifically, we solve -Delta u = 1 with zero boundary +conditions on a unit square domain meshed with a uniform grid. The mesh is +distributed across an N x N process grid, with each processor containing an n x +n sub-mesh of data, so the global mesh is nN x nN. diff --git a/3rd_party/hypre/src/examples/docs/ex2.htm b/3rd_party/hypre/src/examples/docs/ex2.htm new file mode 100644 index 000000000..eb7e9775f --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex2.htm @@ -0,0 +1,11 @@ +

Example 2

+

+This is a two processor example and is similar to the previous +structured interface example (Example 1). However, in +this case the grid boxes are exactly those in the example +diagram in the struct interface chapter of the User's Manual. +(Processor 0 owns two boxes and processor 1 owns one box.) +The solver is PCG with SMG preconditioner. +

+We recommend viewing example 1 before viewing this +example. diff --git a/3rd_party/hypre/src/examples/docs/ex3.htm b/3rd_party/hypre/src/examples/docs/ex3.htm new file mode 100644 index 000000000..d40fd876d --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex3.htm @@ -0,0 +1,30 @@ +

Example 3

+

+This code solves a system corresponding to a discretization +of the Laplace equation with zero boundary conditions on the +unit square. The domain is split into an N x N processor grid. +Thus, the given number of processors should be a perfect square. +Each processor's piece of the grid has n x n cells with n x n +nodes connected by the standard 5-point stencil. Note that the +struct interface assumes a cell-centered grid, and, therefore, +the nodes are not shared. This example demonstrates more +features than the previous two struct examples (Example 1 and +Example 2). Two solvers are available. +

+To incorporate the boundary conditions, we do the following: +Let x_i and x_b be the interior and boundary parts of the +solution vector x. We can split the matrix A as +

+

A = [A_ii A_ib; A_bi A_bb].
+

+Let u_0 be the Dirichlet B.C. We can simply say that x_b = u_0. +If b_i is the right-hand side, then we just need to solve in +the interior: +

+

A_ii x_i = b_i - A_ib u_0.
+

+For this partitcular example, u_0 = 0, so we are just solving +A_ii x_i = b_i. +

+We recommend viewing examples 1 and 2 before viewing this +example. diff --git a/3rd_party/hypre/src/examples/docs/ex4.htm b/3rd_party/hypre/src/examples/docs/ex4.htm new file mode 100644 index 000000000..45a7d4eb5 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex4.htm @@ -0,0 +1,38 @@ +

Example 4

+

+This example differs from the previous structured example +(Example 3) in that a more sophisticated stencil and +boundary conditions are implemented. The method illustrated +here to implement the boundary conditions is much more general +than that in the previous example. Also symmetric storage is +utilized when applicable. +

+This code solves the convection-reaction-diffusion problem +div (-K grad u + B u) + C u = F in the unit square with +boundary condition u = U0. The domain is split into N x N +processor grid. Thus, the given number of processors should +be a perfect square. Each processor has a n x n grid, with +nodes connected by a 5-point stencil. Note that the struct +interface assumes a cell-centered grid, and, therefore, the +nodes are not shared. +

+To incorporate the boundary conditions, we do the following: +Let x_i and x_b be the interior and boundary parts of the +solution vector x. If we split the matrix A as +

A = [A_ii A_ib; A_bi A_bb],
+

+then we solve +

[A_ii 0; 0 I] [x_i ; x_b] = [b_i - A_ib u_0; u_0].
+

+Note that this differs from the previous example in that we +are actually solving for the boundary conditions (so they +may not be exact as in ex3, where we only solved for the +interior). This approach is useful for more general types +of b.c. +

+A number of solvers are available. More information can be +found in the Solvers and Preconditioners chapter of the +User's Manual. +

+We recommend viewing examples 1, 2, and 3 before viewing this +example. diff --git a/3rd_party/hypre/src/examples/docs/ex5.htm b/3rd_party/hypre/src/examples/docs/ex5.htm new file mode 100644 index 000000000..534fbc22e --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex5.htm @@ -0,0 +1,12 @@ +

Example 5

+

+This example solves the 2-D +Laplacian problem with zero boundary conditions +on an nxn grid. The number of unknowns is N=n^2. +The standard 5-point stencil is used, and we solve +for the interior nodes only. +

+This example solves the same problem as Example 3. +Available solvers are AMG, PCG, PCG with AMG or +Parasails preconditioners, or Flexible GMRES with +AMG preconditioner. diff --git a/3rd_party/hypre/src/examples/docs/ex5big.htm b/3rd_party/hypre/src/examples/docs/ex5big.htm new file mode 100644 index 000000000..26f6f2afb --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex5big.htm @@ -0,0 +1,14 @@ +

Example 5 (64-bit version)

+

+This example is a slight modification of Example 5 that illustrates the 64-bit +integer support in hypre needed to run problems with more than 2B unknowns. +

+Specifically, the changes compared to Example 5 are as follows: +

    +
  1. All integer arguments to HYPRE functions should be declared of type HYPRE_Int. +
  2. Variables of type HYPRE_Int are 64-bit integers, so they should be printed in the %lld format (not %d). +
+

+To enable the 64-bit integer support, you need to build hypre with the +--enable-bigint option of the configure script. We recommend comparing this +example with Example 5. diff --git a/3rd_party/hypre/src/examples/docs/ex5f.htm b/3rd_party/hypre/src/examples/docs/ex5f.htm new file mode 100644 index 000000000..a04ed1698 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex5f.htm @@ -0,0 +1,11 @@ +

Example 5 (Fortran version)

+

+This example solves the 2-D +Laplacian problem with zero boundary conditions +on an nxn grid. The number of unknowns is N=n^2. +The standard 5-point stencil is used, and we solve +for the interior nodes only. +

+This example solves the same problem as Example 3. +Available solvers are AMG, PCG, and PCG with AMG or +Parasails preconditioners. diff --git a/3rd_party/hypre/src/examples/docs/ex6.htm b/3rd_party/hypre/src/examples/docs/ex6.htm new file mode 100644 index 000000000..58604dde5 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex6.htm @@ -0,0 +1,14 @@ +

Example 6

+

+This is a two processor example and is the same problem +as is solved with the structured interface in Example 2. +(The grid boxes are exactly those in the example +diagram in the struct interface chapter of the User's Manual. +Processor 0 owns two boxes and processor 1 owns one box.) +This is the simplest sstruct example. There is one part and +one variable. The solver is PCG with SMG preconditioner. We use a +structured solver for this example. +

+We recommend comparing this example with Example 2. + + diff --git a/3rd_party/hypre/src/examples/docs/ex7.htm b/3rd_party/hypre/src/examples/docs/ex7.htm new file mode 100644 index 000000000..0e0d71fb3 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex7.htm @@ -0,0 +1,37 @@ +

Example 7

+

+This example uses the sstruct interface to solve the same +problem as was solved in Example 4 with the struct interface. +Therefore, there is only one part and one variable. +

+This code solves the convection-reaction-diffusion problem +div (-K grad u + B u) + C u = F in the unit square with +boundary condition u = U0. The domain is split into N x N +processor grid. Thus, the given number of processors should +be a perfect square. Each processor has a n x n grid, with +nodes connected by a 5-point stencil. We use cell-centered +variables, and, therefore, the nodes are not shared. +

+To incorporate the boundary conditions, we do the following: +Let x_i and x_b be the interior and boundary parts of the +solution vector x. If we split the matrix A as +

A = [A_ii A_ib; A_bi A_bb],
+

+then we solve +

[A_ii 0; 0 I] [x_i ; x_b] = [b_i - A_ib u_0; u_0].
+

+Note that this differs from the previous example in that we +are actually solving for the boundary conditions (so they +may not be exact as in ex3, where we only solved for the +interior). This approach is useful for more general types +of b.c. +

+As in the previous example (Example 6), we use a structured +solver. A number of structured solvers are available. +More information can be found in the Solvers and Preconditioners +chapter of the User's Manual. +

+We recommend viewing Examples 6 before viewing this +example. + + diff --git a/3rd_party/hypre/src/examples/docs/ex8.htm b/3rd_party/hypre/src/examples/docs/ex8.htm new file mode 100644 index 000000000..cb668125b --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex8.htm @@ -0,0 +1,13 @@ +

Example 8

+

+This is a two processor example which solves a similar +problem to the one in Example 2, and Example 6 (The grid +boxes are exactly those in the example diagram in the +struct interface chapter of the User's Manual.) +

+The difference with the previous examples is that we use +three parts, two with a 5-point and one with a 9-point +discretization stencil. The solver is PCG with split-SMG +preconditioner. +

+We recommend comparing this example with Example 2 and Example 6. diff --git a/3rd_party/hypre/src/examples/docs/ex9.htm b/3rd_party/hypre/src/examples/docs/ex9.htm new file mode 100644 index 000000000..2a6f3da84 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ex9.htm @@ -0,0 +1,23 @@ +

Example 9

+

+This code solves a system corresponding to a discretization +of the biharmonic problem treated as a system of equations +on the unit square. Specifically, instead of solving +Delta^2(u) = f with zero boundary conditions for u and +Delta(u), we solve the system A x = b, where +

+

A = [ Delta -I ; 0 Delta], x = [ u ; v] and b = [ 0 ; f] +
+

+The corresponding boundary conditions are u = 0 and v = 0. +

+The domain is split into an N x N processor grid. Thus, the +given number of processors should be a perfect square. +Each processor's piece of the grid has n x n cells with n x n +nodes. We use cell-centered variables, and, therefore, the +nodes are not shared. Note that we have two variables, u and +v, and need only one part to describe the domain. We use the +standard 5-point stencil to discretize the Laplace operators. +The boundary conditions are incorporated as in Example 3. +

+We recommend viewing Examples 3, 6 and 7 before this example. diff --git a/3rd_party/hypre/src/examples/docs/fe.htm b/3rd_party/hypre/src/examples/docs/fe.htm new file mode 100644 index 000000000..be4162cef --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/fe.htm @@ -0,0 +1,17 @@ +Example Codes +

Finite Elements Example Codes

+ +
+ + + + + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/fei.htm b/3rd_party/hypre/src/examples/docs/fei.htm new file mode 100644 index 000000000..1a4e149c8 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/fei.htm @@ -0,0 +1,13 @@ +Example Codes +

FEI Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/flexgmres-amg.htm b/3rd_party/hypre/src/examples/docs/flexgmres-amg.htm new file mode 100644 index 000000000..5ebc9af84 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/flexgmres-amg.htm @@ -0,0 +1,14 @@ +Example Codes +

Flexible GMRES-AMG Example Codes

+ +
+ + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/fortran.htm b/3rd_party/hypre/src/examples/docs/fortran.htm new file mode 100644 index 000000000..91d73d87c --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/fortran.htm @@ -0,0 +1,14 @@ +Example Codes +

Fortran Example Codes

+ +
+ + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/gmres-amg.htm b/3rd_party/hypre/src/examples/docs/gmres-amg.htm new file mode 100644 index 000000000..8795117c7 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/gmres-amg.htm @@ -0,0 +1,14 @@ +Example Codes +

GMRES-AMG Example Codes

+ +
+ + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/gmres-euclid.htm b/3rd_party/hypre/src/examples/docs/gmres-euclid.htm new file mode 100644 index 000000000..3972053a5 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/gmres-euclid.htm @@ -0,0 +1,13 @@ +Example Codes +

GMRES-Euclid Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/gmres-parasails.htm b/3rd_party/hypre/src/examples/docs/gmres-parasails.htm new file mode 100644 index 000000000..100ff09dc --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/gmres-parasails.htm @@ -0,0 +1,13 @@ +Example Codes +

GMRES-ParaSails Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/gmres-pfmg.htm b/3rd_party/hypre/src/examples/docs/gmres-pfmg.htm new file mode 100644 index 000000000..0f0ae15f2 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/gmres-pfmg.htm @@ -0,0 +1,14 @@ +Example Codes +

GMRES-PFMG Example Codes

+ +
+ + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/gmres-smg.htm b/3rd_party/hypre/src/examples/docs/gmres-smg.htm new file mode 100644 index 000000000..66106b5dd --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/gmres-smg.htm @@ -0,0 +1,14 @@ +Example Codes +

GMRES-SMG Example Codes

+ +
+ + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/gmres-syspfmg.htm b/3rd_party/hypre/src/examples/docs/gmres-syspfmg.htm new file mode 100644 index 000000000..f66b4f34a --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/gmres-syspfmg.htm @@ -0,0 +1,13 @@ +Example Codes +

GMRES-SysPFMG Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/hypre_wiw.gif b/3rd_party/hypre/src/examples/docs/hypre_wiw.gif new file mode 100644 index 000000000..10196a591 Binary files /dev/null and b/3rd_party/hypre/src/examples/docs/hypre_wiw.gif differ diff --git a/3rd_party/hypre/src/examples/docs/ij.htm b/3rd_party/hypre/src/examples/docs/ij.htm new file mode 100644 index 000000000..39f51bfaf --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/ij.htm @@ -0,0 +1,13 @@ +Example Codes +

IJ Interface Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/index.htm b/3rd_party/hypre/src/examples/docs/index.htm new file mode 100644 index 000000000..0ce58cf18 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/index.htm @@ -0,0 +1,105 @@ +Example Codes + +
+

Hypre example codes

+
+

+Clicking on any of the categories below displays examples +that contain the described feature. Additionally, a comprehensive +list of all examples follows the category lists. +

+

+The numerical results from most of the example codes below can be examined using +the GLVis visualization tool. See +the vis sub-directory for more details. +

+

+Users are encouraged to submit any example codes that they have +created and would like to share. Contact a member of the +hypre team. +

+ +

Interface

+ + +

Equation (PDE)

+ + +

Discretization

+ + +

Solver

+ + +

Programming language

+ + + +

All example codes

+
    +
  • Example 1: the simplest Struct example
  • +
  • Example 2: a simple two processor Struct example
  • +
  • Example 3: a Struct solver for the 5-pt discretization of the 2D Laplace equation
  • +
  • Example 4: several Struct solvers for a variable coefficient 2D Convection-Reaction-Diffusion equation
  • +
  • Example 5: unstructured solvers for the 5-pt discretization of the 2D Laplace equation
  • +
  • Example 5big: a 64-bit version of Example 5
  • +
  • Example 5f: Fortran version of Example 5
  • +
  • Example 6: a simple two processor SStruct example
  • +
  • Example 7: several SStruct solvers for a variable coefficient 2D Convection-Reaction-Diffusion equation
  • +
  • Example 8: two processor SStruct example with multiple parts
  • +
  • Example 9: a SStruct example for the biharmonic problem treated as a system of equations
  • +
  • Example 10: FEI example with bilinear finite elements for the 2D Laplace equation
  • +
  • Example 11: eigensolver for the 5-pt discretization of the 2D Laplace equation
  • +
  • Example 12: nodal version of Example 1
  • +
  • Example 12f: Fortran version of Example 12
  • +
  • Example 13: a SStruct example of bilinear finite elements on a mesh with an "enhanced connectivity" point
  • +
  • Example 14: version of Example 13 using the SStruct FEM input functions (instead of stencils)
  • +
  • Example 15: SStruct solver for a 3D definite Maxwell problem
  • +
  • Example 15big: a 64-bit version of Example 15
  • +
  • Example 16: a SStruct example of a high order Q3 finite element discretization
  • +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/lobpcg-amg.htm b/3rd_party/hypre/src/examples/docs/lobpcg-amg.htm new file mode 100644 index 000000000..a56b9ff05 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/lobpcg-amg.htm @@ -0,0 +1,13 @@ +Example Codes +

LOBPGC-AMG Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/pcg-amg.htm b/3rd_party/hypre/src/examples/docs/pcg-amg.htm new file mode 100644 index 000000000..641f2d94c --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/pcg-amg.htm @@ -0,0 +1,15 @@ +Example Codes +

PCG-AMG Example Codes

+ +
+ + + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/pcg-ams.htm b/3rd_party/hypre/src/examples/docs/pcg-ams.htm new file mode 100644 index 000000000..b9dd0af93 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/pcg-ams.htm @@ -0,0 +1,13 @@ +Example Codes +

PCG-AMS Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/pcg-euclid.htm b/3rd_party/hypre/src/examples/docs/pcg-euclid.htm new file mode 100644 index 000000000..94ba41ef6 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/pcg-euclid.htm @@ -0,0 +1,13 @@ +Example Codes +

PCG-Euclid Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/pcg-parasails.htm b/3rd_party/hypre/src/examples/docs/pcg-parasails.htm new file mode 100644 index 000000000..542b827a0 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/pcg-parasails.htm @@ -0,0 +1,14 @@ +Example Codes +

PCG-ParaSails Example Codes

+ +
+ + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/pcg-pfmg.htm b/3rd_party/hypre/src/examples/docs/pcg-pfmg.htm new file mode 100644 index 000000000..a13dc8623 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/pcg-pfmg.htm @@ -0,0 +1,15 @@ +Example Codes +

PCG-PFMG Example Codes

+ +
+ + + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/pcg-smg.htm b/3rd_party/hypre/src/examples/docs/pcg-smg.htm new file mode 100644 index 000000000..d728d14f7 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/pcg-smg.htm @@ -0,0 +1,16 @@ +Example Codes +

PCG-SMG Example Codes

+ +
+ + + + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/pcg-split-smg.htm b/3rd_party/hypre/src/examples/docs/pcg-split-smg.htm new file mode 100644 index 000000000..ea0739ad7 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/pcg-split-smg.htm @@ -0,0 +1,13 @@ +Example Codes +

PCG-split-SMG Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/pcg.htm b/3rd_party/hypre/src/examples/docs/pcg.htm new file mode 100644 index 000000000..f632fdc44 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/pcg.htm @@ -0,0 +1,16 @@ +Example Codes +

CG Example Codes

+ +
+ + + + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/pfmg.htm b/3rd_party/hypre/src/examples/docs/pfmg.htm new file mode 100644 index 000000000..ec9276035 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/pfmg.htm @@ -0,0 +1,15 @@ +Example Codes +

PFMG Example Codes

+ +
+ + + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/replace-ssi.perl b/3rd_party/hypre/src/examples/docs/replace-ssi.perl new file mode 100755 index 000000000..d738334b0 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/replace-ssi.perl @@ -0,0 +1,32 @@ +#!/usr/bin/perl + +# This replaces the server-side includes. + +use IO::File; + +while (<>) { + &process_or_include($_); +} + +sub process_or_include { + local $_ = shift; + if (/^/) { + &include($1); + } else { + &process($_); + } +} + +sub include { + my $name = shift; + my $F = IO::File->new($name) + or die "Cannot open $name: $!"; + while (<$F>) { + &process_or_include($_); + } +} + +sub process { + my $line = shift; + print "$line"; +} diff --git a/3rd_party/hypre/src/examples/docs/smg.htm b/3rd_party/hypre/src/examples/docs/smg.htm new file mode 100644 index 000000000..05ff12bdb --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/smg.htm @@ -0,0 +1,15 @@ +Example Codes +

SMG Example Codes

+ +
+ + + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/sstruct.htm b/3rd_party/hypre/src/examples/docs/sstruct.htm new file mode 100644 index 000000000..7ea585335 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/sstruct.htm @@ -0,0 +1,21 @@ +Example Codes +

SStruct Interface Example Codes

+ +
+ + + + + + + + + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/struct.htm b/3rd_party/hypre/src/examples/docs/struct.htm new file mode 100644 index 000000000..b7927ffbe --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/struct.htm @@ -0,0 +1,16 @@ +Example Codes +

Struct Interface Example Codes

+ +
+ + + + + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/docs/syspfmg.htm b/3rd_party/hypre/src/examples/docs/syspfmg.htm new file mode 100644 index 000000000..246b1b4a0 --- /dev/null +++ b/3rd_party/hypre/src/examples/docs/syspfmg.htm @@ -0,0 +1,13 @@ +Example Codes +

SysPFMG Example Codes

+ +
+ + + +
+ +
+ +
+ diff --git a/3rd_party/hypre/src/examples/ex1.c b/3rd_party/hypre/src/examples/ex1.c new file mode 100644 index 000000000..e6dc71182 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex1.c @@ -0,0 +1,344 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 1 + + Interface: Structured interface (Struct) + + Compile with: make ex1 (may need to edit HYPRE_DIR in Makefile) + + Sample run: mpirun -np 2 ex1 + + Description: This is a two processor example. Each processor owns one + box in the grid. For reference, the two grid boxes are those + in the example diagram in the struct interface chapter + of the User's Manual. Note that in this example code, we have + used the two boxes shown in the diagram as belonging + to processor 0 (and given one box to each processor). The + solver is PCG with no preconditioner. + + We recommend viewing examples 1-4 sequentially for + a nice overview/tutorial of the struct interface. +*/ + +#include + +/* Struct linear solvers header */ +#include "HYPRE_struct_ls.h" + +#include "vis.c" + +int main (int argc, char *argv[]) +{ + int i, j, myid, num_procs; + + int vis = 0; + + HYPRE_StructGrid grid; + HYPRE_StructStencil stencil; + HYPRE_StructMatrix A; + HYPRE_StructVector b; + HYPRE_StructVector x; + HYPRE_StructSolver solver; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + if (num_procs != 2) + { + if (myid == 0) printf("Must run with 2 processors!\n"); + MPI_Finalize(); + + return(0); + } + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* 1. Set up a grid. Each processor describes the piece + of the grid that it owns. */ + { + /* Create an empty 2D grid object */ + HYPRE_StructGridCreate(MPI_COMM_WORLD, 2, &grid); + + /* Add boxes to the grid */ + if (myid == 0) + { + int ilower[2]={-3,1}, iupper[2]={-1,2}; + HYPRE_StructGridSetExtents(grid, ilower, iupper); + } + else if (myid == 1) + { + int ilower[2]={0,1}, iupper[2]={2,4}; + HYPRE_StructGridSetExtents(grid, ilower, iupper); + } + + /* This is a collective call finalizing the grid assembly. + The grid is now ``ready to be used'' */ + HYPRE_StructGridAssemble(grid); + } + + /* 2. Define the discretization stencil */ + { + /* Create an empty 2D, 5-pt stencil object */ + HYPRE_StructStencilCreate(2, 5, &stencil); + + /* Define the geometry of the stencil. Each represents a + relative offset (in the index space). */ + { + int entry; + int offsets[5][2] = {{0,0}, {-1,0}, {1,0}, {0,-1}, {0,1}}; + + /* Assign each of the 5 stencil entries */ + for (entry = 0; entry < 5; entry++) + HYPRE_StructStencilSetElement(stencil, entry, offsets[entry]); + } + } + + /* 3. Set up a Struct Matrix */ + { + /* Create an empty matrix object */ + HYPRE_StructMatrixCreate(MPI_COMM_WORLD, grid, stencil, &A); + + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_StructMatrixInitialize(A); + + /* Set the matrix coefficients. Each processor assigns coefficients + for the boxes in the grid that it owns. Note that the coefficients + associated with each stencil entry may vary from grid point to grid + point if desired. Here, we first set the same stencil entries for + each grid point. Then we make modifications to grid points near + the boundary. */ + if (myid == 0) + { + int ilower[2]={-3,1}, iupper[2]={-1,2}; + int stencil_indices[5] = {0,1,2,3,4}; /* labels for the stencil entries - + these correspond to the offsets + defined above */ + int nentries = 5; + int nvalues = 30; /* 6 grid points, each with 5 stencil entries */ + double values[30]; + + /* We have 6 grid points, each with 5 stencil entries */ + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 4.0; + for (j = 1; j < nentries; j++) + values[i+j] = -1.0; + } + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, nentries, + stencil_indices, values); + } + else if (myid == 1) + { + int ilower[2]={0,1}, iupper[2]={2,4}; + int stencil_indices[5] = {0,1,2,3,4}; + int nentries = 5; + int nvalues = 60; /* 12 grid points, each with 5 stencil entries */ + double values[60]; + + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 4.0; + for (j = 1; j < nentries; j++) + values[i+j] = -1.0; + } + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, nentries, + stencil_indices, values); + } + + /* Set the coefficients reaching outside of the boundary to 0 */ + if (myid == 0) + { + double values[3]; + for (i = 0; i < 3; i++) + values[i] = 0.0; + { + /* values below our box */ + int ilower[2]={-3,1}, iupper[2]={-1,1}; + int stencil_indices[1] = {3}; + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + { + /* values to the left of our box */ + int ilower[2]={-3,1}, iupper[2]={-3,2}; + int stencil_indices[1] = {1}; + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + { + /* values above our box */ + int ilower[2]={-3,2}, iupper[2]={-1,2}; + int stencil_indices[1] = {4}; + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + } + else if (myid == 1) + { + double values[4]; + for (i = 0; i < 4; i++) + values[i] = 0.0; + { + /* values below our box */ + int ilower[2]={0,1}, iupper[2]={2,1}; + int stencil_indices[1] = {3}; + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + { + /* values to the right of our box */ + int ilower[2]={2,1}, iupper[2]={2,4}; + int stencil_indices[1] = {2}; + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + { + /* values above our box */ + int ilower[2]={0,4}, iupper[2]={2,4}; + int stencil_indices[1] = {4}; + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + { + /* values to the left of our box + (that do not border the other box on proc. 0) */ + int ilower[2]={0,3}, iupper[2]={0,4}; + int stencil_indices[1] = {1}; + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + } + + /* This is a collective call finalizing the matrix assembly. + The matrix is now ``ready to be used'' */ + HYPRE_StructMatrixAssemble(A); + } + + /* 4. Set up Struct Vectors for b and x. Each processor sets the vectors + corresponding to its boxes. */ + { + /* Create an empty vector object */ + HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &b); + HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &x); + + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_StructVectorInitialize(b); + HYPRE_StructVectorInitialize(x); + + /* Set the vector coefficients */ + if (myid == 0) + { + int ilower[2]={-3,1}, iupper[2]={-1,2}; + double values[6]; /* 6 grid points */ + + for (i = 0; i < 6; i ++) + values[i] = 1.0; + HYPRE_StructVectorSetBoxValues(b, ilower, iupper, values); + + for (i = 0; i < 6; i ++) + values[i] = 0.0; + HYPRE_StructVectorSetBoxValues(x, ilower, iupper, values); + } + else if (myid == 1) + { + int ilower[2]={0,1}, iupper[2]={2,4}; + double values[12]; /* 12 grid points */ + + for (i = 0; i < 12; i ++) + values[i] = 1.0; + HYPRE_StructVectorSetBoxValues(b, ilower, iupper, values); + + for (i = 0; i < 12; i ++) + values[i] = 0.0; + HYPRE_StructVectorSetBoxValues(x, ilower, iupper, values); + } + + /* This is a collective call finalizing the vector assembly. + The vectors are now ``ready to be used'' */ + HYPRE_StructVectorAssemble(b); + HYPRE_StructVectorAssemble(x); + } + + /* 5. Set up and use a solver (See the Reference Manual for descriptions + of all of the options.) */ + { + /* Create an empty PCG Struct solver */ + HYPRE_StructPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set some parameters */ + HYPRE_StructPCGSetTol(solver, 1.0e-06); /* convergence tolerance */ + HYPRE_StructPCGSetPrintLevel(solver, 2); /* amount of info. printed */ + + /* Setup and solve */ + HYPRE_StructPCGSetup(solver, A, b, x); + HYPRE_StructPCGSolve(solver, A, b, x); + } + + /* Save the solution for GLVis visualization, see vis/glvis-ex1.sh */ + if (vis) + { + GLVis_PrintStructGrid(grid, "vis/ex1.mesh", myid, NULL, NULL); + GLVis_PrintStructVector(x, "vis/ex1.sol", myid); + GLVis_PrintData("vis/ex1.data", myid, num_procs); + } + + /* Free memory */ + HYPRE_StructGridDestroy(grid); + HYPRE_StructStencilDestroy(stencil); + HYPRE_StructMatrixDestroy(A); + HYPRE_StructVectorDestroy(b); + HYPRE_StructVectorDestroy(x); + HYPRE_StructPCGDestroy(solver); + + /* Finalize MPI */ + MPI_Finalize(); + + return (0); +} diff --git a/3rd_party/hypre/src/examples/ex10.cxx b/3rd_party/hypre/src/examples/ex10.cxx new file mode 100644 index 000000000..df16a63db --- /dev/null +++ b/3rd_party/hypre/src/examples/ex10.cxx @@ -0,0 +1,540 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 10 + + Interface: Finite Element Interface (FEI) + + Compile with: make ex10 + + Sample run: mpirun -np 4 ex10 -n 120 -solver 2 + + To see options: ex10 -help + + Description: This code solves a system corresponding to a discretization + of the Laplace equation -Delta u = 1 with zero boundary + conditions on the unit square. The domain is split into + a n x n grid of quadrilateral elements and each processors + owns a horizontal strip of size m x n, where m = n/nprocs. We + use bilinear finite element discretization, so there are + nodes (vertices) that are shared between neighboring + processors. The Finite Element Interface is used to assemble + the matrix and solve the problem. Nine different solvers are + available. +*/ + +#include +#include +#include +#include "_hypre_utilities.h" +#include "LLNL_FEI_Impl.h" + +using namespace std; + +#include "vis.c" + +int main(int argc, char *argv[]) +{ + int i, j, k; + + int nprocs, mypid; + + int n, m, offset; + double h; + + int solverID; + int vis; + + // Initialize MPI + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + MPI_Comm_rank(MPI_COMM_WORLD, &mypid); + + // Set default parameters + n = 4*nprocs; + solverID = 2; + vis = 0; + + // Parse command line + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-solver") == 0 ) + { + arg_index++; + solverID = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (mypid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size per processor (default: %d)\n", 4*nprocs); + printf(" -solver : solver ID\n"); + printf(" 0 - DS-PCG\n"); + printf(" 1 - ParaSails-PCG\n"); + printf(" 2 - AMG-PCG (default)\n"); + printf(" 3 - AMGSA-PCG\n"); + printf(" 4 - Euclid-PCG\n"); + printf(" 5 - DS-GMRES\n"); + printf(" 6 - AMG-GMRES\n"); + printf(" 7 - AMGSA-GMRES\n"); + printf(" 8 - Euclid-GMRES\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + // Each processor owns a m x n grid of quadrilateral finite elements. + // The unknowns are located in the nodes (vertices of the mesh) and + // are numbered globally starting from the lower left corner and moving + // row-wise to the upper right corner. + m = n / nprocs; + offset = mypid*(m*(n+1)); + + h = 1.0 / n; // mesh size + + // 1. FEI initialization phase + + // Instantiate the FEI object + LLNL_FEI_Impl *feiPtr = new LLNL_FEI_Impl(MPI_COMM_WORLD); + + // Set the matrix storage type to HYPRE + { + char **paramStrings = new char*[1]; + paramStrings[0] = new char[100]; + strcpy(paramStrings[0], "externalSolver HYPRE"); + feiPtr->parameters(1, paramStrings); + delete [] paramStrings[0]; + delete [] paramStrings; + } + + // The unknowns in FEI are called fields. Each field has an + // identifier (fieldID) and rank (fieldSize). + int nFields = 1; + int *fieldSizes = new int[nFields]; fieldSizes[0] = 1; + int *fieldIDs = new int[nFields]; fieldIDs[0] = 0; + + // Pass the field information to the FEI + feiPtr->initFields(nFields, fieldSizes, fieldIDs); + + // Elements are grouped into blocks (in this case one block), and we + // have to describe the number of elements in the block (nElems) as + // well as the fields (unknowns) per element. + int elemBlkID = 0; + int nElems = m*n; + int elemNNodes = 4; // number of (shared) nodes per element + int *nodeNFields = new int[elemNNodes]; // fields per node + int **nodeFieldIDs = new int*[elemNNodes]; // node-fields IDs + int elemNFields = 0; // number of (non-shared) fields per element + int *elemFieldIDs = NULL; // element-fields IDs + for (i = 0; i < elemNNodes; i++) + { + nodeNFields[i] = 1; + nodeFieldIDs[i] = new int[nodeNFields[i]]; + nodeFieldIDs[i][0] = fieldIDs[0]; + } + + // Pass the block information to the FEI. The interleave parameter + // controls how different fields are ordered in the element matrices. + int interleave = 0; + feiPtr->initElemBlock(elemBlkID, nElems, elemNNodes, nodeNFields, + nodeFieldIDs, elemNFields, elemFieldIDs, interleave); + + // List the global indexes (IDs) of the nodes in each element + int **elemConn = new int*[nElems]; + for (i = 0; i < m; i++) + for (j = 0; j < n; j++) + { + elemConn[i*n+j] = new int[elemNNodes]; // element with coordinates (i,j) + elemConn[i*n+j][0] = offset + i*(n+1)+j; // node in the lower left + elemConn[i*n+j][1] = elemConn[i*n+j][0]+1; // node in the lower right + elemConn[i*n+j][2] = elemConn[i*n+j][1]+n+1; // node in the upper right + elemConn[i*n+j][3] = elemConn[i*n+j][2]-1; // node in the upper left + } + + // Pass the element topology information to the FEI + for (i = 0; i < nElems; i++) + feiPtr->initElem(elemBlkID, i, elemConn[i]); + + // List the global indexes of nodes that are shared between processors + int nShared, *SharedIDs, *SharedLengs, **SharedProcs; + if (mypid == 0) + { + // Nodes in the top row are shared + nShared = n+1; + SharedIDs = new int[nShared]; + for (i = 0; i < nShared; i++) + SharedIDs[i] = offset + m*(n+1) + i; + SharedLengs = new int[nShared]; + for (i = 0; i < nShared; i++) + SharedLengs[i] = 2; + SharedProcs = new int*[nShared]; + for (i = 0; i < nShared; i++) + { + SharedProcs[i] = new int[SharedLengs[i]]; + SharedProcs[i][0] = mypid; + SharedProcs[i][1] = mypid+1; + } + } + else if (mypid == nprocs-1) + { + // Nodes in the bottom row are shared + nShared = n+1; + SharedIDs = new int[nShared]; + for (i = 0; i < nShared; i++) + SharedIDs[i] = offset + i; + SharedLengs = new int[nShared]; + for (i = 0; i < nShared; i++) + SharedLengs[i] = 2; + SharedProcs = new int*[nShared]; + for (i = 0; i < nShared; i++) + { + SharedProcs[i] = new int[SharedLengs[i]]; + SharedProcs[i][0] = mypid-1; + SharedProcs[i][1] = mypid; + } + } + else + { + // Nodes in the top and bottom rows are shared + nShared = 2*(n+1); + SharedIDs = new int[nShared]; + for (i = 0; i < n+1; i++) + { + SharedIDs[i] = offset + i; + SharedIDs[n+1+i] = offset + m*(n+1) + i; + } + SharedLengs = new int[nShared]; + for (i = 0; i < nShared; i++) + SharedLengs[i] = 2; + SharedProcs = new int*[nShared]; + for (i = 0; i < n+1; i++) + { + SharedProcs[i] = new int[SharedLengs[i]]; + SharedProcs[i][0] = mypid-1; + SharedProcs[i][1] = mypid; + + SharedProcs[n+1+i] = new int[SharedLengs[n+1+i]]; + SharedProcs[n+1+i][0] = mypid; + SharedProcs[n+1+i][1] = mypid+1; + } + } + + // Pass the shared nodes information to the FEI + if (nprocs != 1 && nShared > 0) + feiPtr->initSharedNodes(nShared, SharedIDs, SharedLengs, SharedProcs); + + // Finish the FEI initialization phase + feiPtr->initComplete(); + + // 2. FEI load phase + + // Specify the boundary conditions + int nBCs, *BCEqn; + double **alpha, **beta, **gamma; + if (mypid == 0) + { + // Nodes in the bottom row and left and right columns + nBCs = n+1 + 2*m; + BCEqn = new int[nBCs]; + for (i = 0; i < n+1; i++) + BCEqn[i] = offset + i; + for (i = 0; i < m; i++) + { + BCEqn[n+1+2*i] = offset + (i+1)*(n+1); + BCEqn[n+2+2*i] = offset + (i+1)*(n+1)+n; + } + } + else if (mypid == nprocs-1) + { + // Nodes in the top row and left and right columns + nBCs = n+1 + 2*m; + BCEqn = new int[nBCs]; + for (i = 0; i < n+1; i++) + BCEqn[i] = offset + m*(n+1) + i; + for (i = 0; i < m; i++) + { + BCEqn[n+1+2*i] = offset + i*(n+1); + BCEqn[n+2+2*i] = offset + i*(n+1)+n; + } + } + else + { + // Nodes in the left and right columns + nBCs = 2*(m+1); + BCEqn = new int[nBCs]; + for (i = 0; i < m+1; i++) + { + BCEqn[2*i] = offset + i*(n+1); + BCEqn[2*i+1] = offset + i*(n+1)+n; + } + } + + // The arrays alpha, beta and gamma specify the type of boundary + // condition (essential, natural, mixed). The most general form + // for Laplace problems is alpha U + beta dU/dn = gamma. In this + // example we impose zero Dirichlet boundary conditions. + alpha = new double*[nBCs]; + beta = new double*[nBCs]; + gamma = new double*[nBCs]; + for (i = 0; i < nBCs; i++) + { + alpha[i] = new double[1]; alpha[i][0] = 1.0; + beta[i] = new double[1]; beta[i][0] = 0.0; + gamma[i] = new double[1]; gamma[i][0] = 0.0; + } + + // Pass the boundary condition information to the FEI + feiPtr->loadNodeBCs(nBCs, BCEqn, fieldIDs[0], alpha, beta, gamma); + + // Specify element stiffness matrices + double ***elemStiff = new double**[nElems]; + for (i = 0; i < m; i++) + for (j = 0; j < n; j++) + { + // Element with coordinates (i,j) + elemStiff[i*n+j] = new double*[elemNNodes]; + for (k = 0; k < elemNNodes; k++) + elemStiff[i*n+j][k] = new double[elemNNodes]; + + // Stiffness matrix for the reference square + // 3 +---+ 2 + // | | + // 0 +---+ 1 + + double **A = elemStiff[i*n+j]; + + for (k = 0; k < 4; k++) + A[k][k] = 2/3.; + + A[0][1] = A[1][0] = -1/6.; + A[0][2] = A[2][0] = -1/3.; + A[0][3] = A[3][0] = -1/6.; + A[1][2] = A[2][1] = -1/6.; + A[1][3] = A[3][1] = -1/3.; + A[2][3] = A[3][2] = -1/6.; + } + + // Specify element load vectors + double *elemLoad = new double[nElems*elemNNodes]; + for (i = 0; i < nElems*elemNNodes; i++) + elemLoad[i] = h*h/4; + + // Assemble the matrix. The elemFormat parameter describes + // the storage (symmetric/non-symmetric, row/column-wise) + // of the element stiffness matrices. + int elemFormat = 0; + for (i = 0; i < nElems; i++) + feiPtr->sumInElem(elemBlkID, i, elemConn[i], elemStiff[i], + &(elemLoad[i*elemNNodes]), elemFormat); + + // Finish the FEI load phase + feiPtr->loadComplete(); + + // Clean up + for (i = 0; i < nElems; i++) delete [] elemConn[i]; + delete [] elemConn; + for (i = 0; i < nElems; i++) + { + for (j = 0; j < elemNNodes; j++) delete [] elemStiff[i][j]; + delete [] elemStiff[i]; + } + delete [] elemStiff; + delete [] elemLoad; + + delete [] BCEqn; + for (i = 0; i < nBCs; i++) + { + delete [] alpha[i]; + delete [] beta[i]; + delete [] gamma[i]; + } + delete [] alpha; + delete [] beta; + delete [] gamma; + + if (nShared > 0) + { + delete [] SharedIDs; + delete [] SharedLengs; + for (i = 0; i < nShared; i++) delete [] SharedProcs[i]; + delete [] SharedProcs; + } + + delete [] nodeNFields; + for (i = 0; i < elemNNodes; i++) delete [] nodeFieldIDs[i]; + delete [] nodeFieldIDs; + + delete [] fieldSizes; + delete [] fieldIDs; + + // 3. Set up problem parameters and pass them to the FEI + { + int nParams = 19; + char **paramStrings = new char*[nParams]; + for (i = 0; i < nParams; i++) + paramStrings[i] = new char[100]; + + strcpy(paramStrings[0], "outputLevel 2"); + switch(solverID) + { + case 0: + strcpy(paramStrings[1], "solver cg"); + strcpy(paramStrings[2], "preconditioner diagonal"); + break; + case 1: + strcpy(paramStrings[1], "solver cg"); + strcpy(paramStrings[2], "preconditioner parasails"); + break; + default: + case 2: + strcpy(paramStrings[1], "solver cg"); + strcpy(paramStrings[2], "preconditioner boomeramg"); + break; + case 3: + strcpy(paramStrings[1], "solver cg"); + strcpy(paramStrings[2], "preconditioner mli"); + break; + case 4: + strcpy(paramStrings[1], "solver cg"); + strcpy(paramStrings[2], "preconditioner euclid"); + break; + case 5: + strcpy(paramStrings[1], "solver gmres"); + strcpy(paramStrings[2], "preconditioner diagonal"); + break; + case 6: + strcpy(paramStrings[1], "solver gmres"); + strcpy(paramStrings[2], "preconditioner boomeramg"); + break; + case 7: + strcpy(paramStrings[1], "solver gmres"); + strcpy(paramStrings[2], "preconditioner mli"); + break; + case 8: + strcpy(paramStrings[1], "solver gmres"); + strcpy(paramStrings[2], "preconditioner euclid"); + break; + } + strcpy(paramStrings[3], "maxIterations 100"); + strcpy(paramStrings[4], "tolerance 1e-6"); + strcpy(paramStrings[5], "gmresDim 30"); + strcpy(paramStrings[6], "amgNumSweeps 1"); + strcpy(paramStrings[7], "amgCoarsenType hmis"); + strcpy(paramStrings[8], "amgRelaxType hybridsym"); + strcpy(paramStrings[9], "amgSystemSize 1"); + strcpy(paramStrings[10], "amgStrongThreshold 0.25"); + strcpy(paramStrings[11], "MLI smoother HSGS"); + strcpy(paramStrings[12], "MLI numSweeps 1"); + strcpy(paramStrings[13], "MLI smootherWeight 1.0"); + strcpy(paramStrings[14], "MLI nodeDOF 1"); + strcpy(paramStrings[15], "MLI nullSpaceDim 1"); + strcpy(paramStrings[16], "MLI minCoarseSize 50"); + strcpy(paramStrings[17], "MLI outputLevel 0"); + strcpy(paramStrings[18], "parasailsSymmetric outputLevel 0"); + + feiPtr->parameters(nParams, paramStrings); + + for (i = 0; i < nParams; i++) + delete [] paramStrings[i]; + delete [] paramStrings; + } + + // 4. Solve the system + int status; + feiPtr->solve(&status); + + // 5. Save the solution for GLVis visualization, see vis/glvis-ex10.sh + if (vis) + { + int numNodes, *nodeIDList, *solnOffsets; + double *solnValues; + + // Get the number of nodes in the element block + feiPtr->getNumBlockActNodes(elemBlkID, &numNodes); + + // Get their global IDs + nodeIDList = new int[numNodes]; + feiPtr->getBlockNodeIDList(elemBlkID, numNodes, nodeIDList); + + // Get the values corresponding to nodeIDList + solnOffsets = new int[numNodes]; + solnValues = new double[numNodes]; + feiPtr->getBlockNodeSolution(elemBlkID, numNodes, nodeIDList, + solnOffsets, solnValues); + + // Find the location of the ith local node + for (i = 0; i < numNodes; i++) + solnOffsets[nodeIDList[i]-offset] = i; + + // Save the ordered nodal values to a file + char sol_out[20]; + sprintf(sol_out, "%s.%06d", "vis/ex10.sol", mypid); + ofstream sol(sol_out); + sol << "FiniteElementSpace\n" + << "FiniteElementCollection: H1_2D_P1\n" + << "VDim: 1\n" + << "Ordering: 0\n\n"; + for (i = 0; i < numNodes; i++) + sol << solnValues[solnOffsets[i]] << endl; + + // Save local finite element mesh + GLVis_PrintLocalSquareMesh("vis/ex10.mesh", n, m, h, 0, mypid*h*m, mypid); + + // additional visualization data + if (mypid == 0) + { + char data_out[20]; + sprintf(data_out, "%s", "vis/ex10.data"); + ofstream data(data_out); + data << "np " << nprocs << endl; + } + + // Clean up + delete [] solnValues; + delete [] solnOffsets; + delete [] nodeIDList; + } + delete feiPtr; + + // Finalize MPI + MPI_Finalize(); + + return (0); +} diff --git a/3rd_party/hypre/src/examples/ex11.c b/3rd_party/hypre/src/examples/ex11.c new file mode 100644 index 000000000..b38459109 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex11.c @@ -0,0 +1,357 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 11 + + Interface: Linear-Algebraic (IJ) + + Compile with: make ex11 + + Sample run: mpirun -np 4 ex11 + + Description: This example solves the 2-D Laplacian eigenvalue + problem with zero boundary conditions on an nxn grid. + The number of unknowns is N=n^2. The standard 5-point + stencil is used, and we solve for the interior nodes + only. + + We use the same matrix as in Examples 3 and 5. + The eigensolver is LOBPCG with AMG preconditioner. +*/ + +#include +#include "_hypre_utilities.h" +#include "krylov.h" +#include "HYPRE.h" +#include "HYPRE_parcsr_ls.h" + +/* lobpcg stuff */ +#include "HYPRE_lobpcg.h" +#include "interpreter.h" +#include "HYPRE_MatvecFunctions.h" +#include "temp_multivector.h" +#include "_hypre_parcsr_mv.h" + +#include "vis.c" + +int main (int argc, char *argv[]) +{ + int i; + int myid, num_procs; + int N, n; + int blockSize; + + int ilower, iupper; + int local_size, extra; + + int vis; + + HYPRE_IJMatrix A; + HYPRE_ParCSRMatrix parcsr_A; + HYPRE_IJVector b; + HYPRE_ParVector par_b; + HYPRE_IJVector x; + HYPRE_ParVector par_x; + HYPRE_ParVector* pvx; + + HYPRE_Solver precond, lobpcg_solver; + mv_InterfaceInterpreter* interpreter; + HYPRE_MatvecFunctions matvec_fn; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Default problem parameters */ + n = 33; + blockSize = 10; + vis = 0; + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-blockSize") == 0 ) + { + arg_index++; + blockSize = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size in each direction (default: 33)\n"); + printf(" -blockSize : eigenproblem block size (default: 10)\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* Preliminaries: want at least one processor per row */ + if (n*n < num_procs) n = sqrt(num_procs) + 1; + N = n*n; /* global number of rows */ + + /* Each processor knows only of its own rows - the range is denoted by ilower + and iupper. Here we partition the rows. We account for the fact that + N may not divide evenly by the number of processors. */ + local_size = N/num_procs; + extra = N - local_size*num_procs; + + ilower = local_size*myid; + ilower += hypre_min(myid, extra); + + iupper = local_size*(myid+1); + iupper += hypre_min(myid+1, extra); + iupper = iupper - 1; + + /* How many rows do I have? */ + local_size = iupper - ilower + 1; + + /* Create the matrix. + Note that this is a square matrix, so we indicate the row partition + size twice (since number of rows = number of cols) */ + HYPRE_IJMatrixCreate(MPI_COMM_WORLD, ilower, iupper, ilower, iupper, &A); + + /* Choose a parallel csr format storage (see the User's Manual) */ + HYPRE_IJMatrixSetObjectType(A, HYPRE_PARCSR); + + /* Initialize before setting coefficients */ + HYPRE_IJMatrixInitialize(A); + + /* Now go through my local rows and set the matrix entries. + Each row has at most 5 entries. For example, if n=3: + + A = [M -I 0; -I M -I; 0 -I M] + M = [4 -1 0; -1 4 -1; 0 -1 4] + + Note that here we are setting one row at a time, though + one could set all the rows together (see the User's Manual). + */ + { + int nnz; + double values[5]; + int cols[5]; + + for (i = ilower; i <= iupper; i++) + { + nnz = 0; + + /* The left identity block:position i-n */ + if ((i-n)>=0) + { + cols[nnz] = i-n; + values[nnz] = -1.0; + nnz++; + } + + /* The left -1: position i-1 */ + if (i%n) + { + cols[nnz] = i-1; + values[nnz] = -1.0; + nnz++; + } + + /* Set the diagonal: position i */ + cols[nnz] = i; + values[nnz] = 4.0; + nnz++; + + /* The right -1: position i+1 */ + if ((i+1)%n) + { + cols[nnz] = i+1; + values[nnz] = -1.0; + nnz++; + } + + /* The right identity block:position i+n */ + if ((i+n)< N) + { + cols[nnz] = i+n; + values[nnz] = -1.0; + nnz++; + } + + /* Set the values for row i */ + HYPRE_IJMatrixSetValues(A, 1, &nnz, &i, cols, values); + } + } + + /* Assemble after setting the coefficients */ + HYPRE_IJMatrixAssemble(A); + /* Get the parcsr matrix object to use */ + HYPRE_IJMatrixGetObject(A, (void**) &parcsr_A); + + /* Create sample rhs and solution vectors */ + HYPRE_IJVectorCreate(MPI_COMM_WORLD, ilower, iupper,&b); + HYPRE_IJVectorSetObjectType(b, HYPRE_PARCSR); + HYPRE_IJVectorInitialize(b); + HYPRE_IJVectorAssemble(b); + HYPRE_IJVectorGetObject(b, (void **) &par_b); + + HYPRE_IJVectorCreate(MPI_COMM_WORLD, ilower, iupper,&x); + HYPRE_IJVectorSetObjectType(x, HYPRE_PARCSR); + HYPRE_IJVectorInitialize(x); + HYPRE_IJVectorAssemble(x); + HYPRE_IJVectorGetObject(x, (void **) &par_x); + + /* Create a preconditioner and solve the eigenproblem */ + + /* AMG preconditioner */ + { + HYPRE_BoomerAMGCreate(&precond); + HYPRE_BoomerAMGSetPrintLevel(precond, 1); /* print amg solution info */ + HYPRE_BoomerAMGSetNumSweeps(precond, 2); /* 2 sweeps of smoothing */ + HYPRE_BoomerAMGSetTol(precond, 0.0); /* conv. tolerance zero */ + HYPRE_BoomerAMGSetMaxIter(precond, 1); /* do only one iteration! */ + } + + /* LOBPCG eigensolver */ + { + int time_index; + + int maxIterations = 100; /* maximum number of iterations */ + int pcgMode = 1; /* use rhs as initial guess for inner pcg iterations */ + int verbosity = 1; /* print iterations info */ + double tol = 1.e-8; /* absolute tolerance (all eigenvalues) */ + int lobpcgSeed = 775; /* random seed */ + + mv_MultiVectorPtr eigenvectors = NULL; + mv_MultiVectorPtr constraints = NULL; + double *eigenvalues = NULL; + + if (myid != 0) + verbosity = 0; + + /* define an interpreter for the ParCSR interface */ + interpreter = hypre_CTAlloc(mv_InterfaceInterpreter, 1, HYPRE_MEMORY_HOST); + HYPRE_ParCSRSetupInterpreter(interpreter); + HYPRE_ParCSRSetupMatvec(&matvec_fn); + + /* eigenvectors - create a multivector */ + eigenvectors = + mv_MultiVectorCreateFromSampleVector(interpreter, blockSize, par_x); + mv_MultiVectorSetRandom (eigenvectors, lobpcgSeed); + + /* eigenvectors - get a pointer */ + { + mv_TempMultiVector* tmp = (mv_TempMultiVector*) mv_MultiVectorGetData(eigenvectors); + pvx = (HYPRE_ParVector*)(tmp -> vector); + } + + /* eigenvalues - allocate space */ + eigenvalues = (double*) calloc( blockSize, sizeof(double) ); + + HYPRE_LOBPCGCreate(interpreter, &matvec_fn, &lobpcg_solver); + HYPRE_LOBPCGSetMaxIter(lobpcg_solver, maxIterations); + HYPRE_LOBPCGSetPrecondUsageMode(lobpcg_solver, pcgMode); + HYPRE_LOBPCGSetTol(lobpcg_solver, tol); + HYPRE_LOBPCGSetPrintLevel(lobpcg_solver, verbosity); + + /* use a preconditioner */ + HYPRE_LOBPCGSetPrecond(lobpcg_solver, + (HYPRE_PtrToSolverFcn) HYPRE_BoomerAMGSolve, + (HYPRE_PtrToSolverFcn) HYPRE_BoomerAMGSetup, + precond); + + HYPRE_LOBPCGSetup(lobpcg_solver, (HYPRE_Matrix)parcsr_A, + (HYPRE_Vector)par_b, (HYPRE_Vector)par_x); + + time_index = hypre_InitializeTiming("LOBPCG Solve"); + hypre_BeginTiming(time_index); + + HYPRE_LOBPCGSolve(lobpcg_solver, constraints, eigenvectors, eigenvalues ); + + hypre_EndTiming(time_index); + hypre_PrintTiming("Solve phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* clean-up */ + HYPRE_BoomerAMGDestroy(precond); + HYPRE_LOBPCGDestroy(lobpcg_solver); + hypre_TFree(eigenvalues, HYPRE_MEMORY_HOST); + hypre_TFree(interpreter, HYPRE_MEMORY_HOST); + } + + /* Save the solution for GLVis visualization, see vis/glvis-ex11.sh */ + if (vis) + { + FILE *file; + char filename[255]; + + int nvalues = local_size; + double *values; + + /* get the local solution */ + values = hypre_VectorData(hypre_ParVectorLocalVector( + (hypre_ParVector*)pvx[blockSize-1])); + + sprintf(filename, "%s.%06d", "vis/ex11.sol", myid); + if ((file = fopen(filename, "w")) == NULL) + { + printf("Error: can't open output file %s\n", filename); + MPI_Finalize(); + exit(1); + } + + /* save solution */ + for (i = 0; i < nvalues; i++) + fprintf(file, "%.14e\n", values[i]); + + fflush(file); + fclose(file); + + /* save global finite element mesh */ + if (myid == 0) + GLVis_PrintGlobalSquareMesh("vis/ex11.mesh", n-1); + } + + /* Clean up */ + HYPRE_IJMatrixDestroy(A); + HYPRE_IJVectorDestroy(b); + HYPRE_IJVectorDestroy(x); + + /* Finalize MPI*/ + MPI_Finalize(); + + return(0); +} diff --git a/3rd_party/hypre/src/examples/ex12.c b/3rd_party/hypre/src/examples/ex12.c new file mode 100644 index 000000000..679c092e4 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex12.c @@ -0,0 +1,496 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 12 + + Interface: Semi-Structured interface (SStruct) + + Compile with: make ex12 (may need to edit HYPRE_DIR in Makefile) + + Sample runs: mpirun -np 2 ex12 -pfmg + mpirun -np 2 ex12 -boomeramg + + Description: The grid layout is the same as ex1, but with nodal unknowns. The + solver is PCG preconditioned with either PFMG or BoomerAMG, + selected on the command line. + + We recommend viewing the Struct examples before viewing this + and the other SStruct examples. This is one of the simplest + SStruct examples, used primarily to demonstrate how to set up + non-cell-centered problems, and to demonstrate how easy it is + to switch between structured solvers (PFMG) and solvers + designed for more general settings (AMG). +*/ + +#include +#include +#include + +#include "HYPRE_sstruct_ls.h" +#include "HYPRE_parcsr_ls.h" +#include "HYPRE_krylov.h" + +#include "vis.c" + +int main (int argc, char *argv[]) +{ + int i, j, myid, num_procs; + + int vis = 0; + + HYPRE_SStructGrid grid; + HYPRE_SStructGraph graph; + HYPRE_SStructStencil stencil; + HYPRE_SStructMatrix A; + HYPRE_SStructVector b; + HYPRE_SStructVector x; + + /* We only have one part and one variable */ + int nparts = 1; + int nvars = 1; + int part = 0; + int var = 0; + + int precond_id = 1; + int object_type = HYPRE_STRUCT; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + if (num_procs != 2) + { + if (myid == 0) printf("Must run with 2 processors!\n"); + exit(1); + } + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-pfmg") == 0 ) + { + arg_index++; + precond_id = 1; + object_type = HYPRE_STRUCT; + } + else if ( strcmp(argv[arg_index], "-boomeramg") == 0 ) + { + arg_index++; + precond_id = 2; + object_type = HYPRE_PARCSR; + } + else if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -pfmg : use the structured PFMG solver (default)\n"); + printf(" -boomeramg : use the unstructured BoomerAMG solver\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* 1. Set up the grid. Here we use only one part. Each processor describes + the piece of the grid that it owns. */ + { + /* Create an empty 2D grid object */ + HYPRE_SStructGridCreate(MPI_COMM_WORLD, 2, nparts, &grid); + + /* Add boxes to the grid */ + if (myid == 0) + { + int ilower[2]={-3,1}, iupper[2]={-1,2}; + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + } + else if (myid == 1) + { + int ilower[2]={0,1}, iupper[2]={2,4}; + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + } + + /* Set the variable type and number of variables on each part. */ + { + HYPRE_SStructVariable vartypes[1] = {HYPRE_SSTRUCT_VARIABLE_NODE}; + + HYPRE_SStructGridSetVariables(grid, part, nvars, vartypes); + } + + /* This is a collective call finalizing the grid assembly. + The grid is now ``ready to be used'' */ + HYPRE_SStructGridAssemble(grid); + } + + /* 2. Define the discretization stencil */ + { + /* Create an empty 2D, 5-pt stencil object */ + HYPRE_SStructStencilCreate(2, 5, &stencil); + + /* Define the geometry of the stencil. Each represents a relative offset + (in the index space). */ + { + int entry; + int offsets[5][2] = {{0,0}, {-1,0}, {1,0}, {0,-1}, {0,1}}; + + /* Assign numerical values to the offsets so that we can easily refer + to them - the last argument indicates the variable for which we are + assigning this stencil */ + for (entry = 0; entry < 5; entry++) + HYPRE_SStructStencilSetEntry(stencil, entry, offsets[entry], var); + } + } + + /* 3. Set up the Graph - this determines the non-zero structure of the matrix + and allows non-stencil relationships between the parts */ + { + /* Create the graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, grid, &graph); + + /* See MatrixSetObjectType below */ + HYPRE_SStructGraphSetObjectType(graph, object_type); + + /* Now we need to tell the graph which stencil to use for each variable on + each part (we only have one variable and one part) */ + HYPRE_SStructGraphSetStencil(graph, part, var, stencil); + + /* Here we could establish connections between parts if we had more than + one part using the graph. For example, we could use + HYPRE_GraphAddEntries() routine or HYPRE_GridSetNeighborPart() */ + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(graph); + } + + /* 4. Set up a SStruct Matrix */ + { + /* Create an empty matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, graph, &A); + + /* Set the object type (by default HYPRE_SSTRUCT). This determines the + data structure used to store the matrix. For PFMG we need to use + HYPRE_STRUCT, and for BoomerAMG we need HYPRE_PARCSR (set above). */ + HYPRE_SStructMatrixSetObjectType(A, object_type); + + /* Get ready to set values */ + HYPRE_SStructMatrixInitialize(A); + + /* Set the matrix coefficients. Each processor assigns coefficients for + the boxes in the grid that it owns. Note that the coefficients + associated with each stencil entry may vary from grid point to grid + point if desired. Here, we first set the same stencil entries for each + grid point. Then we make modifications to grid points near the + boundary. Note that the ilower values are different from those used in + ex1 because of the way nodal variables are referenced. Also note that + some of the stencil values are set on both processor 0 and processor 1. + See the User and Reference manuals for more details. */ + if (myid == 0) + { + int ilower[2]={-4,0}, iupper[2]={-1,2}; + int stencil_indices[5] = {0,1,2,3,4}; /* labels for the stencil entries - + these correspond to the offsets + defined above */ + int nentries = 5; + int nvalues = 60; /* 12 grid points, each with 5 stencil entries */ + double values[60]; + + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 4.0; + for (j = 1; j < nentries; j++) + values[i+j] = -1.0; + } + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var, nentries, + stencil_indices, values); + } + else if (myid == 1) + { + int ilower[2]={-1,0}, iupper[2]={2,4}; + int stencil_indices[5] = {0,1,2,3,4}; + int nentries = 5; + int nvalues = 100; /* 20 grid points, each with 5 stencil entries */ + double values[100]; + + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 4.0; + for (j = 1; j < nentries; j++) + values[i+j] = -1.0; + } + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var, nentries, + stencil_indices, values); + } + + /* Set the coefficients reaching outside of the boundary to 0. Note that + * both ilower *and* iupper may be different from those in ex1. */ + if (myid == 0) + { + double values[4]; + for (i = 0; i < 4; i++) + values[i] = 0.0; + { + /* values below our box */ + int ilower[2]={-4,0}, iupper[2]={-1,0}; + int stencil_indices[1] = {3}; + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var, 1, + stencil_indices, values); + } + { + /* values to the left of our box */ + int ilower[2]={-4,0}, iupper[2]={-4,2}; + int stencil_indices[1] = {1}; + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var, 1, + stencil_indices, values); + } + { + /* values above our box */ + int ilower[2]={-4,2}, iupper[2]={-2,2}; + int stencil_indices[1] = {4}; + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var, 1, + stencil_indices, values); + } + } + else if (myid == 1) + { + double values[5]; + for (i = 0; i < 5; i++) + values[i] = 0.0; + { + /* values below our box */ + int ilower[2]={-1,0}, iupper[2]={2,0}; + int stencil_indices[1] = {3}; + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var, 1, + stencil_indices, values); + } + { + /* values to the right of our box */ + int ilower[2]={2,0}, iupper[2]={2,4}; + int stencil_indices[1] = {2}; + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var, 1, + stencil_indices, values); + } + { + /* values above our box */ + int ilower[2]={-1,4}, iupper[2]={2,4}; + int stencil_indices[1] = {4}; + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var, 1, + stencil_indices, values); + } + { + /* values to the left of our box + (that do not border the other box on proc. 0) */ + int ilower[2]={-1,3}, iupper[2]={-1,4}; + int stencil_indices[1] = {1}; + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var, 1, + stencil_indices, values); + } + } + + /* This is a collective call finalizing the matrix assembly. + The matrix is now ``ready to be used'' */ + HYPRE_SStructMatrixAssemble(A); + } + + /* 5. Set up SStruct Vectors for b and x. */ + { + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &b); + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &x); + + /* As with the matrix, set the appropriate object type for the vectors */ + HYPRE_SStructVectorSetObjectType(b, object_type); + HYPRE_SStructVectorSetObjectType(x, object_type); + + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(b); + HYPRE_SStructVectorInitialize(x); + + /* Set the vector coefficients. Again, note that the ilower values are + different from those used in ex1, and some of the values are set on + both processors. */ + if (myid == 0) + { + int ilower[2]={-4,0}, iupper[2]={-1,2}; + double values[12]; /* 12 grid points */ + + for (i = 0; i < 12; i ++) + values[i] = 1.0; + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var, values); + + for (i = 0; i < 12; i ++) + values[i] = 0.0; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + else if (myid == 1) + { + int ilower[2]={0,1}, iupper[2]={2,4}; + double values[20]; /* 20 grid points */ + + for (i = 0; i < 20; i ++) + values[i] = 1.0; + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var, values); + + for (i = 0; i < 20; i ++) + values[i] = 0.0; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + + /* This is a collective call finalizing the vector assembly. + The vectors are now ``ready to be used'' */ + HYPRE_SStructVectorAssemble(b); + HYPRE_SStructVectorAssemble(x); + } + + /* 6. Set up and use a solver (See the Reference Manual for descriptions + of all of the options.) */ + if (precond_id == 1) /* PFMG */ + { + HYPRE_StructMatrix sA; + HYPRE_StructVector sb; + HYPRE_StructVector sx; + + HYPRE_StructSolver solver; + HYPRE_StructSolver precond; + + /* Because we are using a struct solver, we need to get the + object of the matrix and vectors to pass in to the struct solvers */ + HYPRE_SStructMatrixGetObject(A, (void **) &sA); + HYPRE_SStructVectorGetObject(b, (void **) &sb); + HYPRE_SStructVectorGetObject(x, (void **) &sx); + + /* Create an empty PCG Struct solver */ + HYPRE_StructPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set PCG parameters */ + HYPRE_StructPCGSetTol(solver, 1.0e-06); + HYPRE_StructPCGSetPrintLevel(solver, 2); + HYPRE_StructPCGSetMaxIter(solver, 50); + + /* Create the Struct PFMG solver for use as a preconditioner */ + HYPRE_StructPFMGCreate(MPI_COMM_WORLD, &precond); + + /* Set PFMG parameters */ + HYPRE_StructPFMGSetMaxIter(precond, 1); + HYPRE_StructPFMGSetTol(precond, 0.0); + HYPRE_StructPFMGSetZeroGuess(precond); + HYPRE_StructPFMGSetNumPreRelax(precond, 2); + HYPRE_StructPFMGSetNumPostRelax(precond, 2); + /* non-Galerkin coarse grid (more efficient for this problem) */ + HYPRE_StructPFMGSetRAPType(precond, 1); + /* R/B Gauss-Seidel */ + HYPRE_StructPFMGSetRelaxType(precond, 2); + /* skip relaxation on some levels (more efficient for this problem) */ + HYPRE_StructPFMGSetSkipRelax(precond, 1); + + + /* Set preconditioner and solve */ + HYPRE_StructPCGSetPrecond(solver, HYPRE_StructPFMGSolve, + HYPRE_StructPFMGSetup, precond); + HYPRE_StructPCGSetup(solver, sA, sb, sx); + HYPRE_StructPCGSolve(solver, sA, sb, sx); + + /* Free memory */ + HYPRE_StructPCGDestroy(solver); + HYPRE_StructPFMGDestroy(precond); + } + else if (precond_id == 2) /* BoomerAMG */ + { + HYPRE_ParCSRMatrix parA; + HYPRE_ParVector parb; + HYPRE_ParVector parx; + + HYPRE_Solver solver; + HYPRE_Solver precond; + + /* Because we are using a struct solver, we need to get the + object of the matrix and vectors to pass in to the struct solvers */ + HYPRE_SStructMatrixGetObject(A, (void **) &parA); + HYPRE_SStructVectorGetObject(b, (void **) &parb); + HYPRE_SStructVectorGetObject(x, (void **) &parx); + + /* Create an empty PCG Struct solver */ + HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set PCG parameters */ + HYPRE_ParCSRPCGSetTol(solver, 1.0e-06); + HYPRE_ParCSRPCGSetPrintLevel(solver, 2); + HYPRE_ParCSRPCGSetMaxIter(solver, 50); + + /* Create the BoomerAMG solver for use as a preconditioner */ + HYPRE_BoomerAMGCreate(&precond); + + /* Set BoomerAMG parameters */ + HYPRE_BoomerAMGSetMaxIter(precond, 1); + HYPRE_BoomerAMGSetTol(precond, 0.0); + HYPRE_BoomerAMGSetPrintLevel(precond, 1); /* print amg solution info */ + HYPRE_BoomerAMGSetRelaxType(precond, 6); /* Sym G.S./Jacobi hybrid */ + HYPRE_BoomerAMGSetNumSweeps(precond, 1); + + /* Set preconditioner and solve */ + HYPRE_ParCSRPCGSetPrecond(solver, HYPRE_BoomerAMGSolve, + HYPRE_BoomerAMGSetup, precond); + HYPRE_ParCSRPCGSetup(solver, parA, parb, parx); + HYPRE_ParCSRPCGSolve(solver, parA, parb, parx); + + /* Free memory */ + HYPRE_ParCSRPCGDestroy(solver); + HYPRE_BoomerAMGDestroy(precond); + } + + /* Save the solution for GLVis visualization, see vis/glvis-ex12.sh */ + if (vis) + { + /* Gather the solution vector */ + HYPRE_SStructVectorGather(x); + + GLVis_PrintSStructGrid(grid, "vis/ex12.mesh", myid, NULL, NULL); + GLVis_PrintSStructVector(x, 0, "vis/ex12.sol", myid); + GLVis_PrintData("vis/ex12.data", myid, num_procs); + } + + /* Free memory */ + HYPRE_SStructGridDestroy(grid); + HYPRE_SStructStencilDestroy(stencil); + HYPRE_SStructGraphDestroy(graph); + HYPRE_SStructMatrixDestroy(A); + HYPRE_SStructVectorDestroy(b); + HYPRE_SStructVectorDestroy(x); + + /* Finalize MPI */ + MPI_Finalize(); + + return (0); +} diff --git a/3rd_party/hypre/src/examples/ex12f.f b/3rd_party/hypre/src/examples/ex12f.f new file mode 100644 index 000000000..99da97a10 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex12f.f @@ -0,0 +1,485 @@ +! Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +! HYPRE Project Developers. See the top-level COPYRIGHT file for details. +! +! SPDX-License-Identifier: (Apache-2.0 OR MIT) + +! +! Example 12 +! +! Interface: Semi-Structured interface (SStruct) +! +! Compile with: make ex12f (may need to edit HYPRE_DIR in Makefile) +! +! Sample runs: mpirun -np 2 ex12f +! +! Description: The grid layout is the same as ex1, but with nodal +! unknowns. The solver is PCG preconditioned with either PFMG or +! BoomerAMG, set with 'precond_id' below. +! +! We recommend viewing the Struct examples before viewing this and +! the other SStruct examples. This is one of the simplest SStruct +! examples, used primarily to demonstrate how to set up +! non-cell-centered problems, and to demonstrate how easy it is to +! switch between structured solvers (PFMG) and solvers designed for +! more general settings (AMG). +! + + program ex12f + + implicit none + + include 'mpif.h' + include 'HYPREf.h' + + integer ierr + integer i, j, myid, num_procs + + integer*8 grid + integer*8 graph + integer*8 stencil + integer*8 A + integer*8 b + integer*8 x + + integer nparts + integer nvars + integer part + integer var + + integer precond_id, object_type + + integer ilower(2), iupper(2) + integer vartypes(1) + integer offsets(2,5) + integer ent + integer nentries, nvalues, stencil_indices(5) + + double precision values(100), tol + +! This comes from 'sstruct_mv/HYPRE_sstruct_mv.h' + integer HYPRE_SSTRUCT_VARIABLE_NODE + parameter( HYPRE_SSTRUCT_VARIABLE_NODE = 1 ) + + integer*8 sA + integer*8 sb + integer*8 sx + integer*8 parA + integer*8 parb + integer*8 parx + integer*8 solver + integer*8 precond + + character*32 matfile + +! We only have one part and one variable + nparts = 1 + nvars = 1 + part = 0 + var = 0 + +! Initialize MPI + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, myid, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, num_procs, ierr) + + if (num_procs .ne. 2) then + if (myid .eq. 0) then + print *, "Must run with 2 processors!" + stop + endif + endif + +! Set preconditioner id (PFMG = 1, BoomerAMG = 2) + precond_id = 1 + + if (precond_id .eq. 1) then + object_type = HYPRE_STRUCT + else if (precond_id .eq. 2) then + object_type = HYPRE_PARCSR + else + if (myid .eq. 0) then + print *, "Invalid solver!" + stop + endif + endif + +!----------------------------------------------------------------------- +! 1. Set up the grid. Here we use only one part. Each processor +! describes the piece of the grid that it owns. +!----------------------------------------------------------------------- + +! Create an empty 2D grid object + call HYPRE_SStructGridCreate(MPI_COMM_WORLD, 2, nparts, grid, + + ierr) + +! Add boxes to the grid + if (myid .eq. 0) then + ilower(1) = -3 + ilower(2) = 1 + iupper(1) = -1 + iupper(2) = 2 + call HYPRE_SStructGridSetExtents(grid, part, ilower, iupper, + + ierr) + else if (myid .eq. 1) then + ilower(1) = 0 + ilower(2) = 1 + iupper(1) = 2 + iupper(2) = 4 + call HYPRE_SStructGridSetExtents(grid, part, ilower, iupper, + + ierr) + endif + +! Set the variable type and number of variables on each part + vartypes(1) = HYPRE_SSTRUCT_VARIABLE_NODE + call HYPRE_SStructGridSetVariables(grid, part, nvars, vartypes, + + ierr) + +! This is a collective call finalizing the grid assembly + call HYPRE_SStructGridAssemble(grid, ierr) + +!----------------------------------------------------------------------- +! 2. Define the discretization stencil +!----------------------------------------------------------------------- + +! Create an empty 2D, 5-pt stencil object + call HYPRE_SStructStencilCreate(2, 5, stencil, ierr) + +! Define the geometry of the stencil. Each represents a relative +! offset (in the index space). + offsets(1,1) = 0 + offsets(2,1) = 0 + offsets(1,2) = -1 + offsets(2,2) = 0 + offsets(1,3) = 1 + offsets(2,3) = 0 + offsets(1,4) = 0 + offsets(2,4) = -1 + offsets(1,5) = 0 + offsets(2,5) = 1 + +! Assign numerical values to the offsets so that we can easily refer +! to them - the last argument indicates the variable for which we +! are assigning this stencil + do ent = 1, 5 + call HYPRE_SStructStencilSetEntry(stencil, + + ent-1, offsets(1,ent), var, ierr) + enddo + +!----------------------------------------------------------------------- +! 3. Set up the Graph - this determines the non-zero structure of +! the matrix and allows non-stencil relationships between the parts +!----------------------------------------------------------------------- + +! Create the graph object + call HYPRE_SStructGraphCreate(MPI_COMM_WORLD, grid, graph, ierr) + +! See MatrixSetObjectType below + call HYPRE_SStructGraphSetObjectType(graph, object_type, ierr) + +! Now we need to tell the graph which stencil to use for each +! variable on each part (we only have one variable and one part) + call HYPRE_SStructGraphSetStencil(graph, part, var, stencil, ierr) + +! Here we could establish connections between parts if we had more +! than one part using the graph. For example, we could use +! HYPRE_GraphAddEntries() routine or HYPRE_GridSetNeighborPart() + +! Assemble the graph + call HYPRE_SStructGraphAssemble(graph, ierr) + +!----------------------------------------------------------------------- +! 4. Set up a SStruct Matrix +!----------------------------------------------------------------------- + +! Create an empty matrix object + call HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, graph, A, ierr) + +! Set the object type (by default HYPRE_SSTRUCT). This determines +! the data structure used to store the matrix. For PFMG we use +! HYPRE_STRUCT, and for BoomerAMG we use HYPRE_PARCSR (set above). + call HYPRE_SStructMatrixSetObjectTyp(A, object_type, ierr) + +! Get ready to set values + call HYPRE_SStructMatrixInitialize(A, ierr) + +! Set the matrix coefficients. Each processor assigns coefficients +! for the boxes in the grid that it owns. Note that the +! coefficients associated with each stencil entry may vary from grid +! point to grid point if desired. Here, we first set the same +! stencil entries for each grid point. Then we make modifications +! to grid points near the boundary. Note that the ilower values are +! different from those used in ex1 because of the way nodal +! variables are referenced. Also note that some of the stencil +! values are set on both processor 0 and processor 1. See the User +! and Reference manuals for more details. + +! Stencil entry labels correspond to the offsets defined above + do i = 1, 5 + stencil_indices(i) = i-1 + enddo + nentries = 5 + + if (myid .eq. 0) then + ilower(1) = -4 + ilower(2) = 0 + iupper(1) = -1 + iupper(2) = 2 +! 12 grid points, each with 5 stencil entries + nvalues = 60 + else if (myid .eq. 1) then + ilower(1) = -1 + ilower(2) = 0 + iupper(1) = 2 + iupper(2) = 4 +! 12 grid points, each with 5 stencil entries + nvalues = 100 + endif + + do i = 1, nvalues, nentries + values(i) = 4.0 + do j = 1, nentries-1 + values(i+j) = -1.0 + enddo + enddo + + call HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + + var, nentries, stencil_indices, values, ierr) + +! Set the coefficients reaching outside of the boundary to 0. Note +! that both ilower *and* iupper may be different from those in ex1. + + do i = 1, 5 + values(i) = 0.0 + enddo + + if (myid .eq. 0) then + +! values below our box + ilower(1) = -4 + ilower(2) = 0 + iupper(1) = -1 + iupper(2) = 0 + stencil_indices(1) = 3 + call HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + + var, 1, stencil_indices, values, ierr) +! values to the left of our box + ilower(1) = -4 + ilower(2) = 0 + iupper(1) = -4 + iupper(2) = 2 + stencil_indices(1) = 1 + call HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + + var, 1, stencil_indices, values, ierr) +! values above our box + ilower(1) = -4 + ilower(2) = 2 + iupper(1) = -2 + iupper(2) = 2 + stencil_indices(1) = 4 + call HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + + var, 1, stencil_indices, values, ierr) + + else if (myid .eq. 1) then + +! values below our box + ilower(1) = -1 + ilower(2) = 0 + iupper(1) = 2 + iupper(2) = 0 + stencil_indices(1) = 3 + call HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + + var, 1, stencil_indices, values, ierr) +! values to the right of our box + ilower(1) = 2 + ilower(2) = 0 + iupper(1) = 2 + iupper(2) = 4 + stencil_indices(1) = 2 + call HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + + var, 1, stencil_indices, values, ierr) +! values above our box + ilower(1) = -1 + ilower(2) = 4 + iupper(1) = 2 + iupper(2) = 4 + stencil_indices(1) = 4 + call HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + + var, 1, stencil_indices, values, ierr) +! values to the left of our box +! (that do not border the other box on proc. 0) + ilower(1) = -1 + ilower(2) = 3 + iupper(1) = -1 + iupper(2) = 4 + stencil_indices(1) = 1 + call HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + + var, 1, stencil_indices, values, ierr) + + endif + +! This is a collective call finalizing the matrix assembly + call HYPRE_SStructMatrixAssemble(A, ierr) + +! matfile = 'ex12f.out' +! matfile(10:10) = char(0) +! call HYPRE_SStructMatrixPrint(matfile, A, 0, ierr) + +! Create an empty vector object + call HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, b, ierr) + call HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, x, ierr) + +! As with the matrix, set the appropriate object type for the vectors + call HYPRE_SStructVectorSetObjectTyp(b, object_type, ierr) + call HYPRE_SStructVectorSetObjectTyp(x, object_type, ierr) + +! Indicate that the vector coefficients are ready to be set + call HYPRE_SStructVectorInitialize(b, ierr) + call HYPRE_SStructVectorInitialize(x, ierr) + +! Set the vector coefficients. Again, note that the ilower values +! are different from those used in ex1, and some of the values are +! set on both processors. + + if (myid .eq. 0) then + + ilower(1) = -4 + ilower(2) = 0 + iupper(1) = -1 + iupper(2) = 2 + + do i = 1, 12 + values(i) = 1.0 + enddo + call HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, + + var, values, ierr) + do i = 1, 12 + values(i) = 0.0 + enddo + call HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, + + var, values, ierr) + + else if (myid .eq. 1) then + + ilower(1) = 0 + ilower(2) = 1 + iupper(1) = 2 + iupper(2) = 4 + + do i = 1, 20 + values(i) = 1.0 + enddo + call HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, + + var, values, ierr) + do i = 1, 20 + values(i) = 0.0 + enddo + call HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, + + var, values, ierr) + + endif + +! This is a collective call finalizing the vector assembly + call HYPRE_SStructVectorAssemble(b, ierr) + call HYPRE_SStructVectorAssemble(x, ierr) + +!----------------------------------------------------------------------- +! 6. Set up and use a solver (See the Reference Manual for +! descriptions of all of the options.) +!----------------------------------------------------------------------- + + tol = 1.0E-6 + + if (precond_id .eq. 1) then + +! PFMG + +! Because we are using a struct solver, we need to get the object +! of the matrix and vectors to pass in to the struct solvers + call HYPRE_SStructMatrixGetObject(A, sA, ierr) + call HYPRE_SStructVectorGetObject(b, sb, ierr) + call HYPRE_SStructVectorGetObject(x, sx, ierr) + +! Create an empty PCG Struct solver + call HYPRE_StructPCGCreate(MPI_COMM_WORLD, solver, ierr) +! Set PCG parameters + call HYPRE_StructPCGSetTol(solver, tol, ierr) + call HYPRE_StructPCGSetPrintLevel(solver, 2, ierr) + call HYPRE_StructPCGSetMaxIter(solver, 50, ierr) + +! Create the Struct PFMG solver for use as a preconditioner + call HYPRE_StructPFMGCreate(MPI_COMM_WORLD, precond, ierr) +! Set PFMG parameters + call HYPRE_StructPFMGSetMaxIter(precond, 1, ierr) + call HYPRE_StructPFMGSetTol(precond, 0.0d0, ierr) + call HYPRE_StructPFMGSetZeroGuess(precond, ierr) + call HYPRE_StructPFMGSetNumPreRelax(precond, 2, ierr) + call HYPRE_StructPFMGSetNumPostRelax(precond, 2, ierr) +! Non-Galerkin coarse grid (more efficient for this problem) + call HYPRE_StructPFMGSetRAPType(precond, 1, ierr) +! R/B Gauss-Seidel + call HYPRE_StructPFMGSetRelaxType(precond, 2, ierr) +! Skip relaxation on some levels (more efficient for this problem) + call HYPRE_StructPFMGSetSkipRelax(precond, 1, ierr) +! Set preconditioner (PFMG = 1) and solve + call HYPRE_StructPCGSetPrecond(solver, 1, precond, ierr) + call HYPRE_StructPCGSetup(solver, sA, sb, sx, ierr) + call HYPRE_StructPCGSolve(solver, sA, sb, sx, ierr) + +! Free memory + call HYPRE_StructPCGDestroy(solver, ierr) + call HYPRE_StructPFMGDestroy(precond, ierr) + + else if (precond_id .eq. 2) then + +! BoomerAMG + +! Because we are using a struct solver, we need to get the object +! of the matrix and vectors to pass in to the struct solvers + call HYPRE_SStructMatrixGetObject(A, parA, ierr) + call HYPRE_SStructVectorGetObject(b, parb, ierr) + call HYPRE_SStructVectorGetObject(x, parx, ierr) + +! Create an empty PCG Struct solver + call HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, solver, ierr) +! Set PCG parameters + call HYPRE_ParCSRPCGSetTol(solver, tol, ierr) + call HYPRE_ParCSRPCGSetPrintLevel(solver, 2, ierr) + call HYPRE_ParCSRPCGSetMaxIter(solver, 50, ierr) + +! Create the BoomerAMG solver for use as a preconditioner + call HYPRE_BoomerAMGCreate(precond, ierr) +! Set BoomerAMG parameters + call HYPRE_BoomerAMGSetMaxIter(precond, 1, ierr) + call HYPRE_BoomerAMGSetTol(precond, 0.0, ierr) +! Print amg solution info + call HYPRE_BoomerAMGSetPrintLevel(precond, 1, ierr) + call HYPRE_BoomerAMGSetCoarsenType(precond, 6, ierr) + call HYPRE_BoomerAMGSetOldDefault(precond, ierr) +! Sym G.S./Jacobi hybrid + call HYPRE_BoomerAMGSetRelaxType(precond, 6, ierr) + call HYPRE_BoomerAMGSetNumSweeps(precond, 1, ierr) +! Set preconditioner (BoomerAMG = 2) and solve + call HYPRE_ParCSRPCGSetPrecond(solver, 2, precond, ierr) + call HYPRE_ParCSRPCGSetup(solver, parA, parb, parx, ierr) + call HYPRE_ParCSRPCGSolve(solver, parA, parb, parx, ierr) + +! Free memory + call HYPRE_ParCSRPCGDestroy(solver, ierr) + call HYPRE_BoomerAMGDestroy(precond, ierr) + + endif + +! Free memory + call HYPRE_SStructGridDestroy(grid, ierr) + call HYPRE_SStructStencilDestroy(stencil, ierr) + call HYPRE_SStructGraphDestroy(graph, ierr) + call HYPRE_SStructMatrixDestroy(A, ierr) + call HYPRE_SStructVectorDestroy(b, ierr) + call HYPRE_SStructVectorDestroy(x, ierr) + +! Finalize MPI + call MPI_Finalize(ierr) + + stop + end + diff --git a/3rd_party/hypre/src/examples/ex13.c b/3rd_party/hypre/src/examples/ex13.c new file mode 100644 index 000000000..b3e96b636 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex13.c @@ -0,0 +1,691 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 13 + + Interface: Semi-Structured interface (SStruct) + + Compile with: make ex13 + + Sample run: mpirun -np 6 ex13 -n 10 + + To see options: ex13 -help + + Description: This code solves the 2D Laplace equation using bilinear + finite element discretization on a mesh with an "enhanced + connectivity" point. Specifically, we solve -Delta u = 1 + with zero boundary conditions on a star-shaped domain + consisting of identical rhombic parts each meshed with a + uniform n x n grid. Every part is assigned to a different + processor and all parts meet at the origin, equally + subdividing the 2*pi angle there. The case of six processors + (parts) looks as follows: + + + + / \ + / \ + / \ + +--------+ 1 +---------+ + \ \ / / + \ 2 \ / 0 / + \ \ / / + +--------+---------+ + / / \ \ + / 3 / \ 5 \ + / / \ \ + +--------+ 4 +---------+ + \ / + \ / + \ / + + + + Note that in this problem we use nodal variables, which are + shared between the different parts. The node at the origin, + for example, belongs to all parts as illustrated below: + + . + / \ + . . + / \ / \ + o . * + .---.---o \ / \ / *---.---. + \ \ \ o * / / / + .---.---o \ / *---.---. + \ \ \ x / / / + @---@---x x---z---z + @---@---x x---z---z + / / / x \ \ \ + .---.---a / \ #---.---. + / / / a # \ \ \ + .---.---a / \ / \ #---.---. + a . # + \ / \ / + . . + \ / + . + + We recommend viewing the Struct examples before viewing this + and the other SStruct examples. The primary role of this + particular SStruct example is to demonstrate a stencil-based + way to set up finite element problems in SStruct, and + specifically to show how to handle problems with an "enhanced + connectivity" point. +*/ + +#include +#include "_hypre_utilities.h" +#include "HYPRE_sstruct_mv.h" +#include "HYPRE_sstruct_ls.h" +#include "HYPRE.h" + +#ifndef M_PI +#define M_PI 3.14159265358979 +#endif + +#include "vis.c" + +/* + This routine computes the bilinear finite element stiffness matrix and + load vector on a rhombus with angle gamma. Specifically, let R be the + rhombus + [3]------[2] + / / + / / + [0]------[1] + + with sides of length h. The finite element stiffness matrix + + S_ij = (grad phi_i,grad phi_j)_R + + with bilinear finite element functions {phi_i} has the form + + / 4-k -1 -2+k -1 \ + alpha . | -1 4+k -1 -2-k | + | -2+k -1 4-k -1 | + \ -1 -2-k -1 4+k / + + where alpha = 1/(6*sin(gamma)) and k = 3*cos(gamma). The load vector + corresponding to a right-hand side of 1 is + + F_j = (1,phi_j)_R = h^2/4 * sin(gamma) +*/ +void ComputeFEMRhombus (double S[4][4], double F[4], double gamma, double h) +{ + int i, j; + + double h2_4 = h*h/4; + double sing = sin(gamma); + double alpha = 1/(6*sing); + double k = 3*cos(gamma); + + S[0][0] = alpha * (4-k); + S[0][1] = alpha * (-1); + S[0][2] = alpha * (-2+k); + S[0][3] = alpha * (-1); + S[1][1] = alpha * (4+k); + S[1][2] = alpha * (-1); + S[1][3] = alpha * (-2-k); + S[2][2] = alpha * (4-k); + S[2][3] = alpha * (-1); + S[3][3] = alpha * (4+k); + + /* The stiffness matrix is symmetric */ + for (i = 1; i < 4; i++) + for (j = 0; j < i; j++) + S[i][j] = S[j][i]; + + for (i = 0; i < 4; i++) + F[i] = h2_4*sing; +} + + +int main (int argc, char *argv[]) +{ + int myid, num_procs; + int n; + double gamma, h; + int vis; + + HYPRE_SStructGrid grid; + HYPRE_SStructGraph graph; + HYPRE_SStructStencil stencil; + HYPRE_SStructMatrix A; + HYPRE_SStructVector b; + HYPRE_SStructVector x; + + HYPRE_Solver solver; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Set default parameters */ + n = 10; + vis = 0; + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size per processor (default: 10)\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* Set the rhombus angle, gamma, and the mesh size, h, depending on the + number of processors np and the given n */ + if (num_procs < 3) + { + if (myid ==0) printf("Must run with at least 3 processors!\n"); + MPI_Finalize(); + exit(1); + } + gamma = 2*M_PI/num_procs; + h = 1.0/n; + + /* 1. Set up the grid. We will set up the grid so that processor X owns + part X. Note that each part has its own index space numbering. Later + we relate the parts to each other. */ + { + int ndim = 2; + int nparts = num_procs; + + /* Create an empty 2D grid object */ + HYPRE_SStructGridCreate(MPI_COMM_WORLD, ndim, nparts, &grid); + + /* Set the extents of the grid - each processor sets its grid boxes. Each + part has its own relative index space numbering */ + { + int part = myid; + int ilower[2] = {1,1}; /* lower-left cell touching the origin */ + int iupper[2] = {n,n}; /* upper-right cell */ + + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + } + + /* Set the variable type and number of variables on each part. These need + to be set in each part which is neighboring or contains boxes owned by + the processor. */ + { + int i; + int nvars = 1; + + HYPRE_SStructVariable vartypes[1] = {HYPRE_SSTRUCT_VARIABLE_NODE}; + for (i = 0; i < nparts; i++) + HYPRE_SStructGridSetVariables(grid, i, nvars, vartypes); + } + + /* Now we need to set the spatial relation between each of the parts. + Since we are using nodal variables, we have to use SetSharedPart to + establish the connection at the origin. */ + { + /* Relation to the clockwise-previous neighbor part, e.g. 0 and 1 for + the case of 6 parts. Note that we could have used SetNeighborPart + here instead of SetSharedPart. */ + { + int part = myid; + /* the box of cells intersecting the boundary in the current part */ + int ilower[2] = {1,1}, iupper[2] = {1,n}; + /* share all data on the left side of the box */ + int offset[2] = {-1,0}; + + int shared_part = (myid+1) % num_procs; + /* the box of cells intersecting the boundary in the neighbor */ + int shared_ilower[2] = {1,1}, shared_iupper[2] = {n,1}; + /* share all data on the bottom of the box */ + int shared_offset[2] = {0,-1}; + + /* x/y-direction on the current part is -y/x on the neighbor */ + int index_map[2] = {1,0}; + int index_dir[2] = {-1,1}; + + HYPRE_SStructGridSetSharedPart(grid, part, ilower, iupper, offset, + shared_part, shared_ilower, + shared_iupper, shared_offset, + index_map, index_dir); + } + + /* Relation to the clockwise-following neighbor part, e.g. 0 and 5 for + the case of 6 parts. Note that we could have used SetNeighborPart + here instead of SetSharedPart. */ + { + int part = myid; + /* the box of cells intersecting the boundary in the current part */ + int ilower[2] = {1,1}, iupper[2] = {n,1}; + /* share all data on the bottom of the box */ + int offset[2] = {0,-1}; + + int shared_part = (myid+num_procs-1) % num_procs; + /* the box of cells intersecting the boundary in the neighbor */ + int shared_ilower[2] = {1,1}, shared_iupper[2] = {1,n}; + /* share all data on the left side of the box */ + int shared_offset[2] = {-1,0}; + + /* x/y-direction on the current part is y/-x on the neighbor */ + int index_map[2] = {1,0}; + int index_dir[2] = {1,-1}; + + HYPRE_SStructGridSetSharedPart(grid, part, ilower, iupper, offset, + shared_part, shared_ilower, + shared_iupper, shared_offset, + index_map, index_dir); + } + + /* Relation to all other parts, e.g. 0 and 2,3,4. This can be + described only by SetSharedPart. */ + { + int part = myid; + /* the (one cell) box that touches the origin */ + int ilower[2] = {1,1}, iupper[2] = {1,1}; + /* share all data in the bottom left corner (i.e. the origin) */ + int offset[2] = {-1,-1}; + + int shared_part; + /* the box of one cell that touches the origin */ + int shared_ilower[2] = {1,1}, shared_iupper[2] = {1,1}; + /* share all data in the bottom left corner (i.e. the origin) */ + int shared_offset[2] = {-1,-1}; + + /* x/y-direction on the current part is -x/-y on the neighbor, but + in this case the arguments are not really important since we are + only sharing a point */ + int index_map[2] = {0,1}; + int index_dir[2] = {-1,-1}; + + for (shared_part = 0; shared_part < myid-1; shared_part++) + HYPRE_SStructGridSetSharedPart(grid, part, ilower, iupper, offset, + shared_part, shared_ilower, + shared_iupper, shared_offset, + index_map, index_dir); + + for (shared_part = myid+2; shared_part < num_procs; shared_part++) + HYPRE_SStructGridSetSharedPart(grid, part, ilower, iupper, offset, + shared_part, shared_ilower, + shared_iupper, shared_offset, + index_map, index_dir); + } + } + + /* Now the grid is ready to be used */ + HYPRE_SStructGridAssemble(grid); + } + + /* 2. Define the discretization stencils. Since this is a finite element + discretization we define here a full 9-point stencil. We will later + use four sub-stencils for the rows of the local stiffness matrix. */ + { + int ndim = 2; + int var = 0; + int entry; + + /* Define the geometry of the 9-point stencil */ + int stencil_size = 9; + int offsets[9][2] = {{0,0}, /* [8] [4] [7] */ + {-1,0}, {1,0}, /* \ | / */ + {0,-1}, {0,1}, /* [1]-[0]-[2] */ + {-1,-1}, {1,-1}, /* / | \ */ + {1,1}, {-1,1}}; /* [5] [3] [6] */ + + HYPRE_SStructStencilCreate(ndim, stencil_size, &stencil); + + for (entry = 0; entry < stencil_size; entry++) + HYPRE_SStructStencilSetEntry(stencil, entry, offsets[entry], var); + } + + /* 3. Set up the Graph - this determines the non-zero structure of the + matrix. */ + { + int part; + int var = 0; + + /* Create the graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, grid, &graph); + + /* See MatrixSetObjectType below */ + HYPRE_SStructGraphSetObjectType(graph, HYPRE_PARCSR); + + /* Now we need to tell the graph which stencil to use for each + variable on each part (we only have one variable) */ + for (part = 0; part < num_procs; part++) + HYPRE_SStructGraphSetStencil(graph, part, var, stencil); + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(graph); + } + + /* 4. Set up the SStruct Matrix and right-hand side vector */ + { + int part = myid; + int var = 0; + + /* Create the matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, graph, &A); + /* Use a ParCSR storage */ + HYPRE_SStructMatrixSetObjectType(A, HYPRE_PARCSR); + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_SStructMatrixInitialize(A); + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &b); + /* Use a ParCSR storage */ + HYPRE_SStructVectorSetObjectType(b, HYPRE_PARCSR); + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(b); + + /* Set the matrix and vector entries by finite element assembly */ + { + /* local stifness matrix and load vector */ + double S[4][4], F[4]; + + /* The index of the local nodes 0-3 relative to the cell index, + i.e. node k in cell (i,j) is in the upper-right corner of the + cell (i,j) + node_index_offset[k]. */ + int node_index_offset[4][2] = {{-1,-1},{0,-1},{0,0},{-1,0}}; + + /* The cell sub-stencils of nodes 0-3 indexed from the full stencil, + i.e. we take the full stencil in each node of a fixed cell, and + restrict it to that as is done in the finite element stiffness + matrix: + [4] [7] [8] [4] [1]-[0] [0]-[2] + | / \ | / | | \ + [0]-[2] , [1]-[0] , [5] [3] , [3] [6] + + Note that the ordering of the local nodes remains fixed, and + therefore the above sub-stencil at node k corresponds to the kth row + of the local stiffness matrix and the kth entry of the local load + vector. */ + int node_stencil[4][4] = {{0,2,7,4},{1,0,4,8},{5,3,0,1},{3,6,2,0}}; + + int i, j, k; + int index[2]; + int nentries = 4; + + /* set the values in the interior cells */ + { + ComputeFEMRhombus(S, F, gamma, h); + + for (i = 1; i <= n; i++) + for (j = 1; j <= n; j++) + for (k = 0; k < 4; k++) /* node k in cell (i,j) */ + { + index[0] = i + node_index_offset[k][0]; + index[1] = j + node_index_offset[k][1]; + HYPRE_SStructMatrixAddToValues(A, part, index, var, + nentries, node_stencil[k], + &S[k][0]); + HYPRE_SStructVectorAddToValues(b, part, index, var, &F[k]); + } + } + + /* cells having nodes 1,2 on the domain boundary */ + { + ComputeFEMRhombus(S, F, gamma, h); + + /* eliminate nodes 1,2 from S and F */ + for (k = 0; k < 4; k++) + { + S[1][k] = S[k][1] = 0.0; + S[2][k] = S[k][2] = 0.0; + } + S[1][1] = 1.0; + S[2][2] = 1.0; + F[1] = 0.0; + F[2] = 0.0; + + for (i = n; i <= n; i++) + for (j = 1; j <= n; j++) + for (k = 0; k < 4; k++) /* node k in cell (n,j) */ + { + index[0] = i + node_index_offset[k][0]; + index[1] = j + node_index_offset[k][1]; + HYPRE_SStructMatrixAddToValues(A, part, index, var, + nentries, node_stencil[k], + &S[k][0]); + HYPRE_SStructVectorAddToValues(b, part, index, var, &F[k]); + } + } + + /* cells having nodes 2,3 on the domain boundary */ + { + ComputeFEMRhombus(S, F, gamma, h); + + /* eliminate nodes 2,3 from S and F */ + for (k = 0; k < 4; k++) + { + S[2][k] = S[k][2] = 0.0; + S[3][k] = S[k][3] = 0.0; + } + S[2][2] = 1.0; + S[3][3] = 1.0; + F[2] = 0.0; + F[3] = 0.0; + + for (i = 1; i <= n; i++) + for (j = n; j <= n; j++) + for (k = 0; k < 4; k++) /* node k in cell (i,n) */ + { + index[0] = i + node_index_offset[k][0]; + index[1] = j + node_index_offset[k][1]; + HYPRE_SStructMatrixAddToValues(A, part, index, var, + nentries, node_stencil[k], + &S[k][0]); + HYPRE_SStructVectorAddToValues(b, part, index, var, &F[k]); + } + } + + /* cells having nodes 1,2,3 on the domain boundary */ + { + ComputeFEMRhombus(S, F, gamma, h); + + /* eliminate nodes 2,3 from S and F */ + for (k = 0; k < 4; k++) + { + S[1][k] = S[k][1] = 0.0; + S[2][k] = S[k][2] = 0.0; + S[3][k] = S[k][3] = 0.0; + } + S[1][1] = 1.0; + S[2][2] = 1.0; + S[3][3] = 1.0; + F[1] = 0.0; + F[2] = 0.0; + F[3] = 0.0; + + for (i = n; i <= n; i++) + for (j = n; j <= n; j++) + for (k = 0; k < 4; k++) /* node k in cell (n,n) */ + { + index[0] = i + node_index_offset[k][0]; + index[1] = j + node_index_offset[k][1]; + HYPRE_SStructMatrixAddToValues(A, part, index, var, + nentries, node_stencil[k], + &S[k][0]); + HYPRE_SStructVectorAddToValues(b, part, index, var, &F[k]); + } + } + } + } + + /* Collective calls finalizing the matrix and vector assembly */ + HYPRE_SStructMatrixAssemble(A); + HYPRE_SStructVectorAssemble(b); + + /* 5. Set up SStruct Vector for the solution vector x */ + { + int part = myid; + int var = 0; + int nvalues = (n+1)*(n+1); + double *values; + + /* Since the SetBoxValues() calls below set the values of the nodes in + the upper-right corners of the cells, the nodal box should start + from (0,0) instead of (1,1). */ + int ilower[2] = {0,0}; + int iupper[2] = {n,n}; + + values = (double*) calloc(nvalues, sizeof(double)); + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &x); + /* Set the object type to ParCSR */ + HYPRE_SStructVectorSetObjectType(x, HYPRE_PARCSR); + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(x); + /* Set the values for the initial guess */ + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + + free(values); + + /* Finalize the vector assembly */ + HYPRE_SStructVectorAssemble(x); + } + + /* 6. Set up and call the solver (Solver options can be found in the + Reference Manual.) */ + { + double final_res_norm; + int its; + + HYPRE_ParCSRMatrix par_A; + HYPRE_ParVector par_b; + HYPRE_ParVector par_x; + + /* Extract the ParCSR objects needed in the solver */ + HYPRE_SStructMatrixGetObject(A, (void **) &par_A); + HYPRE_SStructVectorGetObject(b, (void **) &par_b); + HYPRE_SStructVectorGetObject(x, (void **) &par_x); + + /* Here we construct a BoomerAMG solver. See the other SStruct examples + as well as the Reference manual for additional solver choices. */ + HYPRE_BoomerAMGCreate(&solver); + HYPRE_BoomerAMGSetOldDefault(solver); + HYPRE_BoomerAMGSetStrongThreshold(solver, 0.25); + HYPRE_BoomerAMGSetTol(solver, 1e-6); + HYPRE_BoomerAMGSetPrintLevel(solver, 2); + HYPRE_BoomerAMGSetMaxIter(solver, 50); + + /* call the setup */ + HYPRE_BoomerAMGSetup(solver, par_A, par_b, par_x); + + /* call the solve */ + HYPRE_BoomerAMGSolve(solver, par_A, par_b, par_x); + + /* get some info */ + HYPRE_BoomerAMGGetNumIterations(solver, &its); + HYPRE_BoomerAMGGetFinalRelativeResidualNorm(solver, + &final_res_norm); + /* clean up */ + HYPRE_BoomerAMGDestroy(solver); + + /* Gather the solution vector */ + HYPRE_SStructVectorGather(x); + + /* Save the solution for GLVis visualization, see vis/glvis-ex13.sh */ + if (vis) + { + FILE *file; + char filename[255]; + + int i, part = myid, var = 0; + int nvalues = (n+1)*(n+1); + double *values = (double*) calloc(nvalues, sizeof(double)); + int ilower[2] = {0,0}; + int iupper[2] = {n,n}; + + /* get all local data (including a local copy of the shared values) */ + HYPRE_SStructVectorGetBoxValues(x, part, ilower, iupper, + var, values); + + sprintf(filename, "%s.%06d", "vis/ex13.sol", myid); + if ((file = fopen(filename, "w")) == NULL) + { + printf("Error: can't open output file %s\n", filename); + MPI_Finalize(); + exit(1); + } + + /* finite element space header */ + fprintf(file, "FiniteElementSpace\n"); + fprintf(file, "FiniteElementCollection: H1_2D_P1\n"); + fprintf(file, "VDim: 1\n"); + fprintf(file, "Ordering: 0\n\n"); + + /* save solution */ + for (i = 0; i < nvalues; i++) + fprintf(file, "%.14e\n", values[i]); + + fflush(file); + fclose(file); + free(values); + + /* save local finite element mesh */ + GLVis_PrintLocalRhombusMesh("vis/ex13.mesh", n, myid, gamma); + + /* additional visualization data */ + GLVis_PrintData("vis/ex13.data", myid, num_procs); + } + + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", its); + printf("Final Relative Residual Norm = %g\n", final_res_norm); + printf("\n"); + } + } + + /* Free memory */ + HYPRE_SStructGridDestroy(grid); + HYPRE_SStructStencilDestroy(stencil); + HYPRE_SStructGraphDestroy(graph); + HYPRE_SStructMatrixDestroy(A); + HYPRE_SStructVectorDestroy(b); + HYPRE_SStructVectorDestroy(x); + + /* Finalize MPI */ + MPI_Finalize(); + + return 0; +} diff --git a/3rd_party/hypre/src/examples/ex14.c b/3rd_party/hypre/src/examples/ex14.c new file mode 100644 index 000000000..235f7a493 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex14.c @@ -0,0 +1,648 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 14 + + Interface: Semi-Structured interface (SStruct) + + Compile with: make ex14 + + Sample run: mpirun -np 6 ex14 -n 10 + + To see options: ex14 -help + + Description: This code solves the 2D Laplace equation using bilinear + finite element discretization on a mesh with an "enhanced + connectivity" point. Specifically, we solve -Delta u = 1 + with zero boundary conditions on a star-shaped domain + consisting of identical rhombic parts each meshed with a + uniform n x n grid. Every part is assigned to a different + processor and all parts meet at the origin, equally + subdividing the 2*pi angle there. The case of six processors + (parts) looks as follows: + + + + / \ + / \ + / \ + +--------+ 1 +---------+ + \ \ / / + \ 2 \ / 0 / + \ \ / / + +--------+---------+ + / / \ \ + / 3 / \ 5 \ + / / \ \ + +--------+ 4 +---------+ + \ / + \ / + \ / + + + + Note that in this problem we use nodal variables, which are + shared between the different parts. The node at the origin, + for example, belongs to all parts as illustrated below: + + . + / \ + . . + / \ / \ + o . * + .---.---o \ / \ / *---.---. + \ \ \ o * / / / + .---.---o \ / *---.---. + \ \ \ x / / / + @---@---x x---z---z + @---@---x x---z---z + / / / x \ \ \ + .---.---a / \ #---.---. + / / / a # \ \ \ + .---.---a / \ / \ #---.---. + a . # + \ / \ / + . . + \ / + . + + This example is a identical to Example 13, except that it + uses the SStruct FEM input functions instead of stencils to + describe the problem. This is the recommended way to set up a + finite element problem in the SStruct interface. +*/ + +#include +#include "_hypre_utilities.h" +#include "HYPRE_sstruct_mv.h" +#include "HYPRE_sstruct_ls.h" +#include "HYPRE.h" + +#ifndef M_PI +#define M_PI 3.14159265358979 +#endif + +#include "vis.c" + +/* + This routine computes the bilinear finite element stiffness matrix and + load vector on a rhombus with angle gamma. Specifically, let R be the + rhombus + [3]------[2] + / / + / / + [0]------[1] + + with sides of length h. The finite element stiffness matrix + + S_ij = (grad phi_i,grad phi_j)_R + + with bilinear finite element functions {phi_i} has the form + + / 4-k -1 -2+k -1 \ + alpha . | -1 4+k -1 -2-k | + | -2+k -1 4-k -1 | + \ -1 -2-k -1 4+k / + + where alpha = 1/(6*sin(gamma)) and k = 3*cos(gamma). The load vector + corresponding to a right-hand side of 1 is + + F_j = (1,phi_j)_R = h^2/4 * sin(gamma) +*/ +void ComputeFEMRhombus (double S[4][4], double F[4], double gamma, double h) +{ + int i, j; + + double h2_4 = h*h/4; + double sing = sin(gamma); + double alpha = 1/(6*sing); + double k = 3*cos(gamma); + + S[0][0] = alpha * (4-k); + S[0][1] = alpha * (-1); + S[0][2] = alpha * (-2+k); + S[0][3] = alpha * (-1); + S[1][1] = alpha * (4+k); + S[1][2] = alpha * (-1); + S[1][3] = alpha * (-2-k); + S[2][2] = alpha * (4-k); + S[2][3] = alpha * (-1); + S[3][3] = alpha * (4+k); + + /* The stiffness matrix is symmetric */ + for (i = 1; i < 4; i++) + for (j = 0; j < i; j++) + S[i][j] = S[j][i]; + + for (i = 0; i < 4; i++) + F[i] = h2_4*sing; +} + + +int main (int argc, char *argv[]) +{ + int myid, num_procs; + int n; + double gamma, h; + int vis; + + HYPRE_SStructGrid grid; + HYPRE_SStructGraph graph; + HYPRE_SStructMatrix A; + HYPRE_SStructVector b; + HYPRE_SStructVector x; + + HYPRE_Solver solver; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Set default parameters */ + n = 10; + vis = 0; + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size per processor (default: 10)\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* Set the rhombus angle, gamma, and the mesh size, h, depending on the + number of processors np and the given n */ + if (num_procs < 3) + { + if (myid ==0) printf("Must run with at least 3 processors!\n"); + MPI_Finalize(); + exit(1); + } + gamma = 2*M_PI/num_procs; + h = 1.0/n; + + /* 1. Set up the grid. We will set up the grid so that processor X owns + part X. Note that each part has its own index space numbering. Later + we relate the parts to each other. */ + { + int ndim = 2; + int nparts = num_procs; + + /* Create an empty 2D grid object */ + HYPRE_SStructGridCreate(MPI_COMM_WORLD, ndim, nparts, &grid); + + /* Set the extents of the grid - each processor sets its grid boxes. Each + part has its own relative index space numbering */ + { + int part = myid; + int ilower[2] = {1,1}; /* lower-left cell touching the origin */ + int iupper[2] = {n,n}; /* upper-right cell */ + + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + } + + /* Set the variable type and number of variables on each part. These need + to be set in each part which is neighboring or contains boxes owned by + the processor. */ + { + int i; + int nvars = 1; + + HYPRE_SStructVariable vartypes[1] = {HYPRE_SSTRUCT_VARIABLE_NODE}; + for (i = 0; i < nparts; i++) + HYPRE_SStructGridSetVariables(grid, i, nvars, vartypes); + } + + /* Set the ordering of the variables in the finite element problem. This + is done by listing the variable offset directions relative to the + element's center. See the Reference Manual for more details. */ + { + int part = myid; + int ordering[12] = { 0, -1, -1, /* [3]------[2] */ + 0, +1, -1, /* / / */ + 0, +1, +1, /* / / */ + 0, -1, +1 }; /* [0]------[1] */ + + HYPRE_SStructGridSetFEMOrdering(grid, part, ordering); + } + + /* Now we need to set the spatial relation between each of the parts. + Since we are using nodal variables, we have to use SetSharedPart to + establish the connection at the origin. */ + { + /* Relation to the clockwise-previous neighbor part, e.g. 0 and 1 for + the case of 6 parts. Note that we could have used SetNeighborPart + here instead of SetSharedPart. */ + { + int part = myid; + /* the box of cells intersecting the boundary in the current part */ + int ilower[2] = {1,1}, iupper[2] = {1,n}; + /* share all data on the left side of the box */ + int offset[2] = {-1,0}; + + int shared_part = (myid+1) % num_procs; + /* the box of cells intersecting the boundary in the neighbor */ + int shared_ilower[2] = {1,1}, shared_iupper[2] = {n,1}; + /* share all data on the bottom of the box */ + int shared_offset[2] = {0,-1}; + + /* x/y-direction on the current part is -y/x on the neighbor */ + int index_map[2] = {1,0}; + int index_dir[2] = {-1,1}; + + HYPRE_SStructGridSetSharedPart(grid, part, ilower, iupper, offset, + shared_part, shared_ilower, + shared_iupper, shared_offset, + index_map, index_dir); + } + + /* Relation to the clockwise-following neighbor part, e.g. 0 and 5 for + the case of 6 parts. Note that we could have used SetNeighborPart + here instead of SetSharedPart. */ + { + int part = myid; + /* the box of cells intersecting the boundary in the current part */ + int ilower[2] = {1,1}, iupper[2] = {n,1}; + /* share all data on the bottom of the box */ + int offset[2] = {0,-1}; + + int shared_part = (myid+num_procs-1) % num_procs; + /* the box of cells intersecting the boundary in the neighbor */ + int shared_ilower[2] = {1,1}, shared_iupper[2] = {1,n}; + /* share all data on the left side of the box */ + int shared_offset[2] = {-1,0}; + + /* x/y-direction on the current part is y/-x on the neighbor */ + int index_map[2] = {1,0}; + int index_dir[2] = {1,-1}; + + HYPRE_SStructGridSetSharedPart(grid, part, ilower, iupper, offset, + shared_part, shared_ilower, + shared_iupper, shared_offset, + index_map, index_dir); + } + + /* Relation to all other parts, e.g. 0 and 2,3,4. This can be + described only by SetSharedPart. */ + { + int part = myid; + /* the (one cell) box that touches the origin */ + int ilower[2] = {1,1}, iupper[2] = {1,1}; + /* share all data in the bottom left corner (i.e. the origin) */ + int offset[2] = {-1,-1}; + + int shared_part; + /* the box of one cell that touches the origin */ + int shared_ilower[2] = {1,1}, shared_iupper[2] = {1,1}; + /* share all data in the bottom left corner (i.e. the origin) */ + int shared_offset[2] = {-1,-1}; + + /* x/y-direction on the current part is -x/-y on the neighbor, but + in this case the arguments are not really important since we are + only sharing a point */ + int index_map[2] = {0,1}; + int index_dir[2] = {-1,-1}; + + for (shared_part = 0; shared_part < myid-1; shared_part++) + HYPRE_SStructGridSetSharedPart(grid, part, ilower, iupper, offset, + shared_part, shared_ilower, + shared_iupper, shared_offset, + index_map, index_dir); + + for (shared_part = myid+2; shared_part < num_procs; shared_part++) + HYPRE_SStructGridSetSharedPart(grid, part, ilower, iupper, offset, + shared_part, shared_ilower, + shared_iupper, shared_offset, + index_map, index_dir); + } + } + + /* Now the grid is ready to be used */ + HYPRE_SStructGridAssemble(grid); + } + + /* 2. Set up the Graph - this determines the non-zero structure of the + matrix. */ + { + int part; + + /* Create the graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, grid, &graph); + + /* See MatrixSetObjectType below */ + HYPRE_SStructGraphSetObjectType(graph, HYPRE_PARCSR); + + /* Indicate that this problem uses finite element stiffness matrices and + load vectors, instead of stencils. */ + for (part = 0; part < num_procs; part++) + HYPRE_SStructGraphSetFEM(graph, part); + + /* The local stiffness matrix is full, so there is no need to call + HYPRE_SStructGraphSetFEMSparsity to set its sparsity pattern. */ + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(graph); + } + + /* 3. Set up the SStruct Matrix and right-hand side vector */ + { + int part = myid; + + /* Create the matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, graph, &A); + /* Use a ParCSR storage */ + HYPRE_SStructMatrixSetObjectType(A, HYPRE_PARCSR); + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_SStructMatrixInitialize(A); + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &b); + /* Use a ParCSR storage */ + HYPRE_SStructVectorSetObjectType(b, HYPRE_PARCSR); + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(b); + + /* Set the matrix and vector entries by finite element assembly */ + { + /* local stifness matrix and load vector */ + double S[4][4], F[4]; + + int i, j, k; + int index[2]; + + /* set the values in the interior cells */ + { + ComputeFEMRhombus(S, F, gamma, h); + + for (i = 1; i <= n; i++) + for (j = 1; j <= n; j++) + { + index[0] = i; + index[1] = j; + HYPRE_SStructMatrixAddFEMValues(A, part, index, &S[0][0]); + HYPRE_SStructVectorAddFEMValues(b, part, index, F); + } + } + + /* cells having nodes 1,2 on the domain boundary */ + { + ComputeFEMRhombus(S, F, gamma, h); + + /* eliminate nodes 1,2 from S and F */ + for (k = 0; k < 4; k++) + { + S[1][k] = S[k][1] = 0.0; + S[2][k] = S[k][2] = 0.0; + } + S[1][1] = 1.0; + S[2][2] = 1.0; + F[1] = 0.0; + F[2] = 0.0; + + for (i = n; i <= n; i++) + for (j = 1; j <= n; j++) + { + index[0] = i; + index[1] = j; + HYPRE_SStructMatrixAddFEMValues(A, part, index, &S[0][0]); + HYPRE_SStructVectorAddFEMValues(b, part, index, F); + } + } + + /* cells having nodes 2,3 on the domain boundary */ + { + ComputeFEMRhombus(S, F, gamma, h); + + /* eliminate nodes 2,3 from S and F */ + for (k = 0; k < 4; k++) + { + S[2][k] = S[k][2] = 0.0; + S[3][k] = S[k][3] = 0.0; + } + S[2][2] = 1.0; + S[3][3] = 1.0; + F[2] = 0.0; + F[3] = 0.0; + + for (i = 1; i <= n; i++) + for (j = n; j <= n; j++) + { + index[0] = i; + index[1] = j; + HYPRE_SStructMatrixAddFEMValues(A, part, index, &S[0][0]); + HYPRE_SStructVectorAddFEMValues(b, part, index, F); + } + + } + + /* cells having nodes 1,2,3 on the domain boundary */ + { + ComputeFEMRhombus(S, F, gamma, h); + + /* eliminate nodes 2,3 from S and F */ + for (k = 0; k < 4; k++) + { + S[1][k] = S[k][1] = 0.0; + S[2][k] = S[k][2] = 0.0; + S[3][k] = S[k][3] = 0.0; + } + S[1][1] = 1.0; + S[2][2] = 1.0; + S[3][3] = 1.0; + F[1] = 0.0; + F[2] = 0.0; + F[3] = 0.0; + + for (i = n; i <= n; i++) + for (j = n; j <= n; j++) + { + index[0] = i; + index[1] = j; + HYPRE_SStructMatrixAddFEMValues(A, part, index, &S[0][0]); + HYPRE_SStructVectorAddFEMValues(b, part, index, F); + } + } + } + } + + /* Collective calls finalizing the matrix and vector assembly */ + HYPRE_SStructMatrixAssemble(A); + HYPRE_SStructVectorAssemble(b); + + /* 4. Set up SStruct Vector for the solution vector x */ + { + int part = myid; + int var = 0; + int nvalues = (n+1)*(n+1); + double *values; + + /* Since the SetBoxValues() calls below set the values of the nodes in + the upper-right corners of the cells, the nodal box should start + from (0,0) instead of (1,1). */ + int ilower[2] = {0,0}; + int iupper[2] = {n,n}; + + values = (double*) calloc(nvalues, sizeof(double)); + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &x); + /* Set the object type to ParCSR */ + HYPRE_SStructVectorSetObjectType(x, HYPRE_PARCSR); + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(x); + /* Set the values for the initial guess */ + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + + free(values); + + /* Finalize the vector assembly */ + HYPRE_SStructVectorAssemble(x); + } + + /* 5. Set up and call the solver (Solver options can be found in the + Reference Manual.) */ + { + double final_res_norm; + int its; + + HYPRE_ParCSRMatrix par_A; + HYPRE_ParVector par_b; + HYPRE_ParVector par_x; + + /* Extract the ParCSR objects needed in the solver */ + HYPRE_SStructMatrixGetObject(A, (void **) &par_A); + HYPRE_SStructVectorGetObject(b, (void **) &par_b); + HYPRE_SStructVectorGetObject(x, (void **) &par_x); + + /* Here we construct a BoomerAMG solver. See the other SStruct examples + as well as the Reference manual for additional solver choices. */ + HYPRE_BoomerAMGCreate(&solver); + HYPRE_BoomerAMGSetOldDefault(solver); + HYPRE_BoomerAMGSetStrongThreshold(solver, 0.25); + HYPRE_BoomerAMGSetTol(solver, 1e-6); + HYPRE_BoomerAMGSetPrintLevel(solver, 2); + HYPRE_BoomerAMGSetMaxIter(solver, 50); + + /* call the setup */ + HYPRE_BoomerAMGSetup(solver, par_A, par_b, par_x); + + /* call the solve */ + HYPRE_BoomerAMGSolve(solver, par_A, par_b, par_x); + + /* get some info */ + HYPRE_BoomerAMGGetNumIterations(solver, &its); + HYPRE_BoomerAMGGetFinalRelativeResidualNorm(solver, + &final_res_norm); + /* clean up */ + HYPRE_BoomerAMGDestroy(solver); + + /* Gather the solution vector */ + HYPRE_SStructVectorGather(x); + + /* Save the solution for GLVis visualization, see vis/glvis-ex13.sh */ + if (vis) + { + FILE *file; + char filename[255]; + + int i, part = myid, var = 0; + int nvalues = (n+1)*(n+1); + double *values = (double*) calloc(nvalues, sizeof(double)); + int ilower[2] = {0,0}; + int iupper[2] = {n,n}; + + /* get all local data (including a local copy of the shared values) */ + HYPRE_SStructVectorGetBoxValues(x, part, ilower, iupper, + var, values); + + sprintf(filename, "%s.%06d", "vis/ex14.sol", myid); + if ((file = fopen(filename, "w")) == NULL) + { + printf("Error: can't open output file %s\n", filename); + MPI_Finalize(); + exit(1); + } + + /* finite element space header */ + fprintf(file, "FiniteElementSpace\n"); + fprintf(file, "FiniteElementCollection: H1_2D_P1\n"); + fprintf(file, "VDim: 1\n"); + fprintf(file, "Ordering: 0\n\n"); + + /* save solution */ + for (i = 0; i < nvalues; i++) + fprintf(file, "%.14e\n", values[i]); + + fflush(file); + fclose(file); + free(values); + + /* save local finite element mesh */ + GLVis_PrintLocalRhombusMesh("vis/ex14.mesh", n, myid, gamma); + + /* additional visualization data */ + GLVis_PrintData("vis/ex14.data", myid, num_procs); + } + + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", its); + printf("Final Relative Residual Norm = %g\n", final_res_norm); + printf("\n"); + } + } + + /* Free memory */ + HYPRE_SStructGridDestroy(grid); + HYPRE_SStructGraphDestroy(graph); + HYPRE_SStructMatrixDestroy(A); + HYPRE_SStructVectorDestroy(b); + HYPRE_SStructVectorDestroy(x); + + /* Finalize MPI */ + MPI_Finalize(); + + return 0; +} diff --git a/3rd_party/hypre/src/examples/ex15.c b/3rd_party/hypre/src/examples/ex15.c new file mode 100644 index 000000000..38aa80f6b --- /dev/null +++ b/3rd_party/hypre/src/examples/ex15.c @@ -0,0 +1,1072 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 15 + + Interface: Semi-Structured interface (SStruct) + + Compile with: make ex15 + + Sample run: mpirun -np 8 ex15 -n 10 + + To see options: ex15 -help + + Description: This code solves a 3D electromagnetic diffusion (definite + curl-curl) problem using the lowest order Nedelec, or "edge" + finite element discretization on a uniform hexahedral meshing + of the unit cube. The right-hand-side corresponds to a unit + vector force and we use uniform zero Dirichlet boundary + conditions. The overall problem reads: + curl alpha curl E + beta E = 1, + with E x n = 0 on the boundary, where alpha and beta are + piecewise-constant material coefficients. + + The linear system is split in parallel using the SStruct + interface with an n x n x n grid on each processors, and + similar N x N x N processor grid. Therefore, the number of + processors should be a perfect cube. + + This example code is mainly meant as an illustration of using + the Auxiliary-space Maxwell Solver (AMS) through the SStruct + interface. It is also an example of setting up a finite + element discretization in the SStruct interface, and we + recommend viewing Example 13 and Example 14 before viewing + this example. +*/ + +#include +#include "_hypre_utilities.h" +#include "HYPRE_sstruct_mv.h" +#include "HYPRE_sstruct_ls.h" +#include "_hypre_parcsr_ls.h" +#include "HYPRE.h" + +#include "vis.c" + +int optionAlpha, optionBeta; + +/* Curl-curl coefficient alpha = mu^{-1} */ +double alpha(double x, double y, double z) +{ + switch (optionAlpha) + { + case 0: /* uniform coefficient */ + return 1.0; + case 1: /* smooth coefficient */ + return x*x+exp(y)+sin(z); + case 2: /* small outside of an interior cube */ + if ((fabs(x-0.5) < 0.25) && (fabs(y-0.5) < 0.25) && (fabs(z-0.5) < 0.25)) + return 1.0; + else + return 1.0e-6; + case 3: /* small outside of an interior ball */ + if (((x-0.5)*(x-0.5)+(y-0.5)*(y-0.5)+(z-0.5)*(z-0.5)) < 0.0625) + return 1.0; + else + return 1.0e-6; + case 4: /* random coefficient */ + return hypre_Rand(); + default: + return 1.0; + } +} + +/* Mass coefficient beta = sigma */ +double beta(double x, double y, double z) +{ + switch (optionBeta) + { + case 0: /* uniform coefficient */ + return 1.0; + case 1: /* smooth coefficient */ + return x*x+exp(y)+sin(z); + case 2:/* small outside of interior cube */ + if ((fabs(x-0.5) < 0.25) && (fabs(y-0.5) < 0.25) && (fabs(z-0.5) < 0.25)) + return 1.0; + else + return 1.0e-6; + case 3: /* small outside of an interior ball */ + if (((x-0.5)*(x-0.5)+(y-0.5)*(y-0.5)+(z-0.5)*(z-0.5)) < 0.0625) + return 1.0; + else + return 1.0e-6; + case 4: /* random coefficient */ + return hypre_Rand(); + default: + return 1.0; + } +} + +/* + This routine computes the lowest order Nedelec, or "edge" finite element + stiffness matrix and load vector on a cube of size h. The 12 edges {e_i} + are numbered in terms of the vertices as follows: + + [7]------[6] + /| /| e_0 = 01, e_1 = 12, e_2 = 32, e_3 = 03, + / | / | e_4 = 45, e_5 = 56, e_6 = 76, e_7 = 47, + [4]------[5] | e_8 = 04, e_9 = 15, e_10 = 26, e_11 = 37. + | [3]----|-[2] + | / | / The edges are oriented from first to the + |/ |/ second vertex, e.g. e_0 is from [0] to [1]. + [0]------[1] + + We allow for different scaling of the curl-curl and the mass parts of the + matrix with coefficients alpha and beta respectively: + + S_ij = alpha (curl phi_i,curl phi_j) + beta (phi_i, phi_j). + + The load vector corresponding to a right-hand side of {1,1,1} is + + F_j = (1,phi_j) = h^2/4. +*/ +void ComputeFEMND1(double S[12][12], double F[12], + double x, double y, double z, double h) +{ + int i, j; + + double h2_4 = h*h/4; + + double cS1 = alpha(x,y,z)/(6.0*h), cS2 = 2*cS1, cS4 = 2*cS2; + double cM1 = beta(x,y,z)*h/36.0, cM2 = 2*cM1, cM4 = 2*cM2; + + S[ 0][ 0] = cS4 + cM4; S[ 0][ 1] = cS2; S[ 0][ 2] = -cS1 + cM2; + S[ 0][ 3] = -cS2; S[ 0][ 4] = -cS1 + cM2; S[ 0][ 5] = cS1; + S[ 0][ 6] = -cS2 + cM1; S[ 0][ 7] = -cS1; S[ 0][ 8] = -cS2; + S[ 0][ 9] = cS2; S[ 0][10] = cS1; S[ 0][11] = -cS1; + + S[ 1][ 1] = cS4 + cM4; S[ 1][ 2] = -cS2; S[ 1][ 3] = -cS1 + cM2; + S[ 1][ 4] = cS1; S[ 1][ 5] = -cS1 + cM2; S[ 1][ 6] = -cS1; + S[ 1][ 7] = -cS2 + cM1; S[ 1][ 8] = -cS1; S[ 1][ 9] = -cS2; + S[ 1][10] = cS2; S[ 1][11] = cS1; + + S[ 2][ 2] = cS4 + cM4; S[ 2][ 3] = cS2; S[ 2][ 4] = -cS2 + cM1; + S[ 2][ 5] = -cS1; S[ 2][ 6] = -cS1 + cM2; S[ 2][ 7] = cS1; + S[ 2][ 8] = -cS1; S[ 2][ 9] = cS1; S[ 2][10] = cS2; + S[ 2][11] = -cS2; + + S[ 3][ 3] = cS4 + cM4; S[ 3][ 4] = -cS1; S[ 3][ 5] = -cS2 + cM1; + S[ 3][ 6] = cS1; S[ 3][ 7] = -cS1 + cM2; S[ 3][ 8] = -cS2; + S[ 3][ 9] = -cS1; S[ 3][10] = cS1; S[ 3][11] = cS2; + + S[ 4][ 4] = cS4 + cM4; S[ 4][ 5] = cS2; S[ 4][ 6] = -cS1 + cM2; + S[ 4][ 7] = -cS2; S[ 4][ 8] = cS2; S[ 4][ 9] = -cS2; + S[ 4][10] = -cS1; S[ 4][11] = cS1; + + S[ 5][ 5] = cS4 + cM4; S[ 5][ 6] = -cS2; S[ 5][ 7] = -cS1 + cM2; + S[ 5][ 8] = cS1; S[ 5][ 9] = cS2; S[ 5][10] = -cS2; + S[ 5][11] = -cS1; + + S[ 6][ 6] = cS4 + cM4; S[ 6][ 7] = cS2; S[ 6][ 8] = cS1; + S[ 6][ 9] = -cS1; S[ 6][10] = -cS2; S[ 6][11] = cS2; + + S[ 7][ 7] = cS4 + cM4; S[ 7][ 8] = cS2; S[ 7][ 9] = cS1; + S[ 7][10] = -cS1; S[ 7][11] = -cS2; + + S[ 8][ 8] = cS4 + cM4; S[ 8][ 9] = -cS1 + cM2; S[ 8][10] = -cS2 + cM1; + S[ 8][11] = -cS1 + cM2; + + S[ 9][ 9] = cS4 + cM4; S[ 9][10] = -cS1 + cM2; S[ 9][11] = -cS2 + cM1; + + S[10][10] = cS4 + cM4; S[10][11] = -cS1 + cM2; + + S[11][11] = cS4 + cM4; + + /* The stiffness matrix is symmetric */ + for (i = 1; i < 12; i++) + for (j = 0; j < i; j++) + S[i][j] = S[j][i]; + + for (i = 0; i < 12; i++) + F[i] = h2_4; +} + + +int main (int argc, char *argv[]) +{ + int myid, num_procs; + int n, N, pi, pj, pk; + double h; + int vis; + + double tol, theta; + int maxit, cycle_type; + int rlx_type, rlx_sweeps, rlx_weight, rlx_omega; + int amg_coarsen_type, amg_agg_levels, amg_rlx_type; + int amg_interp_type, amg_Pmax; + int singular_problem ; + + int time_index; + + HYPRE_SStructGrid edge_grid; + HYPRE_SStructGraph A_graph; + HYPRE_SStructMatrix A; + HYPRE_SStructVector b; + HYPRE_SStructVector x; + HYPRE_SStructGrid node_grid; + HYPRE_SStructGraph G_graph; + HYPRE_SStructStencil G_stencil[3]; + HYPRE_SStructMatrix G; + HYPRE_SStructVector xcoord, ycoord, zcoord; + + HYPRE_Solver solver, precond; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Set default parameters */ + n = 10; + vis = 0; + optionAlpha = 0; + optionBeta = 0; + maxit = 100; + tol = 1e-6; + cycle_type = 13; + rlx_type = 2; + rlx_sweeps = 1; + rlx_weight = 1.0; + rlx_omega = 1.0; + amg_coarsen_type = 10; + amg_agg_levels = 1; + amg_rlx_type = 6; + theta = 0.25; + amg_interp_type = 6; + amg_Pmax = 4; + singular_problem = 0; + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-a") == 0 ) + { + arg_index++; + optionAlpha = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-b") == 0 ) + { + arg_index++; + optionBeta = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-maxit") == 0 ) + { + arg_index++; + maxit = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-tol") == 0 ) + { + arg_index++; + tol = atof(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-type") == 0 ) + { + arg_index++; + cycle_type = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-rlx") == 0 ) + { + arg_index++; + rlx_type = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-rlxn") == 0 ) + { + arg_index++; + rlx_sweeps = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-rlxw") == 0 ) + { + arg_index++; + rlx_weight = atof(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-rlxo") == 0 ) + { + arg_index++; + rlx_omega = atof(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-ctype") == 0 ) + { + arg_index++; + amg_coarsen_type = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-amgrlx") == 0 ) + { + arg_index++; + amg_rlx_type = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-agg") == 0 ) + { + arg_index++; + amg_agg_levels = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-itype") == 0 ) + { + arg_index++; + amg_interp_type = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-pmax") == 0 ) + { + arg_index++; + amg_Pmax = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-sing") == 0 ) + { + arg_index++; + singular_problem = 1; + } + else if ( strcmp(argv[arg_index], "-theta") == 0 ) + { + arg_index++; + theta = atof(argv[arg_index++]); + } + + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size per processor (default: 10)\n"); + printf(" -a : choice for the curl-curl coefficient (default: 1)\n"); + printf(" -b : choice for the mass coefficient (default: 1)\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + printf("PCG-AMS solver options: \n"); + printf(" -maxit : maximum number of iterations (100) \n"); + printf(" -tol : convergence tolerance (1e-6) \n"); + printf(" -type : 3-level cycle type (0-8, 11-14) \n"); + printf(" -theta : BoomerAMG threshold (0.25) \n"); + printf(" -ctype : BoomerAMG coarsening type \n"); + printf(" -agg : Levels of BoomerAMG agg. coarsening \n"); + printf(" -amgrlx : BoomerAMG relaxation type \n"); + printf(" -itype : BoomerAMG interpolation type \n"); + printf(" -pmax : BoomerAMG interpolation truncation \n"); + printf(" -rlx : relaxation type \n"); + printf(" -rlxn : number of relaxation sweeps \n"); + printf(" -rlxw : damping parameter (usually <=1) \n"); + printf(" -rlxo : SOR parameter (usually in (0,2)) \n"); + printf(" -sing : curl-curl only (singular) problem \n"); + printf("\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* Figure out the processor grid (N x N x N). The local problem size is n^3, + while pi, pj and pk indicate the position in the processor grid. */ + N = pow(num_procs,1.0/3.0) + 0.5; + if (num_procs != N*N*N) + { + if (myid == 0) printf("Can't run on %d processors, try %d.\n", + num_procs, N*N*N); + MPI_Finalize(); + exit(1); + } + h = 1.0 / (N*n); + pk = myid / (N*N); + pj = myid/N - pk*N; + pi = myid - pj*N - pk*N*N; + + /* Start timing */ + time_index = hypre_InitializeTiming("SStruct Setup"); + hypre_BeginTiming(time_index); + + /* 1. Set up the edge and nodal grids. Note that we do this simultaneously + to make sure that they have the same extents. For simplicity we use + only one part to represent the unit cube. */ + { + int ndim = 3; + int nparts = 1; + + /* Create empty 2D grid objects */ + HYPRE_SStructGridCreate(MPI_COMM_WORLD, ndim, nparts, &node_grid); + HYPRE_SStructGridCreate(MPI_COMM_WORLD, ndim, nparts, &edge_grid); + + /* Set the extents of the grid - each processor sets its grid boxes. */ + { + int part = 0; + int ilower[3] = {1 + pi*n, 1 + pj*n, 1 + pk*n}; + int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + + HYPRE_SStructGridSetExtents(node_grid, part, ilower, iupper); + HYPRE_SStructGridSetExtents(edge_grid, part, ilower, iupper); + } + + /* Set the variable type and number of variables on each grid. */ + { + int i; + int nnodevars = 1; + int nedgevars = 3; + + HYPRE_SStructVariable nodevars[1] = {HYPRE_SSTRUCT_VARIABLE_NODE}; + HYPRE_SStructVariable edgevars[3] = {HYPRE_SSTRUCT_VARIABLE_XEDGE, + HYPRE_SSTRUCT_VARIABLE_YEDGE, + HYPRE_SSTRUCT_VARIABLE_ZEDGE}; + for (i = 0; i < nparts; i++) + { + HYPRE_SStructGridSetVariables(node_grid, i, nnodevars, nodevars); + HYPRE_SStructGridSetVariables(edge_grid, i, nedgevars, edgevars); + } + } + + /* Since there is only one part, there is no need to call the + SetNeighborPart or SetSharedPart functions, which determine the spatial + relation between the parts. See Examples 12, 13 and 14 for + illustrations of these calls. */ + + /* Now the grids are ready to be used */ + HYPRE_SStructGridAssemble(node_grid); + HYPRE_SStructGridAssemble(edge_grid); + } + + /* 2. Create the finite element stiffness matrix A and load vector b. */ + { + int part = 0; /* this problem has only one part */ + + /* Set the ordering of the variables in the finite element problem. This + is done by listing the variable offset directions relative to the + element's center. See the Reference Manual for more details. */ + { + int ordering[48] = { 0, 0, -1, -1, /* x-edge [0]-[1] */ + 1, +1, 0, -1, /* y-edge [1]-[2] */ + /* [7]------[6] */ 0, 0, +1, -1, /* x-edge [3]-[2] */ + /* /| /| */ 1, -1, 0, -1, /* y-edge [0]-[3] */ + /* / | / | */ 0, 0, -1, +1, /* x-edge [4]-[5] */ + /* [4]------[5] | */ 1, +1, 0, +1, /* y-edge [5]-[6] */ + /* | [3]----|-[2] */ 0, 0, +1, +1, /* x-edge [7]-[6] */ + /* | / | / */ 1, -1, 0, +1, /* y-edge [4]-[7] */ + /* |/ |/ */ 2, -1, -1, 0, /* z-edge [0]-[4] */ + /* [0]------[1] */ 2, +1, -1, 0, /* z-edge [1]-[5] */ + 2, +1, +1, 0, /* z-edge [2]-[6] */ + 2, -1, +1, 0 }; /* z-edge [3]-[7] */ + + HYPRE_SStructGridSetFEMOrdering(edge_grid, part, ordering); + } + + /* Set up the Graph - this determines the non-zero structure of the + matrix. */ + { + int part = 0; + + /* Create the graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, edge_grid, &A_graph); + + /* See MatrixSetObjectType below */ + HYPRE_SStructGraphSetObjectType(A_graph, HYPRE_PARCSR); + + /* Indicate that this problem uses finite element stiffness matrices and + load vectors, instead of stencils. */ + HYPRE_SStructGraphSetFEM(A_graph, part); + + /* The edge finite element matrix is full, so there is no need to call the + HYPRE_SStructGraphSetFEMSparsity() function. */ + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(A_graph); + } + + /* Set up the SStruct Matrix and right-hand side vector */ + { + /* Create the matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, A_graph, &A); + /* Use a ParCSR storage */ + HYPRE_SStructMatrixSetObjectType(A, HYPRE_PARCSR); + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_SStructMatrixInitialize(A); + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, edge_grid, &b); + /* Use a ParCSR storage */ + HYPRE_SStructVectorSetObjectType(b, HYPRE_PARCSR); + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(b); + } + + /* Set the matrix and vector entries by finite element assembly */ + { + /* local stiffness matrix and load vector */ + double S[12][12], F[12]; + + int i, j, k; + int index[3]; + + for (i = 1; i <= n; i++) + for (j = 1; j <= n; j++) + for (k = 1; k <= n; k++) + { + /* Compute the FEM matrix and r.h.s. for cell (i,j,k) with + coefficients evaluated at the cell center. */ + index[0] = i + pi*n; index[1] = j + pj*n; index[2] = k + pk*n; + ComputeFEMND1(S,F,(pi*n+i)*h-h/2,(pj*n+j)*h-h/2,(pk*n+k)*h-h/2,h); + + /* Eliminate boundary conditions on x = 0 */ + if (index[0] == 1) + { + int ii, jj, bc_edges[4] = { 3, 11, 7, 8 }; + for (ii = 0; ii < 4; ii++) + { + for (jj = 0; jj < 12; jj++) + S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0; + S[bc_edges[ii]][bc_edges[ii]] = 1.0; + F[bc_edges[ii]] = 0.0; + } + } + /* Eliminate boundary conditions on y = 0 */ + if (index[1] == 1) + { + int ii, jj, bc_edges[4] = { 0, 9, 4, 8 }; + for (ii = 0; ii < 4; ii++) + { + for (jj = 0; jj < 12; jj++) + S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0; + S[bc_edges[ii]][bc_edges[ii]] = 1.0; + F[bc_edges[ii]] = 0.0; + } + } + /* Eliminate boundary conditions on z = 0 */ + if (index[2] == 1) + { + int ii, jj, bc_edges[4] = { 0, 1, 2, 3 }; + for (ii = 0; ii < 4; ii++) + { + for (jj = 0; jj < 12; jj++) + S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0; + S[bc_edges[ii]][bc_edges[ii]] = 1.0; + F[bc_edges[ii]] = 0.0; + } + } + /* Eliminate boundary conditions on x = 1 */ + if (index[0] == N*n) + { + int ii, jj, bc_edges[4] = { 1, 10, 5, 9 }; + for (ii = 0; ii < 4; ii++) + { + for (jj = 0; jj < 12; jj++) + S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0; + S[bc_edges[ii]][bc_edges[ii]] = 1.0; + F[bc_edges[ii]] = 0.0; + } + } + /* Eliminate boundary conditions on y = 1 */ + if (index[1] == N*n) + { + int ii, jj, bc_edges[4] = { 2, 10, 6, 11 }; + for (ii = 0; ii < 4; ii++) + { + for (jj = 0; jj < 12; jj++) + S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0; + S[bc_edges[ii]][bc_edges[ii]] = 1.0; + F[bc_edges[ii]] = 0.0; + } + } + /* Eliminate boundary conditions on z = 1 */ + if (index[2] == N*n) + { + int ii, jj, bc_edges[4] = { 4, 5, 6, 7 }; + for (ii = 0; ii < 4; ii++) + { + for (jj = 0; jj < 12; jj++) + S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0; + S[bc_edges[ii]][bc_edges[ii]] = 1.0; + F[bc_edges[ii]] = 0.0; + } + } + + /* Assemble the matrix */ + HYPRE_SStructMatrixAddFEMValues(A, part, index, &S[0][0]); + + /* Assemble the vector */ + HYPRE_SStructVectorAddFEMValues(b, part, index, F); + } + } + + /* Collective calls finalizing the matrix and vector assembly */ + HYPRE_SStructMatrixAssemble(A); + HYPRE_SStructVectorAssemble(b); + } + + /* 3. Create the discrete gradient matrix G, which is needed in AMS. */ + { + int part = 0; + int stencil_size = 2; + + /* Define the discretization stencil relating the edges and nodes of the + grid. */ + { + int ndim = 3; + int entry; + int var = 0; /* the node variable */ + + /* The discrete gradient stencils connect edge to node variables. */ + int Gx_offsets[2][3] = {{-1,0,0},{0,0,0}}; /* x-edge [7]-[6] */ + int Gy_offsets[2][3] = {{0,-1,0},{0,0,0}}; /* y-edge [5]-[6] */ + int Gz_offsets[2][3] = {{0,0,-1},{0,0,0}}; /* z-edge [2]-[6] */ + + HYPRE_SStructStencilCreate(ndim, stencil_size, &G_stencil[0]); + HYPRE_SStructStencilCreate(ndim, stencil_size, &G_stencil[1]); + HYPRE_SStructStencilCreate(ndim, stencil_size, &G_stencil[2]); + + for (entry = 0; entry < stencil_size; entry++) + { + HYPRE_SStructStencilSetEntry(G_stencil[0], entry, Gx_offsets[entry], var); + HYPRE_SStructStencilSetEntry(G_stencil[1], entry, Gy_offsets[entry], var); + HYPRE_SStructStencilSetEntry(G_stencil[2], entry, Gz_offsets[entry], var); + } + } + + /* Set up the Graph - this determines the non-zero structure of the + matrix. */ + { + int nvars = 3; + int var; /* the edge variables */ + + /* Create the discrete gradient graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, edge_grid, &G_graph); + + /* See MatrixSetObjectType below */ + HYPRE_SStructGraphSetObjectType(G_graph, HYPRE_PARCSR); + + /* Since the discrete gradient relates edge and nodal variables (it is a + rectangular matrix), we have to specify the domain (column) grid. */ + HYPRE_SStructGraphSetDomainGrid(G_graph, node_grid); + + /* Tell the graph which stencil to use for each edge variable on each + part (we only have one part). */ + for (var = 0; var < nvars; var++) + HYPRE_SStructGraphSetStencil(G_graph, part, var, G_stencil[var]); + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(G_graph); + } + + /* Set up the SStruct Matrix */ + { + /* Create the matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, G_graph, &G); + /* Use a ParCSR storage */ + HYPRE_SStructMatrixSetObjectType(G, HYPRE_PARCSR); + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_SStructMatrixInitialize(G); + } + + /* Set the discrete gradient values, assuming a "natural" orientation of + the edges (i.e. one in agreement with the coordinate directions). */ + { + int i; + int nedges = n*(n+1)*(n+1); + double *values; + int stencil_indices[2] = {0,1}; /* the nodes of each edge */ + + values = (double*) calloc(2*nedges, sizeof(double)); + + /* The edge orientation is fixed: from first to second node */ + for (i = 0; i < nedges; i++) + { + values[2*i] = -1.0; + values[2*i+1] = 1.0; + } + + /* Set the values in the discrete gradient x-edges */ + { + int var = 0; + int ilower[3] = {1 + pi*n, 0 + pj*n, 0 + pk*n}; + int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructMatrixSetBoxValues(G, part, ilower, iupper, var, + stencil_size, stencil_indices, + values); + } + /* Set the values in the discrete gradient y-edges */ + { + int var = 1; + int ilower[3] = {0 + pi*n, 1 + pj*n, 0 + pk*n}; + int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructMatrixSetBoxValues(G, part, ilower, iupper, var, + stencil_size, stencil_indices, + values); + } + /* Set the values in the discrete gradient z-edges */ + { + int var = 2; + int ilower[3] = {0 + pi*n, 0 + pj*n, 1 + pk*n}; + int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructMatrixSetBoxValues(G, part, ilower, iupper, var, + stencil_size, stencil_indices, + values); + } + + free(values); + } + + /* Finalize the matrix assembly */ + HYPRE_SStructMatrixAssemble(G); + } + + /* 4. Create the vectors of nodal coordinates xcoord, ycoord and zcoord, + which are needed in AMS. */ + { + int i, j, k; + int part = 0; + int var = 0; /* the node variable */ + int index[3]; + double xval, yval, zval; + + /* Create empty vector objects */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, node_grid, &xcoord); + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, node_grid, &ycoord); + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, node_grid, &zcoord); + /* Set the object type to ParCSR */ + HYPRE_SStructVectorSetObjectType(xcoord, HYPRE_PARCSR); + HYPRE_SStructVectorSetObjectType(ycoord, HYPRE_PARCSR); + HYPRE_SStructVectorSetObjectType(zcoord, HYPRE_PARCSR); + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(xcoord); + HYPRE_SStructVectorInitialize(ycoord); + HYPRE_SStructVectorInitialize(zcoord); + + /* Compute and set the coordinates of the nodes */ + for (i = 0; i <= n; i++) + for (j = 0; j <= n; j++) + for (k = 0; k <= n; k++) + { + index[0] = i + pi*n; index[1] = j + pj*n; index[2] = k + pk*n; + + xval = index[0]*h; + yval = index[1]*h; + zval = index[2]*h; + + HYPRE_SStructVectorSetValues(xcoord, part, index, var, &xval); + HYPRE_SStructVectorSetValues(ycoord, part, index, var, &yval); + HYPRE_SStructVectorSetValues(zcoord, part, index, var, &zval); + } + + /* Finalize the vector assembly */ + HYPRE_SStructVectorAssemble(xcoord); + HYPRE_SStructVectorAssemble(ycoord); + HYPRE_SStructVectorAssemble(zcoord); + } + + /* 5. Set up a SStruct Vector for the solution vector x */ + { + int part = 0; + int nvalues = n*(n+1)*(n+1); + double *values; + + values = (double*) calloc(nvalues, sizeof(double)); + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, edge_grid, &x); + /* Set the object type to ParCSR */ + HYPRE_SStructVectorSetObjectType(x, HYPRE_PARCSR); + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(x); + + /* Set the values for the initial guess x-edge */ + { + int var = 0; + int ilower[3] = {1 + pi*n, 0 + pj*n, 0 + pk*n}; + int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + /* Set the values for the initial guess y-edge */ + { + int var = 1; + int ilower[3] = {0 + pi*n, 1 + pj*n, 0 + pk*n}; + int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + /* Set the values for the initial guess z-edge */ + { + int var = 2; + int ilower[3] = {0 + pi*n, 0 + pj*n, 1 + pk*n}; + int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + + free(values); + + /* Finalize the vector assembly */ + HYPRE_SStructVectorAssemble(x); + } + + /* Finalize current timing */ + hypre_EndTiming(time_index); + hypre_PrintTiming("SStruct phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* 6. Set up and call the PCG-AMS solver (Solver options can be found in the + Reference Manual.) */ + { + double final_res_norm; + int its; + + HYPRE_ParCSRMatrix par_A; + HYPRE_ParVector par_b; + HYPRE_ParVector par_x; + + HYPRE_ParCSRMatrix par_G; + HYPRE_ParVector par_xcoord; + HYPRE_ParVector par_ycoord; + HYPRE_ParVector par_zcoord; + + /* Extract the ParCSR objects needed in the solver */ + HYPRE_SStructMatrixGetObject(A, (void **) &par_A); + HYPRE_SStructVectorGetObject(b, (void **) &par_b); + HYPRE_SStructVectorGetObject(x, (void **) &par_x); + HYPRE_SStructMatrixGetObject(G, (void **) &par_G); + HYPRE_SStructVectorGetObject(xcoord, (void **) &par_xcoord); + HYPRE_SStructVectorGetObject(ycoord, (void **) &par_ycoord); + HYPRE_SStructVectorGetObject(zcoord, (void **) &par_zcoord); + + if (myid == 0) + printf("Problem size: %d\n\n", + hypre_ParCSRMatrixGlobalNumRows((hypre_ParCSRMatrix*)par_A)); + + /* Start timing */ + time_index = hypre_InitializeTiming("AMS Setup"); + hypre_BeginTiming(time_index); + + /* Create solver */ + HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_PCGSetMaxIter(solver, maxit); /* max iterations */ + HYPRE_PCGSetTol(solver, tol); /* conv. tolerance */ + HYPRE_PCGSetTwoNorm(solver, 0); /* use the two norm as the stopping criteria */ + HYPRE_PCGSetPrintLevel(solver, 2); /* print solve info */ + HYPRE_PCGSetLogging(solver, 1); /* needed to get run info later */ + + /* Create AMS preconditioner */ + HYPRE_AMSCreate(&precond); + + /* Set AMS parameters */ + HYPRE_AMSSetMaxIter(precond, 1); + HYPRE_AMSSetTol(precond, 0.0); + HYPRE_AMSSetCycleType(precond, cycle_type); + HYPRE_AMSSetPrintLevel(precond, 1); + + /* Set discrete gradient */ + HYPRE_AMSSetDiscreteGradient(precond, par_G); + + /* Set vertex coordinates */ + HYPRE_AMSSetCoordinateVectors(precond, + par_xcoord, par_ycoord, par_zcoord); + + if (singular_problem) + HYPRE_AMSSetBetaPoissonMatrix(precond, NULL); + + /* Smoothing and AMG options */ + HYPRE_AMSSetSmoothingOptions(precond, + rlx_type, rlx_sweeps, + rlx_weight, rlx_omega); + HYPRE_AMSSetAlphaAMGOptions(precond, + amg_coarsen_type, amg_agg_levels, + amg_rlx_type, theta, amg_interp_type, + amg_Pmax); + HYPRE_AMSSetBetaAMGOptions(precond, + amg_coarsen_type, amg_agg_levels, + amg_rlx_type, theta, amg_interp_type, + amg_Pmax); + + /* Set the PCG preconditioner */ + HYPRE_PCGSetPrecond(solver, + (HYPRE_PtrToSolverFcn) HYPRE_AMSSolve, + (HYPRE_PtrToSolverFcn) HYPRE_AMSSetup, + precond); + + /* Call the setup */ + HYPRE_ParCSRPCGSetup(solver, par_A, par_b, par_x); + + /* Finalize current timing */ + hypre_EndTiming(time_index); + hypre_PrintTiming("Setup phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Start timing again */ + time_index = hypre_InitializeTiming("AMS Solve"); + hypre_BeginTiming(time_index); + + /* Call the solve */ + HYPRE_ParCSRPCGSolve(solver, par_A, par_b, par_x); + + /* Finalize current timing */ + hypre_EndTiming(time_index); + hypre_PrintTiming("Solve phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Get some info */ + HYPRE_PCGGetNumIterations(solver, &its); + HYPRE_PCGGetFinalRelativeResidualNorm(solver, &final_res_norm); + + /* Clean up */ + HYPRE_AMSDestroy(precond); + HYPRE_ParCSRPCGDestroy(solver); + + /* Gather the solution vector */ + HYPRE_SStructVectorGather(x); + + /* Save the solution for GLVis visualization, see vis/glvis-ex15.sh */ + if (vis) + { + FILE *file; + char filename[255]; + + int part = 0; + int nvalues = n*(n+1)*(n+1); + double *xvalues, *yvalues, *zvalues; + + xvalues = (double*) calloc(nvalues, sizeof(double)); + yvalues = (double*) calloc(nvalues, sizeof(double)); + zvalues = (double*) calloc(nvalues, sizeof(double)); + + /* Get local solution in the x-edges */ + { + int var = 0; + int ilower[3] = {1 + pi*n, 0 + pj*n, 0 + pk*n}; + int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructVectorGetBoxValues(x, part, ilower, iupper, + var, xvalues); + } + /* Get local solution in the y-edges */ + { + int var = 1; + int ilower[3] = {0 + pi*n, 1 + pj*n, 0 + pk*n}; + int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructVectorGetBoxValues(x, part, ilower, iupper, + var, yvalues); + } + /* Get local solution in the z-edges */ + { + int var = 2; + int ilower[3] = {0 + pi*n, 0 + pj*n, 1 + pk*n}; + int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructVectorGetBoxValues(x, part, ilower, iupper, + var, zvalues); + } + + sprintf(filename, "%s.%06d", "vis/ex15.sol", myid); + if ((file = fopen(filename, "w")) == NULL) + { + printf("Error: can't open output file %s\n", filename); + MPI_Finalize(); + exit(1); + } + + /* Finite element space header */ + fprintf(file, "FiniteElementSpace\n"); + fprintf(file, "FiniteElementCollection: Local_Hex_ND1\n"); + fprintf(file, "VDim: 1\n"); + fprintf(file, "Ordering: 0\n\n"); + + /* Save solution with replicated shared data, i.e., element by element, + using the same numbering as the local finite element unknowns. */ + { + int i, j, k, s; + + /* Initial x-, y- and z-edge indices in the values arrays */ + int oi[4] = { 0, n, n*(n+1), n*(n+1)+n }; /* e_0, e_2, e_4, e_6 */ + int oj[4] = { 0, 1, n*(n+1), n*(n+1)+1 }; /* e_3, e_1, e_7, e_5 */ + int ok[4] = { 0, 1, n+1, n+2 }; /* e_8, e_9, e_11, e_10 */ + /* Loop over the cells while updating the above offsets */ + for (k = 0; k < n; k++) + { + for (j = 0; j < n; j++) + { + for (i = 0; i < n; i++) + { + fprintf(file, + "%.14e\n%.14e\n%.14e\n%.14e\n" + "%.14e\n%.14e\n%.14e\n%.14e\n" + "%.14e\n%.14e\n%.14e\n%.14e\n", + xvalues[oi[0]], yvalues[oj[1]], xvalues[oi[1]], yvalues[oj[0]], + xvalues[oi[2]], yvalues[oj[3]], xvalues[oi[3]], yvalues[oj[2]], + zvalues[ok[0]], zvalues[ok[1]], zvalues[ok[3]], zvalues[ok[2]]); + + for (s=0; s<4; s++) oi[s]++, oj[s]++, ok[s]++; + } + for (s=0; s<4; s++) oj[s]++, ok[s]++; + } + for (s=0; s<4; s++) oi[s]+=n, ok[s]+=n+1; + } + } + + fflush(file); + fclose(file); + free(xvalues); + free(yvalues); + free(zvalues); + + /* Save local finite element mesh */ + GLVis_PrintLocalCubicMesh("vis/ex15.mesh", n, n, n, h, + pi*h*n, pj*h*n, pk*h*n, myid); + + /* Additional visualization data */ + GLVis_PrintData("vis/ex15.data", myid, num_procs); + } + + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", its); + printf("Final Relative Residual Norm = %g\n", final_res_norm); + printf("\n"); + } + } + + /* Free memory */ + HYPRE_SStructGridDestroy(edge_grid); + HYPRE_SStructGraphDestroy(A_graph); + HYPRE_SStructMatrixDestroy(A); + HYPRE_SStructVectorDestroy(b); + HYPRE_SStructVectorDestroy(x); + HYPRE_SStructGridDestroy(node_grid); + HYPRE_SStructGraphDestroy(G_graph); + HYPRE_SStructStencilDestroy(G_stencil[0]); + HYPRE_SStructStencilDestroy(G_stencil[1]); + HYPRE_SStructStencilDestroy(G_stencil[2]); + HYPRE_SStructMatrixDestroy(G); + HYPRE_SStructVectorDestroy(xcoord); + HYPRE_SStructVectorDestroy(ycoord); + HYPRE_SStructVectorDestroy(zcoord); + + /* Finalize MPI */ + MPI_Finalize(); + + return 0; +} diff --git a/3rd_party/hypre/src/examples/ex15big.c b/3rd_party/hypre/src/examples/ex15big.c new file mode 100644 index 000000000..4c81daf47 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex15big.c @@ -0,0 +1,958 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 15big + + Interface: Semi-Structured interface (SStruct) + + Compile with: make ex15big + + Sample run: mpirun -np 8 ex15big -n 10 + + To see options: ex15big -help + + Description: This example is a slight modification of Example 15 that + illustrates the 64-bit integer support in hypre needed to + runproblems with more than 2B unknowns. + + Specifically, the changes compared to Example 15 are as + follows: + + 1) All integer arguments to HYPRE functions should be + declared of type HYPRE_Int. + + 2) Variables of type HYPRE_Int are 64-bit integers, so + they should be printed in the %lld format (not %d). + + To enable the 64-bit integer support, you need to build + hypre with the --enable-bigint option of 'configure'. + We recommend comparing this example with Example 15. +*/ + +#include +#include "_hypre_utilities.h" +#include "HYPRE_sstruct_mv.h" +#include "HYPRE_sstruct_ls.h" +#include "_hypre_parcsr_ls.h" +#include "HYPRE.h" + +int optionAlpha, optionBeta; + +/* Curl-curl coefficient alpha = mu^{-1} */ +double alpha(double x, double y, double z) +{ + switch (optionAlpha) + { + case 0: /* uniform coefficient */ + return 1.0; + case 1: /* smooth coefficient */ + return x*x+exp(y)+sin(z); + case 2: /* small outside of an interior cube */ + if ((fabs(x-0.5) < 0.25) && (fabs(y-0.5) < 0.25) && (fabs(z-0.5) < 0.25)) + return 1.0; + else + return 1.0e-6; + case 3: /* small outside of an interior ball */ + if (((x-0.5)*(x-0.5)+(y-0.5)*(y-0.5)+(z-0.5)*(z-0.5)) < 0.0625) + return 1.0; + else + return 1.0e-6; + case 4: /* random coefficient */ + return hypre_Rand(); + default: + return 1.0; + } +} + +/* Mass coefficient beta = sigma */ +double beta(double x, double y, double z) +{ + switch (optionBeta) + { + case 0: /* uniform coefficient */ + return 1.0; + case 1: /* smooth coefficient */ + return x*x+exp(y)+sin(z); + case 2:/* small outside of interior cube */ + if ((fabs(x-0.5) < 0.25) && (fabs(y-0.5) < 0.25) && (fabs(z-0.5) < 0.25)) + return 1.0; + else + return 1.0e-6; + case 3: /* small outside of an interior ball */ + if (((x-0.5)*(x-0.5)+(y-0.5)*(y-0.5)+(z-0.5)*(z-0.5)) < 0.0625) + return 1.0; + else + return 1.0e-6; + case 4: /* random coefficient */ + return hypre_Rand(); + default: + return 1.0; + } +} + +/* + This routine computes the lowest order Nedelec, or "edge" finite element + stiffness matrix and load vector on a cube of size h. The 12 edges {e_i} + are numbered in terms of the vertices as follows: + + [7]------[6] + /| /| e_0 = 01, e_1 = 12, e_2 = 32, e_3 = 03, + / | / | e_4 = 45, e_5 = 56, e_6 = 76, e_7 = 47, + [4]------[5] | e_8 = 04, e_9 = 15, e_10 = 26, e_11 = 37. + | [3]----|-[2] + | / | / The edges are oriented from first to the + |/ |/ second vertex, e.g. e_0 is from [0] to [1]. + [0]------[1] + + We allow for different scaling of the curl-curl and the mass parts of the + matrix with coefficients alpha and beta respectively: + + S_ij = alpha (curl phi_i,curl phi_j) + beta (phi_i, phi_j). + + The load vector corresponding to a right-hand side of {1,1,1} is + + F_j = (1,phi_j) = h^2/4. +*/ +void ComputeFEMND1(double S[12][12], double F[12], + double x, double y, double z, double h) +{ + int i, j; + + double h2_4 = h*h/4; + + double cS1 = alpha(x,y,z)/(6.0*h), cS2 = 2*cS1, cS4 = 2*cS2; + double cM1 = beta(x,y,z)*h/36.0, cM2 = 2*cM1, cM4 = 2*cM2; + + S[ 0][ 0] = cS4 + cM4; S[ 0][ 1] = cS2; S[ 0][ 2] = -cS1 + cM2; + S[ 0][ 3] = -cS2; S[ 0][ 4] = -cS1 + cM2; S[ 0][ 5] = cS1; + S[ 0][ 6] = -cS2 + cM1; S[ 0][ 7] = -cS1; S[ 0][ 8] = -cS2; + S[ 0][ 9] = cS2; S[ 0][10] = cS1; S[ 0][11] = -cS1; + + S[ 1][ 1] = cS4 + cM4; S[ 1][ 2] = -cS2; S[ 1][ 3] = -cS1 + cM2; + S[ 1][ 4] = cS1; S[ 1][ 5] = -cS1 + cM2; S[ 1][ 6] = -cS1; + S[ 1][ 7] = -cS2 + cM1; S[ 1][ 8] = -cS1; S[ 1][ 9] = -cS2; + S[ 1][10] = cS2; S[ 1][11] = cS1; + + S[ 2][ 2] = cS4 + cM4; S[ 2][ 3] = cS2; S[ 2][ 4] = -cS2 + cM1; + S[ 2][ 5] = -cS1; S[ 2][ 6] = -cS1 + cM2; S[ 2][ 7] = cS1; + S[ 2][ 8] = -cS1; S[ 2][ 9] = cS1; S[ 2][10] = cS2; + S[ 2][11] = -cS2; + + S[ 3][ 3] = cS4 + cM4; S[ 3][ 4] = -cS1; S[ 3][ 5] = -cS2 + cM1; + S[ 3][ 6] = cS1; S[ 3][ 7] = -cS1 + cM2; S[ 3][ 8] = -cS2; + S[ 3][ 9] = -cS1; S[ 3][10] = cS1; S[ 3][11] = cS2; + + S[ 4][ 4] = cS4 + cM4; S[ 4][ 5] = cS2; S[ 4][ 6] = -cS1 + cM2; + S[ 4][ 7] = -cS2; S[ 4][ 8] = cS2; S[ 4][ 9] = -cS2; + S[ 4][10] = -cS1; S[ 4][11] = cS1; + + S[ 5][ 5] = cS4 + cM4; S[ 5][ 6] = -cS2; S[ 5][ 7] = -cS1 + cM2; + S[ 5][ 8] = cS1; S[ 5][ 9] = cS2; S[ 5][10] = -cS2; + S[ 5][11] = -cS1; + + S[ 6][ 6] = cS4 + cM4; S[ 6][ 7] = cS2; S[ 6][ 8] = cS1; + S[ 6][ 9] = -cS1; S[ 6][10] = -cS2; S[ 6][11] = cS2; + + S[ 7][ 7] = cS4 + cM4; S[ 7][ 8] = cS2; S[ 7][ 9] = cS1; + S[ 7][10] = -cS1; S[ 7][11] = -cS2; + + S[ 8][ 8] = cS4 + cM4; S[ 8][ 9] = -cS1 + cM2; S[ 8][10] = -cS2 + cM1; + S[ 8][11] = -cS1 + cM2; + + S[ 9][ 9] = cS4 + cM4; S[ 9][10] = -cS1 + cM2; S[ 9][11] = -cS2 + cM1; + + S[10][10] = cS4 + cM4; S[10][11] = -cS1 + cM2; + + S[11][11] = cS4 + cM4; + + /* The stiffness matrix is symmetric */ + for (i = 1; i < 12; i++) + for (j = 0; j < i; j++) + S[i][j] = S[j][i]; + + for (i = 0; i < 12; i++) + F[i] = h2_4; +} + + +int main (int argc, char *argv[]) +{ + int myid, num_procs; + int n, N, pi, pj, pk; + double h; + + double tol, theta; + int maxit, cycle_type; + int rlx_type, rlx_sweeps, rlx_weight, rlx_omega; + int amg_coarsen_type, amg_agg_levels, amg_rlx_type; + int amg_interp_type, amg_Pmax; + int singular_problem ; + + HYPRE_Int time_index; + + HYPRE_SStructGrid edge_grid; + HYPRE_SStructGraph A_graph; + HYPRE_SStructMatrix A; + HYPRE_SStructVector b; + HYPRE_SStructVector x; + HYPRE_SStructGrid node_grid; + HYPRE_SStructGraph G_graph; + HYPRE_SStructStencil G_stencil[3]; + HYPRE_SStructMatrix G; + HYPRE_SStructVector xcoord, ycoord, zcoord; + + HYPRE_Solver solver, precond; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Set default parameters */ + n = 10; + optionAlpha = 0; + optionBeta = 0; + maxit = 100; + tol = 1e-6; + cycle_type = 13; + rlx_type = 2; + rlx_sweeps = 1; + rlx_weight = 1.0; + rlx_omega = 1.0; + amg_coarsen_type = 10; + amg_agg_levels = 1; + amg_rlx_type = 6; + theta = 0.25; + amg_interp_type = 6; + amg_Pmax = 4; + singular_problem = 0; + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-a") == 0 ) + { + arg_index++; + optionAlpha = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-b") == 0 ) + { + arg_index++; + optionBeta = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-maxit") == 0 ) + { + arg_index++; + maxit = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-tol") == 0 ) + { + arg_index++; + tol = atof(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-type") == 0 ) + { + arg_index++; + cycle_type = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-rlx") == 0 ) + { + arg_index++; + rlx_type = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-rlxn") == 0 ) + { + arg_index++; + rlx_sweeps = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-rlxw") == 0 ) + { + arg_index++; + rlx_weight = atof(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-rlxo") == 0 ) + { + arg_index++; + rlx_omega = atof(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-ctype") == 0 ) + { + arg_index++; + amg_coarsen_type = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-amgrlx") == 0 ) + { + arg_index++; + amg_rlx_type = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-agg") == 0 ) + { + arg_index++; + amg_agg_levels = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-itype") == 0 ) + { + arg_index++; + amg_interp_type = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-pmax") == 0 ) + { + arg_index++; + amg_Pmax = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-sing") == 0 ) + { + arg_index++; + singular_problem = 1; + } + else if ( strcmp(argv[arg_index], "-theta") == 0 ) + { + arg_index++; + theta = atof(argv[arg_index++]); + } + + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size per processor (default: 10)\n"); + printf(" -a : choice for the curl-curl coefficient (default: 1)\n"); + printf(" -b : choice for the mass coefficient (default: 1)\n"); + printf("\n"); + printf("PCG-AMS solver options: \n"); + printf(" -maxit : maximum number of iterations (100) \n"); + printf(" -tol : convergence tolerance (1e-6) \n"); + printf(" -type : 3-level cycle type (0-8, 11-14) \n"); + printf(" -theta : BoomerAMG threshold (0.25) \n"); + printf(" -ctype : BoomerAMG coarsening type \n"); + printf(" -agg : Levels of BoomerAMG agg. coarsening \n"); + printf(" -amgrlx : BoomerAMG relaxation type \n"); + printf(" -itype : BoomerAMG interpolation type \n"); + printf(" -pmax : BoomerAMG interpolation truncation \n"); + printf(" -rlx : relaxation type \n"); + printf(" -rlxn : number of relaxation sweeps \n"); + printf(" -rlxw : damping parameter (usually <=1) \n"); + printf(" -rlxo : SOR parameter (usually in (0,2)) \n"); + printf(" -sing : curl-curl only (singular) problem \n"); + printf("\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* Figure out the processor grid (N x N x N). The local problem size is n^3, + while pi, pj and pk indicate the position in the processor grid. */ + N = pow(num_procs,1.0/3.0) + 0.5; + if (num_procs != N*N*N) + { + if (myid == 0) printf("Can't run on %d processors, try %d.\n", + num_procs, N*N*N); + MPI_Finalize(); + exit(1); + } + h = 1.0 / (N*n); + pk = myid / (N*N); + pj = myid/N - pk*N; + pi = myid - pj*N - pk*N*N; + + /* Start timing */ + time_index = hypre_InitializeTiming("SStruct Setup"); + hypre_BeginTiming(time_index); + + /* 1. Set up the edge and nodal grids. Note that we do this simultaneously + to make sure that they have the same extents. For simplicity we use + only one part to represent the unit cube. */ + { + HYPRE_Int ndim = 3; + HYPRE_Int nparts = 1; + + /* Create empty 2D grid objects */ + HYPRE_SStructGridCreate(MPI_COMM_WORLD, ndim, nparts, &node_grid); + HYPRE_SStructGridCreate(MPI_COMM_WORLD, ndim, nparts, &edge_grid); + + /* Set the extents of the grid - each processor sets its grid boxes. */ + { + HYPRE_Int part = 0; + HYPRE_Int ilower[3] = {1 + pi*n, 1 + pj*n, 1 + pk*n}; + HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + + HYPRE_SStructGridSetExtents(node_grid, part, ilower, iupper); + HYPRE_SStructGridSetExtents(edge_grid, part, ilower, iupper); + } + + /* Set the variable type and number of variables on each grid. */ + { + HYPRE_Int i; + HYPRE_Int nnodevars = 1; + HYPRE_Int nedgevars = 3; + + HYPRE_SStructVariable nodevars[1] = {HYPRE_SSTRUCT_VARIABLE_NODE}; + HYPRE_SStructVariable edgevars[3] = {HYPRE_SSTRUCT_VARIABLE_XEDGE, + HYPRE_SSTRUCT_VARIABLE_YEDGE, + HYPRE_SSTRUCT_VARIABLE_ZEDGE}; + for (i = 0; i < nparts; i++) + { + HYPRE_SStructGridSetVariables(node_grid, i, nnodevars, nodevars); + HYPRE_SStructGridSetVariables(edge_grid, i, nedgevars, edgevars); + } + } + + /* Since there is only one part, there is no need to call the + SetNeighborPart or SetSharedPart functions, which determine the spatial + relation between the parts. See Examples 12, 13 and 14 for + illustrations of these calls. */ + + /* Now the grids are ready to be used */ + HYPRE_SStructGridAssemble(node_grid); + HYPRE_SStructGridAssemble(edge_grid); + } + + /* 2. Create the finite element stiffness matrix A and load vector b. */ + { + HYPRE_Int part = 0; /* this problem has only one part */ + + /* Set the ordering of the variables in the finite element problem. This + is done by listing the variable offset directions relative to the + element's center. See the Reference Manual for more details. */ + { + HYPRE_Int ordering[48] = { 0, 0, -1, -1, /* x-edge [0]-[1] */ + 1, +1, 0, -1, /* y-edge [1]-[2] */ + /* [7]------[6] */ 0, 0, +1, -1, /* x-edge [3]-[2] */ + /* /| /| */ 1, -1, 0, -1, /* y-edge [0]-[3] */ + /* / | / | */ 0, 0, -1, +1, /* x-edge [4]-[5] */ + /* [4]------[5] | */ 1, +1, 0, +1, /* y-edge [5]-[6] */ + /* | [3]----|-[2] */ 0, 0, +1, +1, /* x-edge [7]-[6] */ + /* | / | / */ 1, -1, 0, +1, /* y-edge [4]-[7] */ + /* |/ |/ */ 2, -1, -1, 0, /* z-edge [0]-[4] */ + /* [0]------[1] */ 2, +1, -1, 0, /* z-edge [1]-[5] */ + 2, +1, +1, 0, /* z-edge [2]-[6] */ + 2, -1, +1, 0 }; /* z-edge [3]-[7] */ + + HYPRE_SStructGridSetFEMOrdering(edge_grid, part, ordering); + } + + /* Set up the Graph - this determines the non-zero structure of the + matrix. */ + { + HYPRE_Int part = 0; + + /* Create the graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, edge_grid, &A_graph); + + /* See MatrixSetObjectType below */ + HYPRE_SStructGraphSetObjectType(A_graph, HYPRE_PARCSR); + + /* Indicate that this problem uses finite element stiffness matrices and + load vectors, instead of stencils. */ + HYPRE_SStructGraphSetFEM(A_graph, part); + + /* The edge finite element matrix is full, so there is no need to call the + HYPRE_SStructGraphSetFEMSparsity() function. */ + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(A_graph); + } + + /* Set up the SStruct Matrix and right-hand side vector */ + { + /* Create the matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, A_graph, &A); + /* Use a ParCSR storage */ + HYPRE_SStructMatrixSetObjectType(A, HYPRE_PARCSR); + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_SStructMatrixInitialize(A); + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, edge_grid, &b); + /* Use a ParCSR storage */ + HYPRE_SStructVectorSetObjectType(b, HYPRE_PARCSR); + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(b); + } + + /* Set the matrix and vector entries by finite element assembly */ + { + /* local stiffness matrix and load vector */ + double S[12][12], F[12]; + + int i, j, k; + HYPRE_Int index[3]; + + for (i = 1; i <= n; i++) + for (j = 1; j <= n; j++) + for (k = 1; k <= n; k++) + { + /* Compute the FEM matrix and r.h.s. for cell (i,j,k) with + coefficients evaluated at the cell center. */ + index[0] = i + pi*n; index[1] = j + pj*n; index[2] = k + pk*n; + ComputeFEMND1(S,F,(pi*n+i)*h-h/2,(pj*n+j)*h-h/2,(pk*n+k)*h-h/2,h); + + /* Eliminate boundary conditions on x = 0 */ + if (index[0] == 1) + { + int ii, jj, bc_edges[4] = { 3, 11, 7, 8 }; + for (ii = 0; ii < 4; ii++) + { + for (jj = 0; jj < 12; jj++) + S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0; + S[bc_edges[ii]][bc_edges[ii]] = 1.0; + F[bc_edges[ii]] = 0.0; + } + } + /* Eliminate boundary conditions on y = 0 */ + if (index[1] == 1) + { + int ii, jj, bc_edges[4] = { 0, 9, 4, 8 }; + for (ii = 0; ii < 4; ii++) + { + for (jj = 0; jj < 12; jj++) + S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0; + S[bc_edges[ii]][bc_edges[ii]] = 1.0; + F[bc_edges[ii]] = 0.0; + } + } + /* Eliminate boundary conditions on z = 0 */ + if (index[2] == 1) + { + int ii, jj, bc_edges[4] = { 0, 1, 2, 3 }; + for (ii = 0; ii < 4; ii++) + { + for (jj = 0; jj < 12; jj++) + S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0; + S[bc_edges[ii]][bc_edges[ii]] = 1.0; + F[bc_edges[ii]] = 0.0; + } + } + /* Eliminate boundary conditions on x = 1 */ + if (index[0] == N*n) + { + int ii, jj, bc_edges[4] = { 1, 10, 5, 9 }; + for (ii = 0; ii < 4; ii++) + { + for (jj = 0; jj < 12; jj++) + S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0; + S[bc_edges[ii]][bc_edges[ii]] = 1.0; + F[bc_edges[ii]] = 0.0; + } + } + /* Eliminate boundary conditions on y = 1 */ + if (index[1] == N*n) + { + int ii, jj, bc_edges[4] = { 2, 10, 6, 11 }; + for (ii = 0; ii < 4; ii++) + { + for (jj = 0; jj < 12; jj++) + S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0; + S[bc_edges[ii]][bc_edges[ii]] = 1.0; + F[bc_edges[ii]] = 0.0; + } + } + /* Eliminate boundary conditions on z = 1 */ + if (index[2] == N*n) + { + int ii, jj, bc_edges[4] = { 4, 5, 6, 7 }; + for (ii = 0; ii < 4; ii++) + { + for (jj = 0; jj < 12; jj++) + S[bc_edges[ii]][jj] = S[jj][bc_edges[ii]] = 0.0; + S[bc_edges[ii]][bc_edges[ii]] = 1.0; + F[bc_edges[ii]] = 0.0; + } + } + + /* Assemble the matrix */ + HYPRE_SStructMatrixAddFEMValues(A, part, index, &S[0][0]); + + /* Assemble the vector */ + HYPRE_SStructVectorAddFEMValues(b, part, index, F); + } + } + + /* Collective calls finalizing the matrix and vector assembly */ + HYPRE_SStructMatrixAssemble(A); + HYPRE_SStructVectorAssemble(b); + } + + /* 3. Create the discrete gradient matrix G, which is needed in AMS. */ + { + HYPRE_Int part = 0; + HYPRE_Int stencil_size = 2; + + /* Define the discretization stencil relating the edges and nodes of the + grid. */ + { + HYPRE_Int ndim = 3; + HYPRE_Int entry; + HYPRE_Int var = 0; /* the node variable */ + + /* The discrete gradient stencils connect edge to node variables. */ + HYPRE_Int Gx_offsets[2][3] = {{-1,0,0},{0,0,0}}; /* x-edge [7]-[6] */ + HYPRE_Int Gy_offsets[2][3] = {{0,-1,0},{0,0,0}}; /* y-edge [5]-[6] */ + HYPRE_Int Gz_offsets[2][3] = {{0,0,-1},{0,0,0}}; /* z-edge [2]-[6] */ + + HYPRE_SStructStencilCreate(ndim, stencil_size, &G_stencil[0]); + HYPRE_SStructStencilCreate(ndim, stencil_size, &G_stencil[1]); + HYPRE_SStructStencilCreate(ndim, stencil_size, &G_stencil[2]); + + for (entry = 0; entry < stencil_size; entry++) + { + HYPRE_SStructStencilSetEntry(G_stencil[0], entry, Gx_offsets[entry], var); + HYPRE_SStructStencilSetEntry(G_stencil[1], entry, Gy_offsets[entry], var); + HYPRE_SStructStencilSetEntry(G_stencil[2], entry, Gz_offsets[entry], var); + } + } + + /* Set up the Graph - this determines the non-zero structure of the + matrix. */ + { + HYPRE_Int nvars = 3; + HYPRE_Int var; /* the edge variables */ + + /* Create the discrete gradient graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, edge_grid, &G_graph); + + /* See MatrixSetObjectType below */ + HYPRE_SStructGraphSetObjectType(G_graph, HYPRE_PARCSR); + + /* Since the discrete gradient relates edge and nodal variables (it is a + rectangular matrix), we have to specify the domain (column) grid. */ + HYPRE_SStructGraphSetDomainGrid(G_graph, node_grid); + + /* Tell the graph which stencil to use for each edge variable on each + part (we only have one part). */ + for (var = 0; var < nvars; var++) + HYPRE_SStructGraphSetStencil(G_graph, part, var, G_stencil[var]); + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(G_graph); + } + + /* Set up the SStruct Matrix */ + { + /* Create the matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, G_graph, &G); + /* Use a ParCSR storage */ + HYPRE_SStructMatrixSetObjectType(G, HYPRE_PARCSR); + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_SStructMatrixInitialize(G); + } + + /* Set the discrete gradient values, assuming a "natural" orientation of + the edges (i.e. one in agreement with the coordinate directions). */ + { + int i; + int nedges = n*(n+1)*(n+1); + double *values; + HYPRE_Int stencil_indices[2] = {0,1}; /* the nodes of each edge */ + + values = (double*) calloc(2*nedges, sizeof(double)); + + /* The edge orientation is fixed: from first to second node */ + for (i = 0; i < nedges; i++) + { + values[2*i] = -1.0; + values[2*i+1] = 1.0; + } + + /* Set the values in the discrete gradient x-edges */ + { + HYPRE_Int var = 0; + HYPRE_Int ilower[3] = {1 + pi*n, 0 + pj*n, 0 + pk*n}; + HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructMatrixSetBoxValues(G, part, ilower, iupper, var, + stencil_size, stencil_indices, + values); + } + /* Set the values in the discrete gradient y-edges */ + { + HYPRE_Int var = 1; + HYPRE_Int ilower[3] = {0 + pi*n, 1 + pj*n, 0 + pk*n}; + HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructMatrixSetBoxValues(G, part, ilower, iupper, var, + stencil_size, stencil_indices, + values); + } + /* Set the values in the discrete gradient z-edges */ + { + HYPRE_Int var = 2; + HYPRE_Int ilower[3] = {0 + pi*n, 0 + pj*n, 1 + pk*n}; + HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructMatrixSetBoxValues(G, part, ilower, iupper, var, + stencil_size, stencil_indices, + values); + } + + free(values); + } + + /* Finalize the matrix assembly */ + HYPRE_SStructMatrixAssemble(G); + } + + /* 4. Create the vectors of nodal coordinates xcoord, ycoord and zcoord, + which are needed in AMS. */ + { + int i, j, k; + HYPRE_Int part = 0; + HYPRE_Int var = 0; /* the node variable */ + HYPRE_Int index[3]; + double xval, yval, zval; + + /* Create empty vector objects */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, node_grid, &xcoord); + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, node_grid, &ycoord); + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, node_grid, &zcoord); + /* Set the object type to ParCSR */ + HYPRE_SStructVectorSetObjectType(xcoord, HYPRE_PARCSR); + HYPRE_SStructVectorSetObjectType(ycoord, HYPRE_PARCSR); + HYPRE_SStructVectorSetObjectType(zcoord, HYPRE_PARCSR); + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(xcoord); + HYPRE_SStructVectorInitialize(ycoord); + HYPRE_SStructVectorInitialize(zcoord); + + /* Compute and set the coordinates of the nodes */ + for (i = 0; i <= n; i++) + for (j = 0; j <= n; j++) + for (k = 0; k <= n; k++) + { + index[0] = i + pi*n; index[1] = j + pj*n; index[2] = k + pk*n; + + xval = index[0]*h; + yval = index[1]*h; + zval = index[2]*h; + + HYPRE_SStructVectorSetValues(xcoord, part, index, var, &xval); + HYPRE_SStructVectorSetValues(ycoord, part, index, var, &yval); + HYPRE_SStructVectorSetValues(zcoord, part, index, var, &zval); + } + + /* Finalize the vector assembly */ + HYPRE_SStructVectorAssemble(xcoord); + HYPRE_SStructVectorAssemble(ycoord); + HYPRE_SStructVectorAssemble(zcoord); + } + + /* 5. Set up a SStruct Vector for the solution vector x */ + { + HYPRE_Int part = 0; + int nvalues = n*(n+1)*(n+1); + double *values; + + values = (double*) calloc(nvalues, sizeof(double)); + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, edge_grid, &x); + /* Set the object type to ParCSR */ + HYPRE_SStructVectorSetObjectType(x, HYPRE_PARCSR); + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(x); + + /* Set the values for the initial guess x-edge */ + { + HYPRE_Int var = 0; + HYPRE_Int ilower[3] = {1 + pi*n, 0 + pj*n, 0 + pk*n}; + HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + /* Set the values for the initial guess y-edge */ + { + HYPRE_Int var = 1; + HYPRE_Int ilower[3] = {0 + pi*n, 1 + pj*n, 0 + pk*n}; + HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + /* Set the values for the initial guess z-edge */ + { + HYPRE_Int var = 2; + HYPRE_Int ilower[3] = {0 + pi*n, 0 + pj*n, 1 + pk*n}; + HYPRE_Int iupper[3] = {n + pi*n, n + pj*n, n + pk*n}; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + + free(values); + + /* Finalize the vector assembly */ + HYPRE_SStructVectorAssemble(x); + } + + /* Finalize current timing */ + hypre_EndTiming(time_index); + hypre_PrintTiming("SStruct phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* 6. Set up and call the PCG-AMS solver (Solver options can be found in the + Reference Manual.) */ + { + double final_res_norm; + HYPRE_Int its; + + HYPRE_ParCSRMatrix par_A; + HYPRE_ParVector par_b; + HYPRE_ParVector par_x; + + HYPRE_ParCSRMatrix par_G; + HYPRE_ParVector par_xcoord; + HYPRE_ParVector par_ycoord; + HYPRE_ParVector par_zcoord; + + /* Extract the ParCSR objects needed in the solver */ + HYPRE_SStructMatrixGetObject(A, (void **) &par_A); + HYPRE_SStructVectorGetObject(b, (void **) &par_b); + HYPRE_SStructVectorGetObject(x, (void **) &par_x); + HYPRE_SStructMatrixGetObject(G, (void **) &par_G); + HYPRE_SStructVectorGetObject(xcoord, (void **) &par_xcoord); + HYPRE_SStructVectorGetObject(ycoord, (void **) &par_ycoord); + HYPRE_SStructVectorGetObject(zcoord, (void **) &par_zcoord); + + if (myid == 0) + printf("Problem size: %lld\n\n", + hypre_ParCSRMatrixGlobalNumRows((hypre_ParCSRMatrix*)par_A)); + + /* Start timing */ + time_index = hypre_InitializeTiming("AMS Setup"); + hypre_BeginTiming(time_index); + + /* Create solver */ + HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_PCGSetMaxIter(solver, maxit); /* max iterations */ + HYPRE_PCGSetTol(solver, tol); /* conv. tolerance */ + HYPRE_PCGSetTwoNorm(solver, 0); /* use the two norm as the stopping criteria */ + HYPRE_PCGSetPrintLevel(solver, 2); /* print solve info */ + HYPRE_PCGSetLogging(solver, 1); /* needed to get run info later */ + + /* Create AMS preconditioner */ + HYPRE_AMSCreate(&precond); + + /* Set AMS parameters */ + HYPRE_AMSSetMaxIter(precond, 1); + HYPRE_AMSSetTol(precond, 0.0); + HYPRE_AMSSetCycleType(precond, cycle_type); + HYPRE_AMSSetPrintLevel(precond, 1); + + /* Set discrete gradient */ + HYPRE_AMSSetDiscreteGradient(precond, par_G); + + /* Set vertex coordinates */ + HYPRE_AMSSetCoordinateVectors(precond, + par_xcoord, par_ycoord, par_zcoord); + + if (singular_problem) + HYPRE_AMSSetBetaPoissonMatrix(precond, NULL); + + /* Smoothing and AMG options */ + HYPRE_AMSSetSmoothingOptions(precond, + rlx_type, rlx_sweeps, + rlx_weight, rlx_omega); + HYPRE_AMSSetAlphaAMGOptions(precond, + amg_coarsen_type, amg_agg_levels, + amg_rlx_type, theta, amg_interp_type, + amg_Pmax); + HYPRE_AMSSetBetaAMGOptions(precond, + amg_coarsen_type, amg_agg_levels, + amg_rlx_type, theta, amg_interp_type, + amg_Pmax); + + /* Set the PCG preconditioner */ + HYPRE_PCGSetPrecond(solver, + (HYPRE_PtrToSolverFcn) HYPRE_AMSSolve, + (HYPRE_PtrToSolverFcn) HYPRE_AMSSetup, + precond); + + /* Call the setup */ + HYPRE_ParCSRPCGSetup(solver, par_A, par_b, par_x); + + /* Finalize current timing */ + hypre_EndTiming(time_index); + hypre_PrintTiming("Setup phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Start timing again */ + time_index = hypre_InitializeTiming("AMS Solve"); + hypre_BeginTiming(time_index); + + /* Call the solve */ + HYPRE_ParCSRPCGSolve(solver, par_A, par_b, par_x); + + /* Finalize current timing */ + hypre_EndTiming(time_index); + hypre_PrintTiming("Solve phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Get some info */ + HYPRE_PCGGetNumIterations(solver, &its); + HYPRE_PCGGetFinalRelativeResidualNorm(solver, &final_res_norm); + + /* Clean up */ + HYPRE_AMSDestroy(precond); + HYPRE_ParCSRPCGDestroy(solver); + + /* Gather the solution vector */ + HYPRE_SStructVectorGather(x); + + if (myid == 0) + { + printf("\n"); + printf("Iterations = %lld\n", its); + printf("Final Relative Residual Norm = %g\n", final_res_norm); + printf("\n"); + } + } + + /* Free memory */ + HYPRE_SStructGridDestroy(edge_grid); + HYPRE_SStructGraphDestroy(A_graph); + HYPRE_SStructMatrixDestroy(A); + HYPRE_SStructVectorDestroy(b); + HYPRE_SStructVectorDestroy(x); + HYPRE_SStructGridDestroy(node_grid); + HYPRE_SStructGraphDestroy(G_graph); + HYPRE_SStructStencilDestroy(G_stencil[0]); + HYPRE_SStructStencilDestroy(G_stencil[1]); + HYPRE_SStructStencilDestroy(G_stencil[2]); + HYPRE_SStructMatrixDestroy(G); + HYPRE_SStructVectorDestroy(xcoord); + HYPRE_SStructVectorDestroy(ycoord); + HYPRE_SStructVectorDestroy(zcoord); + + /* Finalize MPI */ + MPI_Finalize(); + + return 0; +} diff --git a/3rd_party/hypre/src/examples/ex16.c b/3rd_party/hypre/src/examples/ex16.c new file mode 100644 index 000000000..14787188e --- /dev/null +++ b/3rd_party/hypre/src/examples/ex16.c @@ -0,0 +1,668 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 16 + + Interface: Semi-Structured interface (SStruct) + + Compile with: make ex16 + + Sample run: mpirun -np 4 ex16 -n 10 + + To see options: ex16 -help + + Description: This code solves the 2D Laplace equation using a high order + Q3 finite element discretization. Specifically, we solve + -Delta u = 1 with zero boundary conditions on a unit square + domain meshed with a uniform grid. The mesh is distributed + across an N x N process grid, with each processor containing + an n x n sub-mesh of data, so the global mesh is nN x nN. +*/ + +#include +#include "_hypre_utilities.h" +#include "HYPRE_sstruct_mv.h" +#include "HYPRE_sstruct_ls.h" +#include "HYPRE.h" + +#include "vis.c" + +/* + This routine computes the stiffness matrix for the Laplacian on a square of + size h, using bi-cubic elements with degrees of freedom in lexicographical + ordering. So, the element looks as follows: + + [12]-[13]-[14]-[15] + | | + [8] [9] [10] [11] + | | + [4] [5] [6] [7] + | | + [0]--[1]--[2]--[3] +*/ +void ComputeFEMQ3 (double S[16][16], double F[16], double h) +{ + int i, j; + double s = 1.0/33600; + double h2_64 = h*h/64; + + S[ 0][ 0] = 18944*s; + S[ 0][ 1] = -4770*s; + S[ 0][ 2] = 792*s; + S[ 0][ 3] = 574*s; + S[ 0][ 4] = -4770*s; + S[ 0][ 5] = -18711*s; + S[ 0][ 6] = 6075*s; + S[ 0][ 7] = -2439*s; + S[ 0][ 8] = 792*s; + S[ 0][ 9] = 6075*s; + S[ 0][10] = -1944*s; + S[ 0][11] = 747*s; + S[ 0][12] = 574*s; + S[ 0][13] = -2439*s; + S[ 0][14] = 747*s; + S[ 0][15] = -247*s; + + S[ 1][ 1] = 75600*s; + S[ 1][ 2] = -25002*s; + S[ 1][ 3] = 792*s; + S[ 1][ 4] = -18711*s; + S[ 1][ 5] = -39852*s; + S[ 1][ 6] = -7047*s; + S[ 1][ 7] = 6075*s; + S[ 1][ 8] = 6075*s; + S[ 1][ 9] = 9720*s; + S[ 1][10] = 3159*s; + S[ 1][11] = -1944*s; + S[ 1][12] = -2439*s; + S[ 1][13] = -108*s; + S[ 1][14] = -2295*s; + S[ 1][15] = 747*s; + + S[ 2][ 2] = 75600*s; + S[ 2][ 3] = -4770*s; + S[ 2][ 4] = 6075*s; + S[ 2][ 5] = -7047*s; + S[ 2][ 6] = -39852*s; + S[ 2][ 7] = -18711*s; + S[ 2][ 8] = -1944*s; + S[ 2][ 9] = 3159*s; + S[ 2][10] = 9720*s; + S[ 2][11] = 6075*s; + S[ 2][12] = 747*s; + S[ 2][13] = -2295*s; + S[ 2][14] = -108*s; + S[ 2][15] = -2439*s; + + S[ 3][ 3] = 18944*s; + S[ 3][ 4] = -2439*s; + S[ 3][ 5] = 6075*s; + S[ 3][ 6] = -18711*s; + S[ 3][ 7] = -4770*s; + S[ 3][ 8] = 747*s; + S[ 3][ 9] = -1944*s; + S[ 3][10] = 6075*s; + S[ 3][11] = 792*s; + S[ 3][12] = -247*s; + S[ 3][13] = 747*s; + S[ 3][14] = -2439*s; + S[ 3][15] = 574*s; + + S[ 4][ 4] = 75600*s; + S[ 4][ 5] = -39852*s; + S[ 4][ 6] = 9720*s; + S[ 4][ 7] = -108*s; + S[ 4][ 8] = -25002*s; + S[ 4][ 9] = -7047*s; + S[ 4][10] = 3159*s; + S[ 4][11] = -2295*s; + S[ 4][12] = 792*s; + S[ 4][13] = 6075*s; + S[ 4][14] = -1944*s; + S[ 4][15] = 747*s; + + S[ 5][ 5] = 279936*s; + S[ 5][ 6] = -113724*s; + S[ 5][ 7] = 9720*s; + S[ 5][ 8] = -7047*s; + S[ 5][ 9] = -113724*s; + S[ 5][10] = 24057*s; + S[ 5][11] = 3159*s; + S[ 5][12] = 6075*s; + S[ 5][13] = 9720*s; + S[ 5][14] = 3159*s; + S[ 5][15] = -1944*s; + + S[ 6][ 6] = 279936*s; + S[ 6][ 7] = -39852*s; + S[ 6][ 8] = 3159*s; + S[ 6][ 9] = 24057*s; + S[ 6][10] = -113724*s; + S[ 6][11] = -7047*s; + S[ 6][12] = -1944*s; + S[ 6][13] = 3159*s; + S[ 6][14] = 9720*s; + S[ 6][15] = 6075*s; + + S[ 7][ 7] = 75600*s; + S[ 7][ 8] = -2295*s; + S[ 7][ 9] = 3159*s; + S[ 7][10] = -7047*s; + S[ 7][11] = -25002*s; + S[ 7][12] = 747*s; + S[ 7][13] = -1944*s; + S[ 7][14] = 6075*s; + S[ 7][15] = 792*s; + + S[ 8][ 8] = 75600*s; + S[ 8][ 9] = -39852*s; + S[ 8][10] = 9720*s; + S[ 8][11] = -108*s; + S[ 8][12] = -4770*s; + S[ 8][13] = -18711*s; + S[ 8][14] = 6075*s; + S[ 8][15] = -2439*s; + + S[ 9][ 9] = 279936*s; + S[ 9][10] = -113724*s; + S[ 9][11] = 9720*s; + S[ 9][12] = -18711*s; + S[ 9][13] = -39852*s; + S[ 9][14] = -7047*s; + S[ 9][15] = 6075*s; + + S[10][10] = 279936*s; + S[10][11] = -39852*s; + S[10][12] = 6075*s; + S[10][13] = -7047*s; + S[10][14] = -39852*s; + S[10][15] = -18711*s; + + S[11][11] = 75600*s; + S[11][12] = -2439*s; + S[11][13] = 6075*s; + S[11][14] = -18711*s; + S[11][15] = -4770*s; + + S[12][12] = 18944*s; + S[12][13] = -4770*s; + S[12][14] = 792*s; + S[12][15] = 574*s; + + S[13][13] = 75600*s; + S[13][14] = -25002*s; + S[13][15] = 792*s; + + S[14][14] = 75600*s; + S[14][15] = -4770*s; + + S[15][15] = 18944*s; + + /* The stiffness matrix is symmetric */ + for (i = 1; i < 16; i++) + for (j = 0; j < i; j++) + S[i][j] = S[j][i]; + + F[ 0] = h2_64; + F[ 1] = 3*h2_64; + F[ 2] = 3*h2_64; + F[ 3] = h2_64; + F[ 4] = 3*h2_64; + F[ 5] = 9*h2_64; + F[ 6] = 9*h2_64; + F[ 7] = 3*h2_64; + F[ 8] = 3*h2_64; + F[ 9] = 9*h2_64; + F[10] = 9*h2_64; + F[11] = 3*h2_64; + F[12] = h2_64; + F[13] = 3*h2_64; + F[14] = 3*h2_64; + F[15] = h2_64; +} + + +int main (int argc, char *argv[]) +{ + int myid, num_procs; + int n, N, pi, pj; + double h; + int vis; + + HYPRE_SStructGrid grid; + HYPRE_SStructGraph graph; + HYPRE_SStructMatrix A; + HYPRE_SStructVector b; + HYPRE_SStructVector x; + + HYPRE_Solver solver; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Set default parameters */ + n = 10; + vis = 0; + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size per processor (default: 10)\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* Figure out the processor grid (N x N). The local problem size is n^2, + while pi and pj indicate the position in the processor grid. */ + N = pow(num_procs,1.0/2.0) + 0.5; + if (num_procs != N*N) + { + if (myid == 0) + { + printf("Can't run on %d processors, try %d.\n", num_procs, N*N); + } + MPI_Finalize(); + exit(1); + } + h = 1.0 / (N*n); + pj = myid / N; + pi = myid - pj*N; + + /* 1. Set up the grid. For simplicity we use only one part to represent the + unit square. */ + { + int ndim = 2; + int nparts = 1; + + /* Create an empty 2D grid object */ + HYPRE_SStructGridCreate(MPI_COMM_WORLD, ndim, nparts, &grid); + + /* Set the extents of the grid - each processor sets its grid boxes. */ + { + int part = 0; + int ilower[2] = {1 + pi*n, 1 + pj*n}; + int iupper[2] = {n + pi*n, n + pj*n}; + + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + } + + /* Set the variable type and number of variables on each part. There is + one variable of type NODE, two of type XFACE, two of type YFACE, and + four of type CELL. */ + { + int i; + int nvars = 9; + + HYPRE_SStructVariable vars[9] = {HYPRE_SSTRUCT_VARIABLE_NODE, + HYPRE_SSTRUCT_VARIABLE_XFACE, + HYPRE_SSTRUCT_VARIABLE_XFACE, + HYPRE_SSTRUCT_VARIABLE_YFACE, + HYPRE_SSTRUCT_VARIABLE_YFACE, + HYPRE_SSTRUCT_VARIABLE_CELL, + HYPRE_SSTRUCT_VARIABLE_CELL, + HYPRE_SSTRUCT_VARIABLE_CELL, + HYPRE_SSTRUCT_VARIABLE_CELL}; + for (i = 0; i < nparts; i++) + { + HYPRE_SStructGridSetVariables(grid, i, nvars, vars); + } + } + + /* Set the ordering of the variables in the finite element problem. This + is done by listing the variable numbers and offset directions relative + to the element's center. See the Reference Manual for more details. + The ordering and location of the nine variables in each element is as + follows (notation is [order# : variable#]): + + [12:0]-[13:3]-[14:4]-[15:0] + | | + | | + [8:2] [9:7] [10:8] [11:2] + | | + | | + [4:1] [5:5] [6:6] [7:1] + | | + | | + [0:0]--[1:3]--[2:4]--[3:0] + */ + { + int part = 0; + int ordering[48] = { 0,-1,-1, 3, 0,-1, 4, 0,-1, 0,+1,-1, + 1,-1, 0, 5, 0, 0, 6, 0, 0, 1,+1, 0, + 2,-1, 0, 7, 0, 0, 8, 0, 0, 2,+1, 0, + 0,-1,+1, 3, 0,+1, 4, 0,+1, 0,+1,+1 }; + + HYPRE_SStructGridSetFEMOrdering(grid, part, ordering); + } + + /* Now the grid is ready to be used */ + HYPRE_SStructGridAssemble(grid); + } + + /* 2. Set up the Graph - this determines the non-zero structure of the + matrix. */ + { + int part = 0; + + /* Create the graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, grid, &graph); + + /* See MatrixSetObjectType below */ + HYPRE_SStructGraphSetObjectType(graph, HYPRE_PARCSR); + + /* Indicate that this problem uses finite element stiffness matrices and + load vectors, instead of stencils. */ + HYPRE_SStructGraphSetFEM(graph, part); + + /* The local stiffness matrix is full, so there is no need to call + HYPRE_SStructGraphSetFEMSparsity() to set its sparsity pattern. */ + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(graph); + } + + /* 3. Set up the SStruct Matrix and right-hand side vector */ + { + int part = 0; + + /* Create the matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, graph, &A); + /* Use a ParCSR storage */ + HYPRE_SStructMatrixSetObjectType(A, HYPRE_PARCSR); + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_SStructMatrixInitialize(A); + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &b); + /* Use a ParCSR storage */ + HYPRE_SStructVectorSetObjectType(b, HYPRE_PARCSR); + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(b); + + /* Set the matrix and vector entries by finite element assembly */ + { + /* Local stifness matrix and load vector */ + double S[16][16], F[16]; + + int i, j; + int index[2]; + + for (j = 1; j <= n; j++) + { + for (i = 1; i <= n; i++) + { + index[0] = i + pi*n; + index[1] = j + pj*n; + + /* Compute the FEM matrix and rhs */ + ComputeFEMQ3(S, F, h); + + /* Set boundary conditions */ + { + int ii, jj, bdy, dd; + int set_bc[4] = {0, 0, 0, 0}; + int bc_dofs[4][4] = {{ 0, 4, 8, 12}, /* x = 0 boundary */ + { 0, 1, 2, 3}, /* y = 0 boundary */ + { 3, 7, 11, 15}, /* x = 1 boundary */ + {12, 13, 14, 15}}; /* y = 1 boundary */ + + /* Determine the boundary conditions to be set */ + if (index[0] == 1) set_bc[0] = 1; /* x = 0 boundary */ + if (index[1] == 1) set_bc[1] = 1; /* y = 0 boundary */ + if (index[0] == N*n) set_bc[2] = 1; /* x = 1 boundary */ + if (index[1] == N*n) set_bc[3] = 1; /* y = 1 boundary */ + + /* Modify the FEM matrix and rhs on each boundary by setting + rows and columns of S to the identity and F to zero */ + for (bdy = 0; bdy < 4; bdy++) + { + /* Only modify if boundary condition needs to be set */ + if (set_bc[bdy]) + { + for (dd = 0; dd < 4; dd++) + { + for (jj = 0; jj < 16; jj++) + { + ii = bc_dofs[bdy][dd]; + S[ii][jj] = 0.0; /* row */ + S[jj][ii] = 0.0; /* col */ + } + S[ii][ii] = 1.0; /* diagonal */ + F[ii] = 0.0; /* rhs */ + } + } + } + } + + /* Add this elements contribution to the matrix */ + HYPRE_SStructMatrixAddFEMValues(A, part, index, &S[0][0]); + + /* Add this elements contribution to the rhs */ + HYPRE_SStructVectorAddFEMValues(b, part, index, F); + } + } + } + } + + /* Collective calls finalizing the matrix and vector assembly */ + HYPRE_SStructMatrixAssemble(A); + HYPRE_SStructVectorAssemble(b); + + /* 4. Set up SStruct Vector for the solution vector x */ + { + int part = 0; + int var, nvars = 9; + int nvalues = (n+1)*(n+1); + double *values; + + values = (double*) calloc(nvalues, sizeof(double)); + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &x); + /* Set the object type to ParCSR */ + HYPRE_SStructVectorSetObjectType(x, HYPRE_PARCSR); + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(x); + + /* Set the values for the initial guess one variable at a time. Since the + SetBoxValues() calls below set the values to the right and up from the + cell center, ilower needs to be adjusted. */ + for (var = 0; var < nvars; var++) + { + int ilower[2] = {1 + pi*n, 1 + pj*n}; + int iupper[2] = {n + pi*n, n + pj*n}; + + switch(var) + { + case 0: /* NODE */ + ilower[0]--; + ilower[1]--; + break; + case 1: case 2: /* XFACE */ + ilower[0]--; + break; + case 3: case 4: /* YFACE */ + ilower[1]--; + break; + } + + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + + free(values); + + /* Finalize the vector assembly */ + HYPRE_SStructVectorAssemble(x); + } + + /* 5. Set up and call the solver (Solver options can be found in the + Reference Manual.) */ + { + double final_res_norm; + int its; + + HYPRE_ParCSRMatrix par_A; + HYPRE_ParVector par_b; + HYPRE_ParVector par_x; + + /* Extract the ParCSR objects needed in the solver */ + HYPRE_SStructMatrixGetObject(A, (void **) &par_A); + HYPRE_SStructVectorGetObject(b, (void **) &par_b); + HYPRE_SStructVectorGetObject(x, (void **) &par_x); + + /* Here we construct a BoomerAMG solver. See the other SStruct examples + as well as the Reference manual for additional solver choices. */ + HYPRE_BoomerAMGCreate(&solver); + HYPRE_BoomerAMGSetCoarsenType(solver, 6); + HYPRE_BoomerAMGSetStrongThreshold(solver, 0.25); + HYPRE_BoomerAMGSetTol(solver, 1e-6); + HYPRE_BoomerAMGSetPrintLevel(solver, 2); + HYPRE_BoomerAMGSetMaxIter(solver, 50); + + /* call the setup */ + HYPRE_BoomerAMGSetup(solver, par_A, par_b, par_x); + + /* call the solve */ + HYPRE_BoomerAMGSolve(solver, par_A, par_b, par_x); + + /* get some info */ + HYPRE_BoomerAMGGetNumIterations(solver, &its); + HYPRE_BoomerAMGGetFinalRelativeResidualNorm(solver, + &final_res_norm); + /* clean up */ + HYPRE_BoomerAMGDestroy(solver); + + /* Gather the solution vector */ + HYPRE_SStructVectorGather(x); + + /* Save the solution for GLVis visualization, see vis/glvis-ex16.sh */ + if (vis) + { + FILE *file; + char filename[255]; + + int part = 0; + int i, j, k, index[2]; + int nvalues = n*n*16; + double X[16], *values; + + /* GLVis-to-hypre local renumbering */ + int g2h[16] = {0, 3, 15, 12, 1, 2, 7, 11, 14, 13, 8, 4, 5, 6, 9, 10}; + + values = (double*) calloc(nvalues, sizeof(double)); + + nvalues = 0; + for (j = 1; j <= n; j++) + { + for (i = 1; i <= n; i++) + { + index[0] = i + pi*n; + index[1] = j + pj*n; + + /* Get local element solution values X */ + HYPRE_SStructVectorGetFEMValues(x, part, index, X); + + /* Copy local solution X into values array */ + for (k = 0; k < 16; k++) + { + values[nvalues] = X[g2h[k]]; + nvalues++; + } + } + } + + sprintf(filename, "%s.%06d", "vis/ex16.sol", myid); + if ((file = fopen(filename, "w")) == NULL) + { + printf("Error: can't open output file %s\n", filename); + MPI_Finalize(); + exit(1); + } + + /* Finite element space header */ + fprintf(file, "FiniteElementSpace\n"); + fprintf(file, "FiniteElementCollection: Local_Quad_Q3\n"); + fprintf(file, "VDim: 1\n"); + fprintf(file, "Ordering: 0\n\n"); + + /* Save solution with replicated shared data */ + for (i = 0; i < nvalues; i++) + fprintf(file, "%.14e\n", values[i]); + + fflush(file); + fclose(file); + free(values); + + /* Save local finite element mesh */ + GLVis_PrintLocalSquareMesh("vis/ex16.mesh", n, n, h, + pi*h*n, pj*h*n, myid); + + /* Additional visualization data */ + GLVis_PrintData("vis/ex16.data", myid, num_procs); + } + + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", its); + printf("Final Relative Residual Norm = %g\n", final_res_norm); + printf("\n"); + } + } + + /* Free memory */ + HYPRE_SStructGridDestroy(grid); + HYPRE_SStructGraphDestroy(graph); + HYPRE_SStructMatrixDestroy(A); + HYPRE_SStructVectorDestroy(b); + HYPRE_SStructVectorDestroy(x); + + /* Finalize MPI */ + MPI_Finalize(); + + return 0; +} diff --git a/3rd_party/hypre/src/examples/ex17.c b/3rd_party/hypre/src/examples/ex17.c new file mode 100644 index 000000000..e12ae1695 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex17.c @@ -0,0 +1,347 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 17 + + Interface: Structured interface (Struct) + + Compile with: make ex17 + + Sample run: mpirun -np 16 ex17 -n 10 + + To see options: ex17 -help + + Description: This code solves an "NDIM-D Laplacian" using CG. +*/ + +#include +#include "_hypre_utilities.h" +#include "HYPRE_struct_ls.h" + +#define NDIM 4 +#define NSTENC (2*NDIM+1) + +int main (int argc, char *argv[]) +{ + int d, i, j; + int myid, num_procs; + int n, N, nvol, div, rem; + int p[NDIM], ilower[NDIM], iupper[NDIM]; + + int solver_id; + + HYPRE_StructGrid grid; + HYPRE_StructStencil stencil; + HYPRE_StructMatrix A; + HYPRE_StructVector b; + HYPRE_StructVector x; + HYPRE_StructSolver solver; + + int num_iterations; + double final_res_norm; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Set defaults */ + n = 10; + solver_id = 0; + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-solver") == 0 ) + { + arg_index++; + solver_id = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size per processor (default: 33)\n"); + printf(" -solver : solver ID\n"); + printf(" 0 - CG (default)\n"); + printf(" 1 - GMRES\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + nvol = pow(n, NDIM); + + /* Figure out the processor grid (N x N x N x N). The local problem size for + the interior nodes is indicated by n (n x n x n x n). p indicates the + position in the processor grid. */ + N = pow(num_procs, 1.0/NDIM) + 1.0e-6; + div = pow(N, NDIM); + rem = myid; + if (num_procs != div) + { + printf("Num procs is not a perfect NDIM-th root!\n"); + MPI_Finalize(); + exit(1); + } + for (d = NDIM-1; d >= 0; d--) + { + div /= N; + p[d] = rem / div; + rem %= div; + } + + /* Figure out the extents of each processor's piece of the grid. */ + for (d = 0; d < NDIM; d++) + { + ilower[d] = p[d]*n; + iupper[d] = ilower[d] + n-1; + } + + /* 1. Set up a grid */ + { + /* Create an empty 2D grid object */ + HYPRE_StructGridCreate(MPI_COMM_WORLD, NDIM, &grid); + + /* Add a new box to the grid */ + HYPRE_StructGridSetExtents(grid, ilower, iupper); + + /* This is a collective call finalizing the grid assembly. + The grid is now ``ready to be used'' */ + HYPRE_StructGridAssemble(grid); + } + + /* 2. Define the discretization stencil */ + { + /* Create an empty NDIM-D, NSTENC-pt stencil object */ + HYPRE_StructStencilCreate(NDIM, NSTENC, &stencil); + + /* Define the geometry of the stencil */ + { + int entry; + int offset[NDIM]; + + entry = 0; + for (d = 0; d < NDIM; d++) + { + offset[d] = 0; + } + HYPRE_StructStencilSetElement(stencil, entry++, offset); + for (d = 0; d < NDIM; d++) + { + offset[d] = -1; + HYPRE_StructStencilSetElement(stencil, entry++, offset); + offset[d] = 1; + HYPRE_StructStencilSetElement(stencil, entry++, offset); + offset[d] = 0; + } + } + } + + /* 3. Set up a Struct Matrix */ + { + int nentries = NSTENC; + int nvalues = nentries*nvol; + double *values; + int stencil_indices[NSTENC]; + + /* Create an empty matrix object */ + HYPRE_StructMatrixCreate(MPI_COMM_WORLD, grid, stencil, &A); + + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_StructMatrixInitialize(A); + + values = (double*) calloc(nvalues, sizeof(double)); + + for (j = 0; j < nentries; j++) + { + stencil_indices[j] = j; + } + + /* Set the standard stencil at each grid point; fix boundaries later */ + for (i = 0; i < nvalues; i += nentries) + { + values[i] = NSTENC; /* Use absolute row sum */ + for (j = 1; j < nentries; j++) + { + values[i+j] = -1.0; + } + } + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, nentries, + stencil_indices, values); + + free(values); + } + + /* 4. Incorporate zero boundary conditions: go along each edge of the domain + and set the stencil entry that reaches to the boundary to zero.*/ + { + int bc_ilower[NDIM]; + int bc_iupper[NDIM]; + int nentries = 1; + int nvalues = nentries*nvol/n; /* number of stencil entries times the + length of one side of my grid box */ + double *values; + int stencil_indices[1]; + + values = (double*) calloc(nvalues, sizeof(double)); + for (j = 0; j < nvalues; j++) + { + values[j] = 0.0; + } + + for (d = 0; d < NDIM; d++) + { + bc_ilower[d] = ilower[d]; + bc_iupper[d] = iupper[d]; + } + stencil_indices[0] = 1; + for (d = 0; d < NDIM; d++) + { + /* lower boundary in dimension d */ + if (p[d] == 0) + { + bc_iupper[d] = ilower[d]; + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, nentries, + stencil_indices, values); + bc_iupper[d] = iupper[d]; + } + stencil_indices[0]++; + + /* upper boundary in dimension d */ + if (p[d] == N-1) + { + bc_ilower[d] = iupper[d]; + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, nentries, + stencil_indices, values); + bc_ilower[d] = ilower[d]; + } + stencil_indices[0]++; + } + + free(values); + } + + /* This is a collective call finalizing the matrix assembly. + The matrix is now ``ready to be used'' */ + HYPRE_StructMatrixAssemble(A); + + /* 5. Set up Struct Vectors for b and x */ + { + int nvalues = nvol; + double *values; + + values = (double*) calloc(nvalues, sizeof(double)); + + /* Create an empty vector object */ + HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &b); + HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &x); + + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_StructVectorInitialize(b); + HYPRE_StructVectorInitialize(x); + + /* Set the values */ + for (i = 0; i < nvalues; i ++) + { + values[i] = 1.0; + } + HYPRE_StructVectorSetBoxValues(b, ilower, iupper, values); + + for (i = 0; i < nvalues; i ++) + { + values[i] = 0.0; + } + HYPRE_StructVectorSetBoxValues(x, ilower, iupper, values); + + free(values); + + /* This is a collective call finalizing the vector assembly. + The vector is now ``ready to be used'' */ + HYPRE_StructVectorAssemble(b); + HYPRE_StructVectorAssemble(x); + } + +#if 0 + HYPRE_StructMatrixPrint("ex17.out.A", A, 0); + HYPRE_StructVectorPrint("ex17.out.b", b, 0); + HYPRE_StructVectorPrint("ex17.out.x0", x, 0); +#endif + + /* 6. Set up and use a struct solver + (Solver options can be found in the Reference Manual.) */ + if (solver_id == 0) + { + HYPRE_StructPCGCreate(MPI_COMM_WORLD, &solver); + HYPRE_StructPCGSetMaxIter(solver, 100); + HYPRE_StructPCGSetTol(solver, 1.0e-06); + HYPRE_StructPCGSetTwoNorm(solver, 1); + HYPRE_StructPCGSetRelChange(solver, 0); + HYPRE_StructPCGSetPrintLevel(solver, 2); /* print each CG iteration */ + HYPRE_StructPCGSetLogging(solver, 1); + + /* No preconditioner */ + + HYPRE_StructPCGSetup(solver, A, b, x); + HYPRE_StructPCGSolve(solver, A, b, x); + + /* Get some info on the run */ + HYPRE_StructPCGGetNumIterations(solver, &num_iterations); + HYPRE_StructPCGGetFinalRelativeResidualNorm(solver, &final_res_norm); + + /* Clean up */ + HYPRE_StructPCGDestroy(solver); + } + + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", num_iterations); + printf("Final Relative Residual Norm = %g\n", final_res_norm); + printf("\n"); + } + + /* Free memory */ + HYPRE_StructGridDestroy(grid); + HYPRE_StructStencilDestroy(stencil); + HYPRE_StructMatrixDestroy(A); + HYPRE_StructVectorDestroy(b); + HYPRE_StructVectorDestroy(x); + + /* Finalize MPI */ + MPI_Finalize(); + + return (0); +} diff --git a/3rd_party/hypre/src/examples/ex18.c b/3rd_party/hypre/src/examples/ex18.c new file mode 100644 index 000000000..11517afc1 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex18.c @@ -0,0 +1,442 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 18 + + Interface: SStructured interface (SStruct) + + Compile with: make ex18 + + Sample run: mpirun -np 16 ex18 -n 4 + + To see options: ex18 -help + + Description: This code solves an "NDIM-D Laplacian" using CG. +*/ + +#include +#include "_hypre_utilities.h" +#include "HYPRE_sstruct_ls.h" + +#define NDIM 4 +#define NPARTS 1 +#define NVARS 2 +#define NSTENC NVARS*(2*NDIM+1) + +int main (int argc, char *argv[]) +{ + int d, i, j; + int myid, num_procs; + int n, N, nvol, div, rem; + int p[NDIM], ilower[NDIM], iupper[NDIM]; + + int solver_id, object_type = HYPRE_SSTRUCT; + + HYPRE_SStructGrid grid; + HYPRE_SStructStencil stencil0, stencil1; + HYPRE_SStructGraph graph; + HYPRE_SStructMatrix A; + HYPRE_SStructVector b; + HYPRE_SStructVector x; + + HYPRE_SStructSolver solver; + + int num_iterations; + double final_res_norm; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Set defaults */ + n = 4; + solver_id = 0; + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-solver") == 0 ) + { + arg_index++; + solver_id = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size per processor (default: 4)\n"); + printf(" -solver : solver ID\n"); + printf(" 0 - CG (default)\n"); + printf(" 1 - GMRES\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + nvol = pow(n, NDIM); + + /* Figure out the processor grid (N x N x N x N). The local problem size for + the interior nodes is indicated by n (n x n x n x n). p indicates the + position in the processor grid. */ + N = pow(num_procs, 1.0/NDIM) + 1.0e-6; + div = pow(N, NDIM); + rem = myid; + if (num_procs != div) + { + printf("Num procs is not a perfect NDIM-th root!\n"); + MPI_Finalize(); + exit(1); + } + for (d = NDIM-1; d >= 0; d--) + { + div /= N; + p[d] = rem / div; + rem %= div; + } + + /* Figure out the extents of each processor's piece of the grid. */ + for (d = 0; d < NDIM; d++) + { + ilower[d] = p[d]*n; + iupper[d] = ilower[d] + n-1; + } + + /* 1. Set up a grid */ + { + int part = 0; + HYPRE_SStructVariable vartypes[NVARS] = {HYPRE_SSTRUCT_VARIABLE_CELL, + HYPRE_SSTRUCT_VARIABLE_CELL}; + + /* Create an empty 2D grid object */ + HYPRE_SStructGridCreate(MPI_COMM_WORLD, NDIM, NPARTS, &grid); + + /* Add a new box to the grid */ + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + + /* Set the variable type and number of variables on each part. */ + HYPRE_SStructGridSetVariables(grid, part, NVARS, vartypes); + + /* The grid is now ready to use */ + HYPRE_SStructGridAssemble(grid); + } + + /* 2. Define the discretization stencil */ + { + /* Create two empty NDIM-D, NSTENC-pt stencil objects */ + HYPRE_SStructStencilCreate(NDIM, NSTENC, &stencil0); + HYPRE_SStructStencilCreate(NDIM, NSTENC, &stencil1); + + /* Define the geometry of the stencil */ + { + int entry, var0 = 0, var1 = 1; + int offset[NDIM]; + + entry = 0; + for (d = 0; d < NDIM; d++) + { + offset[d] = 0; + } + HYPRE_SStructStencilSetEntry(stencil0, entry, offset, var0); + HYPRE_SStructStencilSetEntry(stencil1, entry, offset, var1); + entry++; + HYPRE_SStructStencilSetEntry(stencil0, entry, offset, var1); + HYPRE_SStructStencilSetEntry(stencil1, entry, offset, var0); + entry++; + for (d = 0; d < NDIM; d++) + { + offset[d] = -1; + HYPRE_SStructStencilSetEntry(stencil0, entry, offset, var0); + HYPRE_SStructStencilSetEntry(stencil1, entry, offset, var1); + entry++; + HYPRE_SStructStencilSetEntry(stencil0, entry, offset, var1); + HYPRE_SStructStencilSetEntry(stencil1, entry, offset, var0); + entry++; + offset[d] = 1; + HYPRE_SStructStencilSetEntry(stencil0, entry, offset, var0); + HYPRE_SStructStencilSetEntry(stencil1, entry, offset, var1); + entry++; + HYPRE_SStructStencilSetEntry(stencil0, entry, offset, var1); + HYPRE_SStructStencilSetEntry(stencil1, entry, offset, var0); + entry++; + offset[d] = 0; + } + } + } + + /* 3. Set up the Graph */ + { + int part = 0; + int var0 = 0, var1 = 1; + + /* Create the graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, grid, &graph); + + /* Set up the object type (see Matrix and VectorSetObjectType below) */ + HYPRE_SStructGraphSetObjectType(graph, object_type); + + /* Set the stencil */ + HYPRE_SStructGraphSetStencil(graph, part, var0, stencil0); + HYPRE_SStructGraphSetStencil(graph, part, var1, stencil1); + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(graph); + } + + /* 4. Set up the Matrix */ + { + int part = 0; + int var0 = 0, var1 = 1; + int nentries = NSTENC/NVARS; + int nvalues = nentries*nvol; + double *values; + int stencil_indices[NSTENC]; + + /* Create an empty matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, graph, &A); + + /* Set up the object type */ + HYPRE_SStructMatrixSetObjectType(A, object_type); + + /* Get ready to set values */ + HYPRE_SStructMatrixInitialize(A); + + values = (double*) calloc(nvalues, sizeof(double)); + + /* Set intra-variable values; fix boundaries later */ + for (j = 0; j < nentries; j++) + { + stencil_indices[j] = 2*j; + } + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 1.1*(NSTENC/NVARS); /* Diagonal: Use absolute row sum */ + for (j = 1; j < nentries; j++) + { + values[i+j] = -1.0; + } + } + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var0, + nentries, stencil_indices, values); + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var1, + nentries, stencil_indices, values); + + /* Set inter-variable values; fix boundaries later */ + for (j = 0; j < nentries; j++) + { + stencil_indices[j] = 2*j+1; + } + for (i = 0; i < nvalues; i += nentries) + { + values[i] = -0.1; + for (j = 1; j < nentries; j++) + { + values[i+j] = -0.1; + } + } + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var0, + nentries, stencil_indices, values); + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var1, + nentries, stencil_indices, values); + + free(values); + } + + /* 5. Incorporate zero boundary conditions: go along each edge of the domain + and set the stencil entry that reaches to the boundary to zero.*/ + { + int part = 0; + int var0 = 0, var1 = 1; + int bc_ilower[NDIM]; + int bc_iupper[NDIM]; + int nentries = 1; + int nvalues = nentries*nvol/n; /* number of stencil entries times the + length of one side of my grid box */ + double *values; + int stencil_indices[1]; + + values = (double*) calloc(nvalues, sizeof(double)); + for (j = 0; j < nvalues; j++) + { + values[j] = 0.0; + } + + for (d = 0; d < NDIM; d++) + { + bc_ilower[d] = ilower[d]; + bc_iupper[d] = iupper[d]; + } + stencil_indices[0] = NVARS; + for (d = 0; d < NDIM; d++) + { + /* lower boundary in dimension d */ + if (p[d] == 0) + { + bc_iupper[d] = ilower[d]; + for (i = 0; i < NVARS; i++) + { + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, var0, + nentries, stencil_indices, values); + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, var1, + nentries, stencil_indices, values); + stencil_indices[0]++; + } + bc_iupper[d] = iupper[d]; + } + else + { + stencil_indices[0] += NVARS; + } + + /* upper boundary in dimension d */ + if (p[d] == N-1) + { + bc_ilower[d] = iupper[d]; + for (i = 0; i < NVARS; i++) + { + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, var0, + nentries, stencil_indices, values); + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, var1, + nentries, stencil_indices, values); + stencil_indices[0]++; + } + bc_ilower[d] = ilower[d]; + } + else + { + stencil_indices[0] += NVARS; + } + } + + free(values); + } + + /* The matrix is now ready to use */ + HYPRE_SStructMatrixAssemble(A); + + /* 6. Set up Vectors for b and x */ + { + int part = 0; + int var0 = 0, var1 = 1; + int nvalues = NVARS*nvol; + double *values; + + values = (double*) calloc(nvalues, sizeof(double)); + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &b); + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &x); + + /* Set up the object type */ + HYPRE_SStructVectorSetObjectType(b, object_type); + HYPRE_SStructVectorSetObjectType(x, object_type); + + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(b); + HYPRE_SStructVectorInitialize(x); + + /* Set the values */ + for (i = 0; i < nvalues; i ++) + { + values[i] = 1.0; + } + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var0, values); + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var1, values); + + for (i = 0; i < nvalues; i ++) + { + values[i] = 0.0; + } + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var0, values); + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var1, values); + + free(values); + + /* The vector is now ready to use */ + HYPRE_SStructVectorAssemble(b); + HYPRE_SStructVectorAssemble(x); + } + +#if 0 + HYPRE_SStructMatrixPrint("ex18.out.A", A, 0); + HYPRE_SStructVectorPrint("ex18.out.b", b, 0); + HYPRE_SStructVectorPrint("ex18.out.x0", x, 0); +#endif + + /* 7. Set up and use a struct solver */ + if (solver_id == 0) + { + HYPRE_SStructPCGCreate(MPI_COMM_WORLD, &solver); + HYPRE_SStructPCGSetMaxIter(solver, 100); + HYPRE_SStructPCGSetTol(solver, 1.0e-06); + HYPRE_SStructPCGSetTwoNorm(solver, 1); + HYPRE_SStructPCGSetRelChange(solver, 0); + HYPRE_SStructPCGSetPrintLevel(solver, 2); /* print each CG iteration */ + HYPRE_SStructPCGSetLogging(solver, 1); + + /* No preconditioner */ + + HYPRE_SStructPCGSetup(solver, A, b, x); + HYPRE_SStructPCGSolve(solver, A, b, x); + + /* Get some info on the run */ + HYPRE_SStructPCGGetNumIterations(solver, &num_iterations); + HYPRE_SStructPCGGetFinalRelativeResidualNorm(solver, &final_res_norm); + + /* Clean up */ + HYPRE_SStructPCGDestroy(solver); + } + + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", num_iterations); + printf("Final Relative Residual Norm = %g\n", final_res_norm); + printf("\n"); + } + + /* Free memory */ + HYPRE_SStructGridDestroy(grid); + HYPRE_SStructGraphDestroy(graph); + HYPRE_SStructStencilDestroy(stencil0); + HYPRE_SStructStencilDestroy(stencil1); + HYPRE_SStructMatrixDestroy(A); + HYPRE_SStructVectorDestroy(b); + HYPRE_SStructVectorDestroy(x); + + /* Finalize MPI */ + MPI_Finalize(); + + return (0); +} diff --git a/3rd_party/hypre/src/examples/ex18comp.c b/3rd_party/hypre/src/examples/ex18comp.c new file mode 100644 index 000000000..357d09db9 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex18comp.c @@ -0,0 +1,451 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 18comp + + Interface: SStructured interface (SStruct) + + Compile with: make ex18comp + + Sample run: mpirun -np 16 ex18comp -n 4 + + To see options: ex18comp -help + + Description: This code solves a complex "NDIM-D Laplacian" using CG. +*/ + +#include +#include +#include "_hypre_utilities.h" +#include "HYPRE_sstruct_ls.h" + +#define NDIM 4 +#define NPARTS 1 +#define NVARS 2 +#define NSTENC NVARS*(2*NDIM+1) + +int main (int argc, char *argv[]) +{ + int d, i, j; + int myid, num_procs; + int n, N, nvol, div, rem; + int p[NDIM], ilower[NDIM], iupper[NDIM]; + + int solver_id, object_type = HYPRE_SSTRUCT; + + HYPRE_SStructGrid grid; + HYPRE_SStructStencil stencil0, stencil1; + HYPRE_SStructGraph graph; + HYPRE_SStructMatrix A; + HYPRE_SStructVector b; + HYPRE_SStructVector x; + + HYPRE_SStructSolver solver; + + int num_iterations; + double final_res_norm; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Set defaults */ + n = 4; + solver_id = 0; + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-solver") == 0 ) + { + arg_index++; + solver_id = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size per processor (default: 4)\n"); + printf(" -solver : solver ID\n"); + printf(" 0 - CG (default)\n"); + printf(" 1 - GMRES\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + nvol = pow(n, NDIM); + + /* Figure out the processor grid (N x N x N x N). The local problem size for + the interior nodes is indicated by n (n x n x n x n). p indicates the + position in the processor grid. */ + N = pow(num_procs, 1.0/NDIM) + 1.0e-6; + div = pow(N, NDIM); + rem = myid; + if (num_procs != div) + { + printf("Num procs is not a perfect NDIM-th root!\n"); + MPI_Finalize(); + exit(1); + } + for (d = NDIM-1; d >= 0; d--) + { + div /= N; + p[d] = rem / div; + rem %= div; + } + + /* Figure out the extents of each processor's piece of the grid. */ + for (d = 0; d < NDIM; d++) + { + ilower[d] = p[d]*n; + iupper[d] = ilower[d] + n-1; + } + + /* 1. Set up a grid */ + { + int part = 0; + HYPRE_SStructVariable vartypes[NVARS] = {HYPRE_SSTRUCT_VARIABLE_CELL, + HYPRE_SSTRUCT_VARIABLE_CELL}; + + /* Create an empty 2D grid object */ + HYPRE_SStructGridCreate(MPI_COMM_WORLD, NDIM, NPARTS, &grid); + + /* Add a new box to the grid */ + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + + /* Set the variable type and number of variables on each part. */ + HYPRE_SStructGridSetVariables(grid, part, NVARS, vartypes); + + /* The grid is now ready to use */ + HYPRE_SStructGridAssemble(grid); + } + + /* 2. Define the discretization stencil */ + { + /* Create two empty NDIM-D, NSTENC-pt stencil objects */ + HYPRE_SStructStencilCreate(NDIM, NSTENC, &stencil0); + HYPRE_SStructStencilCreate(NDIM, NSTENC, &stencil1); + + /* Define the geometry of the stencil */ + { + int entry, var0 = 0, var1 = 1; + int offset[NDIM]; + + entry = 0; + for (d = 0; d < NDIM; d++) + { + offset[d] = 0; + } + HYPRE_SStructStencilSetEntry(stencil0, entry, offset, var0); + HYPRE_SStructStencilSetEntry(stencil1, entry, offset, var1); + entry++; + HYPRE_SStructStencilSetEntry(stencil0, entry, offset, var1); + HYPRE_SStructStencilSetEntry(stencil1, entry, offset, var0); + entry++; + for (d = 0; d < NDIM; d++) + { + offset[d] = -1; + HYPRE_SStructStencilSetEntry(stencil0, entry, offset, var0); + HYPRE_SStructStencilSetEntry(stencil1, entry, offset, var1); + entry++; + HYPRE_SStructStencilSetEntry(stencil0, entry, offset, var1); + HYPRE_SStructStencilSetEntry(stencil1, entry, offset, var0); + entry++; + offset[d] = 1; + HYPRE_SStructStencilSetEntry(stencil0, entry, offset, var0); + HYPRE_SStructStencilSetEntry(stencil1, entry, offset, var1); + entry++; + HYPRE_SStructStencilSetEntry(stencil0, entry, offset, var1); + HYPRE_SStructStencilSetEntry(stencil1, entry, offset, var0); + entry++; + offset[d] = 0; + } + } + } + + /* 3. Set up the Graph */ + { + int part = 0; + int var0 = 0, var1 = 1; + + /* Create the graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, grid, &graph); + + /* Set up the object type (see Matrix and VectorSetObjectType below) */ + HYPRE_SStructGraphSetObjectType(graph, object_type); + + /* Set the stencil */ + HYPRE_SStructGraphSetStencil(graph, part, var0, stencil0); + HYPRE_SStructGraphSetStencil(graph, part, var1, stencil1); + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(graph); + } + + /* 4. Set up the Matrix */ + { + int part = 0; + int var0 = 0, var1 = 1; + int nentries = NSTENC/NVARS; + int nvalues = nentries*nvol; + HYPRE_Complex *values; + int stencil_indices[NSTENC]; + + /* Create an empty matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, graph, &A); + + /* Set up the object type */ + HYPRE_SStructMatrixSetObjectType(A, object_type); + + /* Get ready to set values */ + HYPRE_SStructMatrixInitialize(A); + + values = (HYPRE_Complex*) calloc(nvalues, sizeof(HYPRE_Complex)); + + /* Set intra-variable values; fix boundaries later */ + for (j = 0; j < nentries; j++) + { + stencil_indices[j] = 2*j; + } + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 1.1*(NSTENC/NVARS); /* Diagonal: Use absolute row sum */ + for (j = 1; j < nentries; j++) + { + values[i+j] = -1.0; + } + } + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var0, + nentries, stencil_indices, values); + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var1, + nentries, stencil_indices, values); + + /* Set inter-variable values; fix boundaries later */ + for (j = 0; j < nentries; j++) + { + stencil_indices[j] = 2*j+1; + } + /* Add an imaginary component and ensure conjugate to below */ + for (i = 0; i < nvalues; i += nentries) + { + for (j = 0; j < nentries; j++) + { + values[i+j] =(-0.1 + (HYPRE_Complex)I*0.1); + } + } + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var0, + nentries, stencil_indices, values); + /* Add an imaginary component and ensure conjugate to above */ + for (i = 0; i < nvalues; i += nentries) + { + for (j = 0; j < nentries; j++) + { + values[i+j] =(HYPRE_Complex)(-0.1 - I*0.1); + } + } + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, var1, + nentries, stencil_indices, values); + + free(values); + } + + /* 5. Incorporate zero boundary conditions: go along each edge of the domain + and set the stencil entry that reaches to the boundary to zero.*/ + { + int part = 0; + int var0 = 0, var1 = 1; + int bc_ilower[NDIM]; + int bc_iupper[NDIM]; + int nentries = 1; + int nvalues = nentries*nvol/n; /* number of stencil entries times the + length of one side of my grid box */ + HYPRE_Complex *values; + int stencil_indices[1]; + + values = (HYPRE_Complex*) calloc(nvalues, sizeof(HYPRE_Complex)); + for (j = 0; j < nvalues; j++) + { + values[j] = 0.0; + } + + for (d = 0; d < NDIM; d++) + { + bc_ilower[d] = ilower[d]; + bc_iupper[d] = iupper[d]; + } + stencil_indices[0] = NVARS; + for (d = 0; d < NDIM; d++) + { + /* lower boundary in dimension d */ + if (p[d] == 0) + { + bc_iupper[d] = ilower[d]; + for (i = 0; i < NVARS; i++) + { + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, var0, + nentries, stencil_indices, values); + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, var1, + nentries, stencil_indices, values); + stencil_indices[0]++; + } + bc_iupper[d] = iupper[d]; + } + else + { + stencil_indices[0] += NVARS; + } + + /* upper boundary in dimension d */ + if (p[d] == N-1) + { + bc_ilower[d] = iupper[d]; + for (i = 0; i < NVARS; i++) + { + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, var0, + nentries, stencil_indices, values); + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, var1, + nentries, stencil_indices, values); + stencil_indices[0]++; + } + bc_ilower[d] = ilower[d]; + } + else + { + stencil_indices[0] += NVARS; + } + } + + free(values); + } + + /* The matrix is now ready to use */ + HYPRE_SStructMatrixAssemble(A); + + /* 6. Set up Vectors for b and x */ + { + int part = 0; + int var0 = 0, var1 = 1; + int nvalues = NVARS*nvol; + HYPRE_Complex *values; + + values = (HYPRE_Complex*) calloc(nvalues, sizeof(HYPRE_Complex)); + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &b); + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &x); + + /* Set up the object type */ + HYPRE_SStructVectorSetObjectType(b, object_type); + HYPRE_SStructVectorSetObjectType(x, object_type); + + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(b); + HYPRE_SStructVectorInitialize(x); + + /* Set the values */ + for (i = 0; i < nvalues; i ++) + { + values[i] = 1.0; + } + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var0, values); + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var1, values); + + for (i = 0; i < nvalues; i ++) + { + values[i] = 0.0; + } + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var0, values); + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var1, values); + + free(values); + + /* The vector is now ready to use */ + HYPRE_SStructVectorAssemble(b); + HYPRE_SStructVectorAssemble(x); + } + +#if 0 + HYPRE_SStructMatrixPrint("ex18comp.out.A", A, 0); + HYPRE_SStructVectorPrint("ex18comp.out.b", b, 0); + HYPRE_SStructVectorPrint("ex18comp.out.x0", x, 0); +#endif + + /* 7. Set up and use a struct solver */ + if (solver_id == 0) + { + HYPRE_SStructPCGCreate(MPI_COMM_WORLD, &solver); + HYPRE_SStructPCGSetMaxIter(solver, 100); + HYPRE_SStructPCGSetTol(solver, 1.0e-06); + HYPRE_SStructPCGSetTwoNorm(solver, 1); + HYPRE_SStructPCGSetRelChange(solver, 0); + HYPRE_SStructPCGSetPrintLevel(solver, 2); /* print each CG iteration */ + HYPRE_SStructPCGSetLogging(solver, 1); + + /* No preconditioner */ + + HYPRE_SStructPCGSetup(solver, A, b, x); + HYPRE_SStructPCGSolve(solver, A, b, x); + + /* Get some info on the run */ + HYPRE_SStructPCGGetNumIterations(solver, &num_iterations); + HYPRE_SStructPCGGetFinalRelativeResidualNorm(solver, &final_res_norm); + + /* Clean up */ + HYPRE_SStructPCGDestroy(solver); + } + + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", num_iterations); + printf("Final Relative Residual Norm = %g\n", final_res_norm); + printf("\n"); + } + + /* Free memory */ + HYPRE_SStructGridDestroy(grid); + HYPRE_SStructGraphDestroy(graph); + HYPRE_SStructStencilDestroy(stencil0); + HYPRE_SStructStencilDestroy(stencil1); + HYPRE_SStructMatrixDestroy(A); + HYPRE_SStructVectorDestroy(b); + HYPRE_SStructVectorDestroy(x); + + /* Finalize MPI */ + MPI_Finalize(); + + return (0); +} diff --git a/3rd_party/hypre/src/examples/ex2.c b/3rd_party/hypre/src/examples/ex2.c new file mode 100644 index 000000000..23f358c05 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex2.c @@ -0,0 +1,485 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 2 + + Interface: Structured interface (Struct) + + Compile with: make ex2 + + Sample run: mpirun -np 2 ex2 + + Description: This is a two processor example and is similar to the previous + structured interface example (Example 1). However, in + this case the grid boxes are exactly those in the example + diagram in the struct interface chapter of the User's Manual. + (Processor 0 owns two boxes and processor 1 owns one box.) + The solver is PCG with SMG preconditioner. + + We recommend viewing example 1 before viewing this + example. +*/ + +#include + +/* Struct linear solvers header */ +#include "HYPRE_struct_ls.h" + +#include "vis.c" + +int main (int argc, char *argv[]) +{ + int i, j; + + int myid, num_procs; + + int vis = 0; + + HYPRE_StructGrid grid; + HYPRE_StructStencil stencil; + HYPRE_StructMatrix A; + HYPRE_StructVector b; + HYPRE_StructVector x; + HYPRE_StructSolver solver; + HYPRE_StructSolver precond; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + if (num_procs != 2) + { + if (myid == 0) printf("Must run with 2 processors!\n"); + MPI_Finalize(); + + return(0); + } + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* 1. Set up a grid */ + { + /* Create an empty 2D grid object */ + HYPRE_StructGridCreate(MPI_COMM_WORLD, 2, &grid); + + /* Processor 0 owns two boxes in the grid. */ + if (myid == 0) + { + /* Add a new box to the grid */ + { + int ilower[2] = {-3, 1}; + int iupper[2] = {-1, 2}; + + HYPRE_StructGridSetExtents(grid, ilower, iupper); + } + + /* Add a new box to the grid */ + { + int ilower[2] = {0, 1}; + int iupper[2] = {2, 4}; + + HYPRE_StructGridSetExtents(grid, ilower, iupper); + } + } + + /* Processor 1 owns one box in the grid. */ + else if (myid == 1) + { + /* Add a new box to the grid */ + { + int ilower[2] = {3, 1}; + int iupper[2] = {6, 4}; + + HYPRE_StructGridSetExtents(grid, ilower, iupper); + } + } + + /* This is a collective call finalizing the grid assembly. + The grid is now ``ready to be used'' */ + HYPRE_StructGridAssemble(grid); + } + + /* 2. Define the discretization stencil */ + { + /* Create an empty 2D, 5-pt stencil object */ + HYPRE_StructStencilCreate(2, 5, &stencil); + + /* Define the geometry of the stencil. Each represents a + relative offset (in the index space). */ + { + int entry; + int offsets[5][2] = {{0,0}, {-1,0}, {1,0}, {0,-1}, {0,1}}; + + /* Assign each of the 5 stencil entries */ + for (entry = 0; entry < 5; entry++) + HYPRE_StructStencilSetElement(stencil, entry, offsets[entry]); + } + } + + /* 3. Set up a Struct Matrix */ + { + /* Create an empty matrix object */ + HYPRE_StructMatrixCreate(MPI_COMM_WORLD, grid, stencil, &A); + + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_StructMatrixInitialize(A); + + if (myid == 0) + { + /* Set the matrix coefficients for some set of stencil entries + over all the gridpoints in my first box (account for boundary + grid points later) */ + { + int ilower[2] = {-3, 1}; + int iupper[2] = {-1, 2}; + + int nentries = 5; + int nvalues = 30; /* 6 grid points, each with 5 stencil entries */ + double values[30]; + + int stencil_indices[5]; + for (j = 0; j < nentries; j++) /* label the stencil indices - + these correspond to the offsets + defined above */ + stencil_indices[j] = j; + + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 4.0; + for (j = 1; j < nentries; j++) + values[i+j] = -1.0; + } + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, nentries, + stencil_indices, values); + } + + /* Set the matrix coefficients for some set of stencil entries + over the gridpoints in my second box */ + { + int ilower[2] = {0, 1}; + int iupper[2] = {2, 4}; + + int nentries = 5; + int nvalues = 60; /* 12 grid points, each with 5 stencil entries */ + double values[60]; + + int stencil_indices[5]; + for (j = 0; j < nentries; j++) + stencil_indices[j] = j; + + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 4.0; + for (j = 1; j < nentries; j++) + values[i+j] = -1.0; + } + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, nentries, + stencil_indices, values); + } + } + else if (myid == 1) + { + /* Set the matrix coefficients for some set of stencil entries + over the gridpoints in my box */ + { + int ilower[2] = {3, 1}; + int iupper[2] = {6, 4}; + + int nentries = 5; + int nvalues = 80; /* 16 grid points, each with 5 stencil entries */ + double values[80]; + + int stencil_indices[5]; + for (j = 0; j < nentries; j++) + stencil_indices[j] = j; + + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 4.0; + for (j = 1; j < nentries; j++) + values[i+j] = -1.0; + } + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, nentries, + stencil_indices, values); + } + } + + /* For each box, set any coefficients that reach outside of the + boundary to 0 */ + if (myid == 0) + { + int maxnvalues = 6; + double values[6]; + + for (i = 0; i < maxnvalues; i++) + values[i] = 0.0; + + { + /* Values below our first AND second box */ + int ilower[2] = {-3, 1}; + int iupper[2] = { 2, 1}; + + int stencil_indices[1] = {3}; + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + + { + /* Values to the left of our first box */ + int ilower[2] = {-3, 1}; + int iupper[2] = {-3, 2}; + + int stencil_indices[1] = {1}; + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + + { + /* Values above our first box */ + int ilower[2] = {-3, 2}; + int iupper[2] = {-1, 2}; + + int stencil_indices[1] = {4}; + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + + { + /* Values to the left of our second box (that do not border the + first box). */ + int ilower[2] = { 0, 3}; + int iupper[2] = { 0, 4}; + + int stencil_indices[1] = {1}; + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + + { + /* Values above our second box */ + int ilower[2] = { 0, 4}; + int iupper[2] = { 2, 4}; + + int stencil_indices[1] = {4}; + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + } + else if (myid == 1) + { + int maxnvalues = 4; + double values[4]; + for (i = 0; i < maxnvalues; i++) + values[i] = 0.0; + + { + /* Values below our box */ + int ilower[2] = { 3, 1}; + int iupper[2] = { 6, 1}; + + int stencil_indices[1] = {3}; + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + + { + /* Values to the right of our box */ + int ilower[2] = { 6, 1}; + int iupper[2] = { 6, 4}; + + int stencil_indices[1] = {2}; + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + + { + /* Values above our box */ + int ilower[2] = { 3, 4}; + int iupper[2] = { 6, 4}; + + int stencil_indices[1] = {4}; + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 1, + stencil_indices, values); + } + } + + /* This is a collective call finalizing the matrix assembly. + The matrix is now ``ready to be used'' */ + HYPRE_StructMatrixAssemble(A); + } + + /* 4. Set up Struct Vectors for b and x */ + { + /* Create an empty vector object */ + HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &b); + HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &x); + + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_StructVectorInitialize(b); + HYPRE_StructVectorInitialize(x); + + if (myid == 0) + { + /* Set the vector coefficients over the gridpoints in my first box */ + { + int ilower[2] = {-3, 1}; + int iupper[2] = {-1, 2}; + + int nvalues = 6; /* 6 grid points */ + double values[6]; + + for (i = 0; i < nvalues; i ++) + values[i] = 1.0; + HYPRE_StructVectorSetBoxValues(b, ilower, iupper, values); + + for (i = 0; i < nvalues; i ++) + values[i] = 0.0; + HYPRE_StructVectorSetBoxValues(x, ilower, iupper, values); + } + + /* Set the vector coefficients over the gridpoints in my second box */ + { + int ilower[2] = { 0, 1}; + int iupper[2] = { 2, 4}; + + int nvalues = 12; /* 12 grid points */ + double values[12]; + + for (i = 0; i < nvalues; i ++) + values[i] = 1.0; + HYPRE_StructVectorSetBoxValues(b, ilower, iupper, values); + + for (i = 0; i < nvalues; i ++) + values[i] = 0.0; + HYPRE_StructVectorSetBoxValues(x, ilower, iupper, values); + } + } + else if (myid == 1) + { + /* Set the vector coefficients over the gridpoints in my box */ + { + int ilower[2] = { 3, 1}; + int iupper[2] = { 6, 4}; + + int nvalues = 16; /* 16 grid points */ + double values[16]; + + for (i = 0; i < nvalues; i ++) + values[i] = 1.0; + HYPRE_StructVectorSetBoxValues(b, ilower, iupper, values); + + for (i = 0; i < nvalues; i ++) + values[i] = 0.0; + HYPRE_StructVectorSetBoxValues(x, ilower, iupper, values); + } + } + + /* This is a collective call finalizing the vector assembly. + The vectors are now ``ready to be used'' */ + HYPRE_StructVectorAssemble(b); + HYPRE_StructVectorAssemble(x); + } + + + /* 5. Set up and use a solver (See the Reference Manual for descriptions + of all of the options.) */ + { + /* Create an empty PCG Struct solver */ + HYPRE_StructPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set PCG parameters */ + HYPRE_StructPCGSetTol(solver, 1.0e-06); + HYPRE_StructPCGSetPrintLevel(solver, 2); + HYPRE_StructPCGSetMaxIter(solver, 50); + + /* Use symmetric SMG as preconditioner */ + HYPRE_StructSMGCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructSMGSetMaxIter(precond, 1); + HYPRE_StructSMGSetTol(precond, 0.0); + HYPRE_StructSMGSetZeroGuess(precond); + HYPRE_StructSMGSetNumPreRelax(precond, 1); + HYPRE_StructSMGSetNumPostRelax(precond, 1); + + /* Set preconditioner and solve */ + HYPRE_StructPCGSetPrecond(solver, HYPRE_StructSMGSolve, + HYPRE_StructSMGSetup, precond); + HYPRE_StructPCGSetup(solver, A, b, x); + HYPRE_StructPCGSolve(solver, A, b, x); + } + + /* Save the solution for GLVis visualization, see vis/glvis-ex2.sh */ + if (vis) + { + GLVis_PrintStructGrid(grid, "vis/ex2.mesh", myid, NULL, NULL); + GLVis_PrintStructVector(x, "vis/ex2.sol", myid); + GLVis_PrintData("vis/ex2.data", myid, num_procs); + } + + /* Free memory */ + HYPRE_StructGridDestroy(grid); + HYPRE_StructStencilDestroy(stencil); + HYPRE_StructMatrixDestroy(A); + HYPRE_StructVectorDestroy(b); + HYPRE_StructVectorDestroy(x); + HYPRE_StructPCGDestroy(solver); + HYPRE_StructSMGDestroy(precond); + + /* Finalize MPI */ + MPI_Finalize(); + + return (0); +} diff --git a/3rd_party/hypre/src/examples/ex3.c b/3rd_party/hypre/src/examples/ex3.c new file mode 100644 index 000000000..b102b6631 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex3.c @@ -0,0 +1,458 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 3 + + Interface: Structured interface (Struct) + + Compile with: make ex3 + + Sample run: mpirun -np 16 ex3 -n 33 -solver 0 -v 1 1 + + To see options: ex3 -help + + Description: This code solves a system corresponding to a discretization + of the Laplace equation -Delta u = 1 with zero boundary + conditions on the unit square. The domain is split into + an N x N processor grid. Thus, the given number of processors + should be a perfect square. Each processor's piece of the + grid has n x n cells with n x n nodes connected by the + standard 5-point stencil. Note that the struct interface + assumes a cell-centered grid, and, therefore, the nodes are + not shared. This example demonstrates more features than the + previous two struct examples (Example 1 and Example 2). Two + solvers are available. + + To incorporate the boundary conditions, we do the following: + Let x_i and x_b be the interior and boundary parts of the + solution vector x. We can split the matrix A as + A = [A_ii A_ib; A_bi A_bb]. + Let u_0 be the Dirichlet B.C. We can simply say that x_b = u_0. + If b_i is the right-hand side, then we just need to solve in + the interior: + A_ii x_i = b_i - A_ib u_0. + For this partitcular example, u_0 = 0, so we are just solving + A_ii x_i = b_i. + + We recommend viewing examples 1 and 2 before viewing this + example. +*/ + +#include +#include "_hypre_utilities.h" +#include "HYPRE_struct_ls.h" + +#include "vis.c" + +int main (int argc, char *argv[]) +{ + int i, j, k; + + int myid, num_procs; + + int n, N, pi, pj; + double h, h2; + int ilower[2], iupper[2]; + + int solver_id; + int n_pre, n_post; + + HYPRE_StructGrid grid; + HYPRE_StructStencil stencil; + HYPRE_StructMatrix A; + HYPRE_StructVector b; + HYPRE_StructVector x; + HYPRE_StructSolver solver; + HYPRE_StructSolver precond; + + int num_iterations; + double final_res_norm; + + int vis; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Set defaults */ + n = 33; + solver_id = 0; + n_pre = 1; + n_post = 1; + vis = 0; + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-solver") == 0 ) + { + arg_index++; + solver_id = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-v") == 0 ) + { + arg_index++; + n_pre = atoi(argv[arg_index++]); + n_post = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size per processor (default: 33)\n"); + printf(" -solver : solver ID\n"); + printf(" 0 - PCG with SMG precond (default)\n"); + printf(" 1 - SMG\n"); + printf(" -v : number of pre and post relaxations (default: 1 1)\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* Figure out the processor grid (N x N). The local problem + size for the interior nodes is indicated by n (n x n). + pi and pj indicate position in the processor grid. */ + N = sqrt(num_procs); + h = 1.0 / (N*n+1); /* note that when calculating h we must + remember to count the boundary nodes */ + h2 = h*h; + pj = myid / N; + pi = myid - pj*N; + + /* Figure out the extents of each processor's piece of the grid. */ + ilower[0] = pi*n; + ilower[1] = pj*n; + + iupper[0] = ilower[0] + n-1; + iupper[1] = ilower[1] + n-1; + + /* 1. Set up a grid */ + { + /* Create an empty 2D grid object */ + HYPRE_StructGridCreate(MPI_COMM_WORLD, 2, &grid); + + /* Add a new box to the grid */ + HYPRE_StructGridSetExtents(grid, ilower, iupper); + + /* This is a collective call finalizing the grid assembly. + The grid is now ``ready to be used'' */ + HYPRE_StructGridAssemble(grid); + } + + /* 2. Define the discretization stencil */ + { + /* Create an empty 2D, 5-pt stencil object */ + HYPRE_StructStencilCreate(2, 5, &stencil); + + /* Define the geometry of the stencil */ + { + int entry; + int offsets[5][2] = {{0,0}, {-1,0}, {1,0}, {0,-1}, {0,1}}; + + for (entry = 0; entry < 5; entry++) + HYPRE_StructStencilSetElement(stencil, entry, offsets[entry]); + } + } + + /* 3. Set up a Struct Matrix */ + { + int nentries = 5; + int nvalues = nentries*n*n; + double *values; + int stencil_indices[5]; + + /* Create an empty matrix object */ + HYPRE_StructMatrixCreate(MPI_COMM_WORLD, grid, stencil, &A); + + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_StructMatrixInitialize(A); + + values = (double*) calloc(nvalues, sizeof(double)); + + for (j = 0; j < nentries; j++) + stencil_indices[j] = j; + + /* Set the standard stencil at each grid point, + we will fix the boundaries later */ + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 4.0; + for (j = 1; j < nentries; j++) + values[i+j] = -1.0; + } + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, nentries, + stencil_indices, values); + + free(values); + } + + /* 4. Incorporate the zero boundary conditions: go along each edge of + the domain and set the stencil entry that reaches to the boundary to + zero.*/ + { + int bc_ilower[2]; + int bc_iupper[2]; + int nentries = 1; + int nvalues = nentries*n; /* number of stencil entries times the length + of one side of my grid box */ + double *values; + int stencil_indices[1]; + + values = (double*) calloc(nvalues, sizeof(double)); + for (j = 0; j < nvalues; j++) + values[j] = 0.0; + + /* Recall: pi and pj describe position in the processor grid */ + if (pj == 0) + { + /* Bottom row of grid points */ + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0] + n-1; + bc_iupper[1] = bc_ilower[1]; + + stencil_indices[0] = 3; + + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, nentries, + stencil_indices, values); + } + + if (pj == N-1) + { + /* upper row of grid points */ + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n + n-1; + + bc_iupper[0] = bc_ilower[0] + n-1; + bc_iupper[1] = bc_ilower[1]; + + stencil_indices[0] = 4; + + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, nentries, + stencil_indices, values); + } + + if (pi == 0) + { + /* Left row of grid points */ + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0]; + bc_iupper[1] = bc_ilower[1] + n-1; + + stencil_indices[0] = 1; + + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, nentries, + stencil_indices, values); + } + + if (pi == N-1) + { + /* Right row of grid points */ + bc_ilower[0] = pi*n + n-1; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0]; + bc_iupper[1] = bc_ilower[1] + n-1; + + stencil_indices[0] = 2; + + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, nentries, + stencil_indices, values); + } + + free(values); + } + + /* This is a collective call finalizing the matrix assembly. + The matrix is now ``ready to be used'' */ + HYPRE_StructMatrixAssemble(A); + + /* 5. Set up Struct Vectors for b and x */ + { + int nvalues = n*n; + double *values; + + values = (double*) calloc(nvalues, sizeof(double)); + + /* Create an empty vector object */ + HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &b); + HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &x); + + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_StructVectorInitialize(b); + HYPRE_StructVectorInitialize(x); + + /* Set the values */ + for (i = 0; i < nvalues; i ++) + values[i] = h2; + HYPRE_StructVectorSetBoxValues(b, ilower, iupper, values); + + for (i = 0; i < nvalues; i ++) + values[i] = 0.0; + HYPRE_StructVectorSetBoxValues(x, ilower, iupper, values); + + free(values); + + /* This is a collective call finalizing the vector assembly. + The vector is now ``ready to be used'' */ + HYPRE_StructVectorAssemble(b); + HYPRE_StructVectorAssemble(x); + } + + /* 6. Set up and use a struct solver + (Solver options can be found in the Reference Manual.) */ + if (solver_id == 0) + { + HYPRE_StructPCGCreate(MPI_COMM_WORLD, &solver); + HYPRE_StructPCGSetMaxIter(solver, 50 ); + HYPRE_StructPCGSetTol(solver, 1.0e-06 ); + HYPRE_StructPCGSetTwoNorm(solver, 1 ); + HYPRE_StructPCGSetRelChange(solver, 0 ); + HYPRE_StructPCGSetPrintLevel(solver, 2 ); /* print each CG iteration */ + HYPRE_StructPCGSetLogging(solver, 1); + + /* Use symmetric SMG as preconditioner */ + HYPRE_StructSMGCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructSMGSetMemoryUse(precond, 0); + HYPRE_StructSMGSetMaxIter(precond, 1); + HYPRE_StructSMGSetTol(precond, 0.0); + HYPRE_StructSMGSetZeroGuess(precond); + HYPRE_StructSMGSetNumPreRelax(precond, 1); + HYPRE_StructSMGSetNumPostRelax(precond, 1); + + /* Set the preconditioner and solve */ + HYPRE_StructPCGSetPrecond(solver, HYPRE_StructSMGSolve, + HYPRE_StructSMGSetup, precond); + HYPRE_StructPCGSetup(solver, A, b, x); + HYPRE_StructPCGSolve(solver, A, b, x); + + /* Get some info on the run */ + HYPRE_StructPCGGetNumIterations(solver, &num_iterations); + HYPRE_StructPCGGetFinalRelativeResidualNorm(solver, &final_res_norm); + + /* Clean up */ + HYPRE_StructPCGDestroy(solver); + HYPRE_StructSMGDestroy(precond); + } + + if (solver_id == 1) + { + HYPRE_StructSMGCreate(MPI_COMM_WORLD, &solver); + HYPRE_StructSMGSetMemoryUse(solver, 0); + HYPRE_StructSMGSetMaxIter(solver, 50); + HYPRE_StructSMGSetTol(solver, 1.0e-06); + HYPRE_StructSMGSetRelChange(solver, 0); + HYPRE_StructSMGSetNumPreRelax(solver, n_pre); + HYPRE_StructSMGSetNumPostRelax(solver, n_post); + /* Logging must be on to get iterations and residual norm info below */ + HYPRE_StructSMGSetLogging(solver, 1); + + /* Setup and solve */ + HYPRE_StructSMGSetup(solver, A, b, x); + HYPRE_StructSMGSolve(solver, A, b, x); + + /* Get some info on the run */ + HYPRE_StructSMGGetNumIterations(solver, &num_iterations); + HYPRE_StructSMGGetFinalRelativeResidualNorm(solver, &final_res_norm); + + /* Clean up */ + HYPRE_StructSMGDestroy(solver); + } + + /* Save the solution for GLVis visualization, see vis/glvis-ex3.sh */ + if (vis) + { + FILE *file; + char filename[255]; + + int nvalues = n*n; + double *values = (double*) calloc(nvalues, sizeof(double)); + + /* get the local solution */ + HYPRE_StructVectorGetBoxValues(x, ilower, iupper, values); + + sprintf(filename, "%s.%06d", "vis/ex3.sol", myid); + if ((file = fopen(filename, "w")) == NULL) + { + printf("Error: can't open output file %s\n", filename); + MPI_Finalize(); + exit(1); + } + + /* save solution with global unknown numbers */ + k = 0; + for (j = 0; j < n; j++) + for (i = 0; i < n; i++) + fprintf(file, "%06d %.14e\n", pj*N*n*n+pi*n+j*N*n+i, values[k++]); + + fflush(file); + fclose(file); + free(values); + + /* save global finite element mesh */ + if (myid == 0) + GLVis_PrintGlobalSquareMesh("vis/ex3.mesh", N*n-1); + } + + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", num_iterations); + printf("Final Relative Residual Norm = %g\n", final_res_norm); + printf("\n"); + } + + /* Free memory */ + HYPRE_StructGridDestroy(grid); + HYPRE_StructStencilDestroy(stencil); + HYPRE_StructMatrixDestroy(A); + HYPRE_StructVectorDestroy(b); + HYPRE_StructVectorDestroy(x); + + /* Finalize MPI */ + MPI_Finalize(); + + return (0); +} diff --git a/3rd_party/hypre/src/examples/ex4.c b/3rd_party/hypre/src/examples/ex4.c new file mode 100644 index 000000000..ab01a2cf2 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex4.c @@ -0,0 +1,1168 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 4 + + Interface: Structured interface (Struct) + + Compile with: make ex4 + + Sample run: mpirun -np 16 ex4 -n 33 -solver 10 -K 3 -B 0 -C 1 -U0 2 -F 4 + + To see options: ex4 -help + + Description: This example differs from the previous structured example + (Example 3) in that a more sophisticated stencil and + boundary conditions are implemented. The method illustrated + here to implement the boundary conditions is much more general + than that in the previous example. Also symmetric storage is + utilized when applicable. + + This code solves the convection-reaction-diffusion problem + div (-K grad u + B u) + C u = F in the unit square with + boundary condition u = U0. The domain is split into N x N + processor grid. Thus, the given number of processors should + be a perfect square. Each processor has a n x n grid, with + nodes connected by a 5-point stencil. Note that the struct + interface assumes a cell-centered grid, and, therefore, the + nodes are not shared. + + To incorporate the boundary conditions, we do the following: + Let x_i and x_b be the interior and boundary parts of the + solution vector x. If we split the matrix A as + A = [A_ii A_ib; A_bi A_bb], + then we solve + [A_ii 0; 0 I] [x_i ; x_b] = [b_i - A_ib u_0; u_0]. + Note that this differs from the previous example in that we + are actually solving for the boundary conditions (so they + may not be exact as in ex3, where we only solved for the + interior). This approach is useful for more general types + of b.c. + + A number of solvers are available. More information can be + found in the Solvers and Preconditioners chapter of the + User's Manual. + + We recommend viewing examples 1, 2, and 3 before viewing this + example. +*/ + +#include +#include "_hypre_utilities.h" +#include "HYPRE_krylov.h" +#include "HYPRE_struct_ls.h" + +#ifdef M_PI + #define PI M_PI +#else + #define PI 3.14159265358979 +#endif + +#include "vis.c" + +/* Macro to evaluate a function F in the grid point (i,j) */ +#define Eval(F,i,j) (F( (ilower[0]+(i))*h, (ilower[1]+(j))*h )) +#define bcEval(F,i,j) (F( (bc_ilower[0]+(i))*h, (bc_ilower[1]+(j))*h )) + +int optionK, optionB, optionC, optionU0, optionF; + +/* Diffusion coefficient */ +double K(double x, double y) +{ + switch (optionK) + { + case 0: + return 1.0; + case 1: + return x*x+exp(y); + case 2: + if ((fabs(x-0.5) < 0.25) && (fabs(y-0.5) < 0.25)) + return 100.0; + else + return 1.0; + case 3: + if (((x-0.5)*(x-0.5)+(y-0.5)*(y-0.5)) < 0.0625) + return 10.0; + else + return 1.0; + default: + return 1.0; + } +} + +/* Convection vector, first component */ +double B1(double x, double y) +{ + switch (optionB) + { + case 0: + return 0.0; + case 1: + return -0.1; + case 2: + return 0.25; + case 3: + return 1.0; + default: + return 0.0; + } +} + +/* Convection vector, second component */ +double B2(double x, double y) +{ + switch (optionB) + { + case 0: + return 0.0; + case 1: + return 0.1; + case 2: + return -0.25; + case 3: + return 1.0; + default: + return 0.0; + } +} + +/* Reaction coefficient */ +double C(double x, double y) +{ + switch (optionC) + { + case 0: + return 0.0; + case 1: + return 10.0; + case 2: + return 100.0; + default: + return 0.0; + } +} + +/* Boundary condition */ +double U0(double x, double y) +{ + switch (optionU0) + { + case 0: + return 0.0; + case 1: + return (x+y)/100; + case 2: + return (sin(5*PI*x)+sin(5*PI*y))/1000; + default: + return 0.0; + } +} + +/* Right-hand side */ +double F(double x, double y) +{ + switch (optionF) + { + case 0: + return 1.0; + case 1: + return 0.0; + case 2: + return 2*PI*PI*sin(PI*x)*sin(PI*y); + case 3: + if ((fabs(x-0.5) < 0.25) && (fabs(y-0.5) < 0.25)) + return -1.0; + else + return 1.0; + case 4: + if (((x-0.5)*(x-0.5)+(y-0.5)*(y-0.5)) < 0.0625) + return -1.0; + else + return 1.0; + default: + return 1.0; + } +} + +int main (int argc, char *argv[]) +{ + int i, j, k; + + int myid, num_procs; + + int n, N, pi, pj; + double h, h2; + int ilower[2], iupper[2]; + + int solver_id; + int n_pre, n_post; + int rap, relax, skip, sym; + int time_index; + + int num_iterations; + double final_res_norm; + + int vis; + + HYPRE_StructGrid grid; + HYPRE_StructStencil stencil; + HYPRE_StructMatrix A; + HYPRE_StructVector b; + HYPRE_StructVector x; + HYPRE_StructSolver solver; + HYPRE_StructSolver precond; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Set default parameters */ + n = 33; + optionK = 0; + optionB = 0; + optionC = 0; + optionU0 = 0; + optionF = 0; + solver_id = 10; + n_pre = 1; + n_post = 1; + rap = 0; + relax = 1; + skip = 0; + sym = 0; + + vis = 0; + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-K") == 0 ) + { + arg_index++; + optionK = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-B") == 0 ) + { + arg_index++; + optionB = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-C") == 0 ) + { + arg_index++; + optionC = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-U0") == 0 ) + { + arg_index++; + optionU0 = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-F") == 0 ) + { + arg_index++; + optionF = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-solver") == 0 ) + { + arg_index++; + solver_id = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-v") == 0 ) + { + arg_index++; + n_pre = atoi(argv[arg_index++]); + n_post = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-rap") == 0 ) + { + arg_index++; + rap = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-relax") == 0 ) + { + arg_index++; + relax = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-skip") == 0 ) + { + arg_index++; + skip = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-sym") == 0 ) + { + arg_index++; + sym = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size per processor (default: 8)\n"); + printf(" -K : choice for the diffusion coefficient (default: 1)\n"); + printf(" -B : choice for the convection vector (default: 0)\n"); + printf(" -C : choice for the reaction coefficient (default: 0)\n"); + printf(" -U0 : choice for the boundary condition (default: 0)\n"); + printf(" -F : choice for the right-hand side (default: 1) \n"); + printf(" -solver : solver ID\n"); + printf(" 0 - SMG \n"); + printf(" 1 - PFMG\n"); + printf(" 10 - CG with SMG precond (default)\n"); + printf(" 11 - CG with PFMG precond\n"); + printf(" 17 - CG with 2-step Jacobi\n"); + printf(" 18 - CG with diagonal scaling\n"); + printf(" 19 - CG\n"); + printf(" 30 - GMRES with SMG precond\n"); + printf(" 31 - GMRES with PFMG precond\n"); + printf(" 37 - GMRES with 2-step Jacobi\n"); + printf(" 38 - GMRES with diagonal scaling\n"); + printf(" 39 - GMRES\n"); + printf(" -v : number of pre and post relaxations\n"); + printf(" -rap : coarse grid operator type\n"); + printf(" 0 - Galerkin (default)\n"); + printf(" 1 - non-Galerkin ParFlow operators\n"); + printf(" 2 - Galerkin, general operators\n"); + printf(" -relax : relaxation type\n"); + printf(" 0 - Jacobi\n"); + printf(" 1 - Weighted Jacobi (default)\n"); + printf(" 2 - R/B Gauss-Seidel\n"); + printf(" 3 - R/B Gauss-Seidel (nonsymmetric)\n"); + printf(" -skip : skip levels in PFMG (0 or 1)\n"); + printf(" -sym : symmetric storage (1) or not (0)\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* Convection produces non-symmetric matrices */ + if (optionB && sym) + optionB = 0; + + /* Figure out the processor grid (N x N). The local + problem size is indicated by n (n x n). pi and pj + indicate position in the processor grid. */ + N = sqrt(num_procs); + h = 1.0 / (N*n-1); + h2 = h*h; + pj = myid / N; + pi = myid - pj*N; + + /* Define the nodes owned by the current processor (each processor's + piece of the global grid) */ + ilower[0] = pi*n; + ilower[1] = pj*n; + iupper[0] = ilower[0] + n-1; + iupper[1] = ilower[1] + n-1; + + /* 1. Set up a grid */ + { + /* Create an empty 2D grid object */ + HYPRE_StructGridCreate(MPI_COMM_WORLD, 2, &grid); + + /* Add a new box to the grid */ + HYPRE_StructGridSetExtents(grid, ilower, iupper); + + /* This is a collective call finalizing the grid assembly. + The grid is now ``ready to be used'' */ + HYPRE_StructGridAssemble(grid); + } + + /* 2. Define the discretization stencil */ + if (sym == 0) + { + /* Define the geometry of the stencil */ + int offsets[5][2] = {{0,0}, {-1,0}, {1,0}, {0,-1}, {0,1}}; + + /* Create an empty 2D, 5-pt stencil object */ + HYPRE_StructStencilCreate(2, 5, &stencil); + + /* Assign stencil entries */ + for (i = 0; i < 5; i++) + HYPRE_StructStencilSetElement(stencil, i, offsets[i]); + } + else /* Symmetric storage */ + { + /* Define the geometry of the stencil */ + int offsets[3][2] = {{0,0}, {1,0}, {0,1}}; + + /* Create an empty 2D, 3-pt stencil object */ + HYPRE_StructStencilCreate(2, 3, &stencil); + + /* Assign stencil entries */ + for (i = 0; i < 3; i++) + HYPRE_StructStencilSetElement(stencil, i, offsets[i]); + } + + /* 3. Set up Struct Vectors for b and x */ + { + double *values; + + /* Create an empty vector object */ + HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &b); + HYPRE_StructVectorCreate(MPI_COMM_WORLD, grid, &x); + + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_StructVectorInitialize(b); + HYPRE_StructVectorInitialize(x); + + values = (double*) calloc((n*n), sizeof(double)); + + /* Set the values of b in left-to-right, bottom-to-top order */ + for (k = 0, j = 0; j < n; j++) + for (i = 0; i < n; i++, k++) + values[k] = h2 * Eval(F,i,j); + HYPRE_StructVectorSetBoxValues(b, ilower, iupper, values); + + /* Set x = 0 */ + for (i = 0; i < (n*n); i ++) + values[i] = 0.0; + HYPRE_StructVectorSetBoxValues(x, ilower, iupper, values); + + free(values); + + /* Assembling is postponed since the vectors will be further modified */ + } + + /* 4. Set up a Struct Matrix */ + { + /* Create an empty matrix object */ + HYPRE_StructMatrixCreate(MPI_COMM_WORLD, grid, stencil, &A); + + /* Use symmetric storage? */ + HYPRE_StructMatrixSetSymmetric(A, sym); + + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_StructMatrixInitialize(A); + + /* Set the stencil values in the interior. Here we set the values + at every node. We will modify the boundary nodes later. */ + if (sym == 0) + { + int stencil_indices[5] = {0, 1, 2, 3, 4}; /* labels correspond + to the offsets */ + double *values; + + values = (double*) calloc(5*(n*n), sizeof(double)); + + /* The order is left-to-right, bottom-to-top */ + for (k = 0, j = 0; j < n; j++) + for (i = 0; i < n; i++, k+=5) + { + values[k+1] = - Eval(K,i-0.5,j) - Eval(B1,i-0.5,j); + + values[k+2] = - Eval(K,i+0.5,j) + Eval(B1,i+0.5,j); + + values[k+3] = - Eval(K,i,j-0.5) - Eval(B2,i,j-0.5); + + values[k+4] = - Eval(K,i,j+0.5) + Eval(B2,i,j+0.5); + + values[k] = h2 * Eval(C,i,j) + + Eval(K ,i-0.5,j) + Eval(K ,i+0.5,j) + + Eval(K ,i,j-0.5) + Eval(K ,i,j+0.5) + - Eval(B1,i-0.5,j) + Eval(B1,i+0.5,j) + - Eval(B2,i,j-0.5) + Eval(B2,i,j+0.5); + } + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 5, + stencil_indices, values); + + free(values); + } + else /* Symmetric storage */ + { + int stencil_indices[3] = {0, 1, 2}; + double *values; + + values = (double*) calloc(3*(n*n), sizeof(double)); + + /* The order is left-to-right, bottom-to-top */ + for (k = 0, j = 0; j < n; j++) + for (i = 0; i < n; i++, k+=3) + { + values[k+1] = - Eval(K,i+0.5,j); + values[k+2] = - Eval(K,i,j+0.5); + values[k] = h2 * Eval(C,i,j) + + Eval(K,i+0.5,j) + Eval(K,i,j+0.5) + + Eval(K,i-0.5,j) + Eval(K,i,j-0.5); + } + + HYPRE_StructMatrixSetBoxValues(A, ilower, iupper, 3, + stencil_indices, values); + + free(values); + } + } + + /* 5. Set the boundary conditions, while eliminating the coefficients + reaching ouside of the domain boundary. We must modify the matrix + stencil and the corresponding rhs entries. */ + { + int bc_ilower[2]; + int bc_iupper[2]; + + int stencil_indices[5] = {0, 1, 2, 3, 4}; + double *values, *bvalues; + + int nentries; + if (sym == 0) + nentries = 5; + else + nentries = 3; + + values = (double*) calloc(nentries*n, sizeof(double)); + bvalues = (double*) calloc(n, sizeof(double)); + + /* The stencil at the boundary nodes is 1-0-0-0-0. Because + we have I x_b = u_0; */ + for (i = 0; i < nentries*n; i += nentries) + { + values[i] = 1.0; + for (j = 1; j < nentries; j++) + values[i+j] = 0.0; + } + + /* Processors at y = 0 */ + if (pj == 0) + { + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0] + n-1; + bc_iupper[1] = bc_ilower[1]; + + /* Modify the matrix */ + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, nentries, + stencil_indices, values); + + /* Put the boundary conditions in b */ + for (i = 0; i < n; i++) + bvalues[i] = bcEval(U0,i,0); + + HYPRE_StructVectorSetBoxValues(b, bc_ilower, bc_iupper, bvalues); + } + + /* Processors at y = 1 */ + if (pj == N-1) + { + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n + n-1; + + bc_iupper[0] = bc_ilower[0] + n-1; + bc_iupper[1] = bc_ilower[1]; + + /* Modify the matrix */ + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, nentries, + stencil_indices, values); + + /* Put the boundary conditions in b */ + for (i = 0; i < n; i++) + bvalues[i] = bcEval(U0,i,0); + + HYPRE_StructVectorSetBoxValues(b, bc_ilower, bc_iupper, bvalues); + } + + /* Processors at x = 0 */ + if (pi == 0) + { + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0]; + bc_iupper[1] = bc_ilower[1] + n-1; + + /* Modify the matrix */ + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, nentries, + stencil_indices, values); + + /* Put the boundary conditions in b */ + for (j = 0; j < n; j++) + bvalues[j] = bcEval(U0,0,j); + + HYPRE_StructVectorSetBoxValues(b, bc_ilower, bc_iupper, bvalues); + } + + /* Processors at x = 1 */ + if (pi == N-1) + { + bc_ilower[0] = pi*n + n-1; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0]; + bc_iupper[1] = bc_ilower[1] + n-1; + + /* Modify the matrix */ + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, nentries, + stencil_indices, values); + + /* Put the boundary conditions in b */ + for (j = 0; j < n; j++) + bvalues[j] = bcEval(U0,0,j); + + HYPRE_StructVectorSetBoxValues(b, bc_ilower, bc_iupper, bvalues); + } + + /* Recall that the system we are solving is: + [A_ii 0; 0 I] [x_i ; x_b] = [b_i - A_ib u_0; u_0]. + This requires removing the connections between the interior + and boundary nodes that we have set up when we set the + 5pt stencil at each node. We adjust for removing + these connections by appropriately modifying the rhs. + For the symm ordering scheme, just do the top and right + boundary */ + + /* Processors at y = 0, neighbors of boundary nodes */ + if (pj == 0) + { + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n + 1; + + bc_iupper[0] = bc_ilower[0] + n-1; + bc_iupper[1] = bc_ilower[1]; + + stencil_indices[0] = 3; + + /* Modify the matrix */ + for (i = 0; i < n; i++) + bvalues[i] = 0.0; + + if (sym == 0) + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, 1, + stencil_indices, bvalues); + + /* Eliminate the boundary conditions in b */ + for (i = 0; i < n; i++) + bvalues[i] = bcEval(U0,i,-1) * (bcEval(K,i,-0.5)+bcEval(B2,i,-0.5)); + + if (pi == 0) + bvalues[0] = 0.0; + + if (pi == N-1) + bvalues[n-1] = 0.0; + + /* Note the use of AddToBoxValues (because we have already set values + at these nodes) */ + HYPRE_StructVectorAddToBoxValues(b, bc_ilower, bc_iupper, bvalues); + } + + /* Processors at x = 0, neighbors of boundary nodes */ + if (pi == 0) + { + bc_ilower[0] = pi*n + 1; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0]; + bc_iupper[1] = bc_ilower[1] + n-1; + + stencil_indices[0] = 1; + + /* Modify the matrix */ + for (j = 0; j < n; j++) + bvalues[j] = 0.0; + + if (sym == 0) + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, 1, + stencil_indices, bvalues); + + /* Eliminate the boundary conditions in b */ + for (j = 0; j < n; j++) + bvalues[j] = bcEval(U0,-1,j) * (bcEval(K,-0.5,j)+bcEval(B1,-0.5,j)); + + if (pj == 0) + bvalues[0] = 0.0; + + if (pj == N-1) + bvalues[n-1] = 0.0; + + HYPRE_StructVectorAddToBoxValues(b, bc_ilower, bc_iupper, bvalues); + } + + /* Processors at y = 1, neighbors of boundary nodes */ + if (pj == N-1) + { + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n + (n-1) -1; + + bc_iupper[0] = bc_ilower[0] + n-1; + bc_iupper[1] = bc_ilower[1]; + + if (sym == 0) + stencil_indices[0] = 4; + else + stencil_indices[0] = 2; + + /* Modify the matrix */ + for (i = 0; i < n; i++) + bvalues[i] = 0.0; + + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, 1, + stencil_indices, bvalues); + + /* Eliminate the boundary conditions in b */ + for (i = 0; i < n; i++) + bvalues[i] = bcEval(U0,i,1) * (bcEval(K,i,0.5)+bcEval(B2,i,0.5)); + + if (pi == 0) + bvalues[0] = 0.0; + + if (pi == N-1) + bvalues[n-1] = 0.0; + + HYPRE_StructVectorAddToBoxValues(b, bc_ilower, bc_iupper, bvalues); + } + + /* Processors at x = 1, neighbors of boundary nodes */ + if (pi == N-1) + { + bc_ilower[0] = pi*n + (n-1) - 1; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0]; + bc_iupper[1] = bc_ilower[1] + n-1; + + if (sym == 0) + stencil_indices[0] = 2; + else + stencil_indices[0] = 1; + + /* Modify the matrix */ + for (j = 0; j < n; j++) + bvalues[j] = 0.0; + + HYPRE_StructMatrixSetBoxValues(A, bc_ilower, bc_iupper, 1, + stencil_indices, bvalues); + + /* Eliminate the boundary conditions in b */ + for (j = 0; j < n; j++) + bvalues[j] = bcEval(U0,1,j) * (bcEval(K,0.5,j)+bcEval(B1,0.5,j)); + + if (pj == 0) + bvalues[0] = 0.0; + + if (pj == N-1) + bvalues[n-1] = 0.0; + + HYPRE_StructVectorAddToBoxValues(b, bc_ilower, bc_iupper, bvalues); + } + + free(values); + free(bvalues); + } + + /* Finalize the vector and matrix assembly */ + HYPRE_StructMatrixAssemble(A); + HYPRE_StructVectorAssemble(b); + HYPRE_StructVectorAssemble(x); + + /* 6. Set up and use a solver */ + if (solver_id == 0) /* SMG */ + { + /* Start timing */ + time_index = hypre_InitializeTiming("SMG Setup"); + hypre_BeginTiming(time_index); + + /* Options and setup */ + HYPRE_StructSMGCreate(MPI_COMM_WORLD, &solver); + HYPRE_StructSMGSetMemoryUse(solver, 0); + HYPRE_StructSMGSetMaxIter(solver, 50); + HYPRE_StructSMGSetTol(solver, 1.0e-06); + HYPRE_StructSMGSetRelChange(solver, 0); + HYPRE_StructSMGSetNumPreRelax(solver, n_pre); + HYPRE_StructSMGSetNumPostRelax(solver, n_post); + HYPRE_StructSMGSetPrintLevel(solver, 1); + HYPRE_StructSMGSetLogging(solver, 1); + HYPRE_StructSMGSetup(solver, A, b, x); + + /* Finalize current timing */ + hypre_EndTiming(time_index); + hypre_PrintTiming("Setup phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Start timing again */ + time_index = hypre_InitializeTiming("SMG Solve"); + hypre_BeginTiming(time_index); + + /* Solve */ + HYPRE_StructSMGSolve(solver, A, b, x); + + /* Finalize current timing */ + hypre_EndTiming(time_index); + hypre_PrintTiming("Solve phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Get info and release memory */ + HYPRE_StructSMGGetNumIterations(solver, &num_iterations); + HYPRE_StructSMGGetFinalRelativeResidualNorm(solver, &final_res_norm); + HYPRE_StructSMGDestroy(solver); + } + + if (solver_id == 1) /* PFMG */ + { + /* Start timing */ + time_index = hypre_InitializeTiming("PFMG Setup"); + hypre_BeginTiming(time_index); + + /* Options and setup */ + HYPRE_StructPFMGCreate(MPI_COMM_WORLD, &solver); + HYPRE_StructPFMGSetMaxIter(solver, 50); + HYPRE_StructPFMGSetTol(solver, 1.0e-06); + HYPRE_StructPFMGSetRelChange(solver, 0); + HYPRE_StructPFMGSetRAPType(solver, rap); + HYPRE_StructPFMGSetRelaxType(solver, relax); + HYPRE_StructPFMGSetNumPreRelax(solver, n_pre); + HYPRE_StructPFMGSetNumPostRelax(solver, n_post); + HYPRE_StructPFMGSetSkipRelax(solver, skip); + HYPRE_StructPFMGSetPrintLevel(solver, 1); + HYPRE_StructPFMGSetLogging(solver, 1); + HYPRE_StructPFMGSetup(solver, A, b, x); + + /* Finalize current timing */ + hypre_EndTiming(time_index); + hypre_PrintTiming("Setup phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Start timing again */ + time_index = hypre_InitializeTiming("PFMG Solve"); + hypre_BeginTiming(time_index); + + /* Solve */ + HYPRE_StructPFMGSolve(solver, A, b, x); + + /* Finalize current timing */ + hypre_EndTiming(time_index); + hypre_PrintTiming("Solve phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Get info and release memory */ + HYPRE_StructPFMGGetNumIterations(solver, &num_iterations); + HYPRE_StructPFMGGetFinalRelativeResidualNorm(solver, &final_res_norm); + HYPRE_StructPFMGDestroy(solver); + } + + /* Preconditioned CG */ + if ((solver_id > 9) && (solver_id < 20)) + { + time_index = hypre_InitializeTiming("PCG Setup"); + hypre_BeginTiming(time_index); + + HYPRE_StructPCGCreate(MPI_COMM_WORLD, &solver); + HYPRE_StructPCGSetMaxIter(solver, 200 ); + HYPRE_StructPCGSetTol(solver, 1.0e-06 ); + HYPRE_StructPCGSetTwoNorm(solver, 1 ); + HYPRE_StructPCGSetRelChange(solver, 0 ); + HYPRE_StructPCGSetPrintLevel(solver, 2 ); + + if (solver_id == 10) + { + /* use symmetric SMG as preconditioner */ + HYPRE_StructSMGCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructSMGSetMemoryUse(precond, 0); + HYPRE_StructSMGSetMaxIter(precond, 1); + HYPRE_StructSMGSetTol(precond, 0.0); + HYPRE_StructSMGSetZeroGuess(precond); + HYPRE_StructSMGSetNumPreRelax(precond, n_pre); + HYPRE_StructSMGSetNumPostRelax(precond, n_post); + HYPRE_StructSMGSetPrintLevel(precond, 0); + HYPRE_StructSMGSetLogging(precond, 0); + HYPRE_StructPCGSetPrecond(solver, + HYPRE_StructSMGSolve, + HYPRE_StructSMGSetup, + precond); + } + + else if (solver_id == 11) + { + /* use symmetric PFMG as preconditioner */ + HYPRE_StructPFMGCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructPFMGSetMaxIter(precond, 1); + HYPRE_StructPFMGSetTol(precond, 0.0); + HYPRE_StructPFMGSetZeroGuess(precond); + HYPRE_StructPFMGSetRAPType(precond, rap); + HYPRE_StructPFMGSetRelaxType(precond, relax); + HYPRE_StructPFMGSetNumPreRelax(precond, n_pre); + HYPRE_StructPFMGSetNumPostRelax(precond, n_post); + HYPRE_StructPFMGSetSkipRelax(precond, skip); + HYPRE_StructPFMGSetPrintLevel(precond, 0); + HYPRE_StructPFMGSetLogging(precond, 0); + HYPRE_StructPCGSetPrecond(solver, + HYPRE_StructPFMGSolve, + HYPRE_StructPFMGSetup, + precond); + } + + else if (solver_id == 17) + { + /* use two-step Jacobi as preconditioner */ + HYPRE_StructJacobiCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructJacobiSetMaxIter(precond, 2); + HYPRE_StructJacobiSetTol(precond, 0.0); + HYPRE_StructJacobiSetZeroGuess(precond); + HYPRE_StructPCGSetPrecond( solver, + HYPRE_StructJacobiSolve, + HYPRE_StructJacobiSetup, + precond); + } + + else if (solver_id == 18) + { + /* use diagonal scaling as preconditioner */ + precond = NULL; + HYPRE_StructPCGSetPrecond(solver, + HYPRE_StructDiagScale, + HYPRE_StructDiagScaleSetup, + precond); + } + + /* PCG Setup */ + HYPRE_StructPCGSetup(solver, A, b, x ); + + hypre_EndTiming(time_index); + hypre_PrintTiming("Setup phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + time_index = hypre_InitializeTiming("PCG Solve"); + hypre_BeginTiming(time_index); + + /* PCG Solve */ + HYPRE_StructPCGSolve(solver, A, b, x); + + hypre_EndTiming(time_index); + hypre_PrintTiming("Solve phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Get info and release memory */ + HYPRE_StructPCGGetNumIterations( solver, &num_iterations ); + HYPRE_StructPCGGetFinalRelativeResidualNorm( solver, &final_res_norm ); + HYPRE_StructPCGDestroy(solver); + + if (solver_id == 10) + { + HYPRE_StructSMGDestroy(precond); + } + else if (solver_id == 11 ) + { + HYPRE_StructPFMGDestroy(precond); + } + else if (solver_id == 17) + { + HYPRE_StructJacobiDestroy(precond); + } + } + + /* Preconditioned GMRES */ + if ((solver_id > 29) && (solver_id < 40)) + { + time_index = hypre_InitializeTiming("GMRES Setup"); + hypre_BeginTiming(time_index); + + HYPRE_StructGMRESCreate(MPI_COMM_WORLD, &solver); + + /* Note that GMRES can be used with all the interfaces - not + just the struct. So here we demonstrate the + more generic GMRES interface functions. Since we have chosen + a struct solver then we must type cast to the more generic + HYPRE_Solver when setting options with these generic functions. + Note that one could declare the solver to be + type HYPRE_Solver, and then the casting would not be necessary.*/ + + HYPRE_GMRESSetMaxIter((HYPRE_Solver) solver, 500 ); + HYPRE_GMRESSetKDim((HYPRE_Solver) solver,30); + HYPRE_GMRESSetTol((HYPRE_Solver) solver, 1.0e-06 ); + HYPRE_GMRESSetPrintLevel((HYPRE_Solver) solver, 2 ); + HYPRE_GMRESSetLogging((HYPRE_Solver) solver, 1 ); + + if (solver_id == 30) + { + /* use symmetric SMG as preconditioner */ + HYPRE_StructSMGCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructSMGSetMemoryUse(precond, 0); + HYPRE_StructSMGSetMaxIter(precond, 1); + HYPRE_StructSMGSetTol(precond, 0.0); + HYPRE_StructSMGSetZeroGuess(precond); + HYPRE_StructSMGSetNumPreRelax(precond, n_pre); + HYPRE_StructSMGSetNumPostRelax(precond, n_post); + HYPRE_StructSMGSetPrintLevel(precond, 0); + HYPRE_StructSMGSetLogging(precond, 0); + HYPRE_StructGMRESSetPrecond(solver, + HYPRE_StructSMGSolve, + HYPRE_StructSMGSetup, + precond); + } + + else if (solver_id == 31) + { + /* use symmetric PFMG as preconditioner */ + HYPRE_StructPFMGCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructPFMGSetMaxIter(precond, 1); + HYPRE_StructPFMGSetTol(precond, 0.0); + HYPRE_StructPFMGSetZeroGuess(precond); + HYPRE_StructPFMGSetRAPType(precond, rap); + HYPRE_StructPFMGSetRelaxType(precond, relax); + HYPRE_StructPFMGSetNumPreRelax(precond, n_pre); + HYPRE_StructPFMGSetNumPostRelax(precond, n_post); + HYPRE_StructPFMGSetSkipRelax(precond, skip); + HYPRE_StructPFMGSetPrintLevel(precond, 0); + HYPRE_StructPFMGSetLogging(precond, 0); + HYPRE_StructGMRESSetPrecond( solver, + HYPRE_StructPFMGSolve, + HYPRE_StructPFMGSetup, + precond); + } + + else if (solver_id == 37) + { + /* use two-step Jacobi as preconditioner */ + HYPRE_StructJacobiCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructJacobiSetMaxIter(precond, 2); + HYPRE_StructJacobiSetTol(precond, 0.0); + HYPRE_StructJacobiSetZeroGuess(precond); + HYPRE_StructGMRESSetPrecond( solver, + HYPRE_StructJacobiSolve, + HYPRE_StructJacobiSetup, + precond); + } + + else if (solver_id == 38) + { + /* use diagonal scaling as preconditioner */ + precond = NULL; + HYPRE_StructGMRESSetPrecond( solver, + HYPRE_StructDiagScale, + HYPRE_StructDiagScaleSetup, + precond); + } + + /* GMRES Setup */ + HYPRE_StructGMRESSetup(solver, A, b, x ); + + hypre_EndTiming(time_index); + hypre_PrintTiming("Setup phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + time_index = hypre_InitializeTiming("GMRES Solve"); + hypre_BeginTiming(time_index); + + /* GMRES Solve */ + HYPRE_StructGMRESSolve(solver, A, b, x); + + hypre_EndTiming(time_index); + hypre_PrintTiming("Solve phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Get info and release memory */ + HYPRE_StructGMRESGetNumIterations(solver, &num_iterations); + HYPRE_StructGMRESGetFinalRelativeResidualNorm(solver, &final_res_norm); + HYPRE_StructGMRESDestroy(solver); + + if (solver_id == 30) + { + HYPRE_StructSMGDestroy(precond); + } + else if (solver_id == 31) + { + HYPRE_StructPFMGDestroy(precond); + } + else if (solver_id == 37) + { + HYPRE_StructJacobiDestroy(precond); + } + } + + /* Save the solution for GLVis visualization, see vis/glvis-ex4.sh */ + if (vis) + { + FILE *file; + char filename[255]; + + int nvalues = n*n; + double *values = (double*) calloc(nvalues, sizeof(double)); + + /* get the local solution */ + HYPRE_StructVectorGetBoxValues(x, ilower, iupper, values); + + sprintf(filename, "%s.%06d", "vis/ex4.sol", myid); + if ((file = fopen(filename, "w")) == NULL) + { + printf("Error: can't open output file %s\n", filename); + MPI_Finalize(); + exit(1); + } + + /* save solution with global unknown numbers */ + k = 0; + for (j = 0; j < n; j++) + for (i = 0; i < n; i++) + fprintf(file, "%06d %.14e\n", pj*N*n*n+pi*n+j*N*n+i, values[k++]); + + fflush(file); + fclose(file); + free(values); + + /* save global finite element mesh */ + if (myid == 0) + GLVis_PrintGlobalSquareMesh("vis/ex4.mesh", N*n-1); + } + + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", num_iterations); + printf("Final Relative Residual Norm = %e\n", final_res_norm); + printf("\n"); + } + + /* Free memory */ + HYPRE_StructGridDestroy(grid); + HYPRE_StructStencilDestroy(stencil); + HYPRE_StructMatrixDestroy(A); + HYPRE_StructVectorDestroy(b); + HYPRE_StructVectorDestroy(x); + + /* Finalize MPI */ + MPI_Finalize(); + + return (0); +} diff --git a/3rd_party/hypre/src/examples/ex5.c b/3rd_party/hypre/src/examples/ex5.c new file mode 100644 index 000000000..7586f54b5 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex5.c @@ -0,0 +1,625 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 5 + + Interface: Linear-Algebraic (IJ) + + Compile with: make ex5 + + Sample run: mpirun -np 4 ex5 + + Description: This example solves the 2-D Laplacian problem with zero boundary + conditions on an n x n grid. The number of unknowns is N=n^2. + The standard 5-point stencil is used, and we solve for the + interior nodes only. + + This example solves the same problem as Example 3. Available + solvers are AMG, PCG, and PCG with AMG or Parasails + preconditioners. */ + +#include +#include "_hypre_utilities.h" +#include "HYPRE_krylov.h" +#include "HYPRE.h" +#include "HYPRE_parcsr_ls.h" + +#include "vis.c" + +int hypre_FlexGMRESModifyPCAMGExample(void *precond_data, int iterations, + double rel_residual_norm); + + +int main (int argc, char *argv[]) +{ + int i; + int myid, num_procs; + int N, n; + + int ilower, iupper; + int local_size, extra; + + int solver_id; + int vis, print_system; + + double h, h2; + + HYPRE_IJMatrix A; + HYPRE_ParCSRMatrix parcsr_A; + HYPRE_IJVector b; + HYPRE_ParVector par_b; + HYPRE_IJVector x; + HYPRE_ParVector par_x; + + HYPRE_Solver solver, precond; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Default problem parameters */ + n = 33; + solver_id = 0; + vis = 0; + print_system = 0; + + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-solver") == 0 ) + { + arg_index++; + solver_id = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-print_system") == 0 ) + { + arg_index++; + print_system = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size in each direction (default: 33)\n"); + printf(" -solver : solver ID\n"); + printf(" 0 - AMG (default) \n"); + printf(" 1 - AMG-PCG\n"); + printf(" 8 - ParaSails-PCG\n"); + printf(" 50 - PCG\n"); + printf(" 61 - AMG-FlexGMRES\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf(" -print_system : print the matrix and rhs\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* Preliminaries: want at least one processor per row */ + if (n*n < num_procs) n = sqrt(num_procs) + 1; + N = n*n; /* global number of rows */ + h = 1.0/(n+1); /* mesh size*/ + h2 = h*h; + + /* Each processor knows only of its own rows - the range is denoted by ilower + and upper. Here we partition the rows. We account for the fact that + N may not divide evenly by the number of processors. */ + local_size = N/num_procs; + extra = N - local_size*num_procs; + + ilower = local_size*myid; + ilower += hypre_min(myid, extra); + + iupper = local_size*(myid+1); + iupper += hypre_min(myid+1, extra); + iupper = iupper - 1; + + /* How many rows do I have? */ + local_size = iupper - ilower + 1; + + /* Create the matrix. + Note that this is a square matrix, so we indicate the row partition + size twice (since number of rows = number of cols) */ + HYPRE_IJMatrixCreate(MPI_COMM_WORLD, ilower, iupper, ilower, iupper, &A); + + /* Choose a parallel csr format storage (see the User's Manual) */ + HYPRE_IJMatrixSetObjectType(A, HYPRE_PARCSR); + + /* Initialize before setting coefficients */ + HYPRE_IJMatrixInitialize(A); + + /* Now go through my local rows and set the matrix entries. + Each row has at most 5 entries. For example, if n=3: + + A = [M -I 0; -I M -I; 0 -I M] + M = [4 -1 0; -1 4 -1; 0 -1 4] + + Note that here we are setting one row at a time, though + one could set all the rows together (see the User's Manual). + */ + { + int nnz; + double values[5]; + int cols[5]; + + for (i = ilower; i <= iupper; i++) + { + nnz = 0; + + /* The left identity block:position i-n */ + if ((i-n)>=0) + { + cols[nnz] = i-n; + values[nnz] = -1.0; + nnz++; + } + + /* The left -1: position i-1 */ + if (i%n) + { + cols[nnz] = i-1; + values[nnz] = -1.0; + nnz++; + } + + /* Set the diagonal: position i */ + cols[nnz] = i; + values[nnz] = 4.0; + nnz++; + + /* The right -1: position i+1 */ + if ((i+1)%n) + { + cols[nnz] = i+1; + values[nnz] = -1.0; + nnz++; + } + + /* The right identity block:position i+n */ + if ((i+n)< N) + { + cols[nnz] = i+n; + values[nnz] = -1.0; + nnz++; + } + + /* Set the values for row i */ + HYPRE_IJMatrixSetValues(A, 1, &nnz, &i, cols, values); + } + } + + /* Assemble after setting the coefficients */ + HYPRE_IJMatrixAssemble(A); + + /* Note: for the testing of small problems, one may wish to read + in a matrix in IJ format (for the format, see the output files + from the -print_system option). + In this case, one would use the following routine: + HYPRE_IJMatrixRead( , MPI_COMM_WORLD, + HYPRE_PARCSR, &A ); + = IJ.A.out to read in what has been printed out + by -print_system (processor numbers are omitted). + A call to HYPRE_IJMatrixRead is an *alternative* to the + following sequence of HYPRE_IJMatrix calls: + Create, SetObjectType, Initialize, SetValues, and Assemble + */ + + + /* Get the parcsr matrix object to use */ + HYPRE_IJMatrixGetObject(A, (void**) &parcsr_A); + + + /* Create the rhs and solution */ + HYPRE_IJVectorCreate(MPI_COMM_WORLD, ilower, iupper,&b); + HYPRE_IJVectorSetObjectType(b, HYPRE_PARCSR); + HYPRE_IJVectorInitialize(b); + + HYPRE_IJVectorCreate(MPI_COMM_WORLD, ilower, iupper,&x); + HYPRE_IJVectorSetObjectType(x, HYPRE_PARCSR); + HYPRE_IJVectorInitialize(x); + + /* Set the rhs values to h^2 and the solution to zero */ + { + double *rhs_values, *x_values; + int *rows; + + rhs_values = (double*) calloc(local_size, sizeof(double)); + x_values = (double*) calloc(local_size, sizeof(double)); + rows = (int*) calloc(local_size, sizeof(int)); + + for (i=0; i, MPI_COMM_WORLD, + HYPRE_PARCSR, &b ); + as an alternative to the + following sequence of HYPRE_IJVectors calls: + Create, SetObjectType, Initialize, SetValues, and Assemble + */ + HYPRE_IJVectorGetObject(b, (void **) &par_b); + + HYPRE_IJVectorAssemble(x); + HYPRE_IJVectorGetObject(x, (void **) &par_x); + + + /* Print out the system - files names will be IJ.out.A.XXXXX + and IJ.out.b.XXXXX, where XXXXX = processor id */ + if (print_system) + { + HYPRE_IJMatrixPrint(A, "IJ.out.A"); + HYPRE_IJVectorPrint(b, "IJ.out.b"); + } + + + /* Choose a solver and solve the system */ + + /* AMG */ + if (solver_id == 0) + { + int num_iterations; + double final_res_norm; + + /* Create solver */ + HYPRE_BoomerAMGCreate(&solver); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_BoomerAMGSetPrintLevel(solver, 3); /* print solve info + parameters */ + HYPRE_BoomerAMGSetOldDefault(solver); /* Falgout coarsening with modified classical interpolaiton */ + HYPRE_BoomerAMGSetRelaxType(solver, 3); /* G-S/Jacobi hybrid relaxation */ + HYPRE_BoomerAMGSetRelaxOrder(solver, 1); /* uses C/F relaxation */ + HYPRE_BoomerAMGSetNumSweeps(solver, 1); /* Sweeeps on each level */ + HYPRE_BoomerAMGSetMaxLevels(solver, 20); /* maximum number of levels */ + HYPRE_BoomerAMGSetTol(solver, 1e-7); /* conv. tolerance */ + + /* Now setup and solve! */ + HYPRE_BoomerAMGSetup(solver, parcsr_A, par_b, par_x); + HYPRE_BoomerAMGSolve(solver, parcsr_A, par_b, par_x); + + /* Run info - needed logging turned on */ + HYPRE_BoomerAMGGetNumIterations(solver, &num_iterations); + HYPRE_BoomerAMGGetFinalRelativeResidualNorm(solver, &final_res_norm); + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", num_iterations); + printf("Final Relative Residual Norm = %e\n", final_res_norm); + printf("\n"); + } + + /* Destroy solver */ + HYPRE_BoomerAMGDestroy(solver); + } + /* PCG */ + else if (solver_id == 50) + { + int num_iterations; + double final_res_norm; + + /* Create solver */ + HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_PCGSetMaxIter(solver, 1000); /* max iterations */ + HYPRE_PCGSetTol(solver, 1e-7); /* conv. tolerance */ + HYPRE_PCGSetTwoNorm(solver, 1); /* use the two norm as the stopping criteria */ + HYPRE_PCGSetPrintLevel(solver, 2); /* prints out the iteration info */ + HYPRE_PCGSetLogging(solver, 1); /* needed to get run info later */ + + /* Now setup and solve! */ + HYPRE_ParCSRPCGSetup(solver, parcsr_A, par_b, par_x); + HYPRE_ParCSRPCGSolve(solver, parcsr_A, par_b, par_x); + + /* Run info - needed logging turned on */ + HYPRE_PCGGetNumIterations(solver, &num_iterations); + HYPRE_PCGGetFinalRelativeResidualNorm(solver, &final_res_norm); + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", num_iterations); + printf("Final Relative Residual Norm = %e\n", final_res_norm); + printf("\n"); + } + + /* Destroy solver */ + HYPRE_ParCSRPCGDestroy(solver); + } + /* PCG with AMG preconditioner */ + else if (solver_id == 1) + { + int num_iterations; + double final_res_norm; + + /* Create solver */ + HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_PCGSetMaxIter(solver, 1000); /* max iterations */ + HYPRE_PCGSetTol(solver, 1e-7); /* conv. tolerance */ + HYPRE_PCGSetTwoNorm(solver, 1); /* use the two norm as the stopping criteria */ + HYPRE_PCGSetPrintLevel(solver, 2); /* print solve info */ + HYPRE_PCGSetLogging(solver, 1); /* needed to get run info later */ + + /* Now set up the AMG preconditioner and specify any parameters */ + HYPRE_BoomerAMGCreate(&precond); + HYPRE_BoomerAMGSetPrintLevel(precond, 1); /* print amg solution info */ + HYPRE_BoomerAMGSetCoarsenType(precond, 6); + HYPRE_BoomerAMGSetOldDefault(precond); + HYPRE_BoomerAMGSetRelaxType(precond, 6); /* Sym G.S./Jacobi hybrid */ + HYPRE_BoomerAMGSetNumSweeps(precond, 1); + HYPRE_BoomerAMGSetTol(precond, 0.0); /* conv. tolerance zero */ + HYPRE_BoomerAMGSetMaxIter(precond, 1); /* do only one iteration! */ + + /* Set the PCG preconditioner */ + HYPRE_PCGSetPrecond(solver, (HYPRE_PtrToSolverFcn) HYPRE_BoomerAMGSolve, + (HYPRE_PtrToSolverFcn) HYPRE_BoomerAMGSetup, precond); + + /* Now setup and solve! */ + HYPRE_ParCSRPCGSetup(solver, parcsr_A, par_b, par_x); + HYPRE_ParCSRPCGSolve(solver, parcsr_A, par_b, par_x); + + /* Run info - needed logging turned on */ + HYPRE_PCGGetNumIterations(solver, &num_iterations); + HYPRE_PCGGetFinalRelativeResidualNorm(solver, &final_res_norm); + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", num_iterations); + printf("Final Relative Residual Norm = %e\n", final_res_norm); + printf("\n"); + } + + /* Destroy solver and preconditioner */ + HYPRE_ParCSRPCGDestroy(solver); + HYPRE_BoomerAMGDestroy(precond); + } + /* PCG with Parasails Preconditioner */ + else if (solver_id == 8) + { + int num_iterations; + double final_res_norm; + + int sai_max_levels = 1; + double sai_threshold = 0.1; + double sai_filter = 0.05; + int sai_sym = 1; + + /* Create solver */ + HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_PCGSetMaxIter(solver, 1000); /* max iterations */ + HYPRE_PCGSetTol(solver, 1e-7); /* conv. tolerance */ + HYPRE_PCGSetTwoNorm(solver, 1); /* use the two norm as the stopping criteria */ + HYPRE_PCGSetPrintLevel(solver, 2); /* print solve info */ + HYPRE_PCGSetLogging(solver, 1); /* needed to get run info later */ + + /* Now set up the ParaSails preconditioner and specify any parameters */ + HYPRE_ParaSailsCreate(MPI_COMM_WORLD, &precond); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_ParaSailsSetParams(precond, sai_threshold, sai_max_levels); + HYPRE_ParaSailsSetFilter(precond, sai_filter); + HYPRE_ParaSailsSetSym(precond, sai_sym); + HYPRE_ParaSailsSetLogging(precond, 3); + + /* Set the PCG preconditioner */ + HYPRE_PCGSetPrecond(solver, (HYPRE_PtrToSolverFcn) HYPRE_ParaSailsSolve, + (HYPRE_PtrToSolverFcn) HYPRE_ParaSailsSetup, precond); + + /* Now setup and solve! */ + HYPRE_ParCSRPCGSetup(solver, parcsr_A, par_b, par_x); + HYPRE_ParCSRPCGSolve(solver, parcsr_A, par_b, par_x); + + + /* Run info - needed logging turned on */ + HYPRE_PCGGetNumIterations(solver, &num_iterations); + HYPRE_PCGGetFinalRelativeResidualNorm(solver, &final_res_norm); + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", num_iterations); + printf("Final Relative Residual Norm = %e\n", final_res_norm); + printf("\n"); + } + + /* Destory solver and preconditioner */ + HYPRE_ParCSRPCGDestroy(solver); + HYPRE_ParaSailsDestroy(precond); + } + /* Flexible GMRES with AMG Preconditioner */ + else if (solver_id == 61) + { + int num_iterations; + double final_res_norm; + int restart = 30; + int modify = 1; + + + /* Create solver */ + HYPRE_ParCSRFlexGMRESCreate(MPI_COMM_WORLD, &solver); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_FlexGMRESSetKDim(solver, restart); + HYPRE_FlexGMRESSetMaxIter(solver, 1000); /* max iterations */ + HYPRE_FlexGMRESSetTol(solver, 1e-7); /* conv. tolerance */ + HYPRE_FlexGMRESSetPrintLevel(solver, 2); /* print solve info */ + HYPRE_FlexGMRESSetLogging(solver, 1); /* needed to get run info later */ + + + /* Now set up the AMG preconditioner and specify any parameters */ + HYPRE_BoomerAMGCreate(&precond); + HYPRE_BoomerAMGSetPrintLevel(precond, 1); /* print amg solution info */ + HYPRE_BoomerAMGSetCoarsenType(precond, 6); + HYPRE_BoomerAMGSetOldDefault(precond); + HYPRE_BoomerAMGSetRelaxType(precond, 6); /* Sym G.S./Jacobi hybrid */ + HYPRE_BoomerAMGSetNumSweeps(precond, 1); + HYPRE_BoomerAMGSetTol(precond, 0.0); /* conv. tolerance zero */ + HYPRE_BoomerAMGSetMaxIter(precond, 1); /* do only one iteration! */ + + /* Set the FlexGMRES preconditioner */ + HYPRE_FlexGMRESSetPrecond(solver, (HYPRE_PtrToSolverFcn) HYPRE_BoomerAMGSolve, + (HYPRE_PtrToSolverFcn) HYPRE_BoomerAMGSetup, precond); + + + if (modify) + /* this is an optional call - if you don't call it, hypre_FlexGMRESModifyPCDefault + is used - which does nothing. Otherwise, you can define your own, similar to + the one used here */ + HYPRE_FlexGMRESSetModifyPC( solver, + (HYPRE_PtrToModifyPCFcn) hypre_FlexGMRESModifyPCAMGExample); + + + /* Now setup and solve! */ + HYPRE_ParCSRFlexGMRESSetup(solver, parcsr_A, par_b, par_x); + HYPRE_ParCSRFlexGMRESSolve(solver, parcsr_A, par_b, par_x); + + /* Run info - needed logging turned on */ + HYPRE_FlexGMRESGetNumIterations(solver, &num_iterations); + HYPRE_FlexGMRESGetFinalRelativeResidualNorm(solver, &final_res_norm); + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", num_iterations); + printf("Final Relative Residual Norm = %e\n", final_res_norm); + printf("\n"); + } + + /* Destory solver and preconditioner */ + HYPRE_ParCSRFlexGMRESDestroy(solver); + HYPRE_BoomerAMGDestroy(precond); + + } + else + { + if (myid ==0) printf("Invalid solver id specified.\n"); + } + + /* Save the solution for GLVis visualization, see vis/glvis-ex5.sh */ + if (vis) + { + FILE *file; + char filename[255]; + + int nvalues = local_size; + int *rows = (int*) calloc(nvalues, sizeof(int)); + double *values = (double*) calloc(nvalues, sizeof(double)); + + for (i = 0; i < nvalues; i++) + rows[i] = ilower + i; + + /* get the local solution */ + HYPRE_IJVectorGetValues(x, nvalues, rows, values); + + sprintf(filename, "%s.%06d", "vis/ex5.sol", myid); + if ((file = fopen(filename, "w")) == NULL) + { + printf("Error: can't open output file %s\n", filename); + MPI_Finalize(); + exit(1); + } + + /* save solution */ + for (i = 0; i < nvalues; i++) + fprintf(file, "%.14e\n", values[i]); + + fflush(file); + fclose(file); + + free(rows); + free(values); + + /* save global finite element mesh */ + if (myid == 0) + GLVis_PrintGlobalSquareMesh("vis/ex5.mesh", n-1); + } + + /* Clean up */ + HYPRE_IJMatrixDestroy(A); + HYPRE_IJVectorDestroy(b); + HYPRE_IJVectorDestroy(x); + + /* Finalize MPI*/ + MPI_Finalize(); + + return(0); +} + +/*-------------------------------------------------------------------------- + hypre_FlexGMRESModifyPCAMGExample - + + This is an example (not recommended) + of how we can modify things about AMG that + affect the solve phase based on how FlexGMRES is doing...For + another preconditioner it may make sense to modify the tolerance.. + + *--------------------------------------------------------------------------*/ + +int hypre_FlexGMRESModifyPCAMGExample(void *precond_data, int iterations, + double rel_residual_norm) +{ + + + if (rel_residual_norm > .1) + { + HYPRE_BoomerAMGSetNumSweeps((HYPRE_Solver)precond_data, 10); + } + else + { + HYPRE_BoomerAMGSetNumSweeps((HYPRE_Solver)precond_data, 1); + } + + + return 0; +} diff --git a/3rd_party/hypre/src/examples/ex5big.c b/3rd_party/hypre/src/examples/ex5big.c new file mode 100644 index 000000000..f8e0b29ac --- /dev/null +++ b/3rd_party/hypre/src/examples/ex5big.c @@ -0,0 +1,585 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 5big + + Interface: Linear-Algebraic (IJ) + + Compile with: make ex5big + + Sample run: mpirun -np 4 ex5big + + Description: This example is a slight modification of Example 5 that + illustrates the 64-bit integer support in hypre needed to run + problems with more than 2B unknowns. + + Specifically, the changes compared to Example 5 are as follows: + + 1) All integer arguments to HYPRE functions should be declared + of type HYPRE_Int. + + 2) Variables of type HYPRE_Int are 64-bit integers, so they + should be printed in the %lld format (not %d). + + To enable the 64-bit integer support, you need to build hypre + with the --enable-bigint option of the configure script. We + recommend comparing this example with Example 5. +*/ + +#include +#include "_hypre_utilities.h" +#include "HYPRE_krylov.h" +#include "HYPRE.h" +#include "HYPRE_parcsr_ls.h" + +int hypre_FlexGMRESModifyPCAMGExample(void *precond_data, int iterations, + double rel_residual_norm); + + +int main (int argc, char *argv[]) +{ + HYPRE_Int i; + int myid, num_procs; + int N, n; + + HYPRE_Int ilower, iupper; + HYPRE_Int local_size, extra; + + int solver_id; + int print_system; + + double h, h2; + + HYPRE_IJMatrix A; + HYPRE_ParCSRMatrix parcsr_A; + HYPRE_IJVector b; + HYPRE_ParVector par_b; + HYPRE_IJVector x; + HYPRE_ParVector par_x; + + HYPRE_Solver solver, precond; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Default problem parameters */ + n = 33; + solver_id = 0; + print_system = 0; + + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-solver") == 0 ) + { + arg_index++; + solver_id = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-print_system") == 0 ) + { + arg_index++; + print_system = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size in each direction (default: 33)\n"); + printf(" -solver : solver ID\n"); + printf(" 0 - AMG (default) \n"); + printf(" 1 - AMG-PCG\n"); + printf(" 8 - ParaSails-PCG\n"); + printf(" 50 - PCG\n"); + printf(" 61 - AMG-FlexGMRES\n"); + printf(" -print_system : print the matrix and rhs\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* Preliminaries: want at least one processor per row */ + if (n*n < num_procs) n = sqrt(num_procs) + 1; + N = n*n; /* global number of rows */ + h = 1.0/(n+1); /* mesh size*/ + h2 = h*h; + + /* Each processor knows only of its own rows - the range is denoted by ilower + and upper. Here we partition the rows. We account for the fact that + N may not divide evenly by the number of processors. */ + local_size = N/num_procs; + extra = N - local_size*num_procs; + + ilower = local_size*myid; + ilower += hypre_min(myid, extra); + + iupper = local_size*(myid+1); + iupper += hypre_min(myid+1, extra); + iupper = iupper - 1; + + /* How many rows do I have? */ + local_size = iupper - ilower + 1; + + /* Create the matrix. + Note that this is a square matrix, so we indicate the row partition + size twice (since number of rows = number of cols) */ + HYPRE_IJMatrixCreate(MPI_COMM_WORLD, ilower, iupper, ilower, iupper, &A); + + /* Choose a parallel csr format storage (see the User's Manual) */ + HYPRE_IJMatrixSetObjectType(A, HYPRE_PARCSR); + + /* Initialize before setting coefficients */ + HYPRE_IJMatrixInitialize(A); + + /* Now go through my local rows and set the matrix entries. + Each row has at most 5 entries. For example, if n=3: + + A = [M -I 0; -I M -I; 0 -I M] + M = [4 -1 0; -1 4 -1; 0 -1 4] + + Note that here we are setting one row at a time, though + one could set all the rows together (see the User's Manual). + */ + { + HYPRE_Int nnz; + double values[5]; + HYPRE_Int cols[5]; + + for (i = ilower; i <= iupper; i++) + { + nnz = 0; + + /* The left identity block:position i-n */ + if ((i-n)>=0) + { + cols[nnz] = i-n; + values[nnz] = -1.0; + nnz++; + } + + /* The left -1: position i-1 */ + if (i%n) + { + cols[nnz] = i-1; + values[nnz] = -1.0; + nnz++; + } + + /* Set the diagonal: position i */ + cols[nnz] = i; + values[nnz] = 4.0; + nnz++; + + /* The right -1: position i+1 */ + if ((i+1)%n) + { + cols[nnz] = i+1; + values[nnz] = -1.0; + nnz++; + } + + /* The right identity block:position i+n */ + if ((i+n)< N) + { + cols[nnz] = i+n; + values[nnz] = -1.0; + nnz++; + } + + /* Set the values for row i */ + HYPRE_IJMatrixSetValues(A, 1, &nnz, &i, cols, values); + } + } + + /* Assemble after setting the coefficients */ + HYPRE_IJMatrixAssemble(A); + + /* Note: for the testing of small problems, one may wish to read + in a matrix in IJ format (for the format, see the output files + from the -print_system option). + In this case, one would use the following routine: + HYPRE_IJMatrixRead( , MPI_COMM_WORLD, + HYPRE_PARCSR, &A ); + = IJ.A.out to read in what has been printed out + by -print_system (processor numbers are omitted). + A call to HYPRE_IJMatrixRead is an *alternative* to the + following sequence of HYPRE_IJMatrix calls: + Create, SetObjectType, Initialize, SetValues, and Assemble + */ + + + /* Get the parcsr matrix object to use */ + HYPRE_IJMatrixGetObject(A, (void**) &parcsr_A); + + + /* Create the rhs and solution */ + HYPRE_IJVectorCreate(MPI_COMM_WORLD, ilower, iupper,&b); + HYPRE_IJVectorSetObjectType(b, HYPRE_PARCSR); + HYPRE_IJVectorInitialize(b); + + HYPRE_IJVectorCreate(MPI_COMM_WORLD, ilower, iupper,&x); + HYPRE_IJVectorSetObjectType(x, HYPRE_PARCSR); + HYPRE_IJVectorInitialize(x); + + /* Set the rhs values to h^2 and the solution to zero */ + { + double *rhs_values, *x_values; + HYPRE_Int *rows; + + rhs_values = (double*) calloc(local_size, sizeof(double)); + x_values = (double*) calloc(local_size, sizeof(double)); + rows = (HYPRE_Int*) calloc(local_size, sizeof(HYPRE_Int)); + + for (i=0; i, MPI_COMM_WORLD, + HYPRE_PARCSR, &b ); + as an alternative to the + following sequence of HYPRE_IJVectors calls: + Create, SetObjectType, Initialize, SetValues, and Assemble + */ + HYPRE_IJVectorGetObject(b, (void **) &par_b); + + HYPRE_IJVectorAssemble(x); + HYPRE_IJVectorGetObject(x, (void **) &par_x); + + + /* Print out the system - files names will be IJ.out.A.XXXXX + and IJ.out.b.XXXXX, where XXXXX = processor id */ + if (print_system) + { + HYPRE_IJMatrixPrint(A, "IJ.out.A"); + HYPRE_IJVectorPrint(b, "IJ.out.b"); + } + + + /* Choose a solver and solve the system */ + + /* AMG */ + if (solver_id == 0) + { + HYPRE_Int num_iterations; + double final_res_norm; + + /* Create solver */ + HYPRE_BoomerAMGCreate(&solver); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_BoomerAMGSetPrintLevel(solver, 3); /* print solve info + parameters */ + HYPRE_BoomerAMGSetOldDefault(solver); /* Falgout coarsening with modified classical interpolation */ + HYPRE_BoomerAMGSetRelaxType(solver, 3); /* G-S/Jacobi hybrid relaxation */ + HYPRE_BoomerAMGSetRelaxOrder(solver, 1); /* Uses C/F relaxation */ + HYPRE_BoomerAMGSetNumSweeps(solver, 1); /* Sweeeps on each level */ + HYPRE_BoomerAMGSetMaxLevels(solver, 20); /* maximum number of levels */ + HYPRE_BoomerAMGSetTol(solver, 1e-7); /* conv. tolerance */ + + /* Now setup and solve! */ + HYPRE_BoomerAMGSetup(solver, parcsr_A, par_b, par_x); + HYPRE_BoomerAMGSolve(solver, parcsr_A, par_b, par_x); + + /* Run info - needed logging turned on */ + HYPRE_BoomerAMGGetNumIterations(solver, &num_iterations); + HYPRE_BoomerAMGGetFinalRelativeResidualNorm(solver, &final_res_norm); + if (myid == 0) + { + printf("\n"); + printf("Iterations = %lld\n", num_iterations); + printf("Final Relative Residual Norm = %e\n", final_res_norm); + printf("\n"); + } + + /* Destroy solver */ + HYPRE_BoomerAMGDestroy(solver); + } + /* PCG */ + else if (solver_id == 50) + { + HYPRE_Int num_iterations; + double final_res_norm; + + /* Create solver */ + HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_PCGSetMaxIter(solver, 1000); /* max iterations */ + HYPRE_PCGSetTol(solver, 1e-7); /* conv. tolerance */ + HYPRE_PCGSetTwoNorm(solver, 1); /* use the two norm as the stopping criteria */ + HYPRE_PCGSetPrintLevel(solver, 2); /* prints out the iteration info */ + HYPRE_PCGSetLogging(solver, 1); /* needed to get run info later */ + + /* Now setup and solve! */ + HYPRE_ParCSRPCGSetup(solver, parcsr_A, par_b, par_x); + HYPRE_ParCSRPCGSolve(solver, parcsr_A, par_b, par_x); + + /* Run info - needed logging turned on */ + HYPRE_PCGGetNumIterations(solver, &num_iterations); + HYPRE_PCGGetFinalRelativeResidualNorm(solver, &final_res_norm); + if (myid == 0) + { + printf("\n"); + printf("Iterations = %lld\n", num_iterations); + printf("Final Relative Residual Norm = %e\n", final_res_norm); + printf("\n"); + } + + /* Destroy solver */ + HYPRE_ParCSRPCGDestroy(solver); + } + /* PCG with AMG preconditioner */ + else if (solver_id == 1) + { + HYPRE_Int num_iterations; + double final_res_norm; + + /* Create solver */ + HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_PCGSetMaxIter(solver, 1000); /* max iterations */ + HYPRE_PCGSetTol(solver, 1e-7); /* conv. tolerance */ + HYPRE_PCGSetTwoNorm(solver, 1); /* use the two norm as the stopping criteria */ + HYPRE_PCGSetPrintLevel(solver, 2); /* print solve info */ + HYPRE_PCGSetLogging(solver, 1); /* needed to get run info later */ + + /* Now set up the AMG preconditioner and specify any parameters */ + HYPRE_BoomerAMGCreate(&precond); + HYPRE_BoomerAMGSetPrintLevel(precond, 1); /* print amg solution info */ + HYPRE_BoomerAMGSetCoarsenType(precond, 6); + HYPRE_BoomerAMGSetOldDefault(precond); + HYPRE_BoomerAMGSetRelaxType(precond, 6); /* Sym G.S./Jacobi hybrid */ + HYPRE_BoomerAMGSetNumSweeps(precond, 1); + HYPRE_BoomerAMGSetTol(precond, 0.0); /* conv. tolerance zero */ + HYPRE_BoomerAMGSetMaxIter(precond, 1); /* do only one iteration! */ + + /* Set the PCG preconditioner */ + HYPRE_PCGSetPrecond(solver, (HYPRE_PtrToSolverFcn) HYPRE_BoomerAMGSolve, + (HYPRE_PtrToSolverFcn) HYPRE_BoomerAMGSetup, precond); + + /* Now setup and solve! */ + HYPRE_ParCSRPCGSetup(solver, parcsr_A, par_b, par_x); + HYPRE_ParCSRPCGSolve(solver, parcsr_A, par_b, par_x); + + /* Run info - needed logging turned on */ + HYPRE_PCGGetNumIterations(solver, &num_iterations); + HYPRE_PCGGetFinalRelativeResidualNorm(solver, &final_res_norm); + if (myid == 0) + { + printf("\n"); + printf("Iterations = %lld\n", num_iterations); + printf("Final Relative Residual Norm = %e\n", final_res_norm); + printf("\n"); + } + + /* Destroy solver and preconditioner */ + HYPRE_ParCSRPCGDestroy(solver); + HYPRE_BoomerAMGDestroy(precond); + } + /* PCG with Parasails Preconditioner */ + else if (solver_id == 8) + { + HYPRE_Int num_iterations; + double final_res_norm; + + int sai_max_levels = 1; + double sai_threshold = 0.1; + double sai_filter = 0.05; + int sai_sym = 1; + + /* Create solver */ + HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_PCGSetMaxIter(solver, 1000); /* max iterations */ + HYPRE_PCGSetTol(solver, 1e-7); /* conv. tolerance */ + HYPRE_PCGSetTwoNorm(solver, 1); /* use the two norm as the stopping criteria */ + HYPRE_PCGSetPrintLevel(solver, 2); /* print solve info */ + HYPRE_PCGSetLogging(solver, 1); /* needed to get run info later */ + + /* Now set up the ParaSails preconditioner and specify any parameters */ + HYPRE_ParaSailsCreate(MPI_COMM_WORLD, &precond); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_ParaSailsSetParams(precond, sai_threshold, sai_max_levels); + HYPRE_ParaSailsSetFilter(precond, sai_filter); + HYPRE_ParaSailsSetSym(precond, sai_sym); + HYPRE_ParaSailsSetLogging(precond, 3); + + /* Set the PCG preconditioner */ + HYPRE_PCGSetPrecond(solver, (HYPRE_PtrToSolverFcn) HYPRE_ParaSailsSolve, + (HYPRE_PtrToSolverFcn) HYPRE_ParaSailsSetup, precond); + + /* Now setup and solve! */ + HYPRE_ParCSRPCGSetup(solver, parcsr_A, par_b, par_x); + HYPRE_ParCSRPCGSolve(solver, parcsr_A, par_b, par_x); + + + /* Run info - needed logging turned on */ + HYPRE_PCGGetNumIterations(solver, &num_iterations); + HYPRE_PCGGetFinalRelativeResidualNorm(solver, &final_res_norm); + if (myid == 0) + { + printf("\n"); + printf("Iterations = %lld\n", num_iterations); + printf("Final Relative Residual Norm = %e\n", final_res_norm); + printf("\n"); + } + + /* Destory solver and preconditioner */ + HYPRE_ParCSRPCGDestroy(solver); + HYPRE_ParaSailsDestroy(precond); + } + /* Flexible GMRES with AMG Preconditioner */ + else if (solver_id == 61) + { + HYPRE_Int num_iterations; + double final_res_norm; + int restart = 30; + int modify = 1; + + + /* Create solver */ + HYPRE_ParCSRFlexGMRESCreate(MPI_COMM_WORLD, &solver); + + /* Set some parameters (See Reference Manual for more parameters) */ + HYPRE_FlexGMRESSetKDim(solver, restart); + HYPRE_FlexGMRESSetMaxIter(solver, 1000); /* max iterations */ + HYPRE_FlexGMRESSetTol(solver, 1e-7); /* conv. tolerance */ + HYPRE_FlexGMRESSetPrintLevel(solver, 2); /* print solve info */ + HYPRE_FlexGMRESSetLogging(solver, 1); /* needed to get run info later */ + + + /* Now set up the AMG preconditioner and specify any parameters */ + HYPRE_BoomerAMGCreate(&precond); + HYPRE_BoomerAMGSetPrintLevel(precond, 1); /* print amg solution info */ + HYPRE_BoomerAMGSetCoarsenType(precond, 6); + HYPRE_BoomerAMGSetOldDefault(precond); + HYPRE_BoomerAMGSetRelaxType(precond, 6); /* Sym G.S./Jacobi hybrid */ + HYPRE_BoomerAMGSetNumSweeps(precond, 1); + HYPRE_BoomerAMGSetTol(precond, 0.0); /* conv. tolerance zero */ + HYPRE_BoomerAMGSetMaxIter(precond, 1); /* do only one iteration! */ + + /* Set the FlexGMRES preconditioner */ + HYPRE_FlexGMRESSetPrecond(solver, (HYPRE_PtrToSolverFcn) HYPRE_BoomerAMGSolve, + (HYPRE_PtrToSolverFcn) HYPRE_BoomerAMGSetup, precond); + + + if (modify) + /* this is an optional call - if you don't call it, hypre_FlexGMRESModifyPCDefault + is used - which does nothing. Otherwise, you can define your own, similar to + the one used here */ + HYPRE_FlexGMRESSetModifyPC( solver, + (HYPRE_PtrToModifyPCFcn) hypre_FlexGMRESModifyPCAMGExample); + + + /* Now setup and solve! */ + HYPRE_ParCSRFlexGMRESSetup(solver, parcsr_A, par_b, par_x); + HYPRE_ParCSRFlexGMRESSolve(solver, parcsr_A, par_b, par_x); + + /* Run info - needed logging turned on */ + HYPRE_FlexGMRESGetNumIterations(solver, &num_iterations); + HYPRE_FlexGMRESGetFinalRelativeResidualNorm(solver, &final_res_norm); + if (myid == 0) + { + printf("\n"); + printf("Iterations = %lld\n", num_iterations); + printf("Final Relative Residual Norm = %e\n", final_res_norm); + printf("\n"); + } + + /* Destory solver and preconditioner */ + HYPRE_ParCSRFlexGMRESDestroy(solver); + HYPRE_BoomerAMGDestroy(precond); + + } + else + { + if (myid ==0) printf("Invalid solver id specified.\n"); + } + + /* Clean up */ + HYPRE_IJMatrixDestroy(A); + HYPRE_IJVectorDestroy(b); + HYPRE_IJVectorDestroy(x); + + /* Finalize MPI*/ + MPI_Finalize(); + + return(0); +} + +/*-------------------------------------------------------------------------- + hypre_FlexGMRESModifyPCAMGExample - + + This is an example (not recommended) + of how we can modify things about AMG that + affect the solve phase based on how FlexGMRES is doing...For + another preconditioner it may make sense to modify the tolerance.. + + *--------------------------------------------------------------------------*/ + +int hypre_FlexGMRESModifyPCAMGExample(void *precond_data, int iterations, + double rel_residual_norm) +{ + + + if (rel_residual_norm > .1) + { + HYPRE_BoomerAMGSetNumSweeps((HYPRE_Solver)precond_data, 10); + } + else + { + HYPRE_BoomerAMGSetNumSweeps((HYPRE_Solver)precond_data, 1); + } + + + return 0; +} diff --git a/3rd_party/hypre/src/examples/ex5f.f b/3rd_party/hypre/src/examples/ex5f.f new file mode 100644 index 000000000..c6a470633 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex5f.f @@ -0,0 +1,466 @@ +! Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +! HYPRE Project Developers. See the top-level COPYRIGHT file for details. +! +! SPDX-License-Identifier: (Apache-2.0 OR MIT) + +! +! Example 5 +! +! Interface: Linear-Algebraic (IJ), Fortran (77) version +! +! Compile with: make ex5f +! +! Sample run: mpirun -np 4 ex5f +! +! Description: This example solves the 2-D +! Laplacian problem with zero boundary conditions +! on an nxn grid. The number of unknowns is N=n^2. +! The standard 5-point stencil is used, and we solve +! for the interior nodes only. +! +! This example solves the same problem as Example 3. +! Available solvers are AMG, PCG, and PCG with AMG, +! and PCG with ParaSails +! +! +! Notes: for PCG, GMRES and BiCGStab, precond_id means: +! 0 - do not set up a preconditioner +! 1 - set up a ds preconditioner +! 2 - set up an amg preconditioner +! 3 - set up a pilut preconditioner +! 4 - set up a ParaSails preconditioner +! + + program ex5f + + + implicit none + + include 'mpif.h' + + integer MAX_LOCAL_SIZE + integer HYPRE_PARCSR + + parameter (MAX_LOCAL_SIZE=123000) + +! the following is from HYPRE.c + parameter (HYPRE_PARCSR=5555) + + integer ierr + integer num_procs, myid + integer local_size, extra + integer n, solver_id, print_solution, ng + integer nnz, ilower, iupper, i + integer precond_id; + double precision h, h2 + double precision rhs_values(MAX_LOCAL_SIZE) + double precision x_values(MAX_LOCAL_SIZE) + integer rows(MAX_LOCAL_SIZE) + integer cols(5) + double precision values(5) + integer num_iterations + double precision final_res_norm, tol + + integer mpi_comm + + integer*8 parcsr_A + integer*8 A + integer*8 b + integer*8 x + integer*8 par_b + integer*8 par_x + integer*8 solver + integer*8 precond + +!----------------------------------------------------------------------- +! Initialize MPI +!----------------------------------------------------------------------- + + call MPI_INIT(ierr) + call MPI_COMM_RANK(MPI_COMM_WORLD, myid, ierr) + call MPI_COMM_SIZE(MPI_COMM_WORLD, num_procs, ierr) + mpi_comm = MPI_COMM_WORLD + +! Default problem parameters + n = 33 + solver_id = 0 + print_solution = 0 + tol = 1.0d-7 + +! The input section not implemented yet. + +! Preliminaries: want at least one processor per row + if ( n*n .lt. num_procs ) then + n = int(sqrt(real(num_procs))) + 1 + endif +! ng = global no. rows, h = mesh size + ng = n*n + h = 1.0d0/(n+1) + h2 = h*h + +! Each processor knows only of its own rows - the range is denoted by ilower +! and upper. Here we partition the rows. We account for the fact that +! N may not divide evenly by the number of processors. + local_size = ng/num_procs + extra = ng - local_size*num_procs + + ilower = local_size*myid + ilower = ilower + min(myid, extra) + + iupper = local_size*(myid+1) + iupper = iupper + min(myid+1, extra) + iupper = iupper - 1 + +! How many rows do I have? + local_size = iupper - ilower + 1 + +! Create the matrix. +! Note that this is a square matrix, so we indicate the row partition +! size twice (since number of rows = number of cols) + call HYPRE_IJMatrixCreate(mpi_comm, ilower, + 1 iupper, ilower, iupper, A, ierr) + + +! Choose a parallel csr format storage (see the User's Manual) + call HYPRE_IJMatrixSetObjectType(A, HYPRE_PARCSR, ierr) + +! Initialize before setting coefficients + call HYPRE_IJMatrixInitialize(A, ierr) + + +! Now go through my local rows and set the matrix entries. +! Each row has at most 5 entries. For example, if n=3: +! +! A = [M -I 0; -I M -I; 0 -I M] +! M = [4 -1 0; -1 4 -1; 0 -1 4] +! +! Note that here we are setting one row at a time, though +! one could set all the rows together (see the User's Manual). + + + do i = ilower, iupper + nnz = 1 + + +! The left identity block:position i-n + if ( (i-n) .ge. 0 ) then + cols(nnz) = i-n + values(nnz) = -1.0d0 + nnz = nnz + 1 + endif + +! The left -1: position i-1 + if ( mod(i,n).ne.0 ) then + cols(nnz) = i-1 + values(nnz) = -1.0d0 + nnz = nnz + 1 + endif + +! Set the diagonal: position i + cols(nnz) = i + values(nnz) = 4.0d0 + nnz = nnz + 1 + +! The right -1: position i+1 + if ( mod((i+1),n) .ne. 0 ) then + cols(nnz) = i+1 + values(nnz) = -1.0d0 + nnz = nnz + 1 + endif + +! The right identity block:position i+n + if ( (i+n) .lt. ng ) then + cols(nnz) = i+n + values(nnz) = -1.0d0 + nnz = nnz + 1 + endif + +! Set the values for row i + call HYPRE_IJMatrixSetValues( + 1 A, 1, nnz-1, i, cols, values, ierr) + + enddo + + +! Assemble after setting the coefficients + call HYPRE_IJMatrixAssemble(A, ierr) + +! Get parcsr matrix object + call HYPRE_IJMatrixGetObject(A, parcsr_A, ierr) + + +! Create the rhs and solution + call HYPRE_IJVectorCreate(mpi_comm, + 1 ilower, iupper, b, ierr) + call HYPRE_IJVectorSetObjectType(b, HYPRE_PARCSR, ierr) + call HYPRE_IJVectorInitialize(b, ierr) + + call HYPRE_IJVectorCreate(mpi_comm, + 1 ilower, iupper, x, ierr) + call HYPRE_IJVectorSetObjectType(x, HYPRE_PARCSR, ierr) + call HYPRE_IJVectorInitialize(x, ierr) + + +! Set the rhs values to h^2 and the solution to zero + do i = 1, local_size + rhs_values(i) = h2 + x_values(i) = 0.0 + rows(i) = ilower + i -1 + enddo + call HYPRE_IJVectorSetValues( + 1 b, local_size, rows, rhs_values, ierr) + call HYPRE_IJVectorSetValues( + 1 x, local_size, rows, x_values, ierr) + + + call HYPRE_IJVectorAssemble(b, ierr) + call HYPRE_IJVectorAssemble(x, ierr) + +! get the x and b objects + + call HYPRE_IJVectorGetObject(b, par_b, ierr) + call HYPRE_IJVectorGetObject(x, par_x, ierr) + + +! Choose a solver and solve the system + +! AMG + if ( solver_id .eq. 0 ) then + +! Create solver + call HYPRE_BoomerAMGCreate(solver, ierr) + + +! Set some parameters (See Reference Manual for more parameters) + +! print solve info + parameters + call HYPRE_BoomerAMGSetPrintLevel(solver, 3, ierr) +! old defaults, Falgout coarsening, mod. class. interpolation + call HYPRE_BoomerAMGSetOldDefault(solver, ierr) +! G-S/Jacobi hybrid relaxation + call HYPRE_BoomerAMGSetRelaxType(solver, 3, ierr) +! C/F relaxation + call HYPRE_BoomerAMGSetRelaxOrder(solver, 1, ierr) +! Sweeeps on each level + call HYPRE_BoomerAMGSetNumSweeps(solver, 1, ierr) +! maximum number of levels + call HYPRE_BoomerAMGSetMaxLevels(solver, 20, ierr) +! conv. tolerance + call HYPRE_BoomerAMGSetTol(solver, 1.0d-7, ierr) + +! Now setup and solve! + call HYPRE_BoomerAMGSetup( + 1 solver, parcsr_A, par_b, par_x, ierr) + call HYPRE_BoomerAMGSolve( + 1 solver, parcsr_A, par_b, par_x, ierr) + + +! Run info - needed logging turned on + call HYPRE_BoomerAMGGetNumIterations(solver, num_iterations, + 1 ierr) + call HYPRE_BoomerAMGGetFinalReltvRes(solver, final_res_norm, + 1 ierr) + + + if ( myid .eq. 0 ) then + print * + print '(A,I2)', " Iterations = ", num_iterations + print '(A,ES16.8)', + 1 " Final Relative Residual Norm = ", final_res_norm + print * + endif + +! Destroy solver + call HYPRE_BoomerAMGDestroy(solver, ierr) + +! PCG (with DS) + elseif ( solver_id .eq. 50 ) then + + +! Create solver + call HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, solver, ierr) + +! Set some parameters (See Reference Manual for more parameters) + call HYPRE_ParCSRPCGSetMaxIter(solver, 1000, ierr) + call HYPRE_ParCSRPCGSetTol(solver, 1.0d-7, ierr) + call HYPRE_ParCSRPCGSetTwoNorm(solver, 1, ierr) + call HYPRE_ParCSRPCGSetPrintLevel(solver, 2, ierr) + call HYPRE_ParCSRPCGSetLogging(solver, 1, ierr) + +! set ds (diagonal scaling) as the pcg preconditioner + precond_id = 1 + call HYPRE_ParCSRPCGSetPrecond(solver, precond_id, + 1 precond, ierr) + + + +! Now setup and solve! + call HYPRE_ParCSRPCGSetup(solver, parcsr_A, par_b, + & par_x, ierr) + call HYPRE_ParCSRPCGSolve(solver, parcsr_A, par_b, + & par_x, ierr) + + +! Run info - needed logging turned on + + call HYPRE_ParCSRPCGGetNumIterations(solver, num_iterations, + & ierr) + call HYPRE_ParCSRPCGGetFinalRelative(solver, final_res_norm, + & ierr) + + if ( myid .eq. 0 ) then + print * + print *, "Iterations = ", num_iterations + print *, "Final Relative Residual Norm = ", final_res_norm + print * + endif + +! Destroy solver + call HYPRE_ParCSRPCGDestroy(solver, ierr) + + +! PCG with AMG preconditioner + elseif ( solver_id == 1 ) then + +! Create solver + call HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, solver, ierr) + +! Set some parameters (See Reference Manual for more parameters) + call HYPRE_ParCSRPCGSetMaxIter(solver, 1000, ierr) + call HYPRE_ParCSRPCGSetTol(solver, 1.0d-7, ierr) + call HYPRE_ParCSRPCGSetTwoNorm(solver, 1, ierr) + call HYPRE_ParCSRPCGSetPrintLevel(solver, 2, ierr) + call HYPRE_ParCSRPCGSetLogging(solver, 1, ierr) + +! Now set up the AMG preconditioner and specify any parameters + + call HYPRE_BoomerAMGCreate(precond, ierr) + + +! Set some parameters (See Reference Manual for more parameters) + +! print less solver info since a preconditioner + call HYPRE_BoomerAMGSetPrintLevel(precond, 1, ierr); +! Falgout coarsening + call HYPRE_BoomerAMGSetCoarsenType(precond, 6, ierr) +! old defaults + call HYPRE_BoomerAMGSetOldDefault(precond, ierr) +! SYMMETRIC G-S/Jacobi hybrid relaxation + call HYPRE_BoomerAMGSetRelaxType(precond, 6, ierr) +! Sweeeps on each level + call HYPRE_BoomerAMGSetNumSweeps(precond, 1, ierr) +! conv. tolerance + call HYPRE_BoomerAMGSetTol(precond, 0.0d0, ierr) +! do only one iteration! + call HYPRE_BoomerAMGSetMaxIter(precond, 1, ierr) + +! set amg as the pcg preconditioner + precond_id = 2 + call HYPRE_ParCSRPCGSetPrecond(solver, precond_id, + 1 precond, ierr) + + +! Now setup and solve! + call HYPRE_ParCSRPCGSetup(solver, parcsr_A, par_b, + 1 par_x, ierr) + call HYPRE_ParCSRPCGSolve(solver, parcsr_A, par_b, + 1 par_x, ierr) + + +! Run info - needed logging turned on + + call HYPRE_ParCSRPCGGetNumIterations(solver, num_iterations, + 1 ierr) + call HYPRE_ParCSRPCGGetFinalRelative(solver, final_res_norm, + 1 ierr) + + if ( myid .eq. 0 ) then + print * + print *, "Iterations = ", num_iterations + print *, "Final Relative Residual Norm = ", final_res_norm + print * + endif + +! Destroy precond and solver + + call HYPRE_BoomerAMGDestroy(precond, ierr) + call HYPRE_ParCSRPCGDestroy(solver, ierr) + +! PCG with ParaSails + elseif ( solver_id .eq. 8 ) then + +! Create solver + call HYPRE_ParCSRPCGCreate(MPI_COMM_WORLD, solver, ierr) + +! Set some parameters (See Reference Manual for more parameters) + call HYPRE_ParCSRPCGSetMaxIter(solver, 1000, ierr) + call HYPRE_ParCSRPCGSetTol(solver, 1.0d-7, ierr) + call HYPRE_ParCSRPCGSetTwoNorm(solver, 1, ierr) + call HYPRE_ParCSRPCGSetPrintLevel(solver, 2, ierr) + call HYPRE_ParCSRPCGSetLogging(solver, 1, ierr) + +! Now set up the Parasails preconditioner and specify any parameters + call HYPRE_ParaSailsCreate(MPI_COMM_WORLD, precond,ierr) + call HYPRE_ParaSailsSetParams(precond, 0.1d0, 1, ierr) + call HYPRE_ParaSailsSetFilter(precond, 0.05d0, ierr) + call HYPRE_ParaSailsSetSym(precond, 1, ierr) + call HYPRE_ParaSailsSetLogging(precond, 3, ierr) + +! set parsails as the pcg preconditioner + precond_id = 4 + call HYPRE_ParCSRPCGSetPrecond(solver, precond_id, + 1 precond, ierr) + + +! Now setup and solve! + call HYPRE_ParCSRPCGSetup(solver, parcsr_A, par_b, + 1 par_x, ierr) + call HYPRE_ParCSRPCGSolve(solver, parcsr_A, par_b, + 1 par_x, ierr) + + +! Run info - needed logging turned on + + call HYPRE_ParCSRPCGGetNumIterations(solver, num_iterations, + 1 ierr) + call HYPRE_ParCSRPCGGetFinalRelative(solver, final_res_norm, + 1 ierr) + + if ( myid .eq. 0 ) then + print * + print *, "Iterations = ", num_iterations + print *, "Final Relative Residual Norm = ", final_res_norm + print * + endif + +! Destroy precond and solver + + call HYPRE_ParaSailsDestroy(precond, ierr) + call HYPRE_ParCSRPCGDestroy(solver, ierr) + + else + if ( myid .eq. 0 ) then + print *,'Invalid solver id specified' + stop + endif + endif + + + +! Print the solution + if ( print_solution .ne. 0 ) then + call HYPRE_IJVectorPrint(x, "ij.out.x", ierr) + endif + +! Clean up + + call HYPRE_IJMatrixDestroy(A, ierr) + call HYPRE_IJVectorDestroy(b, ierr) + call HYPRE_IJVectorDestroy(x, ierr) + + +! Finalize MPI + call MPI_Finalize(ierr) + + stop + end diff --git a/3rd_party/hypre/src/examples/ex6.c b/3rd_party/hypre/src/examples/ex6.c new file mode 100644 index 000000000..e2cc8714d --- /dev/null +++ b/3rd_party/hypre/src/examples/ex6.c @@ -0,0 +1,592 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 6 + + Interface: Semi-Structured interface (SStruct) + + Compile with: make ex6 + + Sample run: mpirun -np 2 ex6 + + Description: This is a two processor example and is the same problem + as is solved with the structured interface in Example 2. + (The grid boxes are exactly those in the example + diagram in the struct interface chapter of the User's Manual. + Processor 0 owns two boxes and processor 1 owns one box.) + + This is the simplest sstruct example, and it demonstrates how + the semi-structured interface can be used for structured problems. + There is one part and one variable. The solver is PCG with SMG + preconditioner. We use a structured solver for this example. +*/ + +#include + +/* SStruct linear solvers headers */ +#include "HYPRE_sstruct_ls.h" + +#include "vis.c" + +int main (int argc, char *argv[]) +{ + int myid, num_procs; + + int vis = 0; + + HYPRE_SStructGrid grid; + HYPRE_SStructGraph graph; + HYPRE_SStructStencil stencil; + HYPRE_SStructMatrix A; + HYPRE_SStructVector b; + HYPRE_SStructVector x; + + /* We are using struct solvers for this example */ + HYPRE_StructSolver solver; + HYPRE_StructSolver precond; + + int object_type; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + if (num_procs != 2) + { + if (myid ==0) printf("Must run with 2 processors!\n"); + MPI_Finalize(); + + return(0); + } + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* 1. Set up the 2D grid. This gives the index space in each part. + Here we only use one part and one variable. (So the part id is 0 + and the variable id is 0) */ + { + int ndim = 2; + int nparts = 1; + int part = 0; + + /* Create an empty 2D grid object */ + HYPRE_SStructGridCreate(MPI_COMM_WORLD, ndim, nparts, &grid); + + /* Set the extents of the grid - each processor sets its grid + boxes. Each part has its own relative index space numbering, + but in this example all boxes belong to the same part. */ + + /* Processor 0 owns two boxes in the grid. */ + if (myid == 0) + { + /* Add a new box to the grid */ + { + int ilower[2] = {-3, 1}; + int iupper[2] = {-1, 2}; + + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + } + + /* Add a new box to the grid */ + { + int ilower[2] = {0, 1}; + int iupper[2] = {2, 4}; + + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + } + } + + /* Processor 1 owns one box in the grid. */ + else if (myid == 1) + { + /* Add a new box to the grid */ + { + int ilower[2] = {3, 1}; + int iupper[2] = {6, 4}; + + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + } + } + + /* Set the variable type and number of variables on each part. */ + { + int i; + int nvars = 1; + HYPRE_SStructVariable vartypes[1] = {HYPRE_SSTRUCT_VARIABLE_CELL}; + + for (i = 0; i< nparts; i++) + HYPRE_SStructGridSetVariables(grid, i, nvars, vartypes); + } + + /* Now the grid is ready to use */ + HYPRE_SStructGridAssemble(grid); + } + + /* 2. Define the discretization stencil(s) */ + { + /* Create an empty 2D, 5-pt stencil object */ + HYPRE_SStructStencilCreate(2, 5, &stencil); + + /* Define the geometry of the stencil. Each represents a + relative offset (in the index space). */ + { + int entry; + int offsets[5][2] = {{0,0}, {-1,0}, {1,0}, {0,-1}, {0,1}}; + int var = 0; + + /* Assign numerical values to the offsets so that we can + easily refer to them - the last argument indicates the + variable for which we are assigning this stencil - we are + just using one variable in this example so it is the first one (0) */ + for (entry = 0; entry < 5; entry++) + HYPRE_SStructStencilSetEntry(stencil, entry, offsets[entry], var); + } + } + + /* 3. Set up the Graph - this determines the non-zero structure + of the matrix and allows non-stencil relationships between the parts */ + { + int var = 0; + int part = 0; + + /* Create the graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, grid, &graph); + + /* See MatrixSetObjectType below */ + object_type = HYPRE_STRUCT; + HYPRE_SStructGraphSetObjectType(graph, object_type); + + /* Now we need to tell the graph which stencil to use for each + variable on each part (we only have one variable and one part) */ + HYPRE_SStructGraphSetStencil(graph, part, var, stencil); + + /* Here we could establish connections between parts if we + had more than one part using the graph. For example, we could + use HYPRE_GraphAddEntries() routine or HYPRE_GridSetNeighborBox() */ + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(graph); + } + + /* 4. Set up a SStruct Matrix */ + { + int i,j; + int part = 0; + int var = 0; + + /* Create the empty matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, graph, &A); + + /* Set the object type (by default HYPRE_SSTRUCT). This determines the + data structure used to store the matrix. If you want to use unstructured + solvers, e.g. BoomerAMG, the object type should be HYPRE_PARCSR. + If the problem is purely structured (with one part), you may want to use + HYPRE_STRUCT to access the structured solvers. Here we have a purely + structured example. */ + object_type = HYPRE_STRUCT; + HYPRE_SStructMatrixSetObjectType(A, object_type); + + /* Get ready to set values */ + HYPRE_SStructMatrixInitialize(A); + + /* Each processor must set the stencil values for their boxes on each part. + In this example, we only set stencil entries and therefore use + HYPRE_SStructMatrixSetBoxValues. If we need to set non-stencil entries, + we have to use HYPRE_SStructMatrixSetValues (shown in a later example). */ + + if (myid == 0) + { + /* Set the matrix coefficients for some set of stencil entries + over all the gridpoints in my first box (account for boundary + grid points later) */ + { + int ilower[2] = {-3, 1}; + int iupper[2] = {-1, 2}; + + int nentries = 5; + int nvalues = 30; /* 6 grid points, each with 5 stencil entries */ + double values[30]; + + int stencil_indices[5]; + for (j = 0; j < nentries; j++) /* label the stencil indices - + these correspond to the offsets + defined above */ + stencil_indices[j] = j; + + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 4.0; + for (j = 1; j < nentries; j++) + values[i+j] = -1.0; + } + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, nentries, + stencil_indices, values); + } + + /* Set the matrix coefficients for some set of stencil entries + over the gridpoints in my second box */ + { + int ilower[2] = {0, 1}; + int iupper[2] = {2, 4}; + + int nentries = 5; + int nvalues = 60; /* 12 grid points, each with 5 stencil entries */ + double values[60]; + + int stencil_indices[5]; + for (j = 0; j < nentries; j++) + stencil_indices[j] = j; + + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 4.0; + for (j = 1; j < nentries; j++) + values[i+j] = -1.0; + } + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, nentries, + stencil_indices, values); + } + } + else if (myid == 1) + { + /* Set the matrix coefficients for some set of stencil entries + over the gridpoints in my box */ + { + int ilower[2] = {3, 1}; + int iupper[2] = {6, 4}; + + int nentries = 5; + int nvalues = 80; /* 16 grid points, each with 5 stencil entries */ + double values[80]; + + int stencil_indices[5]; + for (j = 0; j < nentries; j++) + stencil_indices[j] = j; + + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 4.0; + for (j = 1; j < nentries; j++) + values[i+j] = -1.0; + } + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, nentries, + stencil_indices, values); + } + } + + /* For each box, set any coefficients that reach ouside of the + boundary to 0 */ + if (myid == 0) + { + int maxnvalues = 6; + double values[6]; + + for (i = 0; i < maxnvalues; i++) + values[i] = 0.0; + + { + /* Values below our first AND second box */ + int ilower[2] = {-3, 1}; + int iupper[2] = { 2, 1}; + + int stencil_indices[1] = {3}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + + { + /* Values to the left of our first box */ + int ilower[2] = {-3, 1}; + int iupper[2] = {-3, 2}; + + int stencil_indices[1] = {1}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + + { + /* Values above our first box */ + int ilower[2] = {-3, 2}; + int iupper[2] = {-1, 2}; + + int stencil_indices[1] = {4}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + + { + /* Values to the left of our second box (that do not border the + first box). */ + int ilower[2] = { 0, 3}; + int iupper[2] = { 0, 4}; + + int stencil_indices[1] = {1}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + + { + /* Values above our second box */ + int ilower[2] = { 0, 4}; + int iupper[2] = { 2, 4}; + + int stencil_indices[1] = {4}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + } + else if (myid == 1) + { + int maxnvalues = 4; + double values[4]; + for (i = 0; i < maxnvalues; i++) + values[i] = 0.0; + + { + /* Values below our box */ + int ilower[2] = { 3, 1}; + int iupper[2] = { 6, 1}; + + int stencil_indices[1] = {3}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + + { + /* Values to the right of our box */ + int ilower[2] = { 6, 1}; + int iupper[2] = { 6, 4}; + + int stencil_indices[1] = {2}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + + { + /* Values above our box */ + int ilower[2] = { 3, 4}; + int iupper[2] = { 6, 4}; + + int stencil_indices[1] = {4}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + } + + /* This is a collective call finalizing the matrix assembly. + The matrix is now ``ready to be used'' */ + HYPRE_SStructMatrixAssemble(A); + } + + + /* 5. Set up SStruct Vectors for b and x */ + { + int i; + + /* We have one part and one variable. */ + int part = 0; + int var = 0; + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &b); + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &x); + + /* As with the matrix, set the object type for the vectors + to be the struct type */ + object_type = HYPRE_STRUCT; + HYPRE_SStructVectorSetObjectType(b, object_type); + HYPRE_SStructVectorSetObjectType(x, object_type); + + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(b); + HYPRE_SStructVectorInitialize(x); + + if (myid == 0) + { + /* Set the vector coefficients over the gridpoints in my first box */ + { + int ilower[2] = {-3, 1}; + int iupper[2] = {-1, 2}; + + int nvalues = 6; /* 6 grid points */ + double values[6]; + + for (i = 0; i < nvalues; i ++) + values[i] = 1.0; + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var, values); + + for (i = 0; i < nvalues; i ++) + values[i] = 0.0; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + + /* Set the vector coefficients over the gridpoints in my second box */ + { + int ilower[2] = { 0, 1}; + int iupper[2] = { 2, 4}; + + int nvalues = 12; /* 12 grid points */ + double values[12]; + + for (i = 0; i < nvalues; i ++) + values[i] = 1.0; + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var, values); + + for (i = 0; i < nvalues; i ++) + values[i] = 0.0; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + } + else if (myid == 1) + { + /* Set the vector coefficients over the gridpoints in my box */ + { + int ilower[2] = { 3, 1}; + int iupper[2] = { 6, 4}; + + int nvalues = 16; /* 16 grid points */ + double values[16]; + + for (i = 0; i < nvalues; i ++) + values[i] = 1.0; + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var, values); + + for (i = 0; i < nvalues; i ++) + values[i] = 0.0; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + } + + /* This is a collective call finalizing the vector assembly. + The vectors are now ``ready to be used'' */ + HYPRE_SStructVectorAssemble(b); + HYPRE_SStructVectorAssemble(x); + } + + /* 6. Set up and use a solver (See the Reference Manual for descriptions + of all of the options.) */ + { + HYPRE_StructMatrix sA; + HYPRE_StructVector sb; + HYPRE_StructVector sx; + + /* Because we are using a struct solver, we need to get the + object of the matrix and vectors to pass in to the struct solvers */ + HYPRE_SStructMatrixGetObject(A, (void **) &sA); + HYPRE_SStructVectorGetObject(b, (void **) &sb); + HYPRE_SStructVectorGetObject(x, (void **) &sx); + + /* Create an empty PCG Struct solver */ + HYPRE_StructPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set PCG parameters */ + HYPRE_StructPCGSetTol(solver, 1.0e-06); + HYPRE_StructPCGSetPrintLevel(solver, 2); + HYPRE_StructPCGSetMaxIter(solver, 50); + + /* Create the Struct SMG solver for use as a preconditioner */ + HYPRE_StructSMGCreate(MPI_COMM_WORLD, &precond); + + /* Set SMG parameters */ + HYPRE_StructSMGSetMaxIter(precond, 1); + HYPRE_StructSMGSetTol(precond, 0.0); + HYPRE_StructSMGSetZeroGuess(precond); + HYPRE_StructSMGSetNumPreRelax(precond, 1); + HYPRE_StructSMGSetNumPostRelax(precond, 1); + + /* Set preconditioner and solve */ + HYPRE_StructPCGSetPrecond(solver, HYPRE_StructSMGSolve, + HYPRE_StructSMGSetup, precond); + HYPRE_StructPCGSetup(solver, sA, sb, sx); + HYPRE_StructPCGSolve(solver, sA, sb, sx); + } + + /* Save the solution for GLVis visualization, see vis/glvis-ex6.sh */ + if (vis) + { + GLVis_PrintSStructGrid(grid, "vis/ex6.mesh", myid, NULL, NULL); + GLVis_PrintSStructVector(x, 0, "vis/ex6.sol", myid); + GLVis_PrintData("vis/ex6.data", myid, num_procs); + } + + /* Free memory */ + HYPRE_SStructGridDestroy(grid); + HYPRE_SStructStencilDestroy(stencil); + HYPRE_SStructGraphDestroy(graph); + HYPRE_SStructMatrixDestroy(A); + HYPRE_SStructVectorDestroy(b); + HYPRE_SStructVectorDestroy(x); + + HYPRE_StructPCGDestroy(solver); + HYPRE_StructSMGDestroy(precond); + + /* Finalize MPI */ + MPI_Finalize(); + + return (0); +} diff --git a/3rd_party/hypre/src/examples/ex7.c b/3rd_party/hypre/src/examples/ex7.c new file mode 100644 index 000000000..2373ae8b2 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex7.c @@ -0,0 +1,1270 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 7 + + Interface: SStructured interface (SStruct) + + Compile with: make ex7 + + Sample run: mpirun -np 16 ex7 -n 33 -solver 10 -K 3 -B 0 -C 1 -U0 2 -F 4 + + To see options: ex7 -help + + Description: This example uses the sstruct interface to solve the same + problem as was solved in Example 4 with the struct interface. + Therefore, there is only one part and one variable. + + This code solves the convection-reaction-diffusion problem + div (-K grad u + B u) + C u = F in the unit square with + boundary condition u = U0. The domain is split into N x N + processor grid. Thus, the given number of processors should + be a perfect square. Each processor has a n x n grid, with + nodes connected by a 5-point stencil. We use cell-centered + variables, and, therefore, the nodes are not shared. + + To incorporate the boundary conditions, we do the following: + Let x_i and x_b be the interior and boundary parts of the + solution vector x. If we split the matrix A as + A = [A_ii A_ib; A_bi A_bb], + then we solve + [A_ii 0; 0 I] [x_i ; x_b] = [b_i - A_ib u_0; u_0]. + Note that this differs from Example 3 in that we + are actually solving for the boundary conditions (so they + may not be exact as in ex3, where we only solved for the + interior). This approach is useful for more general types + of b.c. + + As in the previous example (Example 6), we use a structured + solver. A number of structured solvers are available. + More information can be found in the Solvers and Preconditioners + chapter of the User's Manual. +*/ + +#include +#include "_hypre_utilities.h" +#include "HYPRE_krylov.h" +#include "HYPRE_sstruct_ls.h" + +#ifdef M_PI + #define PI M_PI +#else + #define PI 3.14159265358979 +#endif + +#include "vis.c" + +/* Macro to evaluate a function F in the grid point (i,j) */ +#define Eval(F,i,j) (F( (ilower[0]+(i))*h, (ilower[1]+(j))*h )) +#define bcEval(F,i,j) (F( (bc_ilower[0]+(i))*h, (bc_ilower[1]+(j))*h )) + +int optionK, optionB, optionC, optionU0, optionF; + +/* Diffusion coefficient */ +double K(double x, double y) +{ + switch (optionK) + { + case 0: + return 1.0; + case 1: + return x*x+exp(y); + case 2: + if ((fabs(x-0.5) < 0.25) && (fabs(y-0.5) < 0.25)) + return 100.0; + else + return 1.0; + case 3: + if (((x-0.5)*(x-0.5)+(y-0.5)*(y-0.5)) < 0.0625) + return 10.0; + else + return 1.0; + default: + return 1.0; + } +} + +/* Convection vector, first component */ +double B1(double x, double y) +{ + switch (optionB) + { + case 0: + return 0.0; + case 1: + return -0.1; + case 2: + return 0.25; + case 3: + return 1.0; + default: + return 0.0; + } +} + +/* Convection vector, second component */ +double B2(double x, double y) +{ + switch (optionB) + { + case 0: + return 0.0; + case 1: + return 0.1; + case 2: + return -0.25; + case 3: + return 1.0; + default: + return 0.0; + } +} + +/* Reaction coefficient */ +double C(double x, double y) +{ + switch (optionC) + { + case 0: + return 0.0; + case 1: + return 10.0; + case 2: + return 100.0; + default: + return 0.0; + } +} + +/* Boundary condition */ +double U0(double x, double y) +{ + switch (optionU0) + { + case 0: + return 0.0; + case 1: + return (x+y)/100; + case 2: + return (sin(5*PI*x)+sin(5*PI*y))/1000; + default: + return 0.0; + } +} + +/* Right-hand side */ +double F(double x, double y) +{ + switch (optionF) + { + case 0: + return 1.0; + case 1: + return 0.0; + case 2: + return 2*PI*PI*sin(PI*x)*sin(PI*y); + case 3: + if ((fabs(x-0.5) < 0.25) && (fabs(y-0.5) < 0.25)) + return -1.0; + else + return 1.0; + case 4: + if (((x-0.5)*(x-0.5)+(y-0.5)*(y-0.5)) < 0.0625) + return -1.0; + else + return 1.0; + default: + return 1.0; + } +} + +int main (int argc, char *argv[]) +{ + int i, j, k; + + int myid, num_procs; + + int n, N, pi, pj; + double h, h2; + int ilower[2], iupper[2]; + + int solver_id; + int n_pre, n_post; + int rap, relax, skip, sym; + int time_index; + + int object_type; + + int num_iterations; + double final_res_norm; + + int vis; + + HYPRE_SStructGrid grid; + HYPRE_SStructStencil stencil; + HYPRE_SStructGraph graph; + HYPRE_SStructMatrix A; + HYPRE_SStructVector b; + HYPRE_SStructVector x; + + /* We are using struct solvers for this example */ + HYPRE_StructSolver solver; + HYPRE_StructSolver precond; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Set default parameters */ + n = 33; + optionK = 0; + optionB = 0; + optionC = 0; + optionU0 = 0; + optionF = 0; + solver_id = 10; + n_pre = 1; + n_post = 1; + rap = 0; + relax = 1; + skip = 0; + sym = 0; + + vis = 0; + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-K") == 0 ) + { + arg_index++; + optionK = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-B") == 0 ) + { + arg_index++; + optionB = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-C") == 0 ) + { + arg_index++; + optionC = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-U0") == 0 ) + { + arg_index++; + optionU0 = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-F") == 0 ) + { + arg_index++; + optionF = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-solver") == 0 ) + { + arg_index++; + solver_id = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-v") == 0 ) + { + arg_index++; + n_pre = atoi(argv[arg_index++]); + n_post = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-rap") == 0 ) + { + arg_index++; + rap = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-relax") == 0 ) + { + arg_index++; + relax = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-skip") == 0 ) + { + arg_index++; + skip = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-sym") == 0 ) + { + arg_index++; + sym = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size per processor (default: 8)\n"); + printf(" -K : choice for the diffusion coefficient (default: 1)\n"); + printf(" -B : choice for the convection vector (default: 0)\n"); + printf(" -C : choice for the reaction coefficient (default: 0)\n"); + printf(" -U0 : choice for the boundary condition (default: 0)\n"); + printf(" -F : choice for the right-hand side (default: 1) \n"); + printf(" -solver : solver ID\n"); + printf(" 0 - SMG \n"); + printf(" 1 - PFMG\n"); + printf(" 10 - CG with SMG precond (default)\n"); + printf(" 11 - CG with PFMG precond\n"); + printf(" 17 - CG with 2-step Jacobi\n"); + printf(" 18 - CG with diagonal scaling\n"); + printf(" 19 - CG\n"); + printf(" 30 - GMRES with SMG precond\n"); + printf(" 31 - GMRES with PFMG precond\n"); + printf(" 37 - GMRES with 2-step Jacobi\n"); + printf(" 38 - GMRES with diagonal scaling\n"); + printf(" 39 - GMRES\n"); + printf(" -v : number of pre and post relaxations\n"); + printf(" -rap : coarse grid operator type\n"); + printf(" 0 - Galerkin (default)\n"); + printf(" 1 - non-Galerkin ParFlow operators\n"); + printf(" 2 - Galerkin, general operators\n"); + printf(" -relax : relaxation type\n"); + printf(" 0 - Jacobi\n"); + printf(" 1 - Weighted Jacobi (default)\n"); + printf(" 2 - R/B Gauss-Seidel\n"); + printf(" 3 - R/B Gauss-Seidel (nonsymmetric)\n"); + printf(" -skip : skip levels in PFMG (0 or 1)\n"); + printf(" -sym : symmetric storage (1) or not (0)\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* Convection produces non-symmetric matrices */ + if (optionB && sym) + optionB = 0; + + /* Figure out the processor grid (N x N). The local + problem size is indicated by n (n x n). pi and pj + indicate position in the processor grid. */ + N = sqrt(num_procs); + h = 1.0 / (N*n-1); + h2 = h*h; + pj = myid / N; + pi = myid - pj*N; + + /* Define the nodes owned by the current processor (each processor's + piece of the global grid) */ + ilower[0] = pi*n; + ilower[1] = pj*n; + iupper[0] = ilower[0] + n-1; + iupper[1] = ilower[1] + n-1; + + /* 1. Set up a 2D grid */ + { + int ndim = 2; + int nparts = 1; + int nvars = 1; + int part = 0; + int i; + + /* Create an empty 2D grid object */ + HYPRE_SStructGridCreate(MPI_COMM_WORLD, ndim, nparts, &grid); + + /* Add a new box to the grid */ + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + + /* Set the variable type for each part */ + { + HYPRE_SStructVariable vartypes[1] = {HYPRE_SSTRUCT_VARIABLE_CELL}; + + for (i = 0; i< nparts; i++) + HYPRE_SStructGridSetVariables(grid, i, nvars, vartypes); + } + + /* This is a collective call finalizing the grid assembly. + The grid is now ``ready to be used'' */ + HYPRE_SStructGridAssemble(grid); + } + + /* 2. Define the discretization stencil */ + { + int ndim = 2; + int var = 0; + + if (sym == 0) + { + /* Define the geometry of the stencil */ + int offsets[5][2] = {{0,0}, {-1,0}, {1,0}, {0,-1}, {0,1}}; + + /* Create an empty 2D, 5-pt stencil object */ + HYPRE_SStructStencilCreate(ndim, 5, &stencil); + + /* Assign stencil entries */ + for (i = 0; i < 5; i++) + HYPRE_SStructStencilSetEntry(stencil, i, offsets[i], var); + } + else /* Symmetric storage */ + { + /* Define the geometry of the stencil */ + int offsets[3][2] = {{0,0}, {1,0}, {0,1}}; + + /* Create an empty 2D, 3-pt stencil object */ + HYPRE_SStructStencilCreate(ndim, 3, &stencil); + + /* Assign stencil entries */ + for (i = 0; i < 3; i++) + HYPRE_SStructStencilSetEntry(stencil, i, offsets[i], var); + } + } + + /* 3. Set up the Graph - this determines the non-zero structure + of the matrix */ + { + int var = 0; + int part = 0; + + /* Create the graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, grid, &graph); + + /* See MatrixSetObjectType below */ + object_type = HYPRE_STRUCT; + HYPRE_SStructGraphSetObjectType(graph, object_type); + + /* Now we need to tell the graph which stencil to use for each + variable on each part (we only have one variable and one part)*/ + HYPRE_SStructGraphSetStencil(graph, part, var, stencil); + + /* Here we could establish connections between parts if we + had more than one part. */ + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(graph); + } + + /* 4. Set up SStruct Vectors for b and x */ + { + double *values; + + /* We have one part and one variable. */ + int part = 0; + int var = 0; + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &b); + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &x); + + /* Set the object type (by default HYPRE_SSTRUCT). This determines the + data structure used to store the matrix. If you want to use unstructured + solvers, e.g. BoomerAMG, the object type should be HYPRE_PARCSR. + If the problem is purely structured (with one part), you may want to use + HYPRE_STRUCT to access the structured solvers. Here we have a purely + structured example. */ + object_type = HYPRE_STRUCT; + HYPRE_SStructVectorSetObjectType(b, object_type); + HYPRE_SStructVectorSetObjectType(x, object_type); + + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(b); + HYPRE_SStructVectorInitialize(x); + + values = (double*) calloc((n*n), sizeof(double)); + + /* Set the values of b in left-to-right, bottom-to-top order */ + for (k = 0, j = 0; j < n; j++) + for (i = 0; i < n; i++, k++) + values[k] = h2 * Eval(F,i,j); + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var, values); + + /* Set x = 0 */ + for (i = 0; i < (n*n); i ++) + values[i] = 0.0; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + + free(values); + + /* Assembling is postponed since the vectors will be further modified */ + } + + /* 4. Set up a SStruct Matrix */ + { + /* We have one part and one variable. */ + int part = 0; + int var = 0; + + /* Create an empty matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, graph, &A); + + /* Use symmetric storage? The function below is for symmetric stencil entries + (use HYPRE_SStructMatrixSetNSSymmetric for non-stencil entries) */ + HYPRE_SStructMatrixSetSymmetric(A, part, var, var, sym); + + /* As with the vectors, set the object type for the vectors + to be the struct type */ + object_type = HYPRE_STRUCT; + HYPRE_SStructMatrixSetObjectType(A, object_type); + + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_SStructMatrixInitialize(A); + + /* Set the stencil values in the interior. Here we set the values + at every node. We will modify the boundary nodes later. */ + if (sym == 0) + { + int stencil_indices[5] = {0, 1, 2, 3, 4}; /* labels correspond + to the offsets */ + double *values; + + values = (double*) calloc(5*(n*n), sizeof(double)); + + /* The order is left-to-right, bottom-to-top */ + for (k = 0, j = 0; j < n; j++) + for (i = 0; i < n; i++, k+=5) + { + values[k+1] = - Eval(K,i-0.5,j) - Eval(B1,i-0.5,j); + + values[k+2] = - Eval(K,i+0.5,j) + Eval(B1,i+0.5,j); + + values[k+3] = - Eval(K,i,j-0.5) - Eval(B2,i,j-0.5); + + values[k+4] = - Eval(K,i,j+0.5) + Eval(B2,i,j+0.5); + + values[k] = h2 * Eval(C,i,j) + + Eval(K ,i-0.5,j) + Eval(K ,i+0.5,j) + + Eval(K ,i,j-0.5) + Eval(K ,i,j+0.5) + - Eval(B1,i-0.5,j) + Eval(B1,i+0.5,j) + - Eval(B2,i,j-0.5) + Eval(B2,i,j+0.5); + } + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 5, + stencil_indices, values); + + free(values); + } + else /* Symmetric storage */ + { + int stencil_indices[3] = {0, 1, 2}; + double *values; + + values = (double*) calloc(3*(n*n), sizeof(double)); + + /* The order is left-to-right, bottom-to-top */ + for (k = 0, j = 0; j < n; j++) + for (i = 0; i < n; i++, k+=3) + { + values[k+1] = - Eval(K,i+0.5,j); + values[k+2] = - Eval(K,i,j+0.5); + values[k] = h2 * Eval(C,i,j) + + Eval(K,i+0.5,j) + Eval(K,i,j+0.5) + + Eval(K,i-0.5,j) + Eval(K,i,j-0.5); + } + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 3, + stencil_indices, values); + + free(values); + } + } + + /* 5. Set the boundary conditions, while eliminating the coefficients + reaching ouside of the domain boundary. We must modify the matrix + stencil and the corresponding rhs entries. */ + { + int bc_ilower[2]; + int bc_iupper[2]; + + int stencil_indices[5] = {0, 1, 2, 3, 4}; + double *values, *bvalues; + + int nentries; + + /* We have one part and one variable. */ + int part = 0; + int var = 0; + + if (sym == 0) + nentries = 5; + else + nentries = 3; + + values = (double*) calloc(nentries*n, sizeof(double)); + bvalues = (double*) calloc(n, sizeof(double)); + + /* The stencil at the boundary nodes is 1-0-0-0-0. Because + we have I x_b = u_0; */ + for (i = 0; i < nentries*n; i += nentries) + { + values[i] = 1.0; + for (j = 1; j < nentries; j++) + values[i+j] = 0.0; + } + + /* Processors at y = 0 */ + if (pj == 0) + { + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0] + n-1; + bc_iupper[1] = bc_ilower[1]; + + /* Modify the matrix */ + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, nentries, + stencil_indices, values); + + /* Put the boundary conditions in b */ + for (i = 0; i < n; i++) + bvalues[i] = bcEval(U0,i,0); + + HYPRE_SStructVectorSetBoxValues(b, part, bc_ilower, + bc_iupper, var, bvalues); + } + + /* Processors at y = 1 */ + if (pj == N-1) + { + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n + n-1; + + bc_iupper[0] = bc_ilower[0] + n-1; + bc_iupper[1] = bc_ilower[1]; + + /* Modify the matrix */ + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, nentries, + stencil_indices, values); + + /* Put the boundary conditions in b */ + for (i = 0; i < n; i++) + bvalues[i] = bcEval(U0,i,0); + + HYPRE_SStructVectorSetBoxValues(b, part, bc_ilower, bc_iupper, var, bvalues); + } + + /* Processors at x = 0 */ + if (pi == 0) + { + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0]; + bc_iupper[1] = bc_ilower[1] + n-1; + + /* Modify the matrix */ + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, nentries, + stencil_indices, values); + + /* Put the boundary conditions in b */ + for (j = 0; j < n; j++) + bvalues[j] = bcEval(U0,0,j); + + HYPRE_SStructVectorSetBoxValues(b, part, bc_ilower, bc_iupper, + var, bvalues); + } + + /* Processors at x = 1 */ + if (pi == N-1) + { + bc_ilower[0] = pi*n + n-1; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0]; + bc_iupper[1] = bc_ilower[1] + n-1; + + /* Modify the matrix */ + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, nentries, + stencil_indices, values); + + /* Put the boundary conditions in b */ + for (j = 0; j < n; j++) + bvalues[j] = bcEval(U0,0,j); + + HYPRE_SStructVectorSetBoxValues(b, part, bc_ilower, bc_iupper, + var, bvalues); + } + + /* Recall that the system we are solving is: + [A_ii 0; 0 I] [x_i ; x_b] = [b_i - A_ib u_0; u_0]. + This requires removing the connections between the interior + and boundary nodes that we have set up when we set the + 5pt stencil at each node. We adjust for removing + these connections by appropriately modifying the rhs. + For the symm ordering scheme, just do the top and right + boundary */ + + /* Processors at y = 0, neighbors of boundary nodes */ + if (pj == 0) + { + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n + 1; + + bc_iupper[0] = bc_ilower[0] + n-1; + bc_iupper[1] = bc_ilower[1]; + + stencil_indices[0] = 3; + + /* Modify the matrix */ + for (i = 0; i < n; i++) + bvalues[i] = 0.0; + + if (sym == 0) + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, 1, + stencil_indices, bvalues); + + /* Eliminate the boundary conditions in b */ + for (i = 0; i < n; i++) + bvalues[i] = bcEval(U0,i,-1) * (bcEval(K,i,-0.5)+bcEval(B2,i,-0.5)); + + if (pi == 0) + bvalues[0] = 0.0; + + if (pi == N-1) + bvalues[n-1] = 0.0; + + /* Note the use of AddToBoxValues (because we have already set values + at these nodes) */ + HYPRE_SStructVectorAddToBoxValues(b, part, bc_ilower, bc_iupper, + var, bvalues); + } + + /* Processors at x = 0, neighbors of boundary nodes */ + if (pi == 0) + { + bc_ilower[0] = pi*n + 1; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0]; + bc_iupper[1] = bc_ilower[1] + n-1; + + stencil_indices[0] = 1; + + /* Modify the matrix */ + for (j = 0; j < n; j++) + bvalues[j] = 0.0; + + if (sym == 0) + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, 1, + stencil_indices, bvalues); + + /* Eliminate the boundary conditions in b */ + for (j = 0; j < n; j++) + bvalues[j] = bcEval(U0,-1,j) * (bcEval(K,-0.5,j)+bcEval(B1,-0.5,j)); + + if (pj == 0) + bvalues[0] = 0.0; + + if (pj == N-1) + bvalues[n-1] = 0.0; + + HYPRE_SStructVectorAddToBoxValues(b, part, bc_ilower, bc_iupper, var, bvalues); + } + + /* Processors at y = 1, neighbors of boundary nodes */ + if (pj == N-1) + { + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n + (n-1) -1; + + bc_iupper[0] = bc_ilower[0] + n-1; + bc_iupper[1] = bc_ilower[1]; + + if (sym == 0) + stencil_indices[0] = 4; + else + stencil_indices[0] = 2; + + /* Modify the matrix */ + for (i = 0; i < n; i++) + bvalues[i] = 0.0; + + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, var, 1, + stencil_indices, bvalues); + + /* Eliminate the boundary conditions in b */ + for (i = 0; i < n; i++) + bvalues[i] = bcEval(U0,i,1) * (bcEval(K,i,0.5)+bcEval(B2,i,0.5)); + + if (pi == 0) + bvalues[0] = 0.0; + + if (pi == N-1) + bvalues[n-1] = 0.0; + + HYPRE_SStructVectorAddToBoxValues(b, part, bc_ilower, bc_iupper, + var, bvalues); + } + + /* Processors at x = 1, neighbors of boundary nodes */ + if (pi == N-1) + { + bc_ilower[0] = pi*n + (n-1) - 1; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0]; + bc_iupper[1] = bc_ilower[1] + n-1; + + if (sym == 0) + stencil_indices[0] = 2; + else + stencil_indices[0] = 1; + + /* Modify the matrix */ + for (j = 0; j < n; j++) + bvalues[j] = 0.0; + + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, 1, + stencil_indices, bvalues); + + /* Eliminate the boundary conditions in b */ + for (j = 0; j < n; j++) + bvalues[j] = bcEval(U0,1,j) * (bcEval(K,0.5,j)+bcEval(B1,0.5,j)); + + if (pj == 0) + bvalues[0] = 0.0; + + if (pj == N-1) + bvalues[n-1] = 0.0; + + HYPRE_SStructVectorAddToBoxValues(b, part, bc_ilower, bc_iupper, var, bvalues); + } + + free(values); + free(bvalues); + } + + /* Finalize the vector and matrix assembly */ + HYPRE_SStructMatrixAssemble(A); + HYPRE_SStructVectorAssemble(b); + HYPRE_SStructVectorAssemble(x); + + /* 6. Set up and use a solver */ + { + HYPRE_StructMatrix sA; + HYPRE_StructVector sb; + HYPRE_StructVector sx; + + /* Because we are using a struct solver, we need to get the + object of the matrix and vectors to pass in to the struct solvers */ + + HYPRE_SStructMatrixGetObject(A, (void **) &sA); + HYPRE_SStructVectorGetObject(b, (void **) &sb); + HYPRE_SStructVectorGetObject(x, (void **) &sx); + + if (solver_id == 0) /* SMG */ + { + /* Start timing */ + time_index = hypre_InitializeTiming("SMG Setup"); + hypre_BeginTiming(time_index); + + /* Options and setup */ + HYPRE_StructSMGCreate(MPI_COMM_WORLD, &solver); + HYPRE_StructSMGSetMemoryUse(solver, 0); + HYPRE_StructSMGSetMaxIter(solver, 50); + HYPRE_StructSMGSetTol(solver, 1.0e-06); + HYPRE_StructSMGSetRelChange(solver, 0); + HYPRE_StructSMGSetNumPreRelax(solver, n_pre); + HYPRE_StructSMGSetNumPostRelax(solver, n_post); + HYPRE_StructSMGSetPrintLevel(solver, 1); + HYPRE_StructSMGSetLogging(solver, 1); + HYPRE_StructSMGSetup(solver, sA, sb, sx); + + /* Finalize current timing */ + hypre_EndTiming(time_index); + hypre_PrintTiming("Setup phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Start timing again */ + time_index = hypre_InitializeTiming("SMG Solve"); + hypre_BeginTiming(time_index); + + /* Solve */ + HYPRE_StructSMGSolve(solver, sA, sb, sx); + hypre_EndTiming(time_index); + /* Finalize current timing */ + + hypre_PrintTiming("Solve phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Get info and release memory */ + HYPRE_StructSMGGetNumIterations(solver, &num_iterations); + HYPRE_StructSMGGetFinalRelativeResidualNorm(solver, &final_res_norm); + HYPRE_StructSMGDestroy(solver); + } + + if (solver_id == 1) /* PFMG */ + { + /* Start timing */ + time_index = hypre_InitializeTiming("PFMG Setup"); + hypre_BeginTiming(time_index); + + /* Options and setup */ + HYPRE_StructPFMGCreate(MPI_COMM_WORLD, &solver); + HYPRE_StructPFMGSetMaxIter(solver, 50); + HYPRE_StructPFMGSetTol(solver, 1.0e-06); + HYPRE_StructPFMGSetRelChange(solver, 0); + HYPRE_StructPFMGSetRAPType(solver, rap); + HYPRE_StructPFMGSetRelaxType(solver, relax); + HYPRE_StructPFMGSetNumPreRelax(solver, n_pre); + HYPRE_StructPFMGSetNumPostRelax(solver, n_post); + HYPRE_StructPFMGSetSkipRelax(solver, skip); + HYPRE_StructPFMGSetPrintLevel(solver, 1); + HYPRE_StructPFMGSetLogging(solver, 1); + HYPRE_StructPFMGSetup(solver, sA, sb, sx); + + /* Finalize current timing */ + hypre_EndTiming(time_index); + hypre_PrintTiming("Setup phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Start timing again */ + time_index = hypre_InitializeTiming("PFMG Solve"); + hypre_BeginTiming(time_index); + + /* Solve */ + HYPRE_StructPFMGSolve(solver, sA, sb, sx); + + /* Finalize current timing */ + hypre_EndTiming(time_index); + hypre_PrintTiming("Solve phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Get info and release memory */ + HYPRE_StructPFMGGetNumIterations(solver, &num_iterations); + HYPRE_StructPFMGGetFinalRelativeResidualNorm(solver, &final_res_norm); + HYPRE_StructPFMGDestroy(solver); + } + + /* Preconditioned CG */ + if ((solver_id > 9) && (solver_id < 20)) + { + time_index = hypre_InitializeTiming("PCG Setup"); + hypre_BeginTiming(time_index); + + HYPRE_StructPCGCreate(MPI_COMM_WORLD, &solver); + HYPRE_StructPCGSetMaxIter(solver, 200 ); + HYPRE_StructPCGSetTol(solver, 1.0e-06 ); + HYPRE_StructPCGSetTwoNorm(solver, 1 ); + HYPRE_StructPCGSetRelChange(solver, 0 ); + HYPRE_StructPCGSetPrintLevel(solver, 2 ); + + if (solver_id == 10) + { + /* use symmetric SMG as preconditioner */ + HYPRE_StructSMGCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructSMGSetMemoryUse(precond, 0); + HYPRE_StructSMGSetMaxIter(precond, 1); + HYPRE_StructSMGSetTol(precond, 0.0); + HYPRE_StructSMGSetZeroGuess(precond); + HYPRE_StructSMGSetNumPreRelax(precond, n_pre); + HYPRE_StructSMGSetNumPostRelax(precond, n_post); + HYPRE_StructSMGSetPrintLevel(precond, 0); + HYPRE_StructSMGSetLogging(precond, 0); + HYPRE_StructPCGSetPrecond(solver, + HYPRE_StructSMGSolve, + HYPRE_StructSMGSetup, + precond); + } + + else if (solver_id == 11) + { + /* use symmetric PFMG as preconditioner */ + HYPRE_StructPFMGCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructPFMGSetMaxIter(precond, 1); + HYPRE_StructPFMGSetTol(precond, 0.0); + HYPRE_StructPFMGSetZeroGuess(precond); + HYPRE_StructPFMGSetRAPType(precond, rap); + HYPRE_StructPFMGSetRelaxType(precond, relax); + HYPRE_StructPFMGSetNumPreRelax(precond, n_pre); + HYPRE_StructPFMGSetNumPostRelax(precond, n_post); + HYPRE_StructPFMGSetSkipRelax(precond, skip); + HYPRE_StructPFMGSetPrintLevel(precond, 0); + HYPRE_StructPFMGSetLogging(precond, 0); + HYPRE_StructPCGSetPrecond(solver, + HYPRE_StructPFMGSolve, + HYPRE_StructPFMGSetup, + precond); + } + + else if (solver_id == 17) + { + /* use two-step Jacobi as preconditioner */ + HYPRE_StructJacobiCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructJacobiSetMaxIter(precond, 2); + HYPRE_StructJacobiSetTol(precond, 0.0); + HYPRE_StructJacobiSetZeroGuess(precond); + HYPRE_StructPCGSetPrecond( solver, + HYPRE_StructJacobiSolve, + HYPRE_StructJacobiSetup, + precond); + } + + else if (solver_id == 18) + { + /* use diagonal scaling as preconditioner */ + precond = NULL; + HYPRE_StructPCGSetPrecond(solver, + HYPRE_StructDiagScale, + HYPRE_StructDiagScaleSetup, + precond); + } + + /* PCG Setup */ + HYPRE_StructPCGSetup(solver, sA, sb, sx ); + + hypre_EndTiming(time_index); + hypre_PrintTiming("Setup phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + time_index = hypre_InitializeTiming("PCG Solve"); + hypre_BeginTiming(time_index); + + /* PCG Solve */ + HYPRE_StructPCGSolve(solver, sA, sb, sx); + + hypre_EndTiming(time_index); + hypre_PrintTiming("Solve phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Get info and release memory */ + HYPRE_StructPCGGetNumIterations( solver, &num_iterations ); + HYPRE_StructPCGGetFinalRelativeResidualNorm( solver, &final_res_norm ); + HYPRE_StructPCGDestroy(solver); + + if (solver_id == 10) + { + HYPRE_StructSMGDestroy(precond); + } + else if (solver_id == 11 ) + { + HYPRE_StructPFMGDestroy(precond); + } + else if (solver_id == 17) + { + HYPRE_StructJacobiDestroy(precond); + } + } + + /* Preconditioned GMRES */ + if ((solver_id > 29) && (solver_id < 40)) + { + time_index = hypre_InitializeTiming("GMRES Setup"); + hypre_BeginTiming(time_index); + + HYPRE_StructGMRESCreate(MPI_COMM_WORLD, &solver); + + /* Note that GMRES can be used with all the interfaces - not + just the struct. So here we demonstrate the + more generic GMRES interface functions. Since we have chosen + a struct solver then we must type cast to the more generic + HYPRE_Solver when setting options with these generic functions. + Note that one could declare the solver to be + type HYPRE_Solver, and then the casting would not be necessary.*/ + + HYPRE_GMRESSetMaxIter((HYPRE_Solver) solver, 500 ); + HYPRE_GMRESSetKDim((HYPRE_Solver) solver,30); + HYPRE_GMRESSetTol((HYPRE_Solver) solver, 1.0e-06 ); + HYPRE_GMRESSetPrintLevel((HYPRE_Solver) solver, 2 ); + HYPRE_GMRESSetLogging((HYPRE_Solver) solver, 1 ); + + if (solver_id == 30) + { + /* use symmetric SMG as preconditioner */ + HYPRE_StructSMGCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructSMGSetMemoryUse(precond, 0); + HYPRE_StructSMGSetMaxIter(precond, 1); + HYPRE_StructSMGSetTol(precond, 0.0); + HYPRE_StructSMGSetZeroGuess(precond); + HYPRE_StructSMGSetNumPreRelax(precond, n_pre); + HYPRE_StructSMGSetNumPostRelax(precond, n_post); + HYPRE_StructSMGSetPrintLevel(precond, 0); + HYPRE_StructSMGSetLogging(precond, 0); + HYPRE_StructGMRESSetPrecond(solver, + HYPRE_StructSMGSolve, + HYPRE_StructSMGSetup, + precond); + } + + else if (solver_id == 31) + { + /* use symmetric PFMG as preconditioner */ + HYPRE_StructPFMGCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructPFMGSetMaxIter(precond, 1); + HYPRE_StructPFMGSetTol(precond, 0.0); + HYPRE_StructPFMGSetZeroGuess(precond); + HYPRE_StructPFMGSetRAPType(precond, rap); + HYPRE_StructPFMGSetRelaxType(precond, relax); + HYPRE_StructPFMGSetNumPreRelax(precond, n_pre); + HYPRE_StructPFMGSetNumPostRelax(precond, n_post); + HYPRE_StructPFMGSetSkipRelax(precond, skip); + HYPRE_StructPFMGSetPrintLevel(precond, 0); + HYPRE_StructPFMGSetLogging(precond, 0); + HYPRE_StructGMRESSetPrecond( solver, + HYPRE_StructPFMGSolve, + HYPRE_StructPFMGSetup, + precond); + } + + else if (solver_id == 37) + { + /* use two-step Jacobi as preconditioner */ + HYPRE_StructJacobiCreate(MPI_COMM_WORLD, &precond); + HYPRE_StructJacobiSetMaxIter(precond, 2); + HYPRE_StructJacobiSetTol(precond, 0.0); + HYPRE_StructJacobiSetZeroGuess(precond); + HYPRE_StructGMRESSetPrecond( solver, + HYPRE_StructJacobiSolve, + HYPRE_StructJacobiSetup, + precond); + } + + else if (solver_id == 38) + { + /* use diagonal scaling as preconditioner */ + precond = NULL; + HYPRE_StructGMRESSetPrecond( solver, + HYPRE_StructDiagScale, + HYPRE_StructDiagScaleSetup, + precond); + } + + /* GMRES Setup */ + HYPRE_StructGMRESSetup(solver, sA, sb, sx ); + + hypre_EndTiming(time_index); + hypre_PrintTiming("Setup phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + time_index = hypre_InitializeTiming("GMRES Solve"); + hypre_BeginTiming(time_index); + + /* GMRES Solve */ + HYPRE_StructGMRESSolve(solver, sA, sb, sx); + + hypre_EndTiming(time_index); + hypre_PrintTiming("Solve phase times", MPI_COMM_WORLD); + hypre_FinalizeTiming(time_index); + hypre_ClearTiming(); + + /* Get info and release memory */ + HYPRE_StructGMRESGetNumIterations(solver, &num_iterations); + HYPRE_StructGMRESGetFinalRelativeResidualNorm(solver, &final_res_norm); + HYPRE_StructGMRESDestroy(solver); + + if (solver_id == 30) + { + HYPRE_StructSMGDestroy(precond); + } + else if (solver_id == 31) + { + HYPRE_StructPFMGDestroy(precond); + } + else if (solver_id == 37) + { + HYPRE_StructJacobiDestroy(precond); + } + } + + } + + /* Save the solution for GLVis visualization, see vis/glvis-ex7.sh */ + if (vis) + { + FILE *file; + char filename[255]; + + int part = 0, var = 0; + int nvalues = n*n; + double *values = (double*) calloc(nvalues, sizeof(double)); + + /* get all local data (including a local copy of the shared values) */ + HYPRE_SStructVectorGetBoxValues(x, part, ilower, iupper, + var, values); + + sprintf(filename, "%s.%06d", "vis/ex7.sol", myid); + if ((file = fopen(filename, "w")) == NULL) + { + printf("Error: can't open output file %s\n", filename); + MPI_Finalize(); + exit(1); + } + + /* save solution with global unknown numbers */ + k = 0; + for (j = 0; j < n; j++) + for (i = 0; i < n; i++) + fprintf(file, "%06d %.14e\n", pj*N*n*n+pi*n+j*N*n+i, values[k++]); + + fflush(file); + fclose(file); + free(values); + + /* save global finite element mesh */ + if (myid == 0) + GLVis_PrintGlobalSquareMesh("vis/ex7.mesh", N*n-1); + } + + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", num_iterations); + printf("Final Relative Residual Norm = %e\n", final_res_norm); + printf("\n"); + } + + /* Free memory */ + HYPRE_SStructGridDestroy(grid); + HYPRE_SStructStencilDestroy(stencil); + HYPRE_SStructGraphDestroy(graph); + HYPRE_SStructMatrixDestroy(A); + HYPRE_SStructVectorDestroy(b); + HYPRE_SStructVectorDestroy(x); + + /* Finalize MPI */ + MPI_Finalize(); + + return (0); +} diff --git a/3rd_party/hypre/src/examples/ex8.c b/3rd_party/hypre/src/examples/ex8.c new file mode 100644 index 000000000..ae2861e93 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex8.c @@ -0,0 +1,767 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 8 + + Interface: Semi-Structured interface (SStruct) + + Compile with: make ex8 + + Sample run: mpirun -np 2 ex8 + + Description: This is a two processor example which solves a similar + problem to the one in Example 2, and Example 6 (The grid + boxes are exactly those in the example diagram in the + struct interface chapter of the User's Manual.) + + The difference with the previous examples is that we use + three parts, two with a 5-point and one with a 9-point + discretization stencil. The solver is PCG with split-SMG + preconditioner. +*/ + +#include + +/* SStruct linear solvers headers */ +#include "HYPRE_sstruct_ls.h" + +#include "vis.c" + +int main (int argc, char *argv[]) +{ + int myid, num_procs; + + int vis = 0; + + HYPRE_SStructGrid grid; + HYPRE_SStructGraph graph; + HYPRE_SStructStencil stencil_5pt; + HYPRE_SStructStencil stencil_9pt; + HYPRE_SStructMatrix A; + HYPRE_SStructVector b; + HYPRE_SStructVector x; + HYPRE_SStructSolver solver; + HYPRE_SStructSolver precond; + + int object_type; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + if (num_procs != 2) + { + if (myid ==0) printf("Must run with 2 processors!\n"); + MPI_Finalize(); + + return(0); + } + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* 1. Set up the 2D grid. This gives the index space in each part. + We have one variable in each part. */ + { + int ndim = 2; + int nparts = 3; + int part; + + /* Create an empty 2D grid object */ + HYPRE_SStructGridCreate(MPI_COMM_WORLD, ndim, nparts, &grid); + + /* Set the extents of the grid - each processor sets its grid + boxes. Each part has its own relative index space numbering. */ + + /* Processor 0 owns two boxes - one in part 0 and one in part 1. */ + if (myid == 0) + { + /* Add the first box to the grid in part 0 */ + { + int ilower[2] = {-3, 1}; + int iupper[2] = {-1, 2}; + + part = 0; + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + } + + /* Add the second box to the grid in part 1 */ + { + /* For convenience we use the same index space across all + parts, but this is not a requirement. For example, on this + part we could have used ilower=[23,24] and iupper=[25,27]. */ + int ilower[2] = {0, 1}; + int iupper[2] = {2, 4}; + + part = 1; + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + } + } + + /* Processor 1 owns one box in part 2. */ + else if (myid == 1) + { + /* Add a new box to the grid in part 2 */ + { + int ilower[2] = {3, 1}; + int iupper[2] = {6, 4}; + + part = 2; + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + } + } + + /* Set the variable type and number of variables on each part. */ + { + int i; + int nvars = 1; + HYPRE_SStructVariable vartypes[1] = {HYPRE_SSTRUCT_VARIABLE_CELL}; + + for (i = 0; i< nparts; i++) + HYPRE_SStructGridSetVariables(grid, i, nvars, vartypes); + } + + /* Now we need to set the spatial relation between each of the parts. + Since we have the same types of variables on both parts, we can + use HYPRE_GridSetNeighborPart(). Each processor calls this function + for each part on which it owns boxes that border a different part. */ + + if (myid == 0) + { + /* Relation between part 0 and part 1 on processor 0 */ + { + int part = 0; + int nbor_part = 1; + /* Cells just outside of the boundary of part 0 in + its coordinates */ + int b_ilower[2] = {0,1}, b_iupper[2] = {0,2}; + /* The same cells in part 1's coordinates. Since we use the same + index space across all parts, the coordinates coincide. */ + int nbor_ilower[2] = {0,1}, nbor_iupper[2] = {0,2}; + /* These parts have the same orientation, so no + rotation is necessary */ + int index_map[2] = {0,1}; + /* These parts map increasing values to increasing values + for both variables (note: if decreasing maps to increasing, use -1)*/ + int index_dir[2] = {1,1}; + + HYPRE_SStructGridSetNeighborPart(grid, part, b_ilower, b_iupper, + nbor_part, nbor_ilower, nbor_iupper, + index_map, index_dir); + } + + /* Relation between part 1 and part 0 on processor 0 */ + { + int part = 1; + int nbor_part = 0; + /* Cells just outside of the boundary of part 1 in + its coordinates */ + int b_ilower[2] = {-1,1}, b_iupper[2] = {-1,2}; + /* The same cells in part 0's coordinates. Since we use the same + index space across all parts, the coordinates coincide. */ + int nbor_ilower[2] = {-1,1}, nbor_iupper[2] = {-1,2}; + /* These parts have the same orientation, so no + rotation is necessary */ + int index_map[2] = {0,1}; + /* These parts map increasing values to increasing values + for both variables (note: if decreasing maps to increasing, use -1)*/ + int index_dir[2] = {1,1}; + + HYPRE_SStructGridSetNeighborPart(grid, part, b_ilower, b_iupper, + nbor_part, nbor_ilower, nbor_iupper, + index_map, index_dir); + } + + /* Relation between part 1 and part 2 on processor 0 */ + { + int part = 1; + int nbor_part = 2; + /* Cells just outside of the boundary of part 1 in + its coordinates */ + int b_ilower[2] = {3,1}, b_iupper[2] = {3,4}; + /* The same cells in part 2's coordinates. Since we use the same + index space across all parts, the coordinates coincide. */ + int nbor_ilower[2] = {3,1}, nbor_iupper[2] = {3,4}; + /* These parts have the same orientation, so no + rotation is necessary */ + int index_map[2] = {0,1}; + /* These parts map increasing values to increasing values + for both variables (note: if decreasing maps to increasing, use -1)*/ + int index_dir[2] = {1,1}; + + HYPRE_SStructGridSetNeighborPart(grid, part, b_ilower, b_iupper, + nbor_part, nbor_ilower, nbor_iupper, + index_map, index_dir); + } + } + else if (myid == 1) + { + /* Relation between part 2 and part 1 on processor 1 */ + { + int part = 2; + int nbor_part = 1; + /* Cells just outside of the boundary of part 2 in + its coordinates */ + int b_ilower[2] = {2,1}, b_iupper[2] = {2,4}; + /* The same cells in part 1's coordinates. Since we use the same + index space across all parts, the coordinates coincide. */ + int nbor_ilower[2] = {2,1}, nbor_iupper[2] = {2,4}; + /* These parts have the same orientation, so no + rotation is necessary */ + int index_map[2] = {0,1}; + /* These parts map increasing values to increasing values + for both variables (note: if decreasing maps to increasing, use -1)*/ + int index_dir[2] = {1,1}; + + HYPRE_SStructGridSetNeighborPart(grid, part, b_ilower, b_iupper, + nbor_part, nbor_ilower, nbor_iupper, + index_map, index_dir); + } + } + + /* Now the grid is ready to use */ + HYPRE_SStructGridAssemble(grid); + } + + /* 2. Define the discretization stencils */ + { + int ndim = 2; + int var = 0; + int entry; + + /* the 5-pt stencil in 2D */ + { + int offsets[5][2] = {{0,0}, {-1,0}, {1,0}, {0,-1}, {0,1}}; + int stencil_size = 5; + + HYPRE_SStructStencilCreate(ndim, stencil_size, &stencil_5pt); + + for (entry = 0; entry < 5; entry++) + HYPRE_SStructStencilSetEntry(stencil_5pt, entry, offsets[entry], var); + } + + /* the 9-pt stencil in 2D */ + { + int offsets[9][2] = {{0,0}, {-1,0}, {1,0}, {0,-1}, {0,1}, + {-1,-1}, {1,-1}, {1,1}, {-1,1}}; + int stencil_size = 9; + HYPRE_SStructStencilCreate(ndim, stencil_size, &stencil_9pt); + + for (entry = 0; entry < stencil_size; entry++) + HYPRE_SStructStencilSetEntry(stencil_9pt, entry, offsets[entry], var); + } + } + + /* 3. Set up the Graph - this determines the non-zero structure + of the matrix and allows non-stencil relationships between the parts */ + { + int var = 0; + int part; + + /* Create the graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, grid, &graph); + + /* See MatrixSetObjectType below */ + object_type = HYPRE_SSTRUCT; + HYPRE_SStructGraphSetObjectType(graph, object_type); + + /* Use the 5-pt stencil on part 0 */ + part = 0; + HYPRE_SStructGraphSetStencil(graph, part, var, stencil_5pt); + + /* Use the 9-pt stencil on part 1 */ + part = 1; + HYPRE_SStructGraphSetStencil(graph, part, var, stencil_9pt); + + /* Use the 5-pt stencil on part 2 */ + part = 2; + HYPRE_SStructGraphSetStencil(graph, part, var, stencil_5pt); + + /* Since we have only stencil connections between parts, we don't need to + call HYPRE_SStructGraphAddEntries. */ + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(graph); + } + + /* 4. Set up a SStruct Matrix */ + { + int i,j; + int part; + int var = 0; + + /* Create the empty matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, graph, &A); + + /* Set the object type (by default HYPRE_SSTRUCT). This determines the + data structure used to store the matrix. If you want to use unstructured + solvers, e.g. BoomerAMG, the object type should be HYPRE_PARCSR. + If the problem is purely structured (with one part), you may want to use + HYPRE_STRUCT to access the structured solvers. Since we have two parts + with different stencils, we set the object type to HYPRE_SSTRUCT. */ + object_type = HYPRE_SSTRUCT; + HYPRE_SStructMatrixSetObjectType(A, object_type); + + /* Get ready to set values */ + HYPRE_SStructMatrixInitialize(A); + + /* Each processor must set the stencil values for their boxes on each part. + In this example, we only set stencil entries and therefore use + HYPRE_SStructMatrixSetBoxValues. If we need to set non-stencil entries, + we have to use HYPRE_SStructMatrixSetValues. */ + + if (myid == 0) + { + /* Set the matrix coefficients for some set of stencil entries + over all the gridpoints in my first box (account for boundary + grid points later) */ + { + int ilower[2] = {-3, 1}; + int iupper[2] = {-1, 2}; + + int nentries = 5; + int nvalues = 30; /* 6 grid points, each with 5 stencil entries */ + double values[30]; + + int stencil_indices[5]; + for (j = 0; j < nentries; j++) /* label the stencil indices - + these correspond to the offsets + defined above */ + stencil_indices[j] = j; + + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 4.0; + for (j = 1; j < nentries; j++) + values[i+j] = -1.0; + } + + part = 0; + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, nentries, + stencil_indices, values); + } + + /* Set the matrix coefficients for some set of stencil entries + over the gridpoints in my second box */ + { + int ilower[2] = {0, 1}; + int iupper[2] = {2, 4}; + + int nentries = 9; + int nvalues = 108; /* 12 grid points, each with 5 stencil entries */ + double values[108]; + + int stencil_indices[9]; + for (j = 0; j < nentries; j++) + stencil_indices[j] = j; + + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 8./3.; + for (j = 1; j < nentries; j++) + values[i+j] = -1./3.; + } + + part = 1; + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, nentries, + stencil_indices, values); + } + } + else if (myid == 1) + { + /* Set the matrix coefficients for some set of stencil entries + over the gridpoints in my box */ + { + int ilower[2] = {3, 1}; + int iupper[2] = {6, 4}; + + int nentries = 5; + int nvalues = 80; /* 16 grid points, each with 5 stencil entries */ + double values[80]; + + int stencil_indices[5]; + for (j = 0; j < nentries; j++) + stencil_indices[j] = j; + + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 4.0; + for (j = 1; j < nentries; j++) + values[i+j] = -1.0; + } + + part = 2; + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, nentries, + stencil_indices, values); + } + } + + /* Modify the 9-pt stencil on the boundary between parts to ensure + symmetry and good global approximation. */ + if (myid == 0) + { + int nentries = 6; + int nvalues = 24; /* 4 grid points, each with 6 stencil entries */ + double values[24]; + + part = 1; + + for (i = 0; i < nvalues; i += nentries) + { + values[i] = 10./3.; + values[i+1] = -1.; + values[i+2] = -2./3.; + values[i+3] = -2./3.; + values[i+4] = 0.0; + values[i+5] = 0.0; + } + + { + /* Values to the right of the second box */ + int ilower[2] = { 2, 1}; + int iupper[2] = { 2, 4}; + + int stencil_indices[6] = {0,2,3,4,6,7}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, nentries, + stencil_indices, values); + } + + { + /* Values to the left of the second box */ + int ilower[2] = { 0, 1}; + int iupper[2] = { 0, 4}; + + int stencil_indices[6] = {0,1,3,4,5,8}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, nentries, + stencil_indices, values); + } + } + + /* For each box, set any coefficients that reach ouside of the + boundary to 0 */ + if (myid == 0) + { + int maxnvalues = 9; + double values[9]; + + for (i = 0; i < maxnvalues; i++) + values[i] = 0.0; + + part = 0; + + { + /* Values below our first box */ + int ilower[2] = {-3, 1}; + int iupper[2] = {-1, 1}; + + int stencil_indices[1] = {3}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + + { + /* Values to the left of our first box */ + int ilower[2] = {-3, 1}; + int iupper[2] = {-3, 2}; + + int stencil_indices[1] = {1}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + + { + /* Values above our first box */ + int ilower[2] = {-3, 2}; + int iupper[2] = {-1, 2}; + + int stencil_indices[1] = {4}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + + part = 1; + + { + /* Values below our second box */ + int ilower[2] = { 0, 1}; + int iupper[2] = { 2, 1}; + + int stencil_indices[3] = {3,5,6}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 3, + stencil_indices, values); + } + + { + /* Values to the left of our second box (that do not border the + first box). */ + int ilower[2] = { 0, 3}; + int iupper[2] = { 0, 4}; + + int stencil_indices[3] = {1,5,8}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 3, + stencil_indices, values); + } + + { + /* Values above our second box */ + int ilower[2] = { 0, 4}; + int iupper[2] = { 2, 4}; + + int stencil_indices[3] = {4,7,8}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 3, + stencil_indices, values); + } + } + else if (myid == 1) + { + int maxnvalues = 4; + double values[4]; + + for (i = 0; i < maxnvalues; i++) + values[i] = 0.0; + + part = 2; + + { + /* Values below our box */ + int ilower[2] = { 3, 1}; + int iupper[2] = { 6, 1}; + + int stencil_indices[1] = {3}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + + { + /* Values to the right of our box */ + int ilower[2] = { 6, 1}; + int iupper[2] = { 6, 4}; + + int stencil_indices[1] = {2}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + + { + /* Values above our box */ + int ilower[2] = { 3, 4}; + int iupper[2] = { 6, 4}; + + int stencil_indices[1] = {4}; + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, 1, + stencil_indices, values); + } + } + + /* This is a collective call finalizing the matrix assembly. + The matrix is now ``ready to be used'' */ + HYPRE_SStructMatrixAssemble(A); + } + + /* 5. Set up SStruct Vectors for b and x */ + { + int i; + int part; + int var = 0; + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &b); + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &x); + + /* As with the matrix, set the object type for the vectors + to be the sstruct type */ + object_type = HYPRE_SSTRUCT; + HYPRE_SStructVectorSetObjectType(b, object_type); + HYPRE_SStructVectorSetObjectType(x, object_type); + + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(b); + HYPRE_SStructVectorInitialize(x); + + if (myid == 0) + { + /* Set the vector coefficients over the gridpoints in my first box */ + { + int ilower[2] = {-3, 1}; + int iupper[2] = {-1, 2}; + + int nvalues = 6; /* 6 grid points */ + double values[6]; + + part = 0; + + for (i = 0; i < nvalues; i ++) + values[i] = 1.0; + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var, values); + + for (i = 0; i < nvalues; i ++) + values[i] = 0.0; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + + /* Set the vector coefficients over the gridpoints in my second box */ + { + int ilower[2] = { 0, 1}; + int iupper[2] = { 2, 4}; + + int nvalues = 12; /* 12 grid points */ + double values[12]; + + part = 1; + + for (i = 0; i < nvalues; i ++) + values[i] = 1.0; + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var, values); + + for (i = 0; i < nvalues; i ++) + values[i] = 0.0; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + } + else if (myid == 1) + { + /* Set the vector coefficients over the gridpoints in my box */ + { + int ilower[2] = { 3, 1}; + int iupper[2] = { 6, 4}; + + int nvalues = 16; /* 16 grid points */ + double values[16]; + + part = 2; + + for (i = 0; i < nvalues; i ++) + values[i] = 1.0; + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var, values); + + for (i = 0; i < nvalues; i ++) + values[i] = 0.0; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + } + } + + /* This is a collective call finalizing the vector assembly. + The vectors are now ``ready to be used'' */ + HYPRE_SStructVectorAssemble(b); + HYPRE_SStructVectorAssemble(x); + } + + /* 6. Set up and use a solver (See the Reference Manual for descriptions + of all of the options.) */ + { + /* Create an empty PCG Struct solver */ + HYPRE_SStructPCGCreate(MPI_COMM_WORLD, &solver); + + /* Set PCG parameters */ + HYPRE_SStructPCGSetTol(solver, 1.0e-6 ); + HYPRE_SStructPCGSetPrintLevel(solver, 2); + HYPRE_SStructPCGSetMaxIter(solver, 50); + + /* Create a split SStruct solver for use as a preconditioner */ + HYPRE_SStructSplitCreate(MPI_COMM_WORLD, &precond); + HYPRE_SStructSplitSetMaxIter(precond, 1); + HYPRE_SStructSplitSetTol(precond, 0.0); + HYPRE_SStructSplitSetZeroGuess(precond); + + /* Set the preconditioner type to split-SMG */ + HYPRE_SStructSplitSetStructSolver(precond, HYPRE_SMG); + + /* Set preconditioner and solve */ + HYPRE_SStructPCGSetPrecond(solver, HYPRE_SStructSplitSolve, + HYPRE_SStructSplitSetup, precond); + HYPRE_SStructPCGSetup(solver, A, b, x); + HYPRE_SStructPCGSolve(solver, A, b, x); + } + + /* Save the solution for GLVis visualization, see vis/glvis-ex8.sh */ + if (vis) + { + GLVis_PrintSStructGrid(grid, "vis/ex8.mesh", myid, NULL, NULL); + GLVis_PrintSStructVector(x, 0, "vis/ex8.sol", myid); + GLVis_PrintData("vis/ex8.data", myid, num_procs); + } + + /* Free memory */ + HYPRE_SStructGridDestroy(grid); + HYPRE_SStructStencilDestroy(stencil_5pt); + HYPRE_SStructStencilDestroy(stencil_9pt); + HYPRE_SStructGraphDestroy(graph); + HYPRE_SStructMatrixDestroy(A); + HYPRE_SStructVectorDestroy(b); + HYPRE_SStructVectorDestroy(x); + + HYPRE_SStructPCGDestroy(solver); + HYPRE_SStructSplitDestroy(precond); + + /* Finalize MPI */ + MPI_Finalize(); + + return (0); +} diff --git a/3rd_party/hypre/src/examples/ex9.c b/3rd_party/hypre/src/examples/ex9.c new file mode 100644 index 000000000..4983f8699 --- /dev/null +++ b/3rd_party/hypre/src/examples/ex9.c @@ -0,0 +1,786 @@ +/****************************************************************************** + * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other + * HYPRE Project Developers. See the top-level COPYRIGHT file for details. + * + * SPDX-License-Identifier: (Apache-2.0 OR MIT) + ******************************************************************************/ + +/* + Example 9 + + Interface: Semi-Structured interface (SStruct) + + Compile with: make ex9 + + Sample run: mpirun -np 16 ex9 -n 33 -solver 0 -v 1 1 + + To see options: ex9 -help + + Description: This code solves a system corresponding to a discretization + of the biharmonic problem treated as a system of equations + on the unit square. Specifically, instead of solving + Delta^2(u) = f with zero boundary conditions for u and + Delta(u), we solve the system A x = b, where + + A = [ Delta -I ; 0 Delta], x = [ u ; v] and b = [ 0 ; f] + + The corresponding boundary conditions are u = 0 and v = 0. + + The domain is split into an N x N processor grid. Thus, the + given number of processors should be a perfect square. + Each processor's piece of the grid has n x n cells with n x n + nodes. We use cell-centered variables, and, therefore, the + nodes are not shared. Note that we have two variables, u and + v, and need only one part to describe the domain. We use the + standard 5-point stencil to discretize the Laplace operators. + The boundary conditions are incorporated as in Example 3. + + We recommend viewing Examples 3, 6 and 7 before this example. +*/ + +#include +#include "_hypre_utilities.h" +#include "HYPRE_sstruct_ls.h" +#include "HYPRE_krylov.h" + +#include "vis.c" + +int main (int argc, char *argv[]) +{ + int i, j; + + int myid, num_procs; + + int n, N, pi, pj; + double h, h2; + int ilower[2], iupper[2]; + + int solver_id; + int n_pre, n_post; + + int vis; + int object_type; + + HYPRE_SStructGrid grid; + HYPRE_SStructGraph graph; + HYPRE_SStructStencil stencil_v; + HYPRE_SStructStencil stencil_u; + HYPRE_SStructMatrix A; + HYPRE_SStructVector b; + HYPRE_SStructVector x; + + /* sstruct solvers */ + HYPRE_SStructSolver solver; + HYPRE_SStructSolver precond; + + /* parcsr solvers */ + HYPRE_Solver par_solver; + HYPRE_Solver par_precond; + + /* Initialize MPI */ + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &myid); + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + /* Set defaults */ + n = 33; + solver_id = 0; + n_pre = 1; + n_post = 1; + vis = 0; + + /* Parse command line */ + { + int arg_index = 0; + int print_usage = 0; + + while (arg_index < argc) + { + if ( strcmp(argv[arg_index], "-n") == 0 ) + { + arg_index++; + n = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-solver") == 0 ) + { + arg_index++; + solver_id = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-v") == 0 ) + { + arg_index++; + n_pre = atoi(argv[arg_index++]); + n_post = atoi(argv[arg_index++]); + } + else if ( strcmp(argv[arg_index], "-vis") == 0 ) + { + arg_index++; + vis = 1; + } + else if ( strcmp(argv[arg_index], "-help") == 0 ) + { + print_usage = 1; + break; + } + else + { + arg_index++; + } + } + + if ((print_usage) && (myid == 0)) + { + printf("\n"); + printf("Usage: %s []\n", argv[0]); + printf("\n"); + printf(" -n : problem size per processor (default: 33)\n"); + printf(" -solver : solver ID\n"); + printf(" 0 - GMRES with sysPFMG precond (default)\n"); + printf(" 1 - sysPFMG\n"); + printf(" 2 - GMRES with AMG precond\n"); + printf(" 3 - AMG\n"); + printf(" -v : number of pre and post relaxations for SysPFMG (default: 1 1)\n"); + printf(" -vis : save the solution for GLVis visualization\n"); + printf("\n"); + } + + if (print_usage) + { + MPI_Finalize(); + return (0); + } + } + + /* Figure out the processor grid (N x N). The local problem + size for the interior nodes is indicated by n (n x n). + pi and pj indicate position in the processor grid. */ + N = sqrt(num_procs); + h = 1.0 / (N*n+1); /* note that when calculating h we must + remember to count the boundary nodes */ + h2 = h*h; + pj = myid / N; + pi = myid - pj*N; + + /* Figure out the extents of each processor's piece of the grid. */ + ilower[0] = pi*n; + ilower[1] = pj*n; + + iupper[0] = ilower[0] + n-1; + iupper[1] = ilower[1] + n-1; + + /* 1. Set up a grid - we have one part and two variables */ + { + int nparts = 1; + int part = 0; + int ndim = 2; + + /* Create an empty 2D grid object */ + HYPRE_SStructGridCreate(MPI_COMM_WORLD, ndim, nparts, &grid); + + /* Add a new box to the grid */ + HYPRE_SStructGridSetExtents(grid, part, ilower, iupper); + + /* Set the variable type and number of variables on each part.*/ + { + int i; + int nvars = 2; + HYPRE_SStructVariable vartypes[2] = {HYPRE_SSTRUCT_VARIABLE_CELL, + HYPRE_SSTRUCT_VARIABLE_CELL }; + + for (i = 0; i< nparts; i++) + HYPRE_SStructGridSetVariables(grid, i, nvars, vartypes); + } + + /* This is a collective call finalizing the grid assembly. + The grid is now ``ready to be used'' */ + HYPRE_SStructGridAssemble(grid); + } + + /* 2. Define the discretization stencils */ + { + int entry; + int stencil_size; + int var; + int ndim = 2; + + /* Stencil object for variable u (labeled as variable 0) */ + { + int offsets[6][2] = {{0,0}, {-1,0}, {1,0}, {0,-1}, {0,1}, {0,0}}; + stencil_size = 6; + + HYPRE_SStructStencilCreate(ndim, stencil_size, &stencil_u); + + /* The first 5 entries are for the u-u connections */ + var = 0; /* connect to variable 0 */ + for (entry = 0; entry < stencil_size-1 ; entry++) + HYPRE_SStructStencilSetEntry(stencil_u, entry, offsets[entry], var); + + /* The last entry is for the u-v connection */ + var = 1; /* connect to variable 1 */ + entry = 5; + HYPRE_SStructStencilSetEntry(stencil_u, entry, offsets[entry], var); + } + + /* Stencil object for variable v (variable 1) */ + { + int offsets[5][2] = {{0,0}, {-1,0}, {1,0}, {0,-1}, {0,1}}; + stencil_size = 5; + + HYPRE_SStructStencilCreate(ndim, stencil_size, &stencil_v); + + /* These are all v-v connections */ + var = 1; /* Connect to variable 1 */ + for (entry = 0; entry < stencil_size; entry++) + HYPRE_SStructStencilSetEntry(stencil_v, entry, offsets[entry], var); + } + } + + /* 3. Set up the Graph - this determines the non-zero structure + of the matrix and allows non-stencil relationships between the parts. */ + { + int var; + int part = 0; + + /* Create the graph object */ + HYPRE_SStructGraphCreate(MPI_COMM_WORLD, grid, &graph); + + /* See MatrixSetObjectType below */ + if (solver_id > 1 && solver_id < 4) + { + object_type = HYPRE_PARCSR; + } + else + { + object_type = HYPRE_SSTRUCT; + } + HYPRE_SStructGraphSetObjectType(graph, object_type); + + /* Assign the u-stencil we created to variable u (variable 0) */ + var = 0; + HYPRE_SStructGraphSetStencil(graph, part, var, stencil_u); + + /* Assign the v-stencil we created to variable v (variable 1) */ + var = 1; + HYPRE_SStructGraphSetStencil(graph, part, var, stencil_v); + + /* Assemble the graph */ + HYPRE_SStructGraphAssemble(graph); + } + + /* 4. Set up the SStruct Matrix */ + { + int nentries; + int nvalues; + int var; + int part = 0; + + /* Create an empty matrix object */ + HYPRE_SStructMatrixCreate(MPI_COMM_WORLD, graph, &A); + + /* Set the object type (by default HYPRE_SSTRUCT). This determines the + data structure used to store the matrix. If you want to use + unstructured solvers, e.g. BoomerAMG, the object type should be + HYPRE_PARCSR. If the problem is purely structured (with one part), you + may want to use HYPRE_STRUCT to access the structured solvers. */ + HYPRE_SStructMatrixSetObjectType(A, object_type); + + /* Indicate that the matrix coefficients are ready to be set */ + HYPRE_SStructMatrixInitialize(A); + + /* Each processor must set the stencil values for their boxes on each part. + In this example, we only set stencil entries and therefore use + HYPRE_SStructMatrixSetBoxValues. If we need to set non-stencil entries, + we have to use HYPRE_SStructMatrixSetValues. */ + + /* First set the u-stencil entries. Note that + HYPRE_SStructMatrixSetBoxValues can only set values corresponding + to stencil entries for the same variable. Therefore, we must set the + entries for each variable within a stencil with separate function calls. + For example, below the u-u connections and u-v connections are handled + in separate calls. */ + { + int i, j; + double *u_values; + int u_v_indices[1] = {5}; + int u_u_indices[5] = {0, 1, 2, 3, 4}; + + var = 0; /* Set values for the u connections */ + + /* First the u-u connections */ + nentries = 5; + nvalues = nentries*n*n; + u_values = (double*) calloc(nvalues, sizeof(double)); + + for (i = 0; i < nvalues; i += nentries) + { + u_values[i] = 4.0; + for (j = 1; j < nentries; j++) + u_values[i+j] = -1.0; + } + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, nentries, + u_u_indices, u_values); + free(u_values); + + /* Next the u-v connections */ + nentries = 1; + nvalues = nentries*n*n; + u_values = (double*) calloc(nvalues, sizeof(double)); + + for (i = 0; i < nvalues; i++) + { + u_values[i] = -h2; + } + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, nentries, + u_v_indices, u_values); + + free(u_values); + } + + /* Now set the v-stencil entries */ + { + int i, j; + double *v_values; + int v_v_indices[5] = {0, 1, 2, 3, 4}; + + var = 1; /* the v connections */ + + /* the v-v connections */ + nentries = 5; + nvalues = nentries*n*n; + v_values = (double*) calloc(nvalues, sizeof(double)); + + for (i = 0; i < nvalues; i += nentries) + { + v_values[i] = 4.0; + for (j = 1; j < nentries; j++) + v_values[i+j] = -1.0; + } + + HYPRE_SStructMatrixSetBoxValues(A, part, ilower, iupper, + var, nentries, + v_v_indices, v_values); + + free(v_values); + + /* There are no v-u connections to set */ + } + } + + /* 5. Incorporate the zero boundary conditions: go along each edge of + the domain and set the stencil entry that reaches to the boundary + to zero.*/ + { + int bc_ilower[2]; + int bc_iupper[2]; + int nentries = 1; + int nvalues = nentries*n; /* number of stencil entries times the length + of one side of my grid box */ + int var; + double *values; + int stencil_indices[1]; + + int part = 0; + + values = (double*) calloc(nvalues, sizeof(double)); + for (j = 0; j < nvalues; j++) + values[j] = 0.0; + + /* Recall: pi and pj describe position in the processor grid */ + if (pj == 0) + { + /* Bottom row of grid points */ + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0] + n-1; + bc_iupper[1] = bc_ilower[1]; + + stencil_indices[0] = 3; + + /* Need to do this for u and for v */ + var = 0; + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, nentries, + stencil_indices, values); + + var = 1; + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, nentries, + stencil_indices, values); + } + + if (pj == N-1) + { + /* upper row of grid points */ + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n + n-1; + + bc_iupper[0] = bc_ilower[0] + n-1; + bc_iupper[1] = bc_ilower[1]; + + stencil_indices[0] = 4; + + /* Need to do this for u and for v */ + var = 0; + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, nentries, + stencil_indices, values); + + var = 1; + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, nentries, + stencil_indices, values); + + } + + if (pi == 0) + { + /* Left row of grid points */ + bc_ilower[0] = pi*n; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0]; + bc_iupper[1] = bc_ilower[1] + n-1; + + stencil_indices[0] = 1; + + /* Need to do this for u and for v */ + var = 0; + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, nentries, + stencil_indices, values); + + var = 1; + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, nentries, + stencil_indices, values); + } + + if (pi == N-1) + { + /* Right row of grid points */ + bc_ilower[0] = pi*n + n-1; + bc_ilower[1] = pj*n; + + bc_iupper[0] = bc_ilower[0]; + bc_iupper[1] = bc_ilower[1] + n-1; + + stencil_indices[0] = 2; + + /* Need to do this for u and for v */ + var = 0; + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, nentries, + stencil_indices, values); + + var = 1; + HYPRE_SStructMatrixSetBoxValues(A, part, bc_ilower, bc_iupper, + var, nentries, + stencil_indices, values); + } + + free(values); + } + + /* This is a collective call finalizing the matrix assembly. + The matrix is now ``ready to be used'' */ + HYPRE_SStructMatrixAssemble(A); + + /* 5. Set up SStruct Vectors for b and x */ + { + int nvalues = n*n; + double *values; + int part = 0; + int var; + + values = (double*) calloc(nvalues, sizeof(double)); + + /* Create an empty vector object */ + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &b); + HYPRE_SStructVectorCreate(MPI_COMM_WORLD, grid, &x); + + /* Set the object type for the vectors + to be the same as was already set for the matrix */ + HYPRE_SStructVectorSetObjectType(b, object_type); + HYPRE_SStructVectorSetObjectType(x, object_type); + + /* Indicate that the vector coefficients are ready to be set */ + HYPRE_SStructVectorInitialize(b); + HYPRE_SStructVectorInitialize(x); + + /* Set the values for b */ + for (i = 0; i < nvalues; i ++) + values[i] = h2; + var = 1; + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var, values); + + for (i = 0; i < nvalues; i ++) + values[i] = 0.0; + var = 0; + HYPRE_SStructVectorSetBoxValues(b, part, ilower, iupper, var, values); + + /* Set the values for the initial guess */ + var = 0; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + + var = 1; + HYPRE_SStructVectorSetBoxValues(x, part, ilower, iupper, var, values); + + free(values); + + /* This is a collective call finalizing the vector assembly. + The vector is now ``ready to be used'' */ + HYPRE_SStructVectorAssemble(b); + HYPRE_SStructVectorAssemble(x); + } + + /* 6. Set up and use a solver + (Solver options can be found in the Reference Manual.) */ + { + double final_res_norm; + int its; + + HYPRE_ParCSRMatrix par_A; + HYPRE_ParVector par_b; + HYPRE_ParVector par_x; + + /* If we are using a parcsr solver, we need to get the object for the + matrix and vectors. */ + if (object_type == HYPRE_PARCSR) + { + HYPRE_SStructMatrixGetObject(A, (void **) &par_A); + HYPRE_SStructVectorGetObject(b, (void **) &par_b); + HYPRE_SStructVectorGetObject(x, (void **) &par_x); + } + + if (solver_id ==0 ) /* GMRES with SysPFMG - the default*/ + { + HYPRE_SStructGMRESCreate(MPI_COMM_WORLD, &solver); + + /* GMRES parameters */ + HYPRE_SStructGMRESSetMaxIter(solver, 50 ); + HYPRE_SStructGMRESSetTol(solver, 1.0e-06 ); + HYPRE_SStructGMRESSetPrintLevel(solver, 2 ); /* print each GMRES + iteration */ + HYPRE_SStructGMRESSetLogging(solver, 1); + + /* use SysPFMG as precondititioner */ + HYPRE_SStructSysPFMGCreate(MPI_COMM_WORLD, &precond); + + /* Set sysPFMG parameters */ + HYPRE_SStructSysPFMGSetTol(precond, 0.0); + HYPRE_SStructSysPFMGSetMaxIter(precond, 1); + HYPRE_SStructSysPFMGSetNumPreRelax(precond, n_pre); + HYPRE_SStructSysPFMGSetNumPostRelax(precond, n_post); + HYPRE_SStructSysPFMGSetPrintLevel(precond, 0); + HYPRE_SStructSysPFMGSetZeroGuess(precond); + + /* Set the preconditioner*/ + HYPRE_SStructGMRESSetPrecond(solver, HYPRE_SStructSysPFMGSolve, + HYPRE_SStructSysPFMGSetup, precond); + /* do the setup */ + HYPRE_SStructGMRESSetup(solver, A, b, x); + + /* do the solve */ + HYPRE_SStructGMRESSolve(solver, A, b, x); + + /* get some info */ + HYPRE_SStructGMRESGetFinalRelativeResidualNorm(solver, + &final_res_norm); + HYPRE_SStructGMRESGetNumIterations(solver, &its); + + /* clean up */ + HYPRE_SStructGMRESDestroy(solver); + } + else if (solver_id == 1) /* SysPFMG */ + { + HYPRE_SStructSysPFMGCreate(MPI_COMM_WORLD, &solver); + + /* Set sysPFMG parameters */ + HYPRE_SStructSysPFMGSetTol(solver, 1.0e-6); + HYPRE_SStructSysPFMGSetMaxIter(solver, 50); + HYPRE_SStructSysPFMGSetNumPreRelax(solver, n_pre); + HYPRE_SStructSysPFMGSetNumPostRelax(solver, n_post); + HYPRE_SStructSysPFMGSetPrintLevel(solver, 0); + HYPRE_SStructSysPFMGSetLogging(solver, 1); + + /* do the setup */ + HYPRE_SStructSysPFMGSetup(solver, A, b, x); + + /* do the solve */ + HYPRE_SStructSysPFMGSolve(solver, A, b, x); + + /* get some info */ + HYPRE_SStructSysPFMGGetFinalRelativeResidualNorm(solver, + &final_res_norm); + HYPRE_SStructSysPFMGGetNumIterations(solver, &its); + + /* clean up */ + HYPRE_SStructSysPFMGDestroy(solver); + } + else if (solver_id == 2) /* GMRES with AMG */ + { + HYPRE_ParCSRGMRESCreate(MPI_COMM_WORLD, &par_solver); + + /* set the GMRES paramaters */ + HYPRE_GMRESSetKDim(par_solver, 5); + HYPRE_GMRESSetMaxIter(par_solver, 100); + HYPRE_GMRESSetTol(par_solver, 1.0e-06); + HYPRE_GMRESSetPrintLevel(par_solver, 2); + HYPRE_GMRESSetLogging(par_solver, 1); + + /* use BoomerAMG as preconditioner */ + HYPRE_BoomerAMGCreate(&par_precond); + HYPRE_BoomerAMGSetCoarsenType(par_precond, 6); + HYPRE_BoomerAMGSetOldDefault(par_precond); + HYPRE_BoomerAMGSetStrongThreshold(par_precond, 0.25); + HYPRE_BoomerAMGSetTol(par_precond, 0.0); + HYPRE_BoomerAMGSetPrintLevel(par_precond, 1); + HYPRE_BoomerAMGSetPrintFileName(par_precond, "ex9.out.log"); + HYPRE_BoomerAMGSetMaxIter(par_precond, 1); + + /* set the preconditioner */ + HYPRE_ParCSRGMRESSetPrecond(par_solver, + HYPRE_BoomerAMGSolve, + HYPRE_BoomerAMGSetup, + par_precond); + + /* do the setup */ + HYPRE_ParCSRGMRESSetup(par_solver, par_A, par_b, par_x); + + /* do the solve */ + HYPRE_ParCSRGMRESSolve(par_solver, par_A, par_b, par_x); + + /* get some info */ + HYPRE_GMRESGetNumIterations(par_solver, &its); + HYPRE_GMRESGetFinalRelativeResidualNorm(par_solver, + &final_res_norm); + /* clean up */ + HYPRE_ParCSRGMRESDestroy(par_solver); + HYPRE_BoomerAMGDestroy(par_precond); + } + else if (solver_id == 3) /* AMG */ + { + HYPRE_BoomerAMGCreate(&par_solver); + HYPRE_BoomerAMGSetCoarsenType(par_solver, 6); + HYPRE_BoomerAMGSetOldDefault(par_solver); + HYPRE_BoomerAMGSetStrongThreshold(par_solver, 0.25); + HYPRE_BoomerAMGSetTol(par_solver, 1.9e-6); + HYPRE_BoomerAMGSetPrintLevel(par_solver, 1); + HYPRE_BoomerAMGSetPrintFileName(par_solver, "ex9.out.log"); + HYPRE_BoomerAMGSetMaxIter(par_solver, 50); + + /* do the setup */ + HYPRE_BoomerAMGSetup(par_solver, par_A, par_b, par_x); + + /* do the solve */ + HYPRE_BoomerAMGSolve(par_solver, par_A, par_b, par_x); + + /* get some info */ + HYPRE_BoomerAMGGetNumIterations(par_solver, &its); + HYPRE_BoomerAMGGetFinalRelativeResidualNorm(par_solver, + &final_res_norm); + /* clean up */ + HYPRE_BoomerAMGDestroy(par_solver); + } + else + { + if (myid ==0) printf("\n ERROR: Invalid solver id specified.\n"); + } + + /* Gather the solution vector. This needs to be done if: + (1) the object type is parcsr OR + (2) any one of the variables is NOT cell-centered */ + if (object_type == HYPRE_PARCSR) + { + HYPRE_SStructVectorGather(x); + } + + /* Save the solution for GLVis visualization, see vis/glvis-ex7.sh */ + if (vis) + { + FILE *file; + char filename[255]; + + int k, part = 0, var; + int nvalues = n*n; + double *values = (double*) calloc(nvalues, sizeof(double)); + + /* save local solution for variable u */ + var = 0; + HYPRE_SStructVectorGetBoxValues(x, part, ilower, iupper, + var, values); + + sprintf(filename, "%s.%06d", "vis/ex9-u.sol", myid); + if ((file = fopen(filename, "w")) == NULL) + { + printf("Error: can't open output file %s\n", filename); + MPI_Finalize(); + exit(1); + } + + /* save solution with global unknown numbers */ + k = 0; + for (j = 0; j < n; j++) + for (i = 0; i < n; i++) + fprintf(file, "%06d %.14e\n", pj*N*n*n+pi*n+j*N*n+i, values[k++]); + + fflush(file); + fclose(file); + + /* save local solution for variable v */ + var = 1; + HYPRE_SStructVectorGetBoxValues(x, part, ilower, iupper, + var, values); + + sprintf(filename, "%s.%06d", "vis/ex9-v.sol", myid); + if ((file = fopen(filename, "w")) == NULL) + { + printf("Error: can't open output file %s\n", filename); + MPI_Finalize(); + exit(1); + } + + /* save solution with global unknown numbers */ + k = 0; + for (j = 0; j < n; j++) + for (i = 0; i < n; i++) + fprintf(file, "%06d %.14e\n", pj*N*n*n+pi*n+j*N*n+i, values[k++]); + + fflush(file); + fclose(file); + + free(values); + + /* save global finite element mesh */ + if (myid == 0) + GLVis_PrintGlobalSquareMesh("vis/ex9.mesh", N*n-1); + } + + if (myid == 0) + { + printf("\n"); + printf("Iterations = %d\n", its); + printf("Final Relative Residual Norm = %g\n", final_res_norm); + printf("\n"); + } + } + + /* Free memory */ + HYPRE_SStructGridDestroy(grid); + HYPRE_SStructStencilDestroy(stencil_v); + HYPRE_SStructStencilDestroy(stencil_u); + HYPRE_SStructGraphDestroy(graph); + HYPRE_SStructMatrixDestroy(A); + HYPRE_SStructVectorDestroy(b); + HYPRE_SStructVectorDestroy(x); + + /* Finalize MPI */ + MPI_Finalize(); + + return (0); +} diff --git a/3rd_party/hypre/src/examples/vis.c b/3rd_party/hypre/src/examples/vis.c new file mode 100644 index 000000000..c68a9e4dc --- /dev/null +++ b/3rd_party/hypre/src/examples/vis.c @@ -0,0 +1,819 @@ +/* Save a structured n x n mesh of square elements on the unit square into a + GLVis mesh file with the given name. */ +void GLVis_PrintGlobalSquareMesh(const char *meshfile, int n) +{ + FILE *file; + + int Dim = 2; + int NumOfVertices = (n+1)*(n+1); + int NumOfElements = n*n; + + int i, j; + double x, y; + double h = 1.0/n; + + if ((file = fopen(meshfile, "w")) == NULL) + { + printf("Error: can't open output file %s\n", meshfile); + exit(1); + } + + /* mesh header */ + fprintf(file, "MFEM mesh v1.0\n"); + fprintf(file, "\ndimension\n"); + fprintf(file, "%d\n", Dim); + + /* mesh elements */ + fprintf(file, "\nelements\n"); + fprintf(file, "%d\n", NumOfElements); + for (j = 0; j < n; j++) + for (i = 0; i < n; i++) + fprintf(file, "1 3 %d %d %d %d\n", i + j*(n+1), i + 1 +j*(n+1), + i + 1 + (j+1)*(n+1), i + (j+1)*(n+1)); + + /* boundary will be generated by GLVis */ + fprintf(file, "\nboundary\n"); + fprintf(file, "0\n"); + + /* mesh vertices */ + fprintf(file, "\nvertices\n"); + fprintf(file, "%d\n", NumOfVertices); + fprintf(file, "%d\n", Dim); + for (j = 0; j < n+1; j++) + for (i = 0; i < n+1; i++) + { + x = i*h; + y = j*h; + fprintf(file, "%.14e %.14e\n", x, y); + } + + fflush(file); + fclose(file); +} + +/* Save a structured nx x ny mesh of square elements of size h, globally + translated by (x0,y0), into a GLVis mesh file with the given prefix. */ +void GLVis_PrintLocalSquareMesh(const char *meshfile_prefix, int nx, int ny, + double h, double x0, double y0, int myid) +{ + FILE *file; + char meshfile[255]; + + int Dim = 2; + int NumOfVertices = (nx+1)*(ny+1); + int NumOfElements = nx*ny; + + int i, j; + double x, y; + + sprintf(meshfile, "%s.%06d", meshfile_prefix, myid); + if ((file = fopen(meshfile, "w")) == NULL) + { + printf("Error: can't open output file %s\n", meshfile); + exit(1); + } + + /* mesh header */ + fprintf(file, "MFEM mesh v1.0\n"); + fprintf(file, "\ndimension\n"); + fprintf(file, "%d\n", Dim); + + /* mesh elements */ + fprintf(file, "\nelements\n"); + fprintf(file, "%d\n", NumOfElements); + for (j = 0; j < ny; j++) + for (i = 0; i < nx; i++) + fprintf(file, "1 3 %d %d %d %d\n", i + j*(nx+1), i + 1 +j*(nx+1), + i + 1 + (j+1)*(nx+1), i + (j+1)*(nx+1)); + + /* boundary will be generated by GLVis */ + fprintf(file, "\nboundary\n"); + fprintf(file, "0\n"); + + /* mesh vertices */ + fprintf(file, "\nvertices\n"); + fprintf(file, "%d\n", NumOfVertices); + fprintf(file, "%d\n", Dim); + for (j = 0; j < ny+1; j++) + for (i = 0; i < nx+1; i++) + { + x = x0+i*h; + y = y0+j*h; + fprintf(file, "%.14e %.14e\n", x, y); + } + + fflush(file); + fclose(file); +} + +/* Save a structured n x n mesh of gamma-angled rhombuses, globally rotated by + angle gamma*myid, into a GLVis mesh file with the given prefix. */ +void GLVis_PrintLocalRhombusMesh(const char *meshfile_prefix, + int n, int myid, double gamma) +{ + FILE *file; + char meshfile[255]; + + int Dim = 2; + int NumOfVertices = (n+1)*(n+1); + int NumOfElements = n*n; + + int i, j; + double x, y; + double h = 1.0/n; + + double rho = gamma*myid; + double sg = sin(gamma); + double cg = cos(gamma); + double sr = sin(rho); + double cr = cos(rho); + + sprintf(meshfile, "%s.%06d", meshfile_prefix, myid); + if ((file = fopen(meshfile, "w")) == NULL) + { + printf("Error: can't open output file %s\n", meshfile); + exit(1); + } + + /* mesh header */ + fprintf(file, "MFEM mesh v1.0\n"); + fprintf(file, "\ndimension\n"); + fprintf(file, "%d\n", Dim); + + /* mesh elements */ + fprintf(file, "\nelements\n"); + fprintf(file, "%d\n", NumOfElements); + for (j = 0; j < n; j++) + for (i = 0; i < n; i++) + fprintf(file, "1 3 %d %d %d %d\n", i + j*(n+1), i + 1 +j*(n+1), + i + 1 + (j+1)*(n+1), i + (j+1)*(n+1)); + + /* boundary will be generated by GLVis */ + fprintf(file, "\nboundary\n"); + fprintf(file, "0\n"); + + /* mesh vertices */ + fprintf(file, "\nvertices\n"); + fprintf(file, "%d\n", NumOfVertices); + fprintf(file, "%d\n", Dim); + for (j = 0; j < n+1; j++) + for (i = 0; i < n+1; i++) + { + x = i*h + cg*j*h; + y = sg*j*h; + fprintf(file, "%.14e %.14e\n", cr*x - sr*y, sr*x + cr*y); + } + + fflush(file); + fclose(file); +} + +/* Save a structured nx x ny x nz mesh of cubic elements of size h, globally + translated by (x0,y0,z0), into a GLVis mesh file with the given prefix. */ +void GLVis_PrintLocalCubicMesh(const char *meshfile_prefix, + int nx, int ny, int nz, double h, + double x0, double y0, double z0, int myid) +{ + FILE *file; + char meshfile[255]; + + int Dim = 3; + int NumOfVertices = (nx+1)*(ny+1)*(nz+1); + int NumOfElements = nx*ny*nz; + + int i, j, k; + double x, y, z; + + sprintf(meshfile, "%s.%06d", meshfile_prefix, myid); + if ((file = fopen(meshfile, "w")) == NULL) + { + printf("Error: can't open output file %s\n", meshfile); + exit(1); + } + + /* mesh header */ + fprintf(file, "MFEM mesh v1.0\n"); + fprintf(file, "\ndimension\n"); + fprintf(file, "%d\n", Dim); + + /* mesh elements */ + fprintf(file, "\nelements\n"); + fprintf(file, "%d\n", NumOfElements); + for (k = 0; k < nz; k++) + for (j = 0; j < ny; j++) + for (i = 0; i < nx; i++) + fprintf(file, "1 5 %d %d %d %d %d %d %d %d\n", + i + j*(nx+1) + k*(nx+1)*(ny+1), + i + 1 +j*(nx+1) + k*(nx+1)*(ny+1), + i + 1 + (j+1)*(nx+1) + k*(nx+1)*(ny+1), + i + (j+1)*(nx+1) + k*(nx+1)*(ny+1), + i + j*(nx+1) + (k+1)*(nx+1)*(ny+1), + i + 1 +j*(nx+1) + (k+1)*(nx+1)*(ny+1), + i + 1 + (j+1)*(nx+1) + (k+1)*(nx+1)*(ny+1), + i + (j+1)*(nx+1) + (k+1)*(nx+1)*(ny+1)); + + /* boundary will be generated by GLVis */ + fprintf(file, "\nboundary\n"); + fprintf(file, "0\n"); + + /* mesh vertices */ + fprintf(file, "\nvertices\n"); + fprintf(file, "%d\n", NumOfVertices); + fprintf(file, "%d\n", Dim); + for (k = 0; k < nz+1; k++) + for (j = 0; j < ny+1; j++) + for (i = 0; i < nx+1; i++) + { + x = x0+i*h; + y = y0+j*h; + z = z0+k*h; + fprintf(file, "%.14e %.14e %.14e\n", x, y, z); + } + + fflush(file); + fclose(file); +} + +#include "HYPRE_sstruct_mv.h" +#include "_hypre_sstruct_mv.h" + +/* Save a GLVis mesh file with the given prefix corresponding to the input + SStruct grid assuming that the cells in each part are the same. The optional + trans and origin parameters specify the coordinate transformation for each + part, relative to a square Cartesian grid. */ +void GLVis_PrintSStructGrid(HYPRE_SStructGrid grid, + const char *meshfile_prefix, int myid, + double *trans, double *origin) +{ + FILE *file; + char meshfile[255]; + + int dim = ((hypre_SStructGrid *)grid)->ndim; + int cellNV = (dim == 2) ? 4 : 8; + int elemid = 2*dim-1; + int nvert, nelem; + + hypre_StructGrid *part; + int p, nparts = ((hypre_SStructGrid *)grid)->nparts; + int given_trans = (trans != NULL && origin != NULL); + double *T = trans, *O = origin; + + hypre_BoxArray *boxes; + hypre_Box *box; + int b, ncells; + + nvert = nelem = 0; + for (p = 0; p < nparts; p++) + { + part = ((hypre_SStructGrid *)grid)->pgrids[p]->sgrids[0]; + boxes = hypre_StructGridBoxes(part); + for (b = 0; b < hypre_BoxArraySize(boxes); b++) + { + box = hypre_BoxArrayBox(boxes, b); + ncells = hypre_BoxVolume(box); + nvert += ncells*cellNV; + nelem += ncells; + } + } + + { + int i, j, k, v, vert; + double x0, y0, z0, h; + + sprintf(meshfile, "%s.%06d", meshfile_prefix, myid); + if ((file = fopen(meshfile, "w")) == NULL) + { + printf("Error: can't open output file %s\n", meshfile); + exit(1); + } + + /* mesh header */ + fprintf(file, "MFEM mesh v1.0\n"); + fprintf(file, "\ndimension\n"); + fprintf(file, "%d\n", dim); + + /* mesh elements */ + fprintf(file, "\nelements\n"); + fprintf(file, "%d\n", nelem); + + vert = 0; + for (p = 0; p < nparts; p++) + { + part = ((hypre_SStructGrid *)grid)->pgrids[p]->sgrids[0]; + boxes = hypre_StructGridBoxes(part); + for (b = 0; b < hypre_BoxArraySize(boxes); b++) + { + box = hypre_BoxArrayBox(boxes, b); + for (k = hypre_BoxIMinD(box,2); k <= hypre_BoxIMaxD(box,2); k++) + for (j = hypre_BoxIMinD(box,1); j <= hypre_BoxIMaxD(box,1); j++) + for (i = hypre_BoxIMinD(box,0); i <= hypre_BoxIMaxD(box,0); i++) + { + fprintf(file, "1 %d ", elemid); + for (v = 0; v < cellNV; v++, vert++) + fprintf(file, "%d ", vert); + fprintf(file, "\n"); + } + } + } + + /* boundary will be generated by GLVis */ + fprintf(file, "\nboundary\n"); + fprintf(file, "0\n"); + + /* mesh vertices */ + fprintf(file, "\nvertices\n"); + fprintf(file, "%d\n", nvert); + fprintf(file, "%d\n", dim); + + for (p = 0; p < nparts; p++) + { + part = ((hypre_SStructGrid *)grid)->pgrids[p]->sgrids[0]; + x0 = y0 = z0 = 0; + h = 1.0; + boxes = hypre_StructGridBoxes(part); + for (b = 0; b < hypre_BoxArraySize(boxes); b++) + { + box = hypre_BoxArrayBox(boxes, b); + for (k = hypre_BoxIMinD(box,2); k <= hypre_BoxIMaxD(box,2); k++) + for (j = hypre_BoxIMinD(box,1); j <= hypre_BoxIMaxD(box,1); j++) + for (i = hypre_BoxIMinD(box,0); i <= hypre_BoxIMaxD(box,0); i++) + if (dim == 2) + { + if (!given_trans) + { + fprintf(file, "%.14e %.14e \n", x0+i*h, y0+j*h); + fprintf(file, "%.14e %.14e \n", x0+(i+1)*h, y0+j*h); + fprintf(file, "%.14e %.14e \n", x0+(i+1)*h, y0+(j+1)*h); + fprintf(file, "%.14e %.14e \n", x0+i*h, y0+(j+1)*h); + } + else + { + fprintf(file, "%.14e %.14e \n", + T[0]*i+T[1]*j+O[0], + T[2]*i+T[3]*j+O[1]); + fprintf(file, "%.14e %.14e \n", + T[0]*(i+1)+T[1]*j+O[0], + T[2]*(i+1)+T[3]*j+O[1]); + fprintf(file, "%.14e %.14e \n", + T[0]*(i+1)+T[1]*(j+1)+O[0], + T[2]*(i+1)+T[3]*(j+1)+O[1]); + fprintf(file, "%.14e %.14e \n", + T[0]*i+T[1]*(j+1)+O[0], + T[2]*i+T[3]*(j+1)+O[1]); + } + } + else + { + if (!given_trans) + { + fprintf(file, "%.14e %.14e %.14e \n", x0+i*h, y0+j*h, z0+k*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+(i+1)*h, y0+j*h, z0+k*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+(i+1)*h, y0+(j+1)*h, z0+k*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+i*h, y0+(j+1)*h, z0+k*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+i*h, y0+j*h, z0+(k+1)*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+(i+1)*h, y0+j*h, z0+(k+1)*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+(i+1)*h, y0+(j+1)*h, z0+(k+1)*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+i*h, y0+(j+1)*h, z0+(k+1)*h); + } + else + { + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*i+T[1]*j+T[2]*k+O[0], + T[3]*i+T[4]*j+T[5]*k+O[1], + T[6]*i+T[7]*j+T[8]*k+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*(i+1)+T[1]*j+T[2]*k+O[0], + T[3]*(i+1)+T[4]*j+T[5]*k+O[1], + T[6]*(i+1)+T[7]*j+T[8]*k+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*(i+1)+T[1]*(j+1)+T[2]*k+O[0], + T[3]*(i+1)+T[4]*(j+1)+T[5]*k+O[1], + T[6]*(i+1)+T[7]*(j+1)+T[8]*k+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*i+T[1]*(j+1)+T[2]*k+O[0], + T[3]*i+T[4]*(j+1)+T[5]*k+O[1], + T[6]*i+T[7]*(j+1)+T[8]*k+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*i+T[1]*j+T[2]*(k+1)+O[0], + T[3]*i+T[4]*j+T[5]*(k+1)+O[1], + T[6]*i+T[7]*j+T[8]*(k+1)+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*(i+1)+T[1]*j+T[2]*(k+1)+O[0], + T[3]*(i+1)+T[4]*j+T[5]*(k+1)+O[1], + T[6]*(i+1)+T[7]*j+T[8]*(k+1)+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*(i+1)+T[1]*(j+1)+T[2]*(k+1)+O[0], + T[3]*(i+1)+T[4]*(j+1)+T[5]*(k+1)+O[1], + T[6]*(i+1)+T[7]*(j+1)+T[8]*(k+1)+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*i+T[1]*(j+1)+T[2]*(k+1)+O[0], + T[3]*i+T[4]*(j+1)+T[5]*(k+1)+O[1], + T[6]*i+T[7]*(j+1)+T[8]*(k+1)+O[2]); + } + } + } + + if (given_trans) + { + T += dim*dim; + O += dim; + } + } + + fflush(file); + fclose(file); + } +} + +/* Save a GLVis grid function (in a file with the given prefix) corresponding to + the values of the input SStruct vector restricted to the specified SStruct + variable. Currently only CELL and NODE variable types are supported. */ +void GLVis_PrintSStructVector(HYPRE_SStructVector sol, + int var, + const char *solfile_prefix, + int myid) +{ + FILE *file; + char solfile[255]; + + hypre_SStructGrid *grid = ((hypre_SStructVector*)sol)->grid; + int dim = grid->ndim; + + hypre_StructGrid *part; + int p, nparts = grid->nparts; + hypre_BoxArray *boxes; + hypre_Box *box; + int b; + + int i, j, k, ni, nj, nk; + double *values; + int ilower[3], iupper[3]; + + HYPRE_SStructVariable vartype = grid->pgrids[0]->vartypes[var]; + + char fe_coll[100]; + int var_off; + + sprintf(solfile, "%s.%06d", solfile_prefix, myid); + if ((file = fopen(solfile, "w")) == NULL) + { + printf("Error: can't open output file %s\n", solfile); + exit(1); + } + + /* set the finite element collection based on variable type */ + switch (vartype) + { + case HYPRE_SSTRUCT_VARIABLE_CELL: + sprintf(fe_coll, "%s", "Local_L2_2D_P0"); + var_off = 0; + break; + case HYPRE_SSTRUCT_VARIABLE_NODE: + sprintf(fe_coll, "%s", "Local_H1_2D_P1"); + var_off = 1; + break; + default: + printf("Error: unsuported variable type\n"); + exit(1); + } + + /* grid function header */ + fprintf(file, "FiniteElementSpace\n"); + fprintf(file, "FiniteElementCollection: %s\n", fe_coll); + fprintf(file, "VDim: 1\n"); + fprintf(file, "Ordering: 0\n\n"); + + /* extract and save the vector values on each cell */ + for (p = 0; p < nparts; p++) + { + part = grid->pgrids[p]->sgrids[0]; + boxes = hypre_StructGridBoxes(part); + for (b = 0; b < hypre_BoxArraySize(boxes); b++) + { + box = hypre_BoxArrayBox(boxes, b); + ni = hypre_BoxSizeD(box,0); + nj = hypre_BoxSizeD(box,1); + nk = hypre_BoxSizeD(box,2); + + ilower[0] = hypre_BoxIMinD(box,0) - var_off; + ilower[1] = hypre_BoxIMinD(box,1) - var_off; + iupper[0] = hypre_BoxIMaxD(box,0); + iupper[1] = hypre_BoxIMaxD(box,1); + + if (dim == 2) + values = (double*) malloc((ni+var_off)*(nj+var_off)*sizeof(double)); + else + { + values = (double*) malloc((ni+var_off)*(nj+var_off)*(nk+var_off)*sizeof(double)); + ilower[2] = hypre_BoxIMinD(box,2) - var_off; + iupper[2] = hypre_BoxIMaxD(box,2); + } + + HYPRE_SStructVectorGetBoxValues(sol, p, ilower, iupper, var, values); + + if (vartype == HYPRE_SSTRUCT_VARIABLE_CELL) + { + for (k = 0; k < nk; k++) + for (j = 0; j < nj; j++) + for (i = 0; i < ni; i++) + fprintf(file, "%.14e\n", values[i + j*ni]); + } + else if (vartype == HYPRE_SSTRUCT_VARIABLE_NODE) + { + if (dim == 2) + { + for (j = 0; j < nj; j++) + for (i = 0; i < ni; i++) + { + fprintf(file, "%.14e\n", values[i + j*(ni+1)]); + fprintf(file, "%.14e\n", values[i+1 + j*(ni+1)]); + fprintf(file, "%.14e\n", values[i+1 + (j+1)*(ni+1)]); + fprintf(file, "%.14e\n", values[i + (j+1)*(ni+1)]); + } + } + else + { + for (k = 0; k < nk; k++) + for (j = 0; j < nj; j++) + for (i = 0; i < ni; i++) + { + fprintf(file, "%.14e\n", values[i + j*(ni+1) + k*(ni+1)*(nj+1)]); + fprintf(file, "%.14e\n", values[i+1 + j*(ni+1) + k*(ni+1)*(nj+1)]); + fprintf(file, "%.14e\n", values[i+1 + (j+1)*(ni+1) + k*(ni+1)*(nj+1)]); + fprintf(file, "%.14e\n", values[i + (j+1)*(ni+1) + k*(ni+1)*(nj+1)]); + fprintf(file, "%.14e\n", values[i + j*(ni+1) + (k+1)*(ni+1)*(nj+1)]); + fprintf(file, "%.14e\n", values[i+1 + j*(ni+1) + (k+1)*(ni+1)*(nj+1)]); + fprintf(file, "%.14e\n", values[i+1 + (j+1)*(ni+1) + (k+1)*(ni+1)*(nj+1)]); + fprintf(file, "%.14e\n", values[i + (j+1)*(ni+1) + (k+1)*(ni+1)*(nj+1)]); + } + } + } + + free(values); + } + } + + fflush(file); + fclose(file); +} + +/* Save a GLVis mesh file with the given prefix corresponding to the input + Struct grid assuming that the cells are the same. The optional trans and + origin parameters specify a coordinate transformation, relative to a square + Cartesian grid. */ +void GLVis_PrintStructGrid(HYPRE_StructGrid Grid, + const char *meshfile_prefix, int myid, + double *trans, double *origin) +{ + FILE *file; + char meshfile[255]; + + hypre_StructGrid *grid = (hypre_StructGrid *)Grid; + int dim = grid->ndim; + int cellNV = (dim == 2) ? 4 : 8; + int elemid = 2*dim-1; + int nvert, nelem; + + int given_trans = (trans != NULL && origin != NULL); + double *T = trans, *O = origin; + + hypre_BoxArray *boxes; + hypre_Box *box; + int b, ncells; + + nvert = nelem = 0; + boxes = hypre_StructGridBoxes(grid); + for (b = 0; b < hypre_BoxArraySize(boxes); b++) + { + box = hypre_BoxArrayBox(boxes, b); + ncells = hypre_BoxVolume(box); + nvert += ncells*cellNV; + nelem += ncells; + } + + { + int i, j, k, v, vert; + double x0, y0, z0, h; + + sprintf(meshfile, "%s.%06d", meshfile_prefix, myid); + if ((file = fopen(meshfile, "w")) == NULL) + { + printf("Error: can't open output file %s\n", meshfile); + exit(1); + } + + /* mesh header */ + fprintf(file, "MFEM mesh v1.0\n"); + fprintf(file, "\ndimension\n"); + fprintf(file, "%d\n", dim); + + /* mesh elements */ + fprintf(file, "\nelements\n"); + fprintf(file, "%d\n", nelem); + + vert = 0; + + boxes = hypre_StructGridBoxes(grid); + for (b = 0; b < hypre_BoxArraySize(boxes); b++) + { + box = hypre_BoxArrayBox(boxes, b); + for (k = hypre_BoxIMinD(box,2); k <= hypre_BoxIMaxD(box,2); k++) + for (j = hypre_BoxIMinD(box,1); j <= hypre_BoxIMaxD(box,1); j++) + for (i = hypre_BoxIMinD(box,0); i <= hypre_BoxIMaxD(box,0); i++) + { + fprintf(file, "1 %d ", elemid); + for (v = 0; v < cellNV; v++, vert++) + fprintf(file, "%d ", vert); + fprintf(file, "\n"); + } + } + + /* boundary will be generated by GLVis */ + fprintf(file, "\nboundary\n"); + fprintf(file, "0\n"); + + /* mesh vertices */ + fprintf(file, "\nvertices\n"); + fprintf(file, "%d\n", nvert); + fprintf(file, "%d\n", dim); + + x0 = y0 = z0 = 0; + h = 1.0; + boxes = hypre_StructGridBoxes(grid); + for (b = 0; b < hypre_BoxArraySize(boxes); b++) + { + box = hypre_BoxArrayBox(boxes, b); + for (k = hypre_BoxIMinD(box,2); k <= hypre_BoxIMaxD(box,2); k++) + for (j = hypre_BoxIMinD(box,1); j <= hypre_BoxIMaxD(box,1); j++) + for (i = hypre_BoxIMinD(box,0); i <= hypre_BoxIMaxD(box,0); i++) + if (dim == 2) + { + if (!given_trans) + { + fprintf(file, "%.14e %.14e \n", x0+i*h, y0+j*h); + fprintf(file, "%.14e %.14e \n", x0+(i+1)*h, y0+j*h); + fprintf(file, "%.14e %.14e \n", x0+(i+1)*h, y0+(j+1)*h); + fprintf(file, "%.14e %.14e \n", x0+i*h, y0+(j+1)*h); + } + else + { + fprintf(file, "%.14e %.14e \n", + T[0]*i+T[1]*j+O[0], + T[2]*i+T[3]*j+O[1]); + fprintf(file, "%.14e %.14e \n", + T[0]*(i+1)+T[1]*j+O[0], + T[2]*(i+1)+T[3]*j+O[1]); + fprintf(file, "%.14e %.14e \n", + T[0]*(i+1)+T[1]*(j+1)+O[0], + T[2]*(i+1)+T[3]*(j+1)+O[1]); + fprintf(file, "%.14e %.14e \n", + T[0]*i+T[1]*(j+1)+O[0], + T[2]*i+T[3]*(j+1)+O[1]); + } + } + else + { + if (!given_trans) + { + fprintf(file, "%.14e %.14e %.14e \n", x0+i*h, y0+j*h, z0+k*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+(i+1)*h, y0+j*h, z0+k*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+(i+1)*h, y0+(j+1)*h, z0+k*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+i*h, y0+(j+1)*h, z0+k*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+i*h, y0+j*h, z0+(k+1)*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+(i+1)*h, y0+j*h, z0+(k+1)*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+(i+1)*h, y0+(j+1)*h, z0+(k+1)*h); + fprintf(file, "%.14e %.14e %.14e \n", x0+i*h, y0+(j+1)*h, z0+(k+1)*h); + } + else + { + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*i+T[1]*j+T[2]*k+O[0], + T[3]*i+T[4]*j+T[5]*k+O[1], + T[6]*i+T[7]*j+T[8]*k+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*(i+1)+T[1]*j+T[2]*k+O[0], + T[3]*(i+1)+T[4]*j+T[5]*k+O[1], + T[6]*(i+1)+T[7]*j+T[8]*k+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*(i+1)+T[1]*(j+1)+T[2]*k+O[0], + T[3]*(i+1)+T[4]*(j+1)+T[5]*k+O[1], + T[6]*(i+1)+T[7]*(j+1)+T[8]*k+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*i+T[1]*(j+1)+T[2]*k+O[0], + T[3]*i+T[4]*(j+1)+T[5]*k+O[1], + T[6]*i+T[7]*(j+1)+T[8]*k+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*i+T[1]*j+T[2]*(k+1)+O[0], + T[3]*i+T[4]*j+T[5]*(k+1)+O[1], + T[6]*i+T[7]*j+T[8]*(k+1)+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*(i+1)+T[1]*j+T[2]*(k+1)+O[0], + T[3]*(i+1)+T[4]*j+T[5]*(k+1)+O[1], + T[6]*(i+1)+T[7]*j+T[8]*(k+1)+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*(i+1)+T[1]*(j+1)+T[2]*(k+1)+O[0], + T[3]*(i+1)+T[4]*(j+1)+T[5]*(k+1)+O[1], + T[6]*(i+1)+T[7]*(j+1)+T[8]*(k+1)+O[2]); + fprintf(file, "%.14e %.14e %.14e \n", + T[0]*i+T[1]*(j+1)+T[2]*(k+1)+O[0], + T[3]*i+T[4]*(j+1)+T[5]*(k+1)+O[1], + T[6]*i+T[7]*(j+1)+T[8]*(k+1)+O[2]); + } + } + + if (given_trans) + { + T += dim*dim; + O += dim; + } + } + + fflush(file); + fclose(file); + } +} + +/* Save a Q0 GLVis grid function (in a file with the given prefix) corresponding + to the values of the input Struct vector. */ +void GLVis_PrintStructVector(HYPRE_StructVector sol, + const char *solfile_prefix, + int myid) +{ + FILE *file; + char solfile[255]; + + hypre_StructGrid *grid = ((hypre_StructVector*)sol)->grid; + int dim = grid->ndim; + + hypre_BoxArray *boxes; + hypre_Box *box; + int b; + + int i, j, k, ni, nj, nk; + double *values; + int ilower[3], iupper[3]; + + sprintf(solfile, "%s.%06d", solfile_prefix, myid); + if ((file = fopen(solfile, "w")) == NULL) + { + printf("Error: can't open output file %s\n", solfile); + exit(1); + } + + /* grid function header */ + fprintf(file, "FiniteElementSpace\n"); + fprintf(file, "FiniteElementCollection: Local_L2_2D_P0\n"); + fprintf(file, "VDim: 1\n"); + fprintf(file, "Ordering: 0\n\n"); + + /* extract and save the vector values on each cell */ + boxes = hypre_StructGridBoxes(grid); + for (b = 0; b < hypre_BoxArraySize(boxes); b++) + { + box = hypre_BoxArrayBox(boxes, b); + ni = hypre_BoxSizeD(box,0); + nj = hypre_BoxSizeD(box,1); + nk = hypre_BoxSizeD(box,2); + + ilower[0] = hypre_BoxIMinD(box,0); + ilower[1] = hypre_BoxIMinD(box,1); + iupper[0] = hypre_BoxIMaxD(box,0); + iupper[1] = hypre_BoxIMaxD(box,1); + + if (dim == 2) + values = (double*) malloc(ni*nj*sizeof(double)); + else + { + values = (double*) malloc(ni*nj*nk*sizeof(double)); + ilower[2] = hypre_BoxIMinD(box,2); + iupper[2] = hypre_BoxIMaxD(box,2); + } + + HYPRE_StructVectorGetBoxValues(sol, ilower, iupper, values); + + for (k = 0; k < nk; k++) + for (j = 0; j < nj; j++) + for (i = 0; i < ni; i++) + fprintf(file, "%.14e\n", values[i + j*ni]); + + free(values); + } + + fflush(file); + fclose(file); +} + +/* Save additional data needed for GLVis visualization (e.g. the number of + processors in the run). */ +void GLVis_PrintData(const char *datafile, int myid, int num_procs) +{ + FILE *file; + + if (myid == 0) + { + file = fopen(datafile, "w"); + fprintf(file, "np %d\n", num_procs); + fflush(file); + fclose(file); + } +} diff --git a/3rd_party/hypre/src/examples/vis/Makefile b/3rd_party/hypre/src/examples/vis/Makefile new file mode 100644 index 000000000..cca1c93b5 --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/Makefile @@ -0,0 +1,7 @@ +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + +clean: + rm -f *mesh* *sol* *data* diff --git a/3rd_party/hypre/src/examples/vis/README b/3rd_party/hypre/src/examples/vis/README new file mode 100644 index 000000000..d33b497e4 --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/README @@ -0,0 +1,82 @@ +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + +This directory contains scripts that use the GLVis visualization tool to plot +the numerical results for most of the hypre example codes. + +To use the scripts, one first needs to download and install the latest versions +of GLVis and the MFEM finite element library (you will need versions 2.0, or +later) from the following websites: + +http://glvis.org +http://mfem.org + +See http://mfem.org/building for building instructions. + +Assuming that the "glvis" binary is in your $PATH, you can use it to examine +the results from hypre's example codes as follows: + +1) Run an example with the "-vis" option, e.g. + +mpirun -np 16 ex4 -n 10 -U0 2 -K 2 -F 4 -vis + + This will save a number of files describing the solution and the mesh with + the example name prefix in the vis/ sub-directory. These files can be cleaned + by "make clean" in the examples/ or the vis/ directories. + +2) Execute the corresponding "glvis" shell script from the vis/ directory, e.g. + +vis/glvis-ex4.sh + + The scripts can be run from either examples/ or the vis/ directories and will + check if the solution data exists before visualizing it. Note that some of + the scripts, such as glvis-ex3.sh, use pre-processing to merge the parallel + results into serial form, while others (e.g. most of the examples based on + finite elements) directly visualize parallel data. + +3) Interact with the solution in the GLVis window using the mouse buttons and + command keystrokes documented in the GLVis README file. Some of the more + frequently used keystrokes are as follows: + + h - Prints a short help message in the terminal + m - Toggle mesh level lines + a - Toggle bounding box axes + c - Display/Hide the colorbar + p - Cycle through color palettes + t - Cycle materials and lights + r - Reset the plot to 3D view + R - Cycle through different 2D projections + j - Turn on/off perspective + l - Turn on/off light + g - Toggle background color + A - Turn on/off OpenGL anti-aliasing/multi-sampling + q - Exit + +The glvis binary can also be used directly to visualize the solutions of the +finite element-based examples 13, 14, 15 and 16. For example: + +mpirun -np 6 ex13 -n 30 -vis +glvis -np 6 -m vis/ex13.mesh -g vis/ex13.sol + +Pressing the keys "AbjRl************" in the GLVis interactive window will now +produce the same result as running vis/glvis-ex13.sh. + +In this parallel finite element mode, one can use the F11/F12 and the "b" keys +to shrink/zoom or plot the boundary of the processor subdomains. Individual +processor subdomains with their solutions can also be visualized directly, and +the command keystrokes can be passed with the -k option: + +glvis -m vis/ex13.mesh.000005 -g vis/ex13.sol.000005 -k "Aaammcb" + +Various finite elements are directly supported in GLVis, such as high-order +nodal elements: + +mpirun -np 4 ex16 -n 2 -vis +vis/glvis-ex16.sh + +or Nedelec (edge) elements: + +mpirun -np 8 ex15 -a 1 -vis +vis/glvis-ex15.sh diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex1.sh b/3rd_party/hypre/src/examples/vis/glvis-ex1.sh new file mode 100755 index 000000000..745c0e268 --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex1.sh @@ -0,0 +1,30 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex1 +dir=`basename \`pwd\`` +keys=Aaamc + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh.000000 ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +np=`cat $dir/$ex.data | head -n 1 | awk '{ print $2 }'` + +glvis -np $np -m $mesh -g $sol -k $keys diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex10.sh b/3rd_party/hypre/src/examples/vis/glvis-ex10.sh new file mode 100755 index 000000000..99a56ea3b --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex10.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex10 +dir=`basename \`pwd\`` +keys=Aaamc + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh.000000 ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +np=`cat $dir/$ex.data | head -n 1 | awk '{ print $2 }'` + +glvis -np $np -m $mesh -g $sol -k $keys + diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex11.sh b/3rd_party/hypre/src/examples/vis/glvis-ex11.sh new file mode 100755 index 000000000..a046b97ad --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex11.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex11 +dir=`basename \`pwd\`` +keys=Aaamc + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +echo "FiniteElementSpace" > $sol +echo "FiniteElementCollection: H1_2D_P1" >> $sol +echo "VDim: 1" >> $sol +echo "Ordering: 0" >> $sol +echo "" >> $sol +find $dir -name "$ex.sol.??????" | sort | xargs cat >> $sol + +glvis -m $mesh -g $sol -k $keys diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex12.sh b/3rd_party/hypre/src/examples/vis/glvis-ex12.sh new file mode 100755 index 000000000..d69875d75 --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex12.sh @@ -0,0 +1,30 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex12 +dir=`basename \`pwd\`` +keys=Aaamc + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh.000000 ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +np=`cat $dir/$ex.data | head -n 1 | awk '{ print $2 }'` + +glvis -np $np -m $mesh -g $sol -k $keys diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex13.sh b/3rd_party/hypre/src/examples/vis/glvis-ex13.sh new file mode 100755 index 000000000..1d3dd0f60 --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex13.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex13 +dir=`basename \`pwd\`` +keys=AbjRl************ + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh.000000 ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +np=`cat $dir/$ex.data | head -n 1 | awk '{ print $2 }'` + +glvis -np $np -m $mesh -g $sol -k $keys + diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex14.sh b/3rd_party/hypre/src/examples/vis/glvis-ex14.sh new file mode 100755 index 000000000..276be08ae --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex14.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex14 +dir=`basename \`pwd\`` +keys=AbjRl************ + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh.000000 ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +np=`cat $dir/$ex.data | head -n 1 | awk '{ print $2 }'` + +glvis -np $np -m $mesh -g $sol -k $keys + diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex15.sh b/3rd_party/hypre/src/examples/vis/glvis-ex15.sh new file mode 100755 index 000000000..6187e0001 --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex15.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex15 +dir=`basename \`pwd\`` +keys=AaamcVooof + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh.000000 ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +np=`cat $dir/$ex.data | head -n 1 | awk '{ print $2 }'` + +glvis -np $np -m $mesh -g $sol -k $keys + diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex16.sh b/3rd_party/hypre/src/examples/vis/glvis-ex16.sh new file mode 100755 index 000000000..c71a820f2 --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex16.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex16 +dir=`basename \`pwd\`` +keys=Aaamciiii + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh.000000 ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +np=`cat $dir/$ex.data | head -n 1 | awk '{ print $2 }'` + +glvis -np $np -m $mesh -g $sol -k $keys + diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex2.sh b/3rd_party/hypre/src/examples/vis/glvis-ex2.sh new file mode 100755 index 000000000..6a2375c63 --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex2.sh @@ -0,0 +1,30 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex2 +dir=`basename \`pwd\`` +keys=Aaamc + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh.000000 ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +np=`cat $dir/$ex.data | head -n 1 | awk '{ print $2 }'` + +glvis -np $np -m $mesh -g $sol -k $keys diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex3.sh b/3rd_party/hypre/src/examples/vis/glvis-ex3.sh new file mode 100755 index 000000000..ca6f51b3a --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex3.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex3 +dir=`basename \`pwd\`` +keys=Aaamc + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +echo "FiniteElementSpace" > $sol +echo "FiniteElementCollection: H1_2D_P1" >> $sol +echo "VDim: 1" >> $sol +echo "Ordering: 0" >> $sol +echo "" >> $sol +find $dir -name "$ex.sol.??????" | sort | xargs cat | sort | awk '{ print $2 }' >> $sol + +glvis -m $mesh -g $sol -k $keys diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex4.sh b/3rd_party/hypre/src/examples/vis/glvis-ex4.sh new file mode 100755 index 000000000..25e524cfa --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex4.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex4 +dir=`basename \`pwd\`` +keys=Aaamc + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +echo "FiniteElementSpace" > $sol +echo "FiniteElementCollection: H1_2D_P1" >> $sol +echo "VDim: 1" >> $sol +echo "Ordering: 0" >> $sol +echo "" >> $sol +find $dir -name "$ex.sol.??????" | sort | xargs cat | sort | awk '{ print $2 }' >> $sol + +glvis -m $mesh -g $sol -k $keys diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex5.sh b/3rd_party/hypre/src/examples/vis/glvis-ex5.sh new file mode 100755 index 000000000..42c65ed5c --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex5.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex5 +dir=`basename \`pwd\`` +keys=Aaamc + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +echo "FiniteElementSpace" > $sol +echo "FiniteElementCollection: H1_2D_P1" >> $sol +echo "VDim: 1" >> $sol +echo "Ordering: 0" >> $sol +echo "" >> $sol +find $dir -name "$ex.sol.??????" | sort | xargs cat >> $sol + +glvis -m $mesh -g $sol -k $keys diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex6.sh b/3rd_party/hypre/src/examples/vis/glvis-ex6.sh new file mode 100755 index 000000000..e806c7d08 --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex6.sh @@ -0,0 +1,30 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex6 +dir=`basename \`pwd\`` +keys=Aaamc + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh.000000 ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +np=`cat $dir/$ex.data | head -n 1 | awk '{ print $2 }'` + +glvis -np $np -m $mesh -g $sol -k $keys diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex7.sh b/3rd_party/hypre/src/examples/vis/glvis-ex7.sh new file mode 100755 index 000000000..37f566d48 --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex7.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex7 +dir=`basename \`pwd\`` +keys=Aaamc + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +echo "FiniteElementSpace" > $sol +echo "FiniteElementCollection: H1_2D_P1" >> $sol +echo "VDim: 1" >> $sol +echo "Ordering: 0" >> $sol +echo "" >> $sol +find $dir -name "$ex.sol.??????" | sort | xargs cat | sort | awk '{ print $2 }' >> $sol + +glvis -m $mesh -g $sol -k $keys diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex8.sh b/3rd_party/hypre/src/examples/vis/glvis-ex8.sh new file mode 100755 index 000000000..4ecef3577 --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex8.sh @@ -0,0 +1,30 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex8 +dir=`basename \`pwd\`` +keys=Aaamc + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + sol=$ex.sol +else + dir=vis + mesh=vis/$ex.mesh + sol=vis/$ex.sol +fi + +if [ ! -e $mesh.000000 ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +np=`cat $dir/$ex.data | head -n 1 | awk '{ print $2 }'` + +glvis -np $np -m $mesh -g $sol -k $keys diff --git a/3rd_party/hypre/src/examples/vis/glvis-ex9.sh b/3rd_party/hypre/src/examples/vis/glvis-ex9.sh new file mode 100755 index 000000000..4f49f6128 --- /dev/null +++ b/3rd_party/hypre/src/examples/vis/glvis-ex9.sh @@ -0,0 +1,46 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + + +ex=ex9 +dir=`basename \`pwd\`` +keys=Aaamc + +if [ "$dir" = "vis" ]; then + dir=. + mesh=$ex.mesh + solu=$ex-u.sol + solv=$ex-v.sol +else + dir=vis + mesh=vis/$ex.mesh + solu=vis/$ex-u.sol + solv=vis/$ex-v.sol +fi + +if [ ! -e $mesh ] +then + echo "Can't find visualization data for $ex!" + exit +fi + +echo "FiniteElementSpace" > $solu +echo "FiniteElementCollection: H1_2D_P1" >> $solu +echo "VDim: 1" >> $solu +echo "Ordering: 0" >> $solu +echo "" >> $solu +find $dir -name "$ex-u.sol.??????" | sort | xargs cat | sort | awk '{ print $2 }' >> $solu + +glvis -m $mesh -g $solu -k $keys & + +echo "FiniteElementSpace" > $solv +echo "FiniteElementCollection: H1_2D_P1" >> $solv +echo "VDim: 1" >> $solv +echo "Ordering: 0" >> $solv +echo "" >> $solv +find $dir -name "$ex-v.sol.??????" | sort | xargs cat | sort | awk '{ print $2 }' >> $solv + +glvis -m $mesh -g $solv -k $keys diff --git a/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_ls.h b/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_ls.h index 8dd79e8df..152a0190d 100644 --- a/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_ls.h +++ b/3rd_party/hypre/src/parcsr_ls/HYPRE_parcsr_ls.h @@ -73,6 +73,8 @@ typedef HYPRE_Int (*HYPRE_PtrToModifyPCFcn)(HYPRE_Solver, **/ /*@{*/ +#pragma GCC visibility push(default) + /** * Create a solver object. **/ @@ -3705,6 +3707,8 @@ HYPRE_MGRGetNumIterations( HYPRE_Solver solver, HYPRE_Int *num_iterations ); HYPRE_Int HYPRE_MGRGetFinalRelativeResidualNorm( HYPRE_Solver solver, HYPRE_Real *res_norm ); +#pragma GCC visibility pop + /*@}*/ /*-------------------------------------------------------------------------- @@ -3815,6 +3819,8 @@ GenerateCoordinates(MPI_Comm comm, /*-------------------------------------------------------------------------- *--------------------------------------------------------------------------*/ +#pragma GCC visibility push(default) + /* * (Optional) Switches on use of Jacobi interpolation after computing * an original interpolation @@ -3861,6 +3867,8 @@ HYPRE_Int HYPRE_BoomerAMGSetCRUseCG(HYPRE_Solver solver, HYPRE_Int HYPRE_BoomerAMGSetISType(HYPRE_Solver solver, HYPRE_Int IS_type); +#pragma GCC visibility pop + /*-------------------------------------------------------------------------- *--------------------------------------------------------------------------*/ @@ -3903,7 +3911,6 @@ hypre_ParCSRMultiVectorPrint(void *x_, const char *fileName); void * hypre_ParCSRMultiVectorRead(MPI_Comm comm, void *ii_, const char *fileName); - /*@}*/ /*-------------------------------------------------------------------------- diff --git a/3rd_party/hypre/src/utilities/version b/3rd_party/hypre/src/utilities/version new file mode 100755 index 000000000..b328ce443 --- /dev/null +++ b/3rd_party/hypre/src/utilities/version @@ -0,0 +1,46 @@ +#!/bin/sh +# Copyright 1998-2019 Lawrence Livermore National Security, LLC and other +# HYPRE Project Developers. See the top-level COPYRIGHT file for details. +# +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + +#============================================================================= +# This script prints the hypre version number, date, and time. +# It currently inspects the 'configure' file for this info. +#============================================================================= + +case $1 in + -h|-help) + echo + echo "$0 [options]" + echo " -h|-help - prints usage information" + echo " -number - prints the version number" + echo " -date - prints the version day" + echo " -time - prints the version day and time" + echo + exit;; +esac + +# NOTE: In order to call this script from other directories, +# we need to get the path info from the command line +VPATH=`dirname $0` +VFILE="${VPATH}/../configure" +NUMBER=`grep "HYPRE_VERSION=" $VFILE | cut -d= -f 2 | sed 's/"//g'` +DATE=`grep "HYPRE_DATE=" $VFILE | cut -d= -f 2 | sed 's/"//g'` +TIME=`grep "HYPRE_TIME=" $VFILE | cut -d= -f 2 | sed 's/"//g'` + +# this is the no-option print line +VPRINT=`echo hypre Version $NUMBER Date: $DATE` + +# this defines the print lines for the various options +case $1 in + -number) + VPRINT=$NUMBER;; + -date) + VPRINT=$DATE;; + -time) + VPRINT=$TIME;; +esac + +# print the version information +echo $VPRINT diff --git a/3rd_party/nek5000/core/DPROCMAP b/3rd_party/nek5000/core/DPROCMAP index fefab4f83..a81a8e5ca 100644 --- a/3rd_party/nek5000/core/DPROCMAP +++ b/3rd_party/nek5000/core/DPROCMAP @@ -6,3 +6,8 @@ integer dProcmapWin common /cbpmwd/ dProcmapWin(2*lelt) + + parameter (lcr = lelt) ! remote elements + parameter (lc = lelt+lcr+8-mod(lelt+lcr,8)) ! multiple of 8 + integer cache + common /cbpmca/ cache(lc,3) diff --git a/3rd_party/nek5000/core/PARDICT b/3rd_party/nek5000/core/PARDICT index 7eba9a015..a4821a2fa 100644 --- a/3rd_party/nek5000/core/PARDICT +++ b/3rd_party/nek5000/core/PARDICT @@ -4,7 +4,7 @@ c Note: Keys have to be in captial letters c integer PARDICT_NKEYS - parameter(PARDICT_NKEYS = 103) + parameter(PARDICT_NKEYS = 104) character*132 pardictkey(PARDICT_NKEYS) data @@ -108,3 +108,4 @@ c & pardictkey(101)/ 'PRESSURE:SOLVER' / & pardictkey(102)/ 'MESH:PARTITIONER' / & pardictkey(103)/ 'MESH:CONNECTIVITYTOL' / + & pardictkey(104)/ 'SCALAR%%:ADVECTION' / diff --git a/3rd_party/nek5000/core/RESTART b/3rd_party/nek5000/core/RESTART index 8835153b8..206ab08d2 100644 --- a/3rd_party/nek5000/core/RESTART +++ b/3rd_party/nek5000/core/RESTART @@ -31,10 +31,10 @@ c logical ifgetx ,ifgetu ,ifgetp ,ifgett ,ifgtps (ldimt1),ifgtim $ ,ifgetxr,ifgetur,ifgetpr,ifgettr,ifgtpsr(ldimt1),ifgtimr - $ ,if_byte_sw,ifgetz,ifgetw,ifdiro + $ ,if_byte_sw,ifgetz,ifgetw,ifdiro,ifgfldr common /cmfi_l/ ifgetx,ifgetu,ifgetp,ifgett,ifgtps,ifgtim $ ,ifgetxr,ifgetur,ifgetpr,ifgettr,ifgtpsr,ifgtimr - $ ,if_byte_sw,ifgetz,ifgetw,ifdiro + $ ,if_byte_sw,ifgetz,ifgetw,ifdiro,ifgfldr integer fid0,fid0r,pid0,pid1,pid0r,pid1r,pid00 common /cmfi_p/ fid0,fid0r,pid0,pid1,pid0r,pid1r,pid00 diff --git a/3rd_party/nek5000/core/dprocmap.f b/3rd_party/nek5000/core/dprocmap.f index dd2c26859..8dd546749 100644 --- a/3rd_party/nek5000/core/dprocmap.f +++ b/3rd_party/nek5000/core/dprocmap.f @@ -30,8 +30,9 @@ subroutine dProcmapInit() if (ierr .ne. 0 ) call exitti('MPI_Win_allocate failed!$',0) #endif - - dProcmapCache = .false. + + call dProcMapClearCache() + dProcmapCache = .true. return end @@ -75,12 +76,6 @@ subroutine dProcmapGet(ibuf,ieg) integer*8 disp - ! local cache - parameter (lcr = lelt) ! remote elements - parameter (lc = lelt+lcr+8-mod(lelt+lcr,8)) ! multiple of 8 - integer cache(lc,3) - save cache - save icalld data icalld /0/ @@ -95,7 +90,7 @@ subroutine dProcmapGet(ibuf,ieg) ii = lsearch_ur(cache(1,3),lc,ieg) if (ii.gt.lc) call exitti('lsearch_ur returns invalid index$',ii) if (ii.gt.0 .and. ii.ne.lelt+lcr) then ! cache hit -c write(6,*) nid, 'cache hit ', 'ieg:', ieg + !write(6,*) nid, 'cache hit ', 'ieg:', ieg ibuf(1) = cache(ii,1) ibuf(2) = cache(ii,2) else @@ -204,4 +199,18 @@ integer function gllel(ieg) gllel = ibuf(1) end +c----------------------------------------------------------------------- + subroutine dProcMapClearCache + + include 'SIZE' + include 'PARALLEL' + include 'DPROCMAP' + + call ifill(cache,-1,size(cache)) + itmp = gllnid(0) ! reset last element cache + itmp = gllel(0) ! reset last element cache + + end +c----------------------------------------------------------------------- + #endif diff --git a/3rd_party/nek5000/core/drive1.f b/3rd_party/nek5000/core/drive1.f index fa30ff869..28db2c86c 100644 --- a/3rd_party/nek5000/core/drive1.f +++ b/3rd_party/nek5000/core/drive1.f @@ -332,6 +332,7 @@ subroutine nek_end include 'SIZE' include 'TOTAL' + include 'DPROCMAP' if(instep.ne.0) call runstat @@ -341,6 +342,11 @@ subroutine nek_end c call fgslib_crs_free(xxth(1)) c endif +#ifdef DPROCMAP +#ifdef MPI + call MPI_Win_free(dProcmapH, ierr) +#endif +#endif call in_situ_end() call exitt0() diff --git a/3rd_party/nek5000/core/gfldr.f b/3rd_party/nek5000/core/gfldr.f index dc66f7615..683819534 100644 --- a/3rd_party/nek5000/core/gfldr.f +++ b/3rd_party/nek5000/core/gfldr.f @@ -22,6 +22,7 @@ subroutine gfldr(sourcefld) character*1 hdr(iHeaderSize) integer*8 dtmp8 + integer*8 i8glsum,nfail,nfail_sum logical if_byte_swap_test real*4 bytetest @@ -111,35 +112,73 @@ subroutine gfldr(sourcefld) & nels,nxf,nyf,nzf,bb_t, & nhash,nhash,nmax,tol) + + ! locate points (iel,iproc,r,s,t) + nfail = 0 + toldist = 5e-6 + if(wdsizr.eq.8) toldist = 5e-14 + + ntot = lx1*ly1*lz1*nelt + call fgslib_findpts(inth_gfldr, + & grcode,1, + & gproc,1, + & gelid,1, + & grst,ldim, + & gdist,1, + & xm1,1, + & ym1,1, + & zm1,1,ntot) + + do i=1,ntot + if(grcode(i).eq.1 .and. sqrt(gdist(i)).gt.toldist) + & nfail = nfail + 1 + if(grcode(i).eq.2) nfail = nfail + 1 + enddo + + nfail_sum = i8glsum(nfail,1) + if(nfail_sum.gt.0) then + if(nio.eq.0) write(6,*) + & ' WARNING: Unable to find all mesh points in source fld ', + & nfail_sum + endif + ! read source fields and interpolate if(ifgetur) then - if(nid.eq.0 .and. loglevel.gt.2) write(6,*) 'reading vel' - ntot = nx1*ny1*nz1*nelv - call gfldr_getfld(vx,vy,vz,ntot,ldim,ifldpos+1) + if(.not.ifgfldr.or.ifgetu) then !skip if this is a restart call and the scalar isn't requested + if(nid.eq.0 .and. loglevel.gt.2) write(6,*) 'reading vel' + ntot = nx1*ny1*nz1*nelv + call gfldr_getfld(vx,vy,vz,ntot,ldim,ifldpos+1) + endif ifldpos = ifldpos + ldim endif if(ifgetpr) then - if(nid.eq.0 .and. loglevel.gt.2) write(6,*) 'reading pr' - ntot = nx1*ny1*nz1*nelv - call gfldr_getfld(pm1,dum,dum,ntot,1,ifldpos+1) + if(.not.ifgfldr.or.ifgetp) then !skip if this is a restart call and the scalar isn't requested + if(nid.eq.0 .and. loglevel.gt.2) write(6,*) 'reading pr' + ntot = nx1*ny1*nz1*nelv + call gfldr_getfld(pm1,dum,dum,ntot,1,ifldpos+1) + if (ifaxis) call axis_interp_ic(pm1) + call map_pm1_to_pr(pm1,1) + endif ifldpos = ifldpos + 1 - if (ifaxis) call axis_interp_ic(pm1) - call map_pm1_to_pr(pm1,1) endif if(ifgettr .and. ifheat) then - if(nid.eq.0 .and. loglevel.gt.2) write(6,*) 'reading temp' - ntot = nx1*ny1*nz1*nelfld(2) - call gfldr_getfld(t(1,1,1,1,1),dum,dum,ntot,1,ifldpos+1) + if(.not.ifgfldr.or.ifgett) then !skip if this is a restart call and the scalar isn't requested + if(nid.eq.0 .and. loglevel.gt.2) write(6,*) 'reading temp' + ntot = nx1*ny1*nz1*nelfld(2) + call gfldr_getfld(t(1,1,1,1,1),dum,dum,ntot,1,ifldpos+1) + endif ifldpos = ifldpos + 1 endif do i = 1,ldimt-1 - if(ifgtpsr(i)) then - if(nid.eq.0 .and. loglevel.gt.2) - $ write(6,*) 'reading scalar',i - ntot = nx1*ny1*nz1*nelfld(i+2) - call gfldr_getfld(t(1,1,1,1,i+1),dum,dum,ntot,1,ifldpos+1) - ifldpos = ifldpos + 1 - endif + if(ifgtpsr(i)) then + if(.not.ifgfldr.or.ifgtps(i)) then !skip if this is a restart call and the scalar isn't requested + if(nid.eq.0 .and. loglevel.gt.2) + $ write(6,*) 'reading scalar',i + ntot = nx1*ny1*nz1*nelfld(i+2) + call gfldr_getfld(t(1,1,1,1,i+1),dum,dum,ntot,1,ifldpos+1) + endif + ifldpos = ifldpos + 1 + endif enddo call byte_close_mpi(fldh_gfldr,ierr) @@ -266,39 +305,6 @@ subroutine gfldr_intp(fieldout,nout,fieldin,iffpts) real fieldout(nout) real fieldin (*) - logical iffpts - - integer*8 i8glsum,nfail,nfail_sum - - if(iffpts) then ! locate points (iel,iproc,r,s,t) - nfail = 0 - toldist = 5e-6 - if(wdsizr.eq.8) toldist = 5e-14 - - ntot = lx1*ly1*lz1*nelt - call fgslib_findpts(inth_gfldr, - & grcode,1, - & gproc,1, - & gelid,1, - & grst,ldim, - & gdist,1, - & xm1,1, - & ym1,1, - & zm1,1,ntot) - - do i=1,ntot - if(grcode(i).eq.1 .and. sqrt(gdist(i)).gt.toldist) - & nfail = nfail + 1 - if(grcode(i).eq.2) nfail = nfail + 1 - enddo - - nfail_sum = i8glsum(nfail,1) - if(nfail_sum.gt.0) then - if(nio.eq.0) write(6,*) - & ' WARNING: Unable to find all mesh points in source fld ', - & nfail_sum - endif - endif ! evaluate inut field at given points npt = nout @@ -312,5 +318,14 @@ subroutine gfldr_intp(fieldout,nout,fieldin,iffpts) return end +c----------------------------------------------------------------------- +#else + subroutine gfldr(sourcefld) + character sourcefld*(*) + + call exitti("MPIIO needed for gfldr!$",0) + + return + end #endif diff --git a/3rd_party/nek5000/core/ic.f b/3rd_party/nek5000/core/ic.f index 15b0cb7df..391c8ce76 100644 --- a/3rd_party/nek5000/core/ic.f +++ b/3rd_party/nek5000/core/ic.f @@ -492,7 +492,12 @@ subroutine restart(nfiles) if (p67.eq.6.0) then do ifile=1,nfiles call sioflag(ndumps,fname,initc(ifile)) - call mfi(fname,ifile) + if(ifgfldr) then + call gfldr(fname) + else + call mfi(fname,ifile) + endif + ifgfldr=.false. !avoid interfering with future gfldr calls enddo call bcast(time,wdsize)! Sync time across processors return @@ -1002,6 +1007,7 @@ subroutine sioflag(ndumps,fname,rsopts) 100 continue ifgtim=.true. ndumps=0 + ifgfldr=.false. C C Check for default case - just a filename given, no i/o options specified C @@ -1038,6 +1044,11 @@ subroutine sioflag(ndumps,fname,rsopts) C Parse field specifications. + IGO=INDX_CUT(RSOPT,'INT',3) + IF (IGO.NE.0) THEN + ifgfldr=.TRUE. + ENDIF + IXO=INDX_CUT(RSOPT,'X',1) IF (IXO.NE.0) THEN ifdeft=.false. @@ -1080,8 +1091,10 @@ subroutine sioflag(ndumps,fname,rsopts) C If no fields were explicitly specified, assume getting all fields. if (ifdeft) then - IFGETX=.TRUE. - IF (IF3D) IFGETZ=.TRUE. + if(.not.ifgfldr) then + IFGETX=.TRUE. + IF (IF3D) IFGETZ=.TRUE. + endif IFANYC=.FALSE. DO 400 I=1,NFIELD IF (IFADVC(I)) IFANYC=.TRUE. @@ -1097,6 +1110,9 @@ subroutine sioflag(ndumps,fname,rsopts) 410 continue endif + if(ifgfldr.and.ifgetx) + & call exitti('"X" and "INT" restart options incompatible!$',0) + return END c----------------------------------------------------------------------- diff --git a/3rd_party/nek5000/core/makefile.template b/3rd_party/nek5000/core/makefile.template index 0e5dd56e8..65325998d 100644 --- a/3rd_party/nek5000/core/makefile.template +++ b/3rd_party/nek5000/core/makefile.template @@ -1,10 +1,10 @@ -OBJDIR=obj BINNAME=nek5000 -LIB=$(OBJDIR)/libnek5000.a CASENAME= CASEDIR= S= OPT_INCDIR=./ +OBJDIR=$(CASEDIR)/obj +LIB=$(OBJDIR)/libnek5000.a LD= FC= @@ -64,10 +64,11 @@ ifeq ($(MPI),0) COMM_MPI := ${COMM_MPI} mpi_dummy.o endif -DUMMY:= $(shell cp $S/core/PARALLEL.default $S/core/PARALLEL 2>/dev/null) +$(info $(shell mkdir -p $(OBJDIR))) +DUMMY:= $(shell cp $S/core/PARALLEL.default $(OBJDIR)/PARALLEL 2>/dev/null) ifeq ($(DPROCMAP),1) CORE := ${CORE} dprocmap.o - DUMMY:= $(shell cp $S/core/PARALLEL.dprocmap $S/core/PARALLEL 2>/dev/null) + DUMMY:= $(shell cp $S/core/PARALLEL.dprocmap $(OBJDIR)/PARALLEL 2>/dev/null) endif ifneq ($(VISIT),0) @@ -84,9 +85,11 @@ endif TMP1 = $(CORE) $(MXM) $(USR) $(COMM_MPI) $(VISITO) -OPT_INCDIR += $S/core/experimental - -INCLUDES:= $(foreach dir,$(OPT_INCDIR),-I$(dir)) +INCLUDES_DIR += $(OPT_INCDIR) +INCLUDES_DIR += $(S)/core/experimental +INCLUDES_DIR += $(OBJDIR) +INCLUDES:= $(foreach dir,$(INCLUDES_DIR),-I$(dir)) +#$(info $(shell echo "INCLUDES: $(INCLUDES)")) NOBJS = $(patsubst %,$(OBJDIR)/%,$(TMP1)) USRF = $(OBJDIR)/${CASENAME}.o @@ -95,15 +98,13 @@ L0 = $(G) -O0 L2 = $(G) -O2 L3 = $(G) -O3 -FL0 = $(L0) $(FFLAGS) $(PPS_F) -I$(CASEDIR) -I$S/core $(INCLUDES) -FL2 = $(L2) $(FFLAGS) $(PPS_F) -I$(CASEDIR) -I$S/core $(INCLUDES) -FL3 = $(L3) $(FFLAGS) $(PPS_F) -I$(CASEDIR) -I$S/core $(INCLUDES) - -cFL0 = $(L0) $(CFLAGS) $(PPS_C) -I$S/core $(INCLUDES) -cFL2 = $(L2) $(CFLAGS) $(PPS_C) -I$S/core $(INCLUDES) -cFL3 = $(L3) $(CFLAGS) $(PPS_C) -I$S/core $(INCLUDES) +FL0 = $(L0) $(FFLAGS) $(PPS_F) -I$(CASEDIR) -I$S/core $(INCLUDES) +FL2 = $(L2) $(FFLAGS) $(PPS_F) -I$(CASEDIR) -I$S/core $(INCLUDES) +FL3 = $(L3) $(FFLAGS) $(PPS_F) -I$(CASEDIR) -I$S/core $(INCLUDES) -$(info $(shell mkdir -p $(OBJDIR))) +cFL0 = $(L0) $(CFLAGS) $(PPS_C) -I$S/core $(INCLUDES) +cFL2 = $(L2) $(CFLAGS) $(PPS_C) -I$S/core $(INCLUDES) +cFL3 = $(L3) $(CFLAGS) $(PPS_C) -I$S/core $(INCLUDES) ################################################################################ all : nek5000 @@ -134,7 +135,7 @@ nek5000: ${LIB} $(USRF) lib: ${LIB} ${LIB}: $(NOBJS) - @$(AR) cr ./${LIB} $(NOBJS) + @$(AR) cr ${LIB} $(NOBJS) @ranlib ${LIB} @if test -f ${LIB}; then \ printf "done\n"; \ diff --git a/3rd_party/nek5000/core/map2.f b/3rd_party/nek5000/core/map2.f index 4b3d856df..05532d861 100644 --- a/3rd_party/nek5000/core/map2.f +++ b/3rd_party/nek5000/core/map2.f @@ -227,10 +227,9 @@ subroutine get_vert_map(nlv) enddo neliv = j - algo = 0 ! 0 - Lanczos, 1 - MG nel = neliv call fpartMesh(eid8,vtx8,xyz,lelt,nel,nlv,nekcomm, - $ meshPartitioner,algo,loglevel,ierr) + $ meshPartitioner,0,loglevel,ierr) call err_chk(ierr,'partMesh fluid failed!$') nelv = nel @@ -281,10 +280,9 @@ subroutine get_vert_map(nlv) enddo nelit = j - algo = 0 ! 0 - Lanczos, 1 - MG nel = nelit call fpartMesh(eid8,vtx8,xyz,lelt,nel,nlv,nekcomm, - $ meshPartitioner,algo,loglevel,ierr) + $ 2,0,loglevel,ierr) call err_chk(ierr,'partMesh solid failed!$') nelt = nelv + nel @@ -302,6 +300,7 @@ subroutine get_vert_map(nlv) endif #ifdef DPROCMAP + call dProcMapClearCache() do i = 1,nelt ieg = lglel(i) if (ieg.lt.1 .or. ieg.gt.nelgt) @@ -557,7 +556,6 @@ subroutine set_proc_map() ! setup gllnid + gllel #if defined(DPROCMAP) call dProcmapInit() - dProcmapCache = .false. #endif nelB = igl_running_sum(nelt) - nelt do i = 1,nelt @@ -592,10 +590,6 @@ subroutine set_proc_map() ! get element-proc mapping call get_map() - itmp = gllnid(0) ! reset last element cache - itmp = gllel(0) ! reset last element cache - dProcmapCache = .true. - #if !defined(DPROCMAP) IEL=0 CALL IZERO(GLLEL,NELGT) diff --git a/3rd_party/nek5000/core/postpro.f b/3rd_party/nek5000/core/postpro.f index 2048455b3..41b2d401b 100644 --- a/3rd_party/nek5000/core/postpro.f +++ b/3rd_party/nek5000/core/postpro.f @@ -564,7 +564,7 @@ subroutine map2reg(ur,n,u,nel) c----------------------------------------------------------------------- subroutine map2reg_2di_e(uf,n,uc,m) ! Fine, uniform pt - real uf(n,n),uc(m,m) + real uf(n,n),uc(m,m),j,jt parameter (l=50) common /cmap2d/ j(l*l),jt(l*l),w(l*l),z(l) @@ -582,7 +582,8 @@ subroutine map2reg_2di_e(uf,n,uc,m) ! Fine, uniform pt call zuni (w,n) call gen_int_gz(j,jt,w,n,z,m) - + mo = m + no = n endif call mxm(j,n,uc,m,w ,m) @@ -593,7 +594,7 @@ subroutine map2reg_2di_e(uf,n,uc,m) ! Fine, uniform pt c----------------------------------------------------------------------- subroutine map2reg_3di_e(uf,n,uc,m) ! Fine, uniform pt - real uf(n,n,n),uc(m,m,m) + real uf(n,n,n),uc(m,m,m),j,jt parameter (l=16) common /cmap3d/ j(l*l),jt(l*l),v(l*l*l),w(l*l*l),z(l) @@ -611,7 +612,8 @@ subroutine map2reg_3di_e(uf,n,uc,m) ! Fine, uniform pt call zuni (w,n) call gen_int_gz(j,jt,w,n,z,m) - + mo = m + no = n endif mm = m*m diff --git a/3rd_party/nek5000/core/reader_re2.f b/3rd_party/nek5000/core/reader_re2.f index bf1452983..c79ba4af2 100644 --- a/3rd_party/nek5000/core/reader_re2.f +++ b/3rd_party/nek5000/core/reader_re2.f @@ -202,7 +202,7 @@ subroutine readp_re2_curve(ifbswap,ifread) re2off_b = re2off_b + nrg*4*lrs4 if (.not.ifread) return - if(nio.eq.0) write(6,*) 'reading curved sides ' + if(nio.eq.0) write(6,'(A,I10)') ' reading curved sides ', nrg nwds4r = nr*lrs4 call byte_set_view(lre2off_b,fh_re2) @@ -306,7 +306,9 @@ subroutine readp_re2_bc(cbl,bl,ifbswap,ifread) re2off_b = re2off_b + nrg*4*lrs4 if (.not.ifread) return - if(nio.eq.0) write(6,*) 'reading bc for ifld',ifield + if(nio.eq.0) write(6,'(A,I10,AI3)') + $ ' reading boundary faces ', nrg, + $ ' for ifield ', ifield nwds4r = nr*lrs4 call byte_set_view(lre2off_b,fh_re2) diff --git a/3rd_party/nek5000_gslib/src/gs.c b/3rd_party/nek5000_gslib/src/gs.c index ce797553c..b1a9aa7a7 100644 --- a/3rd_party/nek5000_gslib/src/gs.c +++ b/3rd_party/nek5000_gslib/src/gs.c @@ -1391,9 +1391,8 @@ static uint local_setup(struct gs_data *gsh, const struct array *nz) } static void gs_setup_aux(struct gs_data *gsh, const slong *id, uint n, - int unique, gs_method _method, int verbose) + int unique, gs_method method, int verbose) { - gs_method method = gs_pairwise; static setup_fun *const remote_setup[] = { &auto_setup, &pw_setup, &cr_setup, &allreduce_setup }; diff --git a/3rd_party/nek5000_parRSB/examples/ethier.re2 b/3rd_party/nek5000_parRSB/examples/ethier.re2 new file mode 100644 index 000000000..5cd214ffa Binary files /dev/null and b/3rd_party/nek5000_parRSB/examples/ethier.re2 differ diff --git a/3rd_party/nek5000_parRSB/examples/gencon.c b/3rd_party/nek5000_parRSB/examples/gencon.c new file mode 100644 index 000000000..44c6afc65 --- /dev/null +++ b/3rd_party/nek5000_parRSB/examples/gencon.c @@ -0,0 +1,123 @@ +/* + * Generate connectivity (.co2) from Nek5000 mesh (.re2) file. + */ +#include + +static int test_parcon(unsigned int neltp, long long *vlp, char *name, + MPI_Comm comm) { + unsigned int nelt; + int nv; + long long *vls = NULL; + int err = parrsb_read_mesh(&nelt, &nv, &vls, NULL, NULL, NULL, name, + MPI_COMM_WORLD, 2); + assert(neltp == nelt); + + uint size = nelt * nv; + slong *minp = tcalloc(slong, size); + slong *maxp = tcalloc(slong, size); + + buffer buf; + buffer_init(&buf, 1024); + + struct comm c; + comm_init(&c, comm); + + struct gs_data *gsh = gs_setup(vls, size, &c, 0, gs_pairwise, 0); + + uint i; + for (i = 0; i < size; i++) + minp[i] = maxp[i] = vlp[i]; + + gs(minp, gs_long, gs_min, 0, gsh, &buf); + gs(maxp, gs_long, gs_max, 0, gsh, &buf); + + for (i = 0; i < size; i++) + if (minp[i] != maxp[i]) { + err = 1; + break; + } + + gs_free(gsh); + gsh = gs_setup(vlp, size, &c, 0, gs_pairwise, 0); + + for (i = 0; i < size; i++) + minp[i] = maxp[i] = vls[i]; + + gs(minp, gs_long, gs_min, 0, gsh, &buf); + gs(maxp, gs_long, gs_max, 0, gsh, &buf); + + for (i = 0; i < size; i++) + if (minp[i] != maxp[i]) { + err = 1; + break; + } + + int np; + MPI_Comm_size(MPI_COMM_WORLD, &np); + if (np == 1) { + for (i = 0; i < size; i++) + if (vls[i] != vlp[i]) { + err = 1; + break; + } + } + + gs_free(gsh); + comm_free(&c); + buffer_free(&buf); + + if (minp != NULL) + free(minp); + if (maxp != NULL) + free(maxp); + if (vls != NULL) + free(vls); + + return err; +} + +int main(int argc, char *argv[]) { + MPI_Init(&argc, &argv); + + parrsb_input *in = parrsb_parse_input(argc, argv); + + /* Read the geometry from the .re2 file */ + unsigned int nelt, nbcs; + double *coord = NULL; + long long *bcs = NULL; + int nv; + int err = parrsb_read_mesh(&nelt, &nv, NULL, &coord, &nbcs, &bcs, in->mesh, + MPI_COMM_WORLD, 1); + parrsb_check_error(err, MPI_COMM_WORLD); + + /* Find connectivity */ + long long *vl = (long long *)calloc(nelt * nv, sizeof(long long)); + int ndim = nv == 8 ? 3 : 2; + err |= parrsb_find_conn(vl, coord, nelt, ndim, bcs, nbcs, in->tol, + MPI_COMM_WORLD, 0); + parrsb_check_error(err, MPI_COMM_WORLD); + + /* Test if the relevant env. variable is set */ + if (in->test == 1) + err |= test_parcon(nelt, vl, in->mesh, MPI_COMM_WORLD); + parrsb_check_error(err, MPI_COMM_WORLD); + + /* Write connectivity to .co2 file */ + if (in->dump == 1) + err |= parrsb_dump_con(vl, nelt, nv, in->mesh, MPI_COMM_WORLD); + parrsb_check_error(err, MPI_COMM_WORLD); + + /* Free resources */ + if (vl != NULL) + free(vl); + if (coord != NULL) + free(coord); + if (bcs != NULL) + free(bcs); + if (in != NULL) + free(in); + + MPI_Finalize(); + + return 0; +} diff --git a/3rd_party/nek5000_parRSB/examples/genmap.c b/3rd_party/nek5000_parRSB/examples/genmap.c new file mode 100644 index 000000000..94c89e60b --- /dev/null +++ b/3rd_party/nek5000_parRSB/examples/genmap.c @@ -0,0 +1,93 @@ +/* + * Generate partitions (.ma2) from Nek5000's mesh (.re2) file. + */ +#include + +static int test_parrsb() {} + +int main(int argc, char *argv[]) { + MPI_Init(&argc, &argv); + + int id; + MPI_Comm_rank(MPI_COMM_WORLD, &id); + + parrsb_input *in = parrsb_parse_input(argc, argv); + + int active = 0; + if (id < in->nactive) + active = 1; + + MPI_Comm comm; + MPI_Comm_split(MPI_COMM_WORLD, active, id, &comm); + + /* Read the geometry from the .re2 file */ + unsigned int nelt, nbcs; + double *coord = NULL; + long long *bcs = NULL; + int nv; + int err = 0; + if (active == 1) + err = parrsb_read_mesh(&nelt, &nv, NULL, &coord, &nbcs, &bcs, in->mesh, + comm, 1); + parrsb_check_error(err, comm); + + /* Find connectivity */ + long long *vl = (long long *)calloc(nelt * nv, sizeof(long long)); + int ndim = nv == 8 ? 3 : 2; + if (active == 1) + err |= parrsb_find_conn(vl, coord, nelt, ndim, bcs, nbcs, in->tol, comm, 0); + parrsb_check_error(err, comm); + + int nss[6]; + if (active == 1 && in->nactive > 1) { + if (id == 0) + printf("Partition statistics before RSB:\n"); + parrsb_print_part_stat(vl, nelt, nv, comm); + parrsb_get_part_stat(NULL, NULL, &nss[0], NULL, vl, nelt, nv, comm); + } + + /* Partition the mesh */ + parrsb_options options = parrsb_default_options; + int *part = (int *)calloc(nelt, sizeof(int)); + if (active == 1) + err |= parrsb_part_mesh(part, NULL, vl, coord, nelt, nv, options, comm); + parrsb_check_error(err, comm); + + /* Redistribute data */ + if (active == 1) + err |= parrsb_distribute_elements(&nelt, &vl, &coord, part, nv, comm); + parrsb_check_error(err, comm); + + if (active == 1 && in->nactive > 1) { + if (id == 0) + printf("Partition statistics after RSB:\n"); + parrsb_print_part_stat(vl, nelt, nv, comm); + parrsb_get_part_stat(NULL, NULL, &nss[3], NULL, vl, nelt, nv, comm); + } + + if (active == 1 && in->test && in->nactive > 1) + err |= nss[2] < nss[5]; + parrsb_check_error(err, comm); + + /* Write partition to .ma2 file */ + if (active == 1 && in->dump == 1) + err |= parrsb_dump_map(in->mesh, nelt, nv, vl, part, comm); + parrsb_check_error(err, comm); + + /* Free resources */ + if (part != NULL) + free(part); + if (vl != NULL) + free(vl); + if (coord != NULL) + free(coord); + if (bcs != NULL) + free(bcs); + if (in != NULL) + free(in); + + MPI_Comm_free(&comm); + MPI_Finalize(); + + return 0; +} diff --git a/3rd_party/occa/examples/cpp/06_shared_memory/reductionWithAtomics.okl b/3rd_party/occa/examples/cpp/06_shared_memory/reductionWithAtomics.okl new file mode 100644 index 000000000..086f7eb94 --- /dev/null +++ b/3rd_party/occa/examples/cpp/06_shared_memory/reductionWithAtomics.okl @@ -0,0 +1,8 @@ + +@kernel void reductionWithAtomics(const int entries, + const float *vec, + float *sum) { + for (int i = 0; i < entries; ++i; @tile(16, @outer, @inner)) { + @atomic *sum += vec[i]; + } +} diff --git a/3rd_party/occa/examples/cpp/06_shared_memory/reduction.okl b/3rd_party/occa/examples/cpp/06_shared_memory/reductionWithSharedMemory.okl similarity index 78% rename from 3rd_party/occa/examples/cpp/06_shared_memory/reduction.okl rename to 3rd_party/occa/examples/cpp/06_shared_memory/reductionWithSharedMemory.okl index 184650c59..37758a0c3 100644 --- a/3rd_party/occa/examples/cpp/06_shared_memory/reduction.okl +++ b/3rd_party/occa/examples/cpp/06_shared_memory/reductionWithSharedMemory.okl @@ -1,3 +1,4 @@ + @kernel void reductionWithSharedMemory(const int entries, const float *vec, float *blockSum) { @@ -28,11 +29,3 @@ } } } - -@kernel void reductionWithAtomics(const int entries, - const float *vec, - float *sum) { - for (int i = 0; i < entries; ++i; @tile(16, @outer, @inner)) { - @atomic *sum += vec[i]; - } -} diff --git a/3rd_party/occa/src/occa/internal/lang/modes/cuda.cpp b/3rd_party/occa/src/occa/internal/lang/modes/cuda.cpp index ab25286bf..2bf97080b 100644 --- a/3rd_party/occa/src/occa/internal/lang/modes/cuda.cpp +++ b/3rd_party/occa/src/occa/internal/lang/modes/cuda.cpp @@ -129,11 +129,13 @@ namespace occa { // Set kernel qualifiers vartype_t &vartype = kernelSmnt.function().returnType; + const bool addLaunchBounds = !settings.get("okl/no_launch_bounds", false); dim kernelInnerDims = innerDims(kernelSmnt); if (!success) return; + int kernelInnerDim = kernelInnerDims[0]; for(int i=1; i < kernelInnerDims.dims; i++) kernelInnerDim *= kernelInnerDims[i]; - if(kernelInnerDim) { + if(kernelInnerDim && addLaunchBounds) { const std::string s = "__launch_bounds__(" + std::to_string(kernelInnerDim) + ")"; qualifier_t *boundQualifier = new qualifier_t(s, qualifierType::custom); vartype.qualifiers.addFirst(vartype.origin(), diff --git a/3rd_party/occa/src/occa/internal/lang/modes/opencl.cpp b/3rd_party/occa/src/occa/internal/lang/modes/opencl.cpp index b137b7d9a..332759a26 100644 --- a/3rd_party/occa/src/occa/internal/lang/modes/opencl.cpp +++ b/3rd_party/occa/src/occa/internal/lang/modes/opencl.cpp @@ -281,12 +281,13 @@ namespace occa { migrateLocalDecls((functionDeclStatement&) *smnt); if (!success) return; + const bool addLaunchBounds = !settings.get("okl/no_launch_bounds", false); dim kernelInnerDims = innerDims((functionDeclStatement&) *smnt); if (!success) return; int kernelInnerDim = kernelInnerDims[0]; for(int i=1; i < kernelInnerDims.dims; i++) kernelInnerDim *= kernelInnerDims[i]; - if(kernelInnerDim) { + if(kernelInnerDim && addLaunchBounds) { std::string s = "__attribute__((work_group_size_hint("; s += std::to_string(kernelInnerDims[0]); for(int i=1; i < 3; i++) { diff --git a/3rd_party/occa/src/occa/internal/modes/cuda/device.cpp b/3rd_party/occa/src/occa/internal/modes/cuda/device.cpp index e7afb39a0..9df0c8289 100644 --- a/3rd_party/occa/src/occa/internal/modes/cuda/device.cpp +++ b/3rd_party/occa/src/occa/internal/modes/cuda/device.cpp @@ -281,8 +281,11 @@ namespace occa { sys::addCompilerLibraryFlags(compilerFlags); } - //---[ PTX Check Command ]-------- std::stringstream command; + +#if 0 + + //---[ PTX Check Command ]-------- if (allProps.has("compiler_env_script")) { command << allProps["compiler_env_script"] << " && "; } @@ -314,13 +317,13 @@ namespace occa { #endif io::sync(ptxBinaryFilename); - //================================ +#endif //---[ Compiling Command ]-------- command.str(""); command << allProps["compiler"] << ' ' << compilerFlags - << " -ptx" + << " -fatbin -Xptxas -v" #if (OCCA_OS == OCCA_WINDOWS_OS) << " -D OCCA_OS=OCCA_WINDOWS_OS -D _MSC_VER=1800" #endif diff --git a/CMakeLists.txt b/CMakeLists.txt index 571d58c72..789adcd5a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,17 +1,19 @@ -cmake_minimum_required(VERSION 3.11) -project(NekRS LANGUAGES C CXX Fortran VERSION 21.1) +cmake_minimum_required(VERSION 3.13) +project(NekRS LANGUAGES C CXX Fortran VERSION 22.0.0) if(${CMAKE_BINARY_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}) MESSAGE(FATAL_ERROR "Error: In-place builds are not supported. Please create a separate build directory") endif(${CMAKE_BINARY_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}) +include(CheckCCompilerFlag) include(FortranCInterface) include(ExternalProject) include(FetchContent) +include(config/utils.cmake) #set_property(GLOBAL PROPERTY RULE_MESSAGES OFF) set(CMAKE_INSTALL_MESSAGE NEVER) -set(CMAKE_VERBOSE_MAKEFILE OFF) +#set(CMAKE_VERBOSE_MAKEFILE ON) set(FETCHCONTENT_QUIET on) set(FETCHCONTENT_UPDATES_DISCONNECTED on) @@ -20,11 +22,6 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) set(CMAKE_SKIP_BUILD_RPATH FALSE) #set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE) set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") -set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH};${CMAKE_INSTALL_PREFIX}/occa/lib") -set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH};${CMAKE_INSTALL_PREFIX}/elliptic") -set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH};${CMAKE_INSTALL_PREFIX}/amgSolver/parAlmond") -set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH};${CMAKE_INSTALL_PREFIX}/libparanumal") -set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH};${CMAKE_INSTALL_PREFIX}/gatherScatter") ############################################################################### # Configure Options @@ -32,7 +29,6 @@ set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH};${CMAKE_INSTALL_PREFIX}/gatherSc set(NEK5000_PPLIST "PARRSB DPROCMAP" CACHE STRING "Preprocessor macros for Nek5000") - set(NEKINTERFACEDIR "${CMAKE_CURRENT_SOURCE_DIR}/src/nekInterface/" CACHE PATH "Directory for Nek5000 inteface source files") @@ -43,42 +39,14 @@ set(OCCA_CUDA_COMPILER_FLAGS "-O3 --fmad=true" CACHE STRING "CUDA flags for OCCA set(OCCA_HIP_COMPILER_FLAGS "-O3 -ffp-contract=fast" CACHE STRING "HIP flags for OCCA JIT compile") set(OCCA_OPENCL_COMPILER_FLAGS "-cl-std=CL2.0 -cl-mad-enable -cl-no-signed-zeros" CACHE STRING "OPENCL flags for OCCA JIT compile") -# From OCCA's CMakeLists. All default to ON, but are verified in config tests -# - ENABLE_OPENMP -# - ENABLE_CUDA -# - ENABLE_OPENCL -# - ENABLE_HIP -# - ENABLE_METAL -# - ENABLE_MPI - set(ENABLE_METAL OFF CACHE BOOL "Enable OCCA Metal support") set(ENABLE_AMGX OFF CACHE BOOL "Enable NVIDIA AMGX support") -set(GPU_MPI "0" CACHE STRING "GPU aware MPI") +set(GPU_MPI "1" CACHE STRING "GPU aware MPI") ############################################################################### # Check compiler/MPI vendor, version and Fortran/C compatibility # ############################################################################### -macro(check_compiler_id compiler) - if(NOT "${compiler}" STREQUAL "GNU") - message(FATAL_ERROR "NekRS only supports GNU compilers") - endif() -endmacro() - -macro(check_compiler_version compiler_version) - if(${compiler_version} LESS 6.2) - message(FATAL_ERROR "NekRS only supports GNU compiler versions >= 6.2") - endif() -endmacro() - -check_compiler_id("${CMAKE_CXX_COMPILER_ID}") -check_compiler_id("${CMAKE_C_COMPILER_ID}") -check_compiler_id("${CMAKE_Fortran_COMPILER_ID}") - -check_compiler_version("${CMAKE_CXX_COMPILER_VERSION}") -check_compiler_version("${CMAKE_C_COMPILER_VERSION}") -check_compiler_version("${CMAKE_Fortran_COMPILER_VERSION}") - ## Sanity checks for MPI set(MPI_C_COMPILER ${CMAKE_C_COMPILER}) set(MPI_CXX_COMPILER ${CMAKE_CXX_COMPILER}) @@ -106,11 +74,13 @@ set (CMAKE_CXX_STANDARD 14) set(CMAKE_POSITION_INDEPENDENT_CODE on) -find_package(OpenMP) -if (OPENMP_FOUND) +find_package(OpenMP REQUIRED) +if(OpenMP_FOUND) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") + if(OpenMP_CXX_INCLUDE_DIR) + include_directories(SYSTEM ${OpenMP_CXX_INCLUDE_DIRS}) + endif() endif() # For CMake targets, CMAKE__FLAGS, CMAKE__FLAGS_, and @@ -148,6 +118,17 @@ if ("${GIT_COMMIT_HASH}" STREQUAL "") set(GIT_COMMIT_HASH "no sha") endif() +# avoid hijacked symbols with the same name +set(BSYMBOLIC_FLAG " -Wl,-Bsymbolic -Wl,-Bsymbolic-functions") +list(APPEND CMAKE_REQUIRED_LIBRARIES ${BSYMBOLIC_FLAG}) +check_c_compiler_flag("" Allowed_LD_Flag_BSYMBOLIC) +list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES ${BSYMBOLIC_FLAG}) +if(NOT Allowed_LD_Flag_BSYMBOLIC OR APPLE) + set(BSYMBOLIC_FLAG "") +else() +# set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${BSYMBOLIC_FLAG}") +endif() + ############################################################################### # Build OCCA # ############################################################################### @@ -174,45 +155,59 @@ include(config/hypre.cmake) # Build Nek5000 dependencies # ############################################################################### +macro(check_fcompiler_id compiler) + if(NOT "${compiler}" STREQUAL "GNU") + message(FATAL_ERROR "NekRS only supports gfortran") + endif() +endmacro() + +check_fcompiler_id("${CMAKE_Fortran_COMPILER_ID}") include(config/nek5000.cmake) ############################################################################### -# Definitions for libP, gslib, and blaslapack +# Build NekRS ############################################################################### - +include(config/bench.cmake) include(config/mesh.cmake) include(config/elliptic.cmake) include(config/gslib.cmake) include(config/blaslapack.cmake) -############################################################################### -# Build NekRS -############################################################################### - set(SRC src/lib/nekrs.cpp src/io/writeFld.cpp src/io/utils.cpp - src/core/utils/inipp.cpp - src/core/utils/mysort.cpp - src/core/utils/parallelSort.cpp - src/core/utils/tinyexpr.c - src/core/setupAide.cpp - src/core/cfl.cpp + src/utils/inipp.cpp + src/utils/mysort.cpp + src/utils/parallelSort.cpp + src/utils/tinyexpr.c + src/utils/setupAide.cpp + src/core/numberActiveFields.cpp + src/core/printHeader.cpp + src/timeStepper/cfl.cpp src/regularization/filter.cpp src/regularization/avm.cpp - src/core/bcMap.cpp + src/bdry/bcMap.cpp src/core/compileKernels.cpp - src/core/setup.cpp + src/setup/setup.cpp + src/core/alignment.cpp + src/core/registerNrsKernels.cpp + src/bdry/createEToBV.cpp + src/bdry/applyDirichlet.cpp src/timeStepper/timeStepper.cpp - src/lns/tombo.cpp - src/lns/constantFlowRate.cpp - src/cds/cds.cpp - src/core/parReader.cpp + src/timeStepper/subCycling.cpp + src/navierStokes/tombo.cpp + src/navierStokes/constantFlowRate.cpp + src/cds/cdsSolve.cpp + src/cds/registerCdsKernels.cpp + src/io/parReader.cpp + src/io/re2Reader.cpp src/core/configReader.cpp src/core/timer.cpp src/core/platform.cpp + src/core/flopCounter.cpp + src/core/kernelRequestManager.cpp src/core/device.cpp src/linAlg/linAlg.cpp src/linAlg/matrixConditionNumber.cpp @@ -220,12 +215,16 @@ set(SRC src/linAlg/matrixEig.cpp src/linAlg/matrixTranspose.cpp src/linAlg/matrixRightSolve.cpp - src/plugins/avg.cpp + src/linAlg/registerLinAlgKernels.cpp + src/plugins/tavg.cpp src/plugins/velRecycling.cpp src/plugins/RANSktau.cpp src/plugins/lowMach.cpp src/udf/udf.cpp + src/udf/compileUDFKernels.cpp src/nekInterface/nekInterfaceAdapter.cpp + src/postProcessing/planarAvg.cpp + ${BENCH_SOURCES} ${MESH_SOURCES} ${PARALMOND_SOURCES} ${ELLIPTIC_SOURCES} @@ -234,7 +233,7 @@ set(SRC ) set_property( - SOURCE src/lib/nekrs.cpp + SOURCE src/core/printHeader.cpp APPEND PROPERTY COMPILE_DEFINITIONS GITCOMMITHASH="${GIT_COMMIT_HASH}" NEKRS_VERSION=${PROJECT_VERSION_MAJOR} @@ -243,8 +242,10 @@ set_property( ) add_library(nekrs-lib SHARED ${SRC}) + +add_dependencies(nekrs-lib HYPRE_BUILD) set_target_properties(nekrs-lib PROPERTIES LINKER_LANGUAGE CXX OUTPUT_NAME nekrs) -target_link_libraries(nekrs-lib PUBLIC libocca PRIVATE HYPRE gs ${GSLIB}) +target_link_libraries(nekrs-lib PUBLIC libocca OpenMP::OpenMP_CXX PRIVATE gs ${GSLIB}) target_compile_definitions(nekrs-lib PUBLIC -DDOGS="${CMAKE_INSTALL_PREFIX}/gatherScatter" @@ -253,16 +254,25 @@ target_compile_definitions(nekrs-lib PUBLIC target_include_directories(nekrs-lib PUBLIC ${CMAKE_CURRENT_BINARY_DIR} + src + src/setup + src/bdry src/core - src/core/utils + src/utils src/lib src/io src/udf src/regularization src/linAlg src/timeStepper - src/lns + src/navierStokes src/cds + src/postProcessing + ${BENCH_SOURCE_DIR} + ${BENCH_SOURCE_DIR}/core + ${BENCH_SOURCE_DIR}/fdm + ${BENCH_SOURCE_DIR}/axHelm + ${BENCH_SOURCE_DIR}/advsub ${MESH_SOURCE_DIR} ${NEKINTERFACEDIR} ${OGS_SOURCE_DIR}/include @@ -272,26 +282,31 @@ target_include_directories(nekrs-lib ${ELLIPTIC_SOURCE_DIR}/amgSolver/hypre ${ELLIPTIC_SOURCE_DIR}/amgSolver/amgx ${PARALMOND_SOURCE_DIR} - ${HYPRE_SOURCE_DIR}/src - ${HYPRE_SOURCE_DIR}/src/utilities - ${HYPRE_SOURCE_DIR}/src/seq_mv - ${HYPRE_SOURCE_DIR}/src/parcsr_mv - ${HYPRE_SOURCE_DIR}/src/parcsr_ls - ${HYPRE_SOURCE_DIR}/src/IJ_mv - ${HYPRE_SOURCE_DIR}/src/multivector - ${HYPRE_SOURCE_DIR}/src/krylov - ${HYPRE_BINARY_DIR}) + ${HYPRE_INCLUDE_DIR} +) if(ENABLE_AMGX AND ENABLE_CUDA) target_compile_definitions(nekrs-lib PUBLIC -DENABLE_AMGX) - target_link_libraries(nekrs-lib PUBLIC amgxsh) - target_include_directories(nekrs-lib PUBLIC 3rd_party/AMGX/base/include) + target_link_libraries(nekrs-lib PUBLIC amgxsh amgx) + target_include_directories(nekrs-lib PUBLIC 3rd_party/AMGX/base/include 3rd_party/AMGX/core/include) endif() add_executable(nekrs-bin src/main.cpp) target_include_directories(nekrs-bin PRIVATE src/lib) set_target_properties(nekrs-bin PROPERTIES LINKER_LANGUAGE CXX OUTPUT_NAME nekrs) -target_link_libraries(nekrs-bin nekrs-lib) +target_link_libraries(nekrs-bin PRIVATE nekrs-lib) + +add_executable(axhelm-bin src/bench/axHelm/main.cpp) +set_target_properties(axhelm-bin PROPERTIES LINKER_LANGUAGE CXX OUTPUT_NAME nekrs-bench-axhelm) +target_link_libraries(axhelm-bin PRIVATE nekrs-lib) + +add_executable(advsub-bin src/bench/advsub/main.cpp) +set_target_properties(advsub-bin PROPERTIES LINKER_LANGUAGE CXX OUTPUT_NAME nekrs-bench-advsub) +target_link_libraries(advsub-bin PRIVATE nekrs-lib) + +add_executable(fdm-bin src/bench/fdm/main.cpp) +set_target_properties(fdm-bin PROPERTIES LINKER_LANGUAGE CXX OUTPUT_NAME nekrs-bench-fdm) +target_link_libraries(fdm-bin PRIVATE nekrs-lib) if(ENABLE_CUDA) set(BACKEND_DEFAULT "CUDA" CACHE STRING "Default occa mode") @@ -305,27 +320,8 @@ endif() ### Install # ################################################################################# -# Generate nekrs.conf # -set(MPI_COMPILER_CMDLINE "") -_MPI_check_compiler("CXX" "-show" MPI_COMPILER_CMDLINE MPI_COMPILER_RETURN) -if (NOT MPI_COMPILER_RETURN EQUAL 0) - _MPI_check_compiler("CXX" "-showme" MPI_COMPILER_CMDLINE MPI_COMPILER_RETURN) -endif() -if (NOT MPI_COMPILER_RETURN EQUAL 0) - _MPI_check_compiler("CXX" "-craype-verbose" MPI_COMPILER_CMDLINE MPI_COMPILER_RETURN) -endif() -if (MPI_COMPILER_RETURN EQUAL 0 AND MPI_COMPILER_CMDLINE) - separate_arguments(MPI_COMPILER_CMDLINE) - list(GET MPI_COMPILER_CMDLINE 0 _MPI_UNDERLYING_COMPILER) - find_program(MPI_UNDERLYING_COMPILER ${_MPI_UNDERLYING_COMPILER}) - if(EXISTS ${MPI_UNDERLYING_COMPILER}) - message(NOTICE "-- Found MPI_UNDERLYING_COMPILER: ${MPI_UNDERLYING_COMPILER}") - else() - message(FATAL_ERROR "Cannot find ${MPI_UNDERLYING_COMPILER}") - endif() -else() - message(FATAL_ERROR "Cannot identify underlying compiler used by ${CMAKE_CXX_COMPILER}") -endif() +# Generate nekrs.conf +__MPI_underlying_compiler("CXX" MPI_UNDERLYING_COMPILER) configure_file( ${CMAKE_CURRENT_LIST_DIR}/config/nekrs.conf.in @@ -333,7 +329,7 @@ configure_file( @ONLY) #install nekRS -install(TARGETS nekrs-lib nekrs-bin +install(TARGETS nekrs-lib nekrs-bin axhelm-bin advsub-bin fdm-bin RUNTIME DESTINATION bin LIBRARY DESTINATION lib ARCHIVE DESTINATION lib @@ -342,7 +338,7 @@ install(TARGETS nekrs-lib nekrs-bin # Trailing slash prevents parent directory from being copied install(DIRECTORY scripts/ DESTINATION bin FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) -install(DIRECTORY okl/ DESTINATION okl FILES_MATCHING REGEX "\.okl$|\.c$") +install(DIRECTORY okl/ DESTINATION okl FILES_MATCHING REGEX "\.okl$|\.c|\.h$") install(DIRECTORY src/ DESTINATION include FILES_MATCHING REGEX "\.hpp$|\.h$|\.tpp$") install(FILES src/udf/CMakeLists.txt DESTINATION udf) install(DIRECTORY src/nekInterface/ DESTINATION nekInterface REGEX "\.hpp$|\.cpp$" EXCLUDE) @@ -353,7 +349,7 @@ configure_file(${CMAKE_CURRENT_LIST_DIR}/config/install_examples.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/install_examples.cmake @ONLY) install(SCRIPT ${CMAKE_CURRENT_BINARY_DIR}/install_examples.cmake) -install(FILES src/core/parHelp.txt DESTINATION include) +install(FILES src/io/parHelp.txt DESTINATION include) message(NOTICE "") message(NOTICE "----------------- Summary -----------------") diff --git a/LICENSE b/LICENSE index 1e5e18767..aac30d4e5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2019-2021, UCHICAGO ARGONNE, LLC. +Copyright (c) 2019-2022, UCHICAGO ARGONNE, LLC. The UChicago Argonne, LLC as Operator of Argonne National Laboratory holds copyright in the Software. The copyright holder diff --git a/README.md b/README.md index 585701e3f..6cd6f53a6 100644 --- a/README.md +++ b/README.md @@ -5,57 +5,57 @@ [![Build Status](https://travis-ci.com/Nek5000/nekRS.svg?branch=master)](https://travis-ci.com/Nek5000/nekRS) [![License](https://img.shields.io/badge/License-BSD%203--Clause-orange.svg)](https://opensource.org/licenses/BSD-3-Clause) -**nekRS** is an open-source Navier Stokes solver based on the spectral element method targeting classical processors and hardware accelerators like GPUs. The code started as an early fork of [libParanumal](https://github.com/paranumal/libparanumal) tailored to our needs. For API portable programming [OCCA](https://github.com/libocca/occa) is used. +**nekRS** is an open-source Navier Stokes solver based on the spectral element method targeting classical processors and accelerators like GPUs. The code started as an fork of [libParanumal](https://github.com/paranumal/libparanumal) in 2019. For API portable programming [OCCA](https://github.com/libocca/occa) is used. Capabilities: * Incompressible and low Mach-number Navier-Stokes + scalar transport * CG-SEM using curvilinear conformal hexaheadral elements -* Adaptive 3rd/2nd order semi-implicit time integration + operator integration factor splitting -* MPI+X hybrid parallelism supporting CUDA, HIP, OPENCL and CPU -* Interface to [Nek5000](https://github.com/Nek5000/Nek5000) +* Variable time step 2nd/3rd order semi-implicit time integration +* MPI+X hybrid parallelism supporting CPU, CUDA, HIP, and OPENCL +* Various boundary conditions * Conjugate fluid-solid heat transfer * LES and RANS turbulence models -* ALE formulation for moving mesh support +* Arbitrary-Lagrangian-Eulerian moving mesh * VisIt & Paraview support for data analysis and visualization +* Interface to [Nek5000](https://github.com/Nek5000/Nek5000) ## Build Instructions Requirements: -* POSIX compilant OS -* GNU compiler collection version 6.2 or later +* Linux, Mac OS X (Microsoft Windows is not supported) +* C++14/C99 compatible compilers + GNU Fortran * MPI-3.1 or later -* CMake version 3.11 or later -* bash +* CMake version 3.13 (AMGx requires >=3.18) or later -Download the latest release tarball (recommended) +Download the latest release available under ```sh -wget https://github.com/Nek5000/nekRS/archive/refs/tags/v21.1.tar.gz -tar -zxf v21.1.tar.gz +https://github.com/Nek5000/nekRS/archive/refs/tags/v22.0.tar.gz ``` or clone our GitHub repository: ```sh -git clone https://github.com/Nek5000/nekRS.git +https://github.com/Nek5000/nekRS.git ``` -The git master branch always points to the latest release, the next branch -is a special development branch that contains all the new features that will be released at the next release cycle. +The `master` branch always points to the latest stable release while `next` +provides an early preview of the next upcoming release (do not use in a production environment). # -To build and install the code just run: +To build and install the code run: ```sh ./nrsconfig cmake --build ./build --target install -j8 ``` +Please delete instead of overwriting your old build and install directory before updating. Build settings can be customized by environment variables. After installation you may want to adjust `$NEKRS_HOME/nekrs.conf` to your environment. ## Setting the Enviroment -Assuming you run bash and your install directory is $HOME/.local/nekrs, +Assuming you run `bash` and your install directory is $HOME/.local/nekrs, add the following line to your $HOME/.bash_profile: ```sh @@ -66,29 +66,28 @@ then type `source $HOME/.bash_profile` in the current terminal window. ## Run the Code -We try hard not to break userland but the code is evolving quickly so things might change from one version to another without being backward compatible (see release notes). +We try hard not to break userland but the code is evolving quickly so things might change from one version to another without being backward compatible. Please consult `RELEASE.md` before using the code. ```sh cd $NEKRS_HOME/examples/turbPipePeriodic -nrsmpi turbPipe 2 # run on two MPI ranks +mpirun -np 2 nekrs --setup turbPipe.par ``` -Note, `nrsmpi/nrsbmpi` are just basic launch scripts. Please check `bin` for more examples. +For convenience we provide various launch scripts in the `bin` directory. ## Documentation -For documentation, see our [readthedocs page](https://nekrs.readthedocs.io/en/latest/). +For documentation, see our [readthedocs page](https://nekrs.readthedocs.io/en/latest/). For now it's just a dummy. We hope to improve documentation to make it more useable for new users. ## Discussion Group Please visit [GitHub Discussions](https://github.com/Nek5000/nekRS/discussions). Here we help, find solutions, share ideas, and follow discussions. -## Reporting Bugs -nekRS is hosted on GitHub and all bugs are reported and tracked through the [Issues](https://github.com/Nek5000/nekRS/issues) feature. If you are having trouble installing the code or getting your model to run properly, you should first vist our discussion group. - ## Contributing -Our project is hosted on [GitHub](https://github.com/Nek5000/nekRS) and everbody is welcome to become a part of it. -For more details see CONTRIBUTING.md. +Our project is hosted on [GitHub](https://github.com/Nek5000/nekRS). To learn how to contribute, see `CONTRIBUTING.md`. + +## Reporting Bugs +All bugs are reported and tracked through [Issues](https://github.com/Nek5000/nekRS/issues). If you are having trouble installing the code or getting your model to run properly, you should first vist our discussion group. ## License -nekRS is released under the BSD 3-clause license (see LICENSE file). +nekRS is released under the BSD 3-clause license (see `LICENSE` file). All new contributions must be made under the BSD 3-clause license. ## Acknowledgment diff --git a/RELEASE.md b/RELEASE.md index 22b81af25..6beffde74 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,46 @@ +# Release v22.0 + +## What is new? + +* Multi-session (uncoupled) support +* Support unaligned symmetry boundary condition +* Support (unaligned) traction boundary condition +* Better performance on AMD MI-GPUs +* FLOP counters +* Various bug fixes + +## Good to know + +* OpenCL support is now disabled by default + +## Breaking Changes + +* [udf] Rename `udfBuildKernel` => `oudfBuildKernel` +* [par] Separate details of coarse grid discretization from coarse grid solver + e.g., `coarseSolver = SEMFEM+AmgX` is replaced by + `coarseSolver = AmgX` and `coarseGridDiscretization = SEMFEM` +* [par] Remove `preconditioner=semg` and `preconditioner=semg_amg` +* [udf] Rename plug-in name `avg` => `tavg` +* [udf] Rename `udf.converged` => `udf.timeStepConverged` +* [nrsconfig] Rename env-var `AMGX_ENABLE` => `ENABLE_AMGX` + +## Known Bugs / Restrictions + +* Mesh solver does not support CHT and unaligned sym/shl BCs +* [729](https://github.com/Nek5000/Nek5000/issues/759) +* [300](https://github.com/Nek5000/nekRS/issues/300) +* [258](https://github.com/Nek5000/nekRS/issues/258) + +## Thanks to our Contributors + +@tcew, @kris-rowe, @aprilnovak + +We are grateful to all who added new features, filed issues or helped resolve them, +asked and answered questions, and were part of inspiring discussions. + +A special shout out to Tim Warburton at VT for tuning some critical kernels. + + # Release v21.1 ## What is new? diff --git a/config/bench.cmake b/config/bench.cmake new file mode 100644 index 000000000..6f9ad974b --- /dev/null +++ b/config/bench.cmake @@ -0,0 +1,8 @@ +set(BENCH_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/bench) + +set(BENCH_SOURCES + ${BENCH_SOURCE_DIR}/fdm/benchmarkFDM.cpp + ${BENCH_SOURCE_DIR}/axHelm/benchmarkAx.cpp + ${BENCH_SOURCE_DIR}/advsub/benchmarkAdvsub.cpp + ${BENCH_SOURCE_DIR}/core/kernelBenchmarker.cpp +) \ No newline at end of file diff --git a/config/elliptic.cmake b/config/elliptic.cmake index b14d61a90..744e6d81e 100644 --- a/config/elliptic.cmake +++ b/config/elliptic.cmake @@ -4,15 +4,18 @@ set(ELLIPTIC_SOURCES ${ELLIPTIC_SOURCE_DIR}/linearSolver/PCG.cpp ${ELLIPTIC_SOURCE_DIR}/linearSolver/PGMRES.cpp ${ELLIPTIC_SOURCE_DIR}/amgSolver/amgx/amgx.c + ${ELLIPTIC_SOURCE_DIR}/ellipticApplyMask.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticBuildSEMFEM.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticBuildContinuous.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticBuildContinuousGalerkin.cpp - ${ELLIPTIC_SOURCE_DIR}/ellipticJacobi.cpp - ${ELLIPTIC_SOURCE_DIR}/ellipticKernelInfo.cpp + ${ELLIPTIC_SOURCE_DIR}/ellipticUpdateJacobi.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticSEMFEM.cpp + ${ELLIPTIC_SOURCE_DIR}/registerEllipticKernels.cpp + ${ELLIPTIC_SOURCE_DIR}/registerEllipticPreconditionerKernels.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticBuildPreconditionerKernels.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticBuildMultigridLevelFine.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticBuildMultigridLevel.cpp + ${ELLIPTIC_SOURCE_DIR}/ellipticMultiGridUpdateLambda.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticMultiGridLevel.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticMultiGridLevelSetup.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticMultiGridSchwarz.cpp @@ -20,9 +23,10 @@ set(ELLIPTIC_SOURCES ${ELLIPTIC_SOURCE_DIR}/ellipticOperator.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticPreconditioner.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticPreconditionerSetup.cpp - ${ELLIPTIC_SOURCE_DIR}/ellipticResidualProjection.cpp + ${ELLIPTIC_SOURCE_DIR}/ellipticSolutionProjection.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticSolve.cpp - ${ELLIPTIC_SOURCE_DIR}/ellipticSolveSetup.cpp + ${ELLIPTIC_SOURCE_DIR}/ellipticOgs.cpp + ${ELLIPTIC_SOURCE_DIR}/ellipticSetup.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticUpdatePCG.cpp ${ELLIPTIC_SOURCE_DIR}/ellipticZeroMean.cpp) diff --git a/config/hypre.cmake b/config/hypre.cmake index 24abde66e..7dc6c5df5 100644 --- a/config/hypre.cmake +++ b/config/hypre.cmake @@ -1,16 +1,28 @@ set(HYPRE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/3rd_party/hypre) -# * These two variables are significant to HYPRE's CMakeLists, not our own -# HYPRE's CMakeLists leak some variables into parent project, and this is a workaround -set(HYPRE_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX} CACHE PATH "" FORCE) -set(HYPRE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "" FORCE) - -set(HYPRE_ENABLE_SINGLE OFF CACHE BOOL "" FORCE) -set(HYPRE_ENABLE_MIXEDINT ON CACHE BOOL "" FORCE) - -add_subdirectory(${HYPRE_SOURCE_DIR}/src) -get_property(HYPRE_BINARY_DIR TARGET HYPRE PROPERTY BINARY_DIR) - -# This conflicts with the stdlib "version" header... -file(REMOVE ${HYPRE_SOURCE_DIR}/src/utilities/version) +file(MAKE_DIRECTORY ${CMAKE_INSTALL_PREFIX}/include) +set(HYPRE_INCLUDE_DIR ${CMAKE_INSTALL_PREFIX}/include) +set(HYPRE_C_FLAGS "${CMAKE_C_FLAGS_RELWITHDEBINFO}") +set(HYPRE_CXX_FLAGS "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") +set(HYPRE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}) +ExternalProject_Add( + HYPRE_BUILD + SOURCE_DIR ${HYPRE_SOURCE_DIR} + SOURCE_SUBDIR "src" + BUILD_ALWAYS ON + CMAKE_ARGS -DHYPRE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} + -DHYPRE_ENABLE_SHARED=ON + -DHYPRE_ENABLE_MIXEDINT=ON + -DHYPRE_ENABLE_BIGINT=ON + -DHYPRE_ENABLE_SINGLE=OFF + -DCMAKE_CXX_VISIBILITY_PRESET=hidden + -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_PREFIX}/lib + -DCMAKE_C_VISIBILITY_PRESET=hidden + -DCMAKE_BUILD_TYPE=RelWithDebInfo + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_SHARED_LINKER_FLAGS=${HYPRE_SHARED_LINKER_FLAGS} + -DCMAKE_CXX_FLAGS_RELWITHDEBINFO=${HYPRE_CXX_FLAGS} + -DCMAKE_C_FLAGS_RELWITHDEBINFO=${HYPRE_C_FLAGS} +) diff --git a/config/mesh.cmake b/config/mesh.cmake index f92be009e..7fe824699 100644 --- a/config/mesh.cmake +++ b/config/mesh.cmake @@ -2,6 +2,8 @@ set(MESH_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/mesh) set(MESH_SOURCES ${MESH_SOURCE_DIR}/meshSetup.cpp + ${MESH_SOURCE_DIR}/meshAvgBoundaryValue.cpp + ${MESH_SOURCE_DIR}/registerMeshKernels.cpp ${MESH_SOURCE_DIR}/meshNekReader.cpp ${MESH_SOURCE_DIR}/meshPhysicalNodesHex3D.cpp ${MESH_SOURCE_DIR}/meshGlobalIds.cpp @@ -9,7 +11,6 @@ set(MESH_SOURCES ${MESH_SOURCE_DIR}/meshBasisHex3D.cpp ${MESH_SOURCE_DIR}/meshApplyElementMatrix.cpp ${MESH_SOURCE_DIR}/meshConnect.cpp - ${MESH_SOURCE_DIR}/meshConnectBoundary.cpp ${MESH_SOURCE_DIR}/meshConnectFaceNodes3D.cpp ${MESH_SOURCE_DIR}/meshConnectPeriodicFaceNodes3D.cpp ${MESH_SOURCE_DIR}/meshFree.cpp @@ -23,4 +24,6 @@ set(MESH_SOURCES ${MESH_SOURCE_DIR}/meshParallelConsecutiveGlobalNumbering.cpp ${MESH_SOURCE_DIR}/meshParallelGatherScatterSetup.cpp ${MESH_SOURCE_DIR}/meshSurfaceGeometricFactorsHex3D.cpp + ${MESH_SOURCE_DIR}/meshComputeInvLMM.cpp + ${MESH_SOURCE_DIR}/meshSolve.cpp ${MESH_SOURCE_DIR}/meshParallelConnectOpt.cpp) diff --git a/config/nek5000.cmake b/config/nek5000.cmake index a1d10db0f..2ec0d58b4 100644 --- a/config/nek5000.cmake +++ b/config/nek5000.cmake @@ -20,12 +20,6 @@ endif() set(NEK5000_SOURCE_DIR ${nek5000_content_SOURCE_DIR}) -if (USE_PARRSB) - install(FILES ${NEK5000_SOURCE_DIR}/core/PARALLEL.dprocmap DESTINATION ${NEK5000_SOURCE_DIR}/core RENAME "PARALLEL") -else() - install(FILES ${NEK5000_SOURCE_DIR}/core/PARALLEL.default DESTINATION ${NEK5000_SOURCE_DIR}/core RENAME "PARALLEL") -endif() - # blasLapack # ========== @@ -83,23 +77,35 @@ set(PARRSB_LIB_DIR ${PARRSB_DIR}/../lib) # so we use the helper script (run_config.sh) to set the environment # variables based on the command-line arguments -if(CMAKE_C_COMPILER_ID STREQUAL "GNU") - set(FPIC_FLAG "-fPIC") +set(FPIC_FLAG "-fPIC") - set(MCMODEL_FLAG "-mcmodel=medium -mlarge-data-threshold=0") - CHECK_C_COMPILER_FLAG("${MCMODEL_FLAG}" COMPILER_C_SUPPORTS_MCMODEL_MEDIUM) - if(NOT COMPILER_C_SUPPORTS_MCMODEL_MEDIUM OR APPLE) - set(MCMODEL_FLAG "") - endif() +set(MCMODEL_FLAG "-mcmodel=medium") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") + set(MCMODEL_FLAG "-mcmodel=large") +endif() - CHECK_C_COMPILER_FLAG("${MCMODEL_FLAG} ${MCMODEL_FLAG}" COMPILER_C_SUPPORTS_MCMODEL_MEDIUM_FPIC) - if(NOT COMPILER_C_SUPPORTS_MCMODEL_MEDIUM_FPIC) - set(MCMODEL_FLAG "") - endif() +CHECK_C_COMPILER_FLAG("${MCMODEL_FLAG}" COMPILER_C_SUPPORTS_MCMODEL) +if(NOT COMPILER_C_SUPPORTS_MCMODEL OR APPLE) + set(MCMODEL_FLAG "") +endif() + +if(NOT MCMODEL_FLAG STREQUAL "") + CHECK_C_COMPILER_FLAG("${FPIC_FLAG} ${MCMODEL_FLAG}" COMPILER_C_SUPPORTS_MCMODEL_FPIC) + if(NOT COMPILER_C_SUPPORTS_MCMODEL_FPIC) + set(MCMODEL_FLAG "") + endif() endif() include(CheckFortranCompilerFlag) if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") + if(MCMODEL_FLAG STREQUAL "-mcmodel=medium") + set(LARGE_DATA_THRES_FLAG "-mlarge-data-threshold=0") + CHECK_Fortran_COMPILER_FLAG("${MCMODEL_FLAG} ${LARGE_DATA_THRES_FLAG}" COMPILER_Fortran_SUPPORTS_LARGE_DATA_THRES) + if(NOT COMPILER_Fortran_SUPPORTS_LARGE_DATA_THRES) + set(LARGE_DATA_THRES_FLAG "") + endif() + endif() + CHECK_Fortran_COMPILER_FLAG("-fcray-pointer" COMPILER_Fortran_SUPPORTS_CRAYPTR) if(COMPILER_Fortran_SUPPORTS_CRAYPTR) set(CRAYPTR_FLAG "-fcray-pointer") @@ -108,16 +114,20 @@ if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") endif() endif() +string(REGEX REPLACE "-O[2,3,fast]" "" _EXTERNAL_C_FLAGS ${EXTERNAL_C_FLAGS}) +string(REGEX REPLACE "-O[2,3,fast]" "" _EXTERNAL_Fortran_FLAGS ${EXTERNAL_Fortran_FLAGS}) + ExternalProject_Add( nek5000_deps SOURCE_DIR ${NEK5000_SOURCE_DIR} CONFIGURE_COMMAND "" BUILD_COMMAND ${CMAKE_CURRENT_LIST_DIR}/run_nekconfig.sh + "LDFLAGS=${BSYMBOLIC_FLAG}" "CC=${CMAKE_C_COMPILER}" - "CFLAGS=${EXTERNAL_C_FLAGS} ${FPIC_FLAG} ${MCMODEL_FLAG}" + "CFLAGS=${_EXTERNAL_C_FLAGS} ${FPIC_FLAG} ${MCMODEL_FLAG} ${LARGE_DATA_THRES_FLAG}" "FC=${CMAKE_Fortran_COMPILER}" - "FFLAGS=${EXTERNAL_Fortran_FLAGS} ${FPIC_FLAG} ${MCMODEL_FLAG} ${CRAYPTR_FLAG}" + "FFLAGS=${_EXTERNAL_Fortran_FLAGS} ${FPIC_FLAG} ${MCMODEL_FLAG} ${LARGE_DATA_THRES_FLAG} ${CRAYPTR_FLAG}" "NEK5000_SOURCE_DIR=${NEK5000_SOURCE_DIR}" "PPLIST=${NEK5000_PPLIST}" INSTALL_COMMAND "" diff --git a/config/utils.cmake b/config/utils.cmake new file mode 100644 index 000000000..4241bcb37 --- /dev/null +++ b/config/utils.cmake @@ -0,0 +1,53 @@ +function (__MPI_find_compiler LANG QUERY_FLAG OUTPUT_VARIABLE) + separate_arguments(_MPI_COMPILER_WRAPPER_OPTIONS NATIVE_COMMAND "${QUERY_FLAG}") + set(DUMMYSRC "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src.cxx") + file(WRITE ${DUMMYSRC} "int main() { return 0; }\n") + execute_process( + COMMAND ${MPI_${LANG}_COMPILER} ${_MPI_COMPILER_WRAPPER_OPTIONS} ${DUMMYSRC} + OUTPUT_VARIABLE WRAPPER_OUTPUT OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_VARIABLE WRAPPER_OUTPUT ERROR_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE WRAPPER_RETURN) + # Some compiler wrappers will yield spurious zero return values, for example + # Intel MPI tolerates unknown arguments and if the MPI wrappers loads a shared + # library that has invalid or missing version information there would be warning + # messages emitted by ld.so in the compiler output. In either case, we'll treat + # the output as invalid. + if("${WRAPPER_OUTPUT}" MATCHES "undefined reference|unrecognized|need to set|no version information available|command not found") + set(WRAPPER_RETURN 255) + endif() + # Ensure that no error output might be passed upwards. + if(NOT WRAPPER_RETURN EQUAL 0) + unset(WRAPPER_OUTPUT) + else() + # Strip leading whitespace + string(REGEX REPLACE "^ +" "" WRAPPER_OUTPUT "${WRAPPER_OUTPUT}") + endif() + + unset(UNDERLYING_COMPILER) + if(WRAPPER_OUTPUT) + separate_arguments(WRAPPER_OUTPUT) + list(GET WRAPPER_OUTPUT 0 WRAPPER_OUTPUT_0) + find_program(UNDERLYING_COMPILER ${WRAPPER_OUTPUT_0}) + message("-- Found MPI_UNDERLYING_COMPILER: ${UNDERLYING_COMPILER}") + if(NOT EXISTS UNDERLYING_COMPILER) + unset(UNDERLYING_COMPILER) + endif() + endif() + + set(${OUTPUT_VARIABLE} "${UNDERLYING_COMPILER}" PARENT_SCOPE) +endfunction() + +function (__MPI_underlying_compiler LANG OUTPUT_VARIABLE) + foreach (flag IN ITEMS "show" "showme" "craype-verbose") + __MPI_find_compiler("CXX" "-${flag}" COMPILER) + if(COMPILER) + break() + endif() + endforeach() + + if(NOT COMPILER) + message(FATAL_ERROR "Cannot identify underlying compiler used by ${MPI_${LANG}_COMPILER}") + endif() + + set(${OUTPUT_VARIABLE} "${COMPILER}" PARENT_SCOPE) +endfunction() diff --git a/examples/channel/CASEDATA b/examples/channel/CASEDATA new file mode 100644 index 000000000..45f0a1739 --- /dev/null +++ b/examples/channel/CASEDATA @@ -0,0 +1,3 @@ + real p_rot + + common /casevars/ p_rot diff --git a/examples/channel/channel.oudf b/examples/channel/channel.oudf index 92a6ec4e5..e69de29bb 100644 --- a/examples/channel/channel.oudf +++ b/examples/channel/channel.oudf @@ -1 +0,0 @@ -// Boundary conditions diff --git a/examples/channel/channel.par b/examples/channel/channel.par index c1e5675ca..abd9c25e6 100644 --- a/examples/channel/channel.par +++ b/examples/channel/channel.par @@ -14,6 +14,10 @@ stressFormulation = true # required for variable viscosity residualTol = 1e-08 [VELOCITY] +boundaryTypeMap = sym residualTol = 1e-12 density = 1.0 viscosity = 0.1 + +[CASEDATA] +p_rot = 0.7853981633974483 # \pi / 4 diff --git a/examples/channel/channel.udf b/examples/channel/channel.udf index 2b35a5630..c37025658 100644 --- a/examples/channel/channel.udf +++ b/examples/channel/channel.udf @@ -7,6 +7,7 @@ static occa::memory o_nekFU; static occa::memory o_nekMue; +static dfloat P_ROT; /* User Functions */ @@ -33,6 +34,11 @@ void UDF_Setup0(MPI_Comm comm, setupAide &options) { options.getArgs("CI-MODE", ciMode); if (ciMode) ciSetup(comm, options); + platform->par->extract("casedata", "p_rot", P_ROT); + if (platform->options.compareArgs("BUILD ONLY", "FALSE")) { + double* const nek_cb_scnrs = (double*) nek::ptr("cb_scnrs"); + nek_cb_scnrs[0] = P_ROT; + } } void UDF_Setup(nrs_t *nrs) diff --git a/examples/channel/channel.usr b/examples/channel/channel.usr index e4ed5542a..51ef10359 100644 --- a/examples/channel/channel.usr +++ b/examples/channel/channel.usr @@ -13,14 +13,15 @@ c----------------------------------------------------------------------- include 'SIZE' include 'TOTAL' include 'NEKUSE' + include 'CASEDATA' common /exavel/ uxex (lx1,ly1,lz1,lelv),uyex (lx1,ly1,lz1,lelv) $ ,uzex (lx1,ly1,lz1,lelv) ie = gllel(ieg) - ux = uxex(ix,iy,iz,ie) - uy = uyex(ix,iy,iz,ie) + ux = uxex(ix,iy,iz,ie)*cos(P_ROT) - uyex(ix,iy,iz,ie)*sin(P_ROT) + uy = uxex(ix,iy,iz,ie)*sin(P_ROT) + uyex(ix,iy,iz,ie)*cos(P_ROT) uz = uzex(ix,iy,iz,ie) return @@ -29,6 +30,7 @@ c----------------------------------------------------------------------- subroutine userchk include 'SIZE' include 'TOTAL' + include 'CASEDATA' common /cforce/ forcx(lx1,ly1,lz1,lelv),forcy(lx1,ly1,lz1,lelv) $ ,forcz(lx1,ly1,lz1,lelv) @@ -59,24 +61,28 @@ c----------------------------------------------------------------------- nrs_scptr(5) = loc(err(1)) do i=1,n - diffx(i,1,1,1) = abs(vx(i,1,1,1)-uxex(i,1,1,1)) - if (uxex(i,1,1,1).gt.1e-14) diffx(i,1,1,1) = + vxr = vx(i,1,1,1) * cos(P_ROT) + vy(i,1,1,1) * sin(P_ROT) + vyr =-vx(i,1,1,1) * sin(P_ROT) + vy(i,1,1,1) * cos(P_ROT) + diffx(i,1,1,1) = abs(vxr -uxex(i,1,1,1)) + if (uxex(i,1,1,1).gt.1e-14) diffx(i,1,1,1) = $ diffx(i,1,1,1)/uxex(i,1,1,1) - diffy(i,1,1,1) = abs(vy(i,1,1,1)-uyex(i,1,1,1)) - if (uyex(i,1,1,1).gt.1e-14) diffy(i,1,1,1) = + diffy(i,1,1,1) = abs(vyr -uyex(i,1,1,1)) + if (uyex(i,1,1,1).gt.1e-14) diffy(i,1,1,1) = $ diffy(i,1,1,1)/uyex(i,1,1,1) enddo err(1) = glmax(diffx,n) err(2) = glmax(diffy,n) - if (nio.eq.0) - $ write(6,'(1p2e13.4,A)') err(1),err(2),' Linf VX VY' + if (nio.eq.0) + $ write(6,'(i6, 1p2e13.4,A)') istep, err(1),err(2),' Linf VX VY' -c if(ifoutfld) then -c call outpost2(diffx,diffy,diffz,mul,t,0,'dif') -c call outpost2(mul_dx,mul_dy,mul_dz,mul,t,0,'mul') -c endif + !if(ifoutfld) then + ! call outpost2(diffx,diffy,diffz,mul,t,0,'dif') + ! call outpost2(mul_dx,mul_dy,mul_dz,mul,t,0,'mul') + ! call outpost2(forcx,forcy,forcz,mul,t,0,'for') + ! call outpost2(uxex,uyex,uzex,mul,t,0,'exa') + !endif return end @@ -84,6 +90,15 @@ c----------------------------------------------------------------------- subroutine usrdat ! This routine to modify element vertices include 'SIZE' ! _before_ mesh is generated, which include 'TOTAL' ! guarantees GLL mapping of mesh. + integer e + + do e=1,nelv ! Rescale mesh to [0,2]^2 x [0,0.1] + do i=1,2**ndim ! Assumes original domain in .re2 file on [0,1]^3 + xc(i,e) = 2.0*xc(i,e) + yc(i,e) = 2.0*yc(i,e) + zc(i,e) = 0.1*zc(i,e) + enddo + enddo return end @@ -92,17 +107,38 @@ c----------------------------------------------------------------------- include 'SIZE' include 'TOTAL' + include 'CASEDATA' - call rescale_x(xm1,0.0,2.0) - call rescale_x(ym1,0.0,2.0) - call rescale_x(zm1,0.0,0.1) + common /scnrs/ sc_nrs(10) + real sc_nrs + + P_ROT = sc_nrs(1) do iel=1,nelt do ifc=5,6 cbc(ifc,iel,1) = 'P ' enddo + do ifc=1,2*ndim + if(cbc(ifc,iel,1) .eq. 'SYM' ) then + boundaryID(ifc, iel) = 1 + endif + enddo + enddo + + call get_mol_visc + call get_forcing + call get_analsol + + ntot = nx1*ny1*nz1*nelt + do i=1,ntot + xpt = xm1(i,1,1,1) + ypt = ym1(i,1,1,1) + + xm1(i,1,1,1) = xpt * cos(P_ROT) - ypt * sin(P_ROT) + ym1(i,1,1,1) = xpt * sin(P_ROT) + ypt * cos(P_ROT) enddo + return end c----------------------------------------------------------------------- @@ -126,10 +162,6 @@ c----------------------------------------------------------------------- include 'SIZE' include 'TOTAL' - call get_mol_visc - call get_forcing - call get_analsol - return end C----------------------------------------------------------------------- @@ -198,6 +230,7 @@ c----------------------------------------------------------------------- include 'SIZE' include 'TOTAL' + include 'CASEDATA' common /cforce/ forcx(lx1,ly1,lz1,lelv),forcy(lx1,ly1,lz1,lelv) $ ,forcz(lx1,ly1,lz1,lelv) @@ -205,6 +238,8 @@ c----------------------------------------------------------------------- $ ,mul_dy(lx1,ly1,lz1,lelv), mul_dz(lx1,ly1,lz1,lelv) real mul, mul_dx, mul_dy, mul_dz + real Fx, Fy + n=nx1*ny1*nz1*nelv U0 = 1. @@ -230,8 +265,10 @@ c----------------------------------------------------------------------- yterm = (5.*pi**2*U0/2.) * sin(pi*x+phx) * cos(2.*pi*y+phy) zterm = 0. - forcx(i,1,1,1) = -xterm * valmu - (- Uvec*dmudx + Vvec * dmudy) - forcy(i,1,1,1) = -yterm * valmu - (+ Vvec*dmudx + Uvec * dmudy) + Fx = -xterm * valmu - (- Uvec*dmudx + Vvec * dmudy) + Fy = -yterm * valmu - (+ Vvec*dmudx + Uvec * dmudy) + forcx(i,1,1,1) = Fx * cos(P_ROT) - Fy * sin(P_ROT) + forcy(i,1,1,1) = Fx * sin(P_ROT) + Fy * cos(P_ROT) forcz(i,1,1,1) = -zterm * valmu enddo diff --git a/examples/channel/ci.inc b/examples/channel/ci.inc index 0bcbafe87..3ae569775 100644 --- a/examples/channel/ci.inc +++ b/examples/channel/ci.inc @@ -24,6 +24,13 @@ void ciSetup(MPI_Comm comm, setupAide &options) options.setArgs("PRESSURE SOLVER TOLERANCE", std::string("1e-08")); options.setArgs("STRESSFORMULATION", "TRUE"); options.setArgs("ADVECTION TYPE", "CONVECTIVE"); + + if(ciMode == 1){ + platform->par->set("casedata", "p_rot", 0.0); + } + if(ciMode == 2){ + platform->par->set("casedata", "p_rot", 0.7853981633974483); + } } void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) @@ -37,8 +44,8 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) double *err = (double *) nek::scPtr(5); - double vxErr = abs((err[0] - 9.5942E-09)/err[0]); - double vyErr = abs((err[1] - 2.3012E-07)/err[1]); + const double vxErr = abs((err[0] - 9.5942E-09)/err[0]); + const double vyErr = abs((err[1] - 2.3012E-07)/err[1]); if (rank == 0) printf("relative error to target: vx=%g vy=%g\n", diff --git a/examples/conj_ht/conj_ht.udf b/examples/conj_ht/conj_ht.udf index 8ce27e8f8..f02bb4ffd 100644 --- a/examples/conj_ht/conj_ht.udf +++ b/examples/conj_ht/conj_ht.udf @@ -51,7 +51,7 @@ void uservp(nrs_t *nrs, dfloat time, occa::memory o_U, occa::memory o_S, void UDF_LoadKernels(occa::properties& kernelInfo) { - cFillKernel = udfBuildKernel(kernelInfo, "cFill"); + cFillKernel = oudfBuildKernel(kernelInfo, "cFill"); } void UDF_Setup0(MPI_Comm comm, setupAide &options) diff --git a/examples/ethier/ci.inc b/examples/ethier/ci.inc index 6b9c3d778..2fad1f2ac 100644 --- a/examples/ethier/ci.inc +++ b/examples/ethier/ci.inc @@ -1,5 +1,8 @@ #include #include +#include +#include +#include static int ciMode = 0; @@ -71,6 +74,7 @@ void ciSetup(MPI_Comm comm, setupAide &options) options.setArgs("TARGET CFL", "0.5"); } if (ciMode == 9) { + options.setArgs("ADVECTION TYPE", "CONVECTIVE"); options.setArgs("VELOCITY BLOCK SOLVER", "TRUE"); options.setArgs("SUBCYCLING STEPS", std::string("1")); options.setArgs("PRESSURE INITIAL GUESS", "PROJECTION-ACONJ"); @@ -80,6 +84,19 @@ void ciSetup(MPI_Comm comm, setupAide &options) options.setArgs("VELOCITY BLOCK SOLVER", "TRUE"); options.setArgs("PRESSURE INITIAL GUESS", "PROJECTION-ACONJ"); } + if (ciMode == 11) { + options.setArgs("PRESSURE MULTIGRID SMOOTHER", "DAMPEDJACOBI,CHEBYSHEV"); + options.setArgs("VELOCITY BLOCK SOLVER", "TRUE"); + options.setArgs("SUBCYCLING STEPS", std::string("1")); + options.setArgs("PRESSURE INITIAL GUESS", "PROJECTION-ACONJ"); + } + if (ciMode == 12) { + options.setArgs("SCALAR00 SOLVER", "NONE"); + } + if (ciMode == 13) { + options.setArgs("SCALAR00 SOLVER", "NONE"); + options.setArgs("VELOCITY SOLVER", "NONE"); + } options.setArgs("TIME INTEGRATOR", "TOMBO3"); options.setArgs("VELOCITY SOLVER TOLERANCE", std::string("1e-12")); @@ -89,6 +106,67 @@ void ciSetup(MPI_Comm comm, setupAide &options) options.setArgs("VARIABLEPROPERTIES", "FALSE"); } +void surfaceArea(nrs_t* nrs, int BID, dfloat& SA) +{ + auto* mesh = nrs->meshV; + + SA = 0.0; + + for(int elem = 0; elem < mesh->Nelements; ++elem){ + for(int face = 0; face < mesh->Nfaces; ++face){ + for(int fp = 0; fp < mesh->Nfp; ++fp){ + const auto base = mesh->Nsgeo * (mesh->Nfaces * mesh->Nfp * elem + mesh->Nfp * face + fp); + /* volume index of face node */ + const auto n = mesh->faceNodes[face * mesh->Nfp + fp]; + const auto WsJ = mesh->sgeo[base + WSJID]; + if(mesh->EToB[elem * mesh->Nfaces + face] == BID){ + SA += WsJ; + } + } + } + } + + MPI_Allreduce(MPI_IN_PLACE, &SA, 1, MPI_DFLOAT, MPI_SUM, platform->comm.mpiComm); +} + +void ciTestAvgBoundary(nrs_t* nrs) +{ + const int rank = platform->comm.mpiRank; + auto * mesh = nrs->meshV; + const int BID = 1; + const auto testTol = 50. * std::numeric_limits::epsilon(); + + constexpr int N = 5; + + dfloat SA = 0.0; + + surfaceArea(nrs, BID, SA); + + // surface area of unit sphere + const auto SAref = 4. * M_PI; + if(std::abs((SA - SAref)/SAref) > 0.02){ + if(rank == 0){ + std::cout << "ciTestAvgBoundary: surface area of unit sphere is " << SAref << " instead of " << SA << std::endl; + } + FAIL; + } + + occa::memory o_fld = platform->device.malloc(N * nrs->fieldOffset * sizeof(dfloat)); + platform->linAlg->fill(N*nrs->fieldOffset, SA, o_fld); + + std::array results; + mesh->avgBoundaryValue(BID, N, nrs->fieldOffset, o_fld, results.data()); + + for(auto&& v : results){ + if(std::abs(v - SA) > testTol){ + if(rank == 0){ + std::cout << "ciTestAvgBoundary: FAILED" << std::endl; + } + FAIL; + } + } +} + dfloat sum(dfloat const * const array, const int size, MPI_Comm comm) { dfloat sumr = 0.0; @@ -183,9 +261,11 @@ void ciTestLinAlg(const int N) void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) { const int rank = platform->comm.mpiRank; - if(tstep == 1 && ciMode != 7){ + if(tstep == 1 && ciMode != 7 && ciMode != 13){ int NiterP = nrs->pSolver->Niter; - const int expectedNiterP = 6; + + // mode 11 uses Chebyshev+Jacobi smoothing + const int expectedNiterP = (ciMode == 11) ? 13 : 6; const int pIterErr = abs(NiterP - expectedNiterP); if(pIterErr >= 2) { if(rank==0){ @@ -195,8 +275,10 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) } } if (!nrs->lastStep) return; - + ciTestAvgBoundary(nrs); + + ciTestLinAlg(1); ciTestLinAlg(BLOCKSIZE / 16); ciTestLinAlg(BLOCKSIZE / 8); ciTestLinAlg(BLOCKSIZE / 4); @@ -220,15 +302,19 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) int NiterU; - if(platform->options.compareArgs("VELOCITY BLOCK SOLVER", "TRUE")) + if(platform->options.compareArgs("VELOCITY BLOCK SOLVER", "TRUE")){ NiterU = nrs->uvwSolver->Niter; - else + } + else if(nrs->uSolver){ NiterU = nrs->uSolver->Niter; + } else { + NiterU = -1; + } - int NiterP = nrs->pSolver->Niter; + int NiterP = nrs->pSolver ? nrs->pSolver->Niter : -1; - int NiterS01 = nrs->cds->solver[0]->Niter; - int NiterS02 = nrs->cds->solver[1]->Niter; + int NiterS01 = platform->options.compareArgs("SCALAR00 SOLVER", "NONE") ? -1 : nrs->cds->solver[0]->Niter; + int NiterS02 = platform->options.compareArgs("SCALAR01 SOLVER", "NONE") ? -1 : nrs->cds->solver[1]->Niter; int s01IterErr, s02IterErr; switch (ciMode) { @@ -241,7 +327,9 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) s01IterErr = abs(NiterS01 - 10); s02IterErr = abs(NiterS02 - 10); break; - case 2 : velIterErr = abs(NiterU - 10); + case 2 : + case 11: + velIterErr = abs(NiterU - 10); s1Err = abs((err[2] - 6.11E-12)/err[2]); s2Err = abs((err[3] - 6.84E-12)/err[3]); pIterErr = abs(NiterP - 1); @@ -307,12 +395,91 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) s01IterErr = abs(NiterS01 - 10); s02IterErr = abs(NiterS02 - 10); break; - + case 9 : velIterErr = abs(NiterU - 10); + s1Err = abs((err[2] - 1.03E-11)/err[2]); + s2Err = abs((err[3] - 1.34E-11)/err[3]); + pIterErr = abs(NiterP - 1); + vxErr = abs((err[0] - 2.78E-10)/err[0]); + prErr = abs((err[1] - 6.98E-10)/err[1]); + s01IterErr = abs(NiterS01 - 10); + s02IterErr = abs(NiterS02 - 10); + break; + case 10 : velIterErr = abs(NiterU - 10); + s1Err = abs((err[2] - 1.03E-11)/err[2]); + s2Err = abs((err[3] - 1.34E-11)/err[3]); + pIterErr = abs(NiterP - 1); + vxErr = abs((err[0] - 2.78E-10)/err[0]); + prErr = abs((err[1] - 6.98E-10)/err[1]); + s01IterErr = abs(NiterS01 - 10); + s02IterErr = abs(NiterS02 - 10); + break; + case 12 : velIterErr = abs(NiterU - 10); + s1Err = abs((err[2] - 2.506E-3)/err[2]); + s2Err = abs((err[3] - 1.34E-11)/err[3]); + pIterErr = abs(NiterP - 4); + vxErr = abs((err[0] - 2.78E-10)/err[0]); + prErr = abs((err[1] - 6.98E-10)/err[1]); + s01IterErr = abs(NiterS01 + 1); + s02IterErr = abs(NiterS02 - 10); + break; + case 13 : velIterErr = abs(NiterU +1); + s1Err = abs((err[2] - 2.506E-3)/err[2]); + s2Err = abs((err[3] - 1.52E-5)/err[3]); + pIterErr = abs(NiterP + 1); + vxErr = abs((err[0] - 2.5E-3)/err[0]); + prErr = abs((err[1] - 2.364E-2)/err[1]); + s01IterErr = abs(NiterS01 + 1); + s02IterErr = abs(NiterS02 - 10); + break; } + // on ci modes 12, 13, confirm that the correct solvers are present + if(ciMode == 12){ + bool correct = true; + correct &= (nrs->pSolver != nullptr); + correct &= (nrs->uSolver != nullptr || nrs->uvwSolver != nullptr); + correct &= (nrs->cds->solver[0] == nullptr); + correct &= (nrs->cds->solver[1] != nullptr); + if(!correct){ + if(platform->comm.mpiRank == 0){ + std::cout << "Unexpected solver configuration for CI mode 12" << std::endl; + } + FAIL; + } + } + if(ciMode == 13){ + bool correct = true; + correct &= (nrs->pSolver == nullptr); + correct &= (nrs->uSolver == nullptr && nrs->uvwSolver == nullptr); + correct &= (nrs->cds->solver[0] == nullptr); + correct &= (nrs->cds->solver[1] != nullptr); + if(!correct){ + if(platform->comm.mpiRank == 0){ + std::cout << "Unexpected solver configuration for CI mode 13" << std::endl; + } + FAIL; + } + + // mode 13 requires using ethierScalar.par as the input + std::string casename = platform->options.getArgs("CASENAME"); + if(casename != "ethierScalar"){ + if(platform->comm.mpiRank == 0){ + std::cout << "Unexpected input file for CI mode 13" << std::endl; + } + FAIL; + } + + if(nrs->flow){ + if(platform->comm.mpiRank == 0){ + std::cout << "nrs->flow = true for CI mode 13" << std::endl; + } + FAIL; + } + } + if (ciMode == 4){ - dfloat memoryUsage = platform->device.memoryAllocated()/1e9; - const dfloat expectedMemoryUsage = 0.0450654; + dfloat memoryUsage = platform->device.occaDevice().memoryAllocated()/1e9; + const dfloat expectedMemoryUsage = 0.0555274; if(memoryUsage > 1.01 * expectedMemoryUsage && platform->comm.mpiCommSize == 2) { if(platform->comm.mpiRank == 0){ @@ -346,8 +513,25 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) if (rank == 0) printf("relative error to target: vx=%g pr=%g s1=%g s2=%g velIter=%d pIter=%d s01Iter=%d s02Iter=%d\n", vxErr, prErr, s1Err, s2Err, velIterErr, pIterErr, s01IterErr, s02IterErr); + + std::vector relErrors = {vxErr, prErr, s1Err, s2Err}; + std::vector absErrors = {abs(err[0]), abs(err[1]), abs(err[2]), abs(err[3])}; + bool passTest = true; + const double absTol = 5e-11; // values below this threshold are ignored + for(size_t i = 0; i < relErrors.size(); ++i){ + + const double absError = absErrors[i]; + const double relError = relErrors[i]; + if(absError > absTol){ + passTest &= relError < EPS; + } + } + + passTest &= velIterErr <= 1; + passTest &= pIterErr <= 2; + passTest &= s01IterErr <= 1; + passTest &= s02IterErr <= 1; - (vxErr < EPS && prErr < EPS && s1Err < EPS && s2Err < EPS && - velIterErr <= 1 && pIterErr <= 2 && s01IterErr <= 1 && s02IterErr <= 1) ? (PASS) : (FAIL); + passTest ? (PASS) : (FAIL); } diff --git a/examples/ethier/ethier.udf b/examples/ethier/ethier.udf index 522d03ed5..1322796b8 100644 --- a/examples/ethier/ethier.udf +++ b/examples/ethier/ethier.udf @@ -6,6 +6,7 @@ #include "udf.hpp" #include "ci.inc" +#include static dfloat P_U0; static dfloat P_V0; @@ -36,6 +37,10 @@ void userq(nrs_t *nrs, dfloat time, occa::memory o_S, occa::memory o_FS) void UDF_LoadKernels(occa::properties& kernelInfo) { + // called from all ranks, so MPI collectives are O.K. + int maxRank = platform->comm.mpiRank; + MPI_Allreduce(MPI_IN_PLACE, &maxRank, 1, MPI_INT, MPI_MAX, platform->comm.mpiComm); + setupAide &options = platform->options; dfloat mue, rho; @@ -51,9 +56,9 @@ void UDF_LoadKernels(occa::properties& kernelInfo) kernelInfo["defines/p_omega"] = P_OMEGA; kernelInfo["defines/p_pi"] = M_PI; - dpdxKernel = udfBuildKernel(kernelInfo, "dpdx"); - exactUVWPKernel = udfBuildKernel(kernelInfo, "exactUVWP"); - userMeshVelocityKernel = udfBuildKernel(kernelInfo, "userMeshVelocity"); + dpdxKernel = oudfBuildKernel(kernelInfo, "dpdx"); + exactUVWPKernel = oudfBuildKernel(kernelInfo, "exactUVWP"); + userMeshVelocityKernel = oudfBuildKernel(kernelInfo, "userMeshVelocity"); } void UDF_Setup0(MPI_Comm comm, setupAide &options) @@ -98,8 +103,11 @@ void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep) if (tstep <= 5) { exactUVWPKernel(mesh->Nlocal, time, mesh->o_x, mesh->o_y, mesh->o_z, nrs->fieldOffset, nrs->o_P, nrs->o_U); - ellipticZeroMean(nrs->pSolver, nrs->o_P); - cds->o_S.copyFrom(nrs->o_U, mesh->Nlocal*sizeof(dfloat), 0*cds->fieldOffset[0]*sizeof(dfloat)); + if (nrs->pSolver) + ellipticZeroMean(nrs->pSolver, nrs->o_P); + if (nrs->Nscalar > 0) + cds->o_S.copyFrom(nrs->o_U, mesh->Nlocal*sizeof(dfloat), 0*cds->fieldOffset[0]*sizeof(dfloat)); + if (nrs->Nscalar > 1) cds->o_S.copyFrom(nrs->o_U, mesh->Nlocal*sizeof(dfloat), 1*cds->fieldOffset[0]*sizeof(dfloat)); } diff --git a/examples/ethier/ethier.usr b/examples/ethier/ethier.usr index 3b37e6d22..2bade3822 100644 --- a/examples/ethier/ethier.usr +++ b/examples/ethier/ethier.usr @@ -115,7 +115,7 @@ c----------------------------------------------------------------------- do iel=1,nelt do ifc=1,2*ndim - if (cbc(ifc,iel,1) .eq. 'v ') boundaryID(ifc,iel) = 1 + if (cbc(ifc,iel,1) .eq. 'EXO') boundaryID(ifc,iel) = 1 enddo enddo diff --git a/examples/ethier/ethierScalar.par b/examples/ethier/ethierScalar.par new file mode 100644 index 000000000..e7f4f7952 --- /dev/null +++ b/examples/ethier/ethierScalar.par @@ -0,0 +1,39 @@ +[GENERAL] +#verbose = true +polynomialOrder = 9 +#startFrom = "restart.fld" +stopAt = numSteps +numSteps = 100 +dt = 2e-03 +timeStepper = tombo3 +writeControl = runTime +writeInterval = 0.1 + +usr = "ethier.usr" +oudf = "ethier.oudf" +udf = "ethier.udf" + +[MESH] +file = "ethier.re2" + +[TEMPERATURE] +boundaryTypeMap = inlet +residualTol = 1e-12 +rhoCp = 1.0 +conductivity = -100 + +[SCALAR01] +boundaryTypeMap = flux +residualTol = 1e-12 +rho = 1.0 +diffusivity = -100 + +[CASEDATA] +P_U0 = 0.5 +P_V0 = 0.1 +P_W0 = 0.2 +P_A0 = 0.025 +P_D0 = 0.5 +P_OMEGA = 15.0 +P_AMP = 1.5 + diff --git a/examples/gabls1/gabls.oudf b/examples/gabls1/gabls.oudf index d9be4aa39..92ed97761 100644 --- a/examples/gabls1/gabls.oudf +++ b/examples/gabls1/gabls.oudf @@ -1,8 +1,48 @@ +// Boundary conditions +// ============================================================================= +void velocityDirichletConditions(bcData *bc) +{ + bc->u = 1.0; + bc->v = 0; + bc->w = 0; +} + +void velocityNeumannConditions(bcData *bc) +{ + const dfloat invMagUWall = 1/bc->wrk[9*bc->fieldOffset + 1]; + const dfloat uTau = bc->wrk[9*bc->fieldOffset + 3]; + + const dfloat ut1x = bc->u*bc->t1x + bc->v*bc->t1y + bc->w*bc->t1z; + const dfloat ut2x = bc->u*bc->t2x + bc->v*bc->t2y + bc->w*bc->t2z; + + bc->trn = 0; + bc->tr1 = -ut1x*invMagUWall * uTau*uTau; + bc->tr2 = -ut2x*invMagUWall * uTau*uTau; +} + +void scalarNeumannConditions(bcData *bc) +{ + bc->flux = p_cond * p_TsGrad; + if(bc->id == 2) { + const dfloat thts = bc->wrk[9*bc->fieldOffset + 0]; + const dfloat dtempWall = fabs(bc->wrk[9*bc->fieldOffset + 2] - (thts - 1)); + const dfloat uTau = bc->wrk[9*bc->fieldOffset + 3]; + const dfloat thetaTau = bc->wrk[9*bc->fieldOffset + 4]; + const dfloat dtemp = bc->s - (thts - 1); + + bc->flux = 0; + if(dtempWall > 0) bc->flux = -dtemp/dtempWall * uTau*thetaTau; + } +} + +// Kernels +// ============================================================================= @kernel void coriolis(const dlong Ntotal, const dlong offset, @restrict const dfloat * Y, @restrict const dfloat * S, - @restrict const dfloat * U, + @restrict const dfloat * U, + @restrict const dfloat * divTAU, @restrict dfloat * OUT) { for(dlong n=0;nu = 1.0; - bc->v = 0; - bc->w = 0; +@kernel void visMF(const dlong Ntotal, + const dlong offset, + const dfloat scale, + @restrict const dfloat * DDYAVG, + @restrict dfloat * OUT) +{ + for(dlong n=0;ns = p_T0s; - bc->s += p_sCoolRate * bc->time - 1.0; + for(dlong e = 0; e < Nelements; ++e; @outer(0)) { + @shared dfloat s_U[3][p_Nq][p_Nq]; + @shared dfloat s_V[3][p_Nq][p_Nq]; + @shared dfloat s_W[3][p_Nq][p_Nq]; + + @shared dfloat s_D[p_Nq][p_Nq]; + @exclusive dfloat r_div[3][p_Nq]; + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const int id = i + j * p_Nq; + s_D[j][i] = D[id]; + } + } + @barrier("local"); + + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + #pragma unroll p_Nq + for(int k = 0; k < p_Nq; ++k) { + r_div[0][k] = 0.; + r_div[1][k] = 0.; + r_div[2][k] = 0.; + } + } + } + @barrier("local"); + + #pragma unroll p_Nq + for(int k = 0; k < p_Nq; ++k) { + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dlong gid = e * p_Np * p_Nvgeo + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat drdy = vgeo[gid + p_RYID * p_Np]; + const dfloat dsdy = vgeo[gid + p_SYID * p_Np]; + const dfloat dtdy = vgeo[gid + p_TYID * p_Np]; + const dfloat JW = vgeo[gid + p_JWID * p_Np]; + + const dfloat vis = VISMF[id]; + + s_U[0][j][i] = JW * drdy * AVG[id + 0*offset] * vis; + s_V[0][j][i] = JW * dsdy * AVG[id + 0*offset] * vis; + s_W[0][j][i] = JW * dtdy * AVG[id + 0*offset] * vis; + + s_U[1][j][i] = JW * drdy * AVG[id + 1*offset] * vis; + s_V[1][j][i] = JW * dsdy * AVG[id + 1*offset] * vis; + s_W[1][j][i] = JW * dtdy * AVG[id + 1*offset] * vis; + + s_U[2][j][i] = JW * drdy * AVG[id + 2*offset] * vis; + s_V[2][j][i] = JW * dsdy * AVG[id + 2*offset] * vis; + s_W[2][j][i] = JW * dtdy * AVG[id + 2*offset] * vis; + + } + } + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + #pragma unroll p_Nq + for (int n = 0; n < p_Nq; n++) { + const dfloat Dr = s_D[n][i]; + const dfloat Ds = s_D[n][j]; + const dfloat Dt = s_D[k][n]; + + r_div[0][k] += Dr * s_U[0][j][n]; + r_div[0][k] += Ds * s_V[0][n][i]; + r_div[0][n] += Dt * s_W[0][j][i]; + + r_div[1][k] += Dr * s_U[1][j][n]; + r_div[1][k] += Ds * s_V[1][n][i]; + r_div[1][n] += Dt * s_W[1][j][i]; + + r_div[2][k] += Dr * s_U[2][j][n]; + r_div[2][k] += Ds * s_V[2][n][i]; + r_div[2][n] += Dt * s_W[2][j][i]; + } + } + } + @barrier("local"); + } //k loop + + //write out + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + #pragma unroll p_Nq + for(int k = 0; k < p_Nq; ++k) { + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat invlmm = invLMM[id]; + TAU[id + 0 * offset] = r_div[0][k] * invlmm; + TAU[id + 1 * offset] = r_div[1][k] * invlmm; + TAU[id + 2 * offset] = r_div[2][k] * invlmm; + } + } + } + } } -void scalarNeumannConditions(bcData *bc) +@kernel void gradY(const dlong Nelements, + @restrict const dfloat* vgeo, + @restrict const dfloat* D, + const dlong offset, + @restrict const dfloat* invLMM, + @restrict const dfloat* P, + @restrict dfloat* gradP) { - bc->flux = p_cond*p_TsGrad; + for(dlong e = 0; e < Nelements; e++; @outer(0)) { + @shared dfloat s_P1[p_Nq][p_Nq]; + @shared dfloat s_P2[p_Nq][p_Nq]; + @shared dfloat s_P3[p_Nq][p_Nq]; + + @exclusive dfloat s_P1loc[p_Nq]; + @exclusive dfloat s_P2loc[p_Nq]; + @exclusive dfloat s_P3loc[p_Nq]; + + @shared dfloat s_D[p_Nq][p_Nq]; + + #pragma unroll p_Nq + for(int k = 0; k < p_Nq; ++k){ + for(int j = 0; j < p_Nq; ++j; @inner(1)){ + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + s_P1[j][i] = P[id + 0*offset]; + s_P2[j][i] = P[id + 1*offset]; + s_P3[j][i] = P[id + 2*offset]; + + if (k == 0) + s_D[j][i] = D[j * p_Nq + i]; + if(k == 0){ + #pragma unroll p_Nq + for(int l = 0 ; l < p_Nq; ++l){ + const dlong other_id = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; + s_P1loc[l] = P[other_id + 0*offset]; + s_P2loc[l] = P[other_id + 1*offset]; + s_P3loc[l] = P[other_id + 2*offset]; + } + } + } + } + + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong gid = e * p_Np * p_Nvgeo + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat drdy = vgeo[gid + p_RYID * p_Np]; + const dfloat dsdy = vgeo[gid + p_SYID * p_Np]; + const dfloat dtdy = vgeo[gid + p_TYID * p_Np]; + + // compute 'r' and 's' derivatives of (q_m) at node n + dfloat dp1dr = 0., dp1ds = 0., dp1dt = 0.; + dfloat dp2dr = 0., dp2ds = 0., dp2dt = 0.; + dfloat dp3dr = 0., dp3ds = 0., dp3dt = 0.; + + #pragma unroll p_Nq + for(int n = 0; n < p_Nq; ++n) { + const dfloat Dr = s_D[i][n]; + const dfloat Ds = s_D[j][n]; + const dfloat Dt = s_D[k][n]; + + dp1dr += Dr * s_P1[j][n]; + dp1ds += Ds * s_P1[n][i]; + dp1dt += Dt * s_P1loc[n]; + + dp2dr += Dr * s_P2[j][n]; + dp2ds += Ds * s_P2[n][i]; + dp2dt += Dt * s_P2loc[n]; + + dp3dr += Dr * s_P3[j][n]; + dp3ds += Ds * s_P3[n][i]; + dp3dt += Dt * s_P3loc[n]; + } + + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat w = invLMM[id] * vgeo[gid + p_JWID * p_Np]; + gradP[id + 0 * offset] = w * (drdy * dp1dr + dsdy * dp1ds + dtdy * dp1dt); + gradP[id + 1 * offset] = w * (drdy * dp2dr + dsdy * dp2ds + dtdy * dp2dt); + gradP[id + 2 * offset] = w * (drdy * dp3dr + dsdy * dp3ds + dtdy * dp3dt); + } + } + @barrier("local"); + } // k-loop + } +} + +@kernel void avgBIDValue(const dlong Nelements, + const dlong BID, + const dlong fieldOffset, + const dlong offset, + @restrict const dfloat *sgeo, + @restrict const dlong *EToB, + @restrict const dlong *vmapM, + @restrict const dfloat *field, + @restrict dfloat *sum) { + + for (dlong e = 0; e < Nelements; e++; @outer(0)) { + @shared dfloat s_sum[3][p_blockSize]; + @shared dfloat s_area[p_blockSize]; + + for (int f = 0; f < p_Nfaces; f++) { + @barrier("local"); + + for (int m = 0; m < p_blockSize; ++m; @inner(0)) { + s_sum[0][m] = 0.0; + s_sum[1][m] = 0.0; + s_sum[2][m] = 0.0; + s_area[m] = 0.0; + if (m < p_Nfp) { + if(EToB[f + p_Nfaces * e] == BID) { + const int n = m + f * p_Nfp; + const int sk = e * p_Nfp * p_Nfaces + n; + const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID]; + const dlong idM = vmapM[sk]; + s_sum[0][m] = field[idM + 0*fieldOffset] * WsJ; + s_sum[1][m] = field[idM + 1*fieldOffset] * WsJ; + s_sum[2][m] = field[idM + 2*fieldOffset] * WsJ; + s_area[m] = WsJ; + } + } + } + + // compute reduction + @barrier("local"); + +#if p_blockSize > 512 + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 512) { + s_sum[0][t] += s_sum[0][t + 512]; + s_sum[1][t] += s_sum[1][t + 512]; + s_sum[2][t] += s_sum[2][t + 512]; + s_area[t] += s_area[t + 512]; + } + } + @barrier("local"); +#endif + +#if p_blockSize > 256 + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 256) { + s_sum[0][t] += s_sum[0][t + 256]; + s_sum[1][t] += s_sum[1][t + 256]; + s_sum[2][t] += s_sum[2][t + 256]; + s_area[t] += s_area[t + 256]; + } + } + @barrier("local"); +#endif + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 128) { + s_sum[0][t] += s_sum[0][t + 128]; + s_sum[1][t] += s_sum[1][t + 128]; + s_sum[2][t] += s_sum[2][t + 128]; + s_area[t] += s_area[t + 128]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 64) { + s_sum[0][t] += s_sum[0][t + 64]; + s_sum[1][t] += s_sum[1][t + 64]; + s_sum[2][t] += s_sum[2][t + 64]; + s_area[t] += s_area[t + 64]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 32) { + s_sum[0][t] += s_sum[0][t + 32]; + s_sum[1][t] += s_sum[1][t + 32]; + s_sum[2][t] += s_sum[2][t + 32]; + s_area[t] += s_area[t + 32]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 16) { + s_sum[0][t] += s_sum[0][t + 16]; + s_sum[1][t] += s_sum[1][t + 16]; + s_sum[2][t] += s_sum[2][t + 16]; + s_area[t] += s_area[t + 16]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 8) { + s_sum[0][t] += s_sum[0][t + 8]; + s_sum[1][t] += s_sum[1][t + 8]; + s_sum[2][t] += s_sum[2][t + 8]; + s_area[t] += s_area[t + 8]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 4) { + s_sum[0][t] += s_sum[0][t + 4]; + s_sum[1][t] += s_sum[1][t + 4]; + s_sum[2][t] += s_sum[2][t + 4]; + s_area[t] += s_area[t + 4]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 2) { + s_sum[0][t] += s_sum[0][t + 2]; + s_sum[1][t] += s_sum[1][t + 2]; + s_sum[2][t] += s_sum[2][t + 2]; + s_area[t] += s_area[t + 2]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 1) { + sum[f + p_Nfaces * e + 0*offset] = s_sum[0][0] + s_sum[0][1]; + sum[f + p_Nfaces * e + 1*offset] = s_sum[1][0] + s_sum[1][1]; + sum[f + p_Nfaces * e + 2*offset] = s_sum[2][0] + s_sum[2][1]; + sum[f + p_Nfaces * e + 3*offset] = s_area[0] + s_area[1]; + } + } + } + } } diff --git a/examples/gabls1/gabls.par b/examples/gabls1/gabls.par index fb14e2802..7c9ab2dba 100644 --- a/examples/gabls1/gabls.par +++ b/examples/gabls1/gabls.par @@ -1,39 +1,49 @@ [GENERAL] -#verbose = true -polynomialOrder = 9 -#startFrom = "restart.fld" +polynomialOrder = 8 +cubaturePolynomialOrder = 10 + +#startFrom = "r.fld" + stopAt = endTime endTime = 2600 -dt = 1.0e-2 -#dt = targetCFL=2 + max=5e-2 - +dt = targetCFL=2 + max=2e-2 timeStepper = tombo2 -subCyclingSteps = 1 writeControl = runTime -writeInterval = 100 +writeInterval = 100 + +regularization = hpfrt + nModes=3 + scalingCoeff=10 -regularization = hpfrt + nModes=1 + scalingCoeff=10 +[PROBLEMTYPE] +stressFormulation = true [PRESSURE] +solver = pfcg residualTol = 1e-04 +preconditioner = multigrid+coarse +smootherType = JAC+Chebyshev+degree=2 +pMultigridCoarsening = 8,4,1 [VELOCITY] -boundaryTypeMap = wall, inlet +boundaryTypeMap = v, shl residualTol = 1e-06 density = 1.0 -viscosity = -50e3 +viscosity = 1/50e6 [TEMPERATURE] -boundaryTypeMap = inlet, flux +boundaryTypeMap = flux, flux residualTol = 1e-06 rhoCp = 1.0 -conductivity = -50e3 +conductivity = 1/50e6 + +[BOOMERAMG] +iterations = 1 [CASEDATA] #/* latitude north */ nlat = 73.0 + #/* geostrophic wind speed [m/s] */ uref = 8.0 @@ -47,11 +57,17 @@ scr = -0.25 #/* inversion layer strength [k/m]*/ ilstrength = 0.01 +z1 = 1.0 +z0 = 0.1 + number_elements_x = 16 -number_elements_y = 16 -number_elements_z = 16 +number_elements_y = 16 +number_elements_z = 16 xlen = 4.0 ylen = 4.0 zlen = 4.0 -betam = 2.5 +betam = 4.8 +betah = 7.8 +kappa = 0.4 +zwall = 0.01 # 1.0/lref diff --git a/examples/gabls1/gabls.udf b/examples/gabls1/gabls.udf index 3a06e589b..5644437d4 100644 --- a/examples/gabls1/gabls.udf +++ b/examples/gabls1/gabls.udf @@ -1,12 +1,12 @@ -// -// nekRS User Defined File -// #include #include "udf.hpp" -/* User Functions */ +occa::kernel coriolisKernel; +occa::kernel gradYKernel; +occa::kernel divStressKernel; +occa::kernel visMFKernel; +occa::kernel avgBIDValueKernel; -occa::kernel coriolisKernel; /* latitude north */ static dfloat NLAT; /* geostrophic wind speed [m/s] */ @@ -29,20 +29,165 @@ static dfloat XLEN; static dfloat YLEN; static dfloat ZLEN; +static dfloat Z0; +static dfloat Z1; + +static dfloat SCOOLRATE; +static dfloat T0S; +static dfloat GACC; + static dfloat BETAM; +static dfloat BETAH; +static dfloat KAPPA; +static dfloat ZWALL; + +static dfloat maxDt0 = -1; + +void avgBoundaryValue(nrs_t *nrs, int BID, int nflds, occa::memory o_flds, dfloat *avgs) +{ + mesh_t* mesh = nrs->meshV; + const auto Nelements = mesh->Nelements; + const auto Nfaces = mesh->Nfaces; + const auto offset = Nfaces * Nelements; + + static dfloat *sum; + static dfloat *sumFace; + static dfloat *area; + static occa::memory o_sumFace; + static occa::memory o_area; + + if (!o_sumFace.isInitialized()) { + o_sumFace = platform->device.malloc((nflds+1) * Nfaces * Nelements * sizeof(dfloat)); + sumFace = (dfloat *) calloc((nflds+1) * Nfaces * Nelements, sizeof(dfloat)); + sum = (dfloat *) calloc(nflds+1, sizeof(dfloat)); + } + + avgBIDValueKernel(Nelements, + BID, + nrs->fieldOffset, + offset, + mesh->o_sgeo, + mesh->o_EToB, + mesh->o_vmapM, + o_flds, + o_sumFace); + o_sumFace.copyTo(sumFace, (nflds+1) * Nfaces * Nelements * sizeof(dfloat)); + + for (int j = 0; j < nflds+1; ++j) { + sum[j] = 0; + for (int i = 0; i < Nfaces * Nelements; ++i) { + sum[j] += sumFace[i + j*offset]; + } + } + + MPI_Allreduce(MPI_IN_PLACE, sum, nflds+1, MPI_DFLOAT, MPI_SUM, platform->comm.mpiComm); + + const auto invArea = 1/sum[nflds]; + for (int i = 0; i < nflds; ++i) avgs[i] = sum[i] * invArea; +} + + +void findUThetaTau(dfloat uWall, dfloat tempWall, + dfloat& rLength, dfloat& uTau, dfloat& thetaTau) +{ + const auto tol = 1e-8; + const auto brat = BETAH/BETAM; + const auto z1z0 = Z1/Z0; + const auto UmN = KAPPA*uWall/log(z1z0); + const auto ThN = KAPPA*tempWall/log(z1z0); + const auto RiBulk = tempWall*ZWALL*GACC/uWall/uWall; + const auto RiCrit = RiBulk*BETAM*BETAM/BETAH; + + uTau = 0; + thetaTau = 0; + rLength = 1e5; + + if(uWall < tol || RiCrit >= 1) return; + + if(fabs(brat - 1) < tol) { + uTau = UmN * (1 - BETAM*RiBulk); + thetaTau = ThN * (1 - BETAM*RiBulk); + rLength = KAPPA*GACC*thetaTau/uTau/uTau; + } else { // find the 2 roots utau1 and utau2 + const auto Det = sqrt(1 + 4*RiBulk*(BETAH - BETAM)); + if(Det < 0) { + if (platform->comm.mpiRank == 0) std::cout << "findUThetaTau: Det < 0!\n"; + ABORT(EXIT_FAILURE); + } + const auto utau1 = (0.5*(2*brat - 1)*UmN/(brat - 1)) * (1 - Det/(2*brat - 1)); + const auto utau2 = (0.5*(2*brat - 1)*UmN/(brat - 1)) * (1 + Det/(2*brat - 1)); + auto thetau1 = 0.0; + auto thetau2 = 0.0; + if(RiBulk > tol) { + const auto scale = 1/BETAM/KAPPA/GACC/ZWALL; + thetau1 = scale * utau1 * (KAPPA*uWall - utau1*log(z1z0)); + thetau2 = scale * utau2 * (KAPPA*uWall - utau2*log(z1z0)); + } + if(utau1 >= 0 && thetau1 >= 0) { + uTau = utau1; + thetaTau = thetau1; + rLength = KAPPA*GACC*thetaTau/uTau/uTau; + } + } +} + +void average(nrs_t *nrs, occa::memory o_avg) +{ + mesh_t* mesh = nrs->meshV; + const auto fieldOffsetByte = nrs->fieldOffset * sizeof(dfloat); + + // (y) + auto o_uAvg = o_avg.slice(0*fieldOffsetByte, fieldOffsetByte); + o_uAvg.copyFrom(nrs->o_U, fieldOffsetByte, 0, 0*fieldOffsetByte); + + // (y) + auto o_wAvg = o_avg.slice(1*fieldOffsetByte, fieldOffsetByte); + o_wAvg.copyFrom(nrs->o_U, fieldOffsetByte, 0, 2*fieldOffsetByte); + + // (y) + auto o_tempAvg = o_avg.slice(2*fieldOffsetByte, fieldOffsetByte); + o_tempAvg.copyFrom(nrs->cds->o_S, fieldOffsetByte, 0, 0*fieldOffsetByte); + + // d/dy(y) + auto o_ddyAvg = o_avg.slice(3*fieldOffsetByte, 3*fieldOffsetByte); + gradYKernel( + mesh->Nelements, + mesh->o_vgeo, + mesh->o_D, + nrs->fieldOffset, + mesh->o_invLMM, + o_avg, + o_ddyAvg); + oogs::startFinish(o_ddyAvg, 3, nrs->fieldOffset, ogsDfloat, ogsAdd, nrs->gsh); + + postProcessing::planarAvg(nrs, "x", NUMBER_ELEMENTS_X, NUMBER_ELEMENTS_Y, NUMBER_ELEMENTS_Z, 6, o_avg); + postProcessing::planarAvg(nrs, "z", NUMBER_ELEMENTS_X, NUMBER_ELEMENTS_Y, NUMBER_ELEMENTS_Z, 6, o_avg); +} + void userf(nrs_t *nrs, dfloat time, occa::memory o_U, occa::memory o_FU) { mesh_t* mesh = nrs->meshV; + const auto fieldOffsetByte = nrs->fieldOffset * sizeof(dfloat); + auto o_divTau = nrs->o_usrwrk.slice(6*fieldOffsetByte, 3*fieldOffsetByte); coriolisKernel( mesh->Nlocal, nrs->fieldOffset, nrs->meshV->o_y, nrs->cds->o_S, nrs->o_U, + o_divTau, o_FU); } +void userq(nrs_t *nrs, dfloat time, occa::memory o_S, occa::memory o_SU) +{ + mesh_t* mesh = nrs->meshV; + const auto fieldOffsetByte = nrs->fieldOffset * sizeof(dfloat); + auto o_divTau = nrs->o_usrwrk.slice(8*fieldOffsetByte, fieldOffsetByte); + platform->linAlg->axpby(mesh->Nlocal, -1.0, o_divTau, 1.0, o_SU, 0, 0); +} + /* UDF Functions */ @@ -57,14 +202,22 @@ void UDF_LoadKernels(occa::properties& kernelInfo) kernelInfo["defines/p_fcor"] = 2*omega*LREF/UREF * sin(NLAT*M_PI/180); kernelInfo["defines/p_bcor"] = 2*omega*LREF/UREF * cos(NLAT*M_PI/180); - kernelInfo["defines/p_gacc"] = LREF*g/(UREF*UREF); + + GACC = LREF*g/(UREF*UREF); + kernelInfo["defines/p_gacc"] = GACC; kernelInfo["defines/p_cond"] = cond; kernelInfo["defines/p_TsGrad"] = ILSTRENGTH * LREF/TREF; - kernelInfo["defines/p_T0s"] = TS0/TREF; - kernelInfo["defines/p_sCoolRate"] = SCR * LREF/TREF/UREF/3600; + T0S = TS0/TREF; + kernelInfo["defines/p_T0s"] = T0S; + SCOOLRATE = SCR * LREF/TREF/UREF/3600; + kernelInfo["defines/p_sCoolRate"] = SCOOLRATE; kernelInfo["defines/p_YLEN"] = YLEN; - coriolisKernel = udfBuildKernel(kernelInfo, "coriolis"); + coriolisKernel = oudfBuildKernel(kernelInfo, "coriolis"); + gradYKernel = oudfBuildKernel(kernelInfo, "gradY"); + divStressKernel = oudfBuildKernel(kernelInfo, "divStress"); + visMFKernel = oudfBuildKernel(kernelInfo, "visMF"); + avgBIDValueKernel = oudfBuildKernel(kernelInfo, "avgBIDValue"); } void UDF_Setup0(MPI_Comm comm, setupAide &options) { @@ -81,7 +234,14 @@ void UDF_Setup0(MPI_Comm comm, setupAide &options) platform->par->extract("casedata","xlen",XLEN); platform->par->extract("casedata","ylen",YLEN); platform->par->extract("casedata","zlen",ZLEN); + + platform->par->extract("casedata","z0",Z0); + platform->par->extract("casedata","z1",Z1); + platform->par->extract("casedata","betam",BETAM); + platform->par->extract("casedata","betah",BETAH); + platform->par->extract("casedata","kappa",KAPPA); + platform->par->extract("casedata","zwall",ZWALL); if (platform->options.compareArgs("BUILD ONLY", "FALSE")) { double* const nek_cb_scnrs = (double*) nek::ptr("cb_scnrs"); @@ -105,12 +265,89 @@ void UDF_Setup0(MPI_Comm comm, setupAide &options) void UDF_Setup(nrs_t *nrs) { + nrs->o_usrwrk = platform->device.malloc((9*nrs->fieldOffset + 5)*sizeof(dfloat)); udf.uEqnSource = &userf; + udf.sEqnSource = &userq; + + platform->options.setArgs("VERBOSE SOLVER INFO", "TRUE"); + platform->options.setArgs("RUNTIME STATISTICS FREQUENCY", "5000"); + + platform->options.getArgs("MAX DT", maxDt0); + if (maxDt0 > 0) { + const double maxDt = 0.25*maxDt0; + if(platform->comm.mpiRank == 0) printf("\n" "reseting MAX DT = %g\n", maxDt); + platform->options.setArgs("MAX DT", to_string_f(maxDt)); + } } void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep) { + mesh_t* mesh = nrs->meshV; + const auto fieldOffsetByte = nrs->fieldOffset * sizeof(dfloat); + + { + static int calld = 0; + if(maxDt0 > 0 && time > 100 && !calld) { + if(platform->comm.mpiRank == 0) printf("restore original MAX DT = %g\n", maxDt0); + platform->options.setArgs("MAX DT", to_string_f(maxDt0)); + calld++; + } + } + + auto o_avg = nrs->o_usrwrk.slice(0*fieldOffsetByte, 6*fieldOffsetByte); + average(nrs, o_avg); + + const int bId = 2; + dfloat avgWallValues[3]; + avgBoundaryValue(nrs, bId, 3, o_avg, avgWallValues); + + dfloat tauWall[5]; + const dfloat thts = tauWall[0] = T0S + SCOOLRATE*time; + + const auto uAvgWall = avgWallValues[0]; + const auto wAvgWall = avgWallValues[1]; + const auto magUWall = tauWall[1] = sqrt(uAvgWall*uAvgWall + wAvgWall*wAvgWall); + const auto tempAvgWall = tauWall[2] = avgWallValues[2]; + + dfloat rLength; + const dfloat diffTempWall = fabs(tempAvgWall - (thts - 1.)); + findUThetaTau(magUWall, diffTempWall, rLength, tauWall[3], tauWall[4]); + nrs->o_usrwrk.copyFrom(tauWall, sizeof(tauWall), 9*fieldOffsetByte); + + const auto factru = (1. + BETAM*ZWALL*rLength)/KAPPA/ZWALL; + const auto scale = 1./(factru*factru); + auto o_ddyAvg = o_avg.slice(3*fieldOffsetByte, 3*fieldOffsetByte); + visMFKernel( + mesh->Nlocal, + nrs->fieldOffset, + scale, + o_ddyAvg, + platform->o_mempool.slice0); +#if 0 + platform->lingAlg->fill(mesh->Nlocal, 0.0, platform->o_mempool.slice0); +#endif + + auto o_divTau = nrs->o_usrwrk.slice(6*fieldOffsetByte, 3*fieldOffsetByte); + divStressKernel( + mesh->Nelements, + nrs->fieldOffset, + mesh->o_invLMM, + mesh->o_vgeo, + mesh->o_D, + platform->o_mempool.slice0, + o_ddyAvg, + o_divTau); + oogs::startFinish(o_divTau, 3, nrs->fieldOffset, ogsDfloat, ogsAdd, nrs->gsh); + if (nrs->isOutputStep) { + if(platform->comm.mpiRank == 0) { + printf("uAvgWall: %g wAvgWall: %g magUWall: %g tempAvgWall: %g\n" + "uTau: %g thetaTau: %g rLength: %g\n", + uAvgWall, wAvgWall, tauWall[1], tempAvgWall, + tauWall[3], tauWall[4], rLength); + } + + //writeFld("avg", time, 1, 1, &o_NULL, &o_NULL, &o_avg, 6); nek::ocopyToNek(time, tstep); nek::userchk(); nek::ocopyFromNek(time); diff --git a/examples/gabls1/gabls.usr b/examples/gabls1/gabls.usr index 4edaeada6..33585f502 100644 --- a/examples/gabls1/gabls.usr +++ b/examples/gabls1/gabls.usr @@ -1,45 +1,6 @@ #define INTP_NMAX 200 /* number of sample points for 1D profile */ #define PI (4.*atan(1.)) -c----------------------------------------------------------------------- - subroutine uservp (ix,iy,iz,ieg) - include 'SIZE' - include 'TOTAL' - include 'NEKUSE' - - return - end -c----------------------------------------------------------------------- - subroutine userf (ix,iy,iz,ieg) - include 'SIZE' - include 'TOTAL' - include 'NEKUSE' - include 'CASEDATA' - - common /WFPAR/ gacc - $ ,thts0, thts, ts_rate, ts_grad - $ ,fcor, bcor - - - yarge = YLEN*(YLEN-y) - factor = exp(-yarge) - - ffx = fcor * uz + (1.-ux)*factor - ffy = + bcor * uz + temp*gacc - ffz = -fcor * (ux-1.) - bcor * uy + (0.-uz)*factor - - return - end -c----------------------------------------------------------------------- - subroutine userq (ix,iy,iz,ieg) - include 'SIZE' - include 'TOTAL' - include 'NEKUSE' - - qvol = 0.0 - - return - end c----------------------------------------------------------------------- subroutine userchk include 'SIZE' @@ -208,38 +169,6 @@ c average horizontally nu and interpolate call flush(58) endif - return - end -c----------------------------------------------------------------------- - subroutine userbc (ix,iy,iz,iside,ieg) - include 'SIZE' - include 'TOTAL' - include 'NEKUSE' - - common /WFPAR/ gacc - $ ,thts0, thts, ts_rate, ts_grad - $ ,fcor, bcor - - real usn(3), tsn(3), bsn(3) - character*3 cbv3, cbt3 - - ie = gllel(ieg) - - cbv3 = cbc(iside,ie,1) - cbt3 = cbc(iside,ie,2) - - if(ifield.eq.1) then - ux = 1.0 - uy = 0.0 - uz = 0.0 - elseif(ifield.eq.2) then - if(cbt3.eq.'f ') then - flux = param(8)*ts_grad - elseif(cbt3.eq.'t ') then - temp = thts0 + ts_rate*time - 1. - endif - endif - return end c----------------------------------------------------------------------- @@ -256,29 +185,24 @@ c----------------------------------------------------------------------- real kx, ky ux = 1.0 +c if(y.le.1.) ux = y*(2.-y) + uy = 0.0 uz = 0.0 - eps = 1.e-2 - epsT= 0.1/TREF - kx = 23 - kz = 13 - - alpha = kx * 2*PI/XLEN - beta = kz * 2*PI/ZLEN - - ! add perturbation to trigger turbulence -c if(y.le.0.5) then - ux = ux + eps*beta * sin(alpha*x)*cos(beta*z) - uy = uy + eps * sin(alpha*x)*sin(beta*z) - uz = uz - eps*alpha * cos(alpha*x)*sin(beta*z) -c endif - + ! thin boundary layer at the lower wall if(y.le.1) then - temp = (thts0 - 1.) + kx = 23 + kz = 13 + alpha = kx * 2*PI/XLEN + beta = kz * 2*PI/ZLEN + + gamma = 5e-6 ! initial thickness + epsT = 0.1/tref + temp = (thts0 - 1.) if(y.le.0.5) temp = temp + epsT*sin(alpha*x)*sin(beta*z) else - temp =(thts0 - 1.) + ts_grad * (y-1.) + temp = (thts0 - 1.) + ts_grad * (y-1.) endif return @@ -287,6 +211,7 @@ c----------------------------------------------------------------------- subroutine usrdat ! This routine to modify element vertices include 'SIZE' ! _before_ mesh is generated, which include 'TOTAL' ! guarantees GLL mapping of mesh. + include 'CASEDATA' return end @@ -297,7 +222,7 @@ c----------------------------------------------------------------------- include 'CASEDATA' common /scnrs/ sc_nrs(20) - real sc_nrs + real sc_nrs nlat = sc_nrs(1) uref = sc_nrs(2) @@ -315,19 +240,13 @@ c----------------------------------------------------------------------- betam = sc_nrs(14) call rescale_x(xm1, 0.0, XLEN) - call rescale_x(zm1, 0.0, ZLEN) - -c call rescale_x(ym1, 0.0, 1.0) -c do i=1,nx1*ny1*nz1*nelt -c ym1(i,1,1,1) = tanh(BETAM*(ym1(i,1,1,1)-1))/tanh(BETAM) + 1 -c enddo - call rescale_x(ym1, 0.0, YLEN) + call rescale_x(zm1, 0.0, ZLEN) do iel=1,nelt do ifc=1,2*ndim - if (cbc(ifc,iel,1) .eq. 'W ') boundaryID(ifc,iel) = 1 - if (cbc(ifc,iel,1) .eq. 'v ') boundaryID(ifc,iel) = 2 + if (cbc(ifc,iel,1) .eq. 'W ') boundaryID(ifc,iel) = 2 + if (cbc(ifc,iel,1) .eq. 'v ') boundaryID(ifc,iel) = 1 cbc(ifc,iel,2) = cbc(ifc,iel,1) if (cbc(ifc,iel,1) .eq. 'W ') cbc(ifc,iel,2) = 't ' diff --git a/examples/kershaw/README.md b/examples/kershaw/README.md new file mode 100644 index 000000000..41cbef6c2 --- /dev/null +++ b/examples/kershaw/README.md @@ -0,0 +1,111 @@ +# Kershaw BP5 and BPS5 + +## Performance Results + +### NVIDIA V100 +``` +BPS5 +solve time: 0.521428s + preconditioner 0.402965s + smoother 0.251669s + coarse grid 0.103442s +iterations: 31 +throughput: 2.17135e+08 (DOF x iter)/s +throughput: 7.00436e+06 DOF/s +FLOPS/s: 4.90859e+11 + +BP5 +solve time: 1.47599s +throughput: 2.47446e+09 (DOF x iter)/s +FLOPS/s: 4.69537e+11 +``` + +### NVIDIA A100 +``` +BPS5 +solve time: 0.296066s + preconditioner 0.224607s + smoother 0.154094s + coarse grid 0.0426403s +iterations: 31 +throughput: 3.82415e+08 (DOF x iter)/s +throughput: 1.2336e+07 DOF/s +FLOPS/s: 8.64493e+11 + +BP5 +solve time: 0.895191s +throughput: 4.07987e+09 (DOF x iter)/s +FLOPS/s: 7.7417e+11 +``` + +### NVIDIA A100, 2 GPU +``` +BPS5 +solve time: 0.199604s + preconditioner 0.161264s + smoother 0.112551s + coarse grid 0.0285382s +iterations: 31 +throughput: 5.67224e+08 (DOF x iter)/s +throughput: 1.82976e+07 DOF/s +FLOPS/s: 1.28228e+12 + +BP5 +solve time: 0.513648s +throughput: 7.11045e+09 (DOF x iter)/s +FLOPS/s: 1.34923e+12 +``` + +### AMD MI100 +``` +BPS5 +solve time: 0.508355s + preconditioner 0.389843s + smoother 0.28957s + coarse grid 0.051806s +iterations: 31 +throughput: 2.22719e+08 (DOF x iter)/s +throughput: 7.18447e+06 DOF/s +FLOPS/s: 5.03481e+11 + +BP5 +solve time: 1.41609s +throughput: 2.57912e+09 (DOF x iter)/s +FLOPS/s: 4.89396e+11 +``` + +### AMD MI250X/1 +``` +BPS5 +solve time: 0.440587s + preconditioner 0.335607s + smoother 0.242305s + coarse grid 0.0527129s +iterations: 31 +throughput: 2.56976e+08 (DOF x iter)/s +throughput: 8.28954e+06 DOF/s +FLOPS/s: 5.80923e+11 + +BP5 +solve time: 1.17002s +throughput: 3.12153e+09 (DOF x iter)/s +FLOPS/s: 5.92321e+11 +``` + +### AMD MI250X +``` +BPS5 +solve time: 0.333465s + preconditioner 0.274027s + smoother 0.189027s + coarse grid 0.0548005s +iterations: 31 +throughput: 3.39527e+08 (DOF x iter)/s +throughput: 1.09525e+07 DOF/s +FLOPS/s: 7.67539e+11 + +BP5 +solve time: 0.74642s +throughput: 4.89304e+09 (DOF x iter)/s +FLOPS/s: 9.28472e+11 +``` diff --git a/examples/kershaw/kershaw.par b/examples/kershaw/kershaw.par index 69a79994c..2640812e9 100644 --- a/examples/kershaw/kershaw.par +++ b/examples/kershaw/kershaw.par @@ -1,36 +1,43 @@ [GENERAL] #verbose = true polynomialOrder = 7 +dealiasing = false +timeStepper = tombo1 stopAt = numSteps numSteps = 0 [PRESSURE] -solver = pfgmres+nVector=20 +solver = pfgmres+nVector=30 maxIterations = 200 residualTol = 1e-8+relative +# p-MG, smooth on coarsest level rather than solve +#preconditioner = multigrid + # p-MG multiplicative V-cycle, coarse grid solve preconditioner = multigrid+coarse -coarseSolver = FEM+BoomerAMG #+Galerkin -smootherType = RAS+Chebyshev+degree=2 #+minEigenvalueBoundFactor=0.1+maxEigenvalueBoundFactor=1.1 - -# p-MG, smooth on coarsest level rather than solve -#preconditioner = pmg -#coarseSolver = none +#coarseSolver = BoomerAMG +#coarseGridDiscretization = FEM #+Galerkin # p-MG, solve coarsest level using SEMFEM discretization #preconditioner = pmg+coarse -#coarseSolver = SEMFEM+AmgX+FP32 +#coarseSolver = AmgX+FP32 +#coarseGridDiscretization = SEMFEM # p-MG additive V-cycle, overlapping coarse grid solve # requires smootherType = #preconditioner = multigrid+additive+overlapCrsSolve+coarse -#smootherType = ASM -pMultigridCoarsening = 7,3,1 +smootherType = RAS+Chebyshev+degree=2 #+minEigenvalueBoundFactor=0.1+maxEigenvalueBoundFactor=1.1 +#pMultigridCoarsening = 7,5,3,1 + +# SEMFEM +#preconditioner = semfem+AMGX+FP32 + initialGuess = previous -[VELOCITY] +[VELOCITY] # dummy +preconditioner = none density = 1.0 viscosity = 1.0 @@ -47,4 +54,5 @@ viscosity = 1.0 #configFile = "amgx.json" [CASEDATA] -P_EPS = 0.3 \ No newline at end of file +P_EPS = 0.3 +gsOverlap = 1 diff --git a/examples/kershaw/kershaw.udf b/examples/kershaw/kershaw.udf index 85c88cd79..7fc038ff8 100644 --- a/examples/kershaw/kershaw.udf +++ b/examples/kershaw/kershaw.udf @@ -1,4 +1,5 @@ #include +#include #include "udf.hpp" /* UDF Functions */ @@ -15,6 +16,10 @@ void UDF_Setup0(MPI_Comm comm, setupAide &options) double* const nek_cb_scnrs = (double*) nek::ptr("cb_scnrs"); nek_cb_scnrs[0] = P_EPS; } + + int gsOverlap = 1; + platform->par->extract("casedata", "gsoverlap", gsOverlap); + if(!gsOverlap) platform->options.setArgs("GS OVERLAP", "FALSE"); } void UDF_Setup(nrs_t *nrs) @@ -33,47 +38,95 @@ void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep) platform->linAlg->fillKernel(mesh->Nlocal, 1.0, nrs->o_ellipticCoeff); platform->o_mempool.slice0.copyFrom(platform->mempool.slice0, mesh->Nlocal * sizeof(dfloat)); - // warm-up - ellipticSolve(nrs->pSolver, platform->o_mempool.slice0, nrs->o_P); - platform->timer.reset("pressureSolve"); + if(platform->comm.mpiRank == 0) + std::cout << "\nrunning benchmarks\n"; - const int Nrep = 10; - if(platform->comm.mpiRank == 0) printf("solving "); - for (int i = 0; i < Nrep; i++) { - platform->linAlg->fillKernel(mesh->Nlocal, 0.0, nrs->o_P); - platform->o_mempool.slice0.copyFrom(platform->mempool.slice0, mesh->Nlocal * sizeof(dfloat)); + for(std::string benchmark : {"BPS5", "BP5"}){ - platform->timer.tic("pressureSolve", 1); - ellipticSolve(nrs->pSolver, platform->o_mempool.slice0, nrs->o_P); - platform->timer.toc("pressureSolve"); - - if(platform->comm.mpiRank == 0) { - printf("."); - fflush(stdout); + MPI_Barrier(platform->comm.mpiComm); + if(platform->comm.mpiRank == 0) + std::cout << "\n" << benchmark << "\n"; + + int Nrep = 0; + if(benchmark == "BP5"){ + Nrep = 25; + nrs->pSolver->options.setArgs("PRECONDITIONER", "NONE"); + nrs->pSolver->options.setArgs("KRYLOV SOLVER", "PCG"); + nrs->pSolver->options.setArgs("MAXIMUM ITERATIONS", "1000"); + nrs->pSolver->options.setArgs("SOLVER TOLERANCE", to_string_f(1e-15)); + ellipticPreconditionerSetup(nrs->pSolver, nrs->pSolver->ogs); } - } - if(platform->comm.mpiRank == 0) printf("\n"); - - hlong Ntotal = mesh->Nelements * (mesh->N * mesh->N * mesh->N); - MPI_Allreduce(MPI_IN_PLACE, &Ntotal, 1, MPI_HLONG, MPI_SUM, platform->comm.mpiComm); - const double solveTime = platform->timer.query("pressureSolve", "DEVICE:MAX")/Nrep; - const double precoTime = platform->timer.query("pressure preconditioner", "DEVICE:MAX")/Nrep; - const double smootherTime = platform->timer.query("pressure preconditioner smoother", "DEVICE:MAX")/Nrep; - const double crsTime = [&](){ - const double tBoomerAMG = platform->timer.query("BoomerAMGSolve", "HOST:MAX")/Nrep; - const double tAMGX = platform->timer.query("AmgXSolve", "DEVICE:MAX")/Nrep; - const double tCrsSEMFEM = platform->timer.query("Coarse SEMFEM Solve", "DEVICE:MAX")/Nrep; - const double maxTime = tBoomerAMG > tAMGX ? tBoomerAMG : tAMGX; - return maxTime > tCrsSEMFEM ? maxTime : tCrsSEMFEM; - }(); - if(platform->comm.mpiRank == 0) { + if(benchmark == "BPS5"){ + Nrep = 50; + nrs->pSolver->options.setArgs("SOLVER TOLERANCE", to_string_f(1e-8)); + nrs->pSolver->options.setArgs("LINEAR SOLVER STOPPING CRITERION", "RELATIVE"); + } + + platform->timer.reset("pressureSolve"); + platform->timer.reset("pressure preconditioner"); + platform->timer.reset("pressure preconditioner smoother"); + platform->timer.reset("coarseSolve"); + platform->flopCounter->clear(); + + std::vector eTime; + std::vector precoTime; + std::vector smootherTime; + std::vector crsTime; + + for (int i = 0; i < Nrep; i++) { + platform->linAlg->fillKernel(mesh->Nlocal, 0.0, nrs->o_P); + platform->o_mempool.slice0.copyFrom(platform->mempool.slice0, mesh->Nlocal * sizeof(dfloat)); + + platform->timer.tic("pressureSolve", 1); + ellipticSolve(nrs->pSolver, platform->o_mempool.slice0, nrs->o_P); + platform->timer.toc("pressureSolve"); + + eTime.push_back(platform->timer.query("pressureSolve", "DEVICE:MAX")); + precoTime.push_back(platform->timer.query("pressure preconditioner", "DEVICE:MAX")); + smootherTime.push_back(platform->timer.query("pressure preconditioner smoother", "DEVICE:MAX")); + crsTime.push_back(platform->timer.query("coarseSolve", "DEVICE:MAX")); + + platform->timer.reset("pressureSolve"); + platform->timer.reset("pressure preconditioner"); + platform->timer.reset("pressure preconditioner smoother"); + platform->timer.reset("coarseSolve"); + + if(platform->comm.mpiRank == 0) { + printf("."); + fflush(stdout); + } + } + if(platform->comm.mpiRank == 0) printf("\n"); + + hlong Ntotal = mesh->Nelements * (mesh->N * mesh->N * mesh->N); + MPI_Allreduce(MPI_IN_PLACE, &Ntotal, 1, MPI_HLONG, MPI_SUM, platform->comm.mpiComm); + + const auto minEntry = std::distance(eTime.begin(), std::min_element(eTime.begin(), eTime.end())); + const double solveTime = eTime.at(minEntry); + const double solveTimeMax = *std::max_element(eTime.begin(), eTime.end()); + const double solveTimeAvg = std::accumulate(eTime.begin(), eTime.end(), 0.0) / Nrep; const double throughput = (double)Ntotal/solveTime; - printf("iterations: %d\n", nrs->pSolver->Niter); - printf("avg solve time: %g s\n", solveTime); - printf(" preconditioner %g s\n", precoTime); - if(smootherTime > 0) printf(" smoother %g s\n", smootherTime); - if(crsTime > 0) printf(" coarse grid %g s\n", crsTime); - printf("avg throughput: %g DOF/s\n", throughput); - printf("avg throughput: %g (DOF x iter)/s\n", (double) nrs->pSolver->Niter * throughput); + const double flopsPerSecond = (platform->flopCounter->get(platform->comm.mpiComm)/Nrep)/solveTime; + + if(platform->comm.mpiRank == 0) { + printf("repetitions: %d\n", Nrep); + printf("solve time: min: %gs avg: %gs max: %gs\n", solveTime, solveTimeAvg, solveTimeMax); + if(precoTime.at(minEntry) > 0) printf(" preconditioner %gs\n", precoTime.at(minEntry)); + if(smootherTime.at(minEntry) > 0) printf(" smoother %gs\n", smootherTime.at(minEntry)); + if(crsTime.at(minEntry) > 0) printf(" coarse grid %gs\n", crsTime.at(minEntry)); + printf("iterations: %d\n", nrs->pSolver->Niter); + printf("throughput: %g (DOF x iter)/s\n", (double) nrs->pSolver->Niter * throughput); + if(benchmark == "BPS5") { + printf("throughput: %g DOF/s\n", throughput); + int nRestart = 0; + nrs->pSolver->options.getArgs("PGMRES RESTART", nRestart); + if(nrs->pSolver->Niter == nRestart) + printf("WARNING: maximum iterations reached!"); + } + if(!nrs->pSolver->options.compareArgs("PRECONDITIONER", "SEMFEM")){ + printf("FLOPS/s: %g \n", flopsPerSecond); + } + } } + } diff --git a/examples/ktauChannel/channel.udf b/examples/ktauChannel/channel.udf index 51586b779..29c7788f1 100644 --- a/examples/ktauChannel/channel.udf +++ b/examples/ktauChannel/channel.udf @@ -46,8 +46,8 @@ void uservp(nrs_t *nrs, dfloat time, occa::memory o_U, occa::memory o_S, void UDF_LoadKernels(occa::properties& kernelInfo) { - userfKernel = udfBuildKernel(kernelInfo, "cfill"); - scalarScaledAddKernel = udfBuildKernel(kernelInfo, "scalarScaledAdd"); + userfKernel = oudfBuildKernel(kernelInfo, "cfill"); + scalarScaledAddKernel = oudfBuildKernel(kernelInfo, "scalarScaledAdd"); RANSktau::buildKernel(kernelInfo); } diff --git a/examples/lowMach/lowMach.udf b/examples/lowMach/lowMach.udf index a325a785d..e25db1fbe 100644 --- a/examples/lowMach/lowMach.udf +++ b/examples/lowMach/lowMach.udf @@ -40,8 +40,8 @@ void uservp(nrs_t *nrs, dfloat time, occa::memory o_U, occa::memory o_S, void UDF_LoadKernels(occa::properties& kernelInfo) { kernelInfo["defines/p_DELTA"] = P_DELTA; - userQKernel = udfBuildKernel(kernelInfo, "userQ"); - userVpKernel = udfBuildKernel(kernelInfo, "userVp"); + userQKernel = oudfBuildKernel(kernelInfo, "userQ"); + userVpKernel = oudfBuildKernel(kernelInfo, "userVp"); lowMach::buildKernel(kernelInfo); } diff --git a/examples/mv_cyl/CASEDATA b/examples/mv_cyl/CASEDATA index de28fc73e..6ec99e40f 100644 --- a/examples/mv_cyl/CASEDATA +++ b/examples/mv_cyl/CASEDATA @@ -1,5 +1,7 @@ real xlen, ylen, zlen, - $ p_gamma, p_delta, p_omega, p_amp + $ p_gamma, p_delta, p_omega, p_amp, + $ p_rot common /casevars/ xlen, ylen, zlen, - $ p_gamma, p_delta, p_omega, p_amp + $ p_gamma, p_delta, p_omega, p_amp, + $ p_rot diff --git a/examples/mv_cyl/ci.inc b/examples/mv_cyl/ci.inc index c657a45af..1545f4db5 100644 --- a/examples/mv_cyl/ci.inc +++ b/examples/mv_cyl/ci.inc @@ -25,6 +25,8 @@ void ciSetup(MPI_Comm comm, setupAide &options) options.setArgs("PRESSURE SOLVER TOLERANCE", std::string("1e-10")); options.setArgs("SCALAR00 SOLVER TOLERANCE", std::string("1e-12")); options.setArgs("MESH SOLVER", "USER"); + + platform->par->set("casedata", "p_rot", 0.0); if(ciMode == 2){ options.setArgs("SUBCYCLING STEPS", std::string("1")); @@ -40,6 +42,14 @@ void ciSetup(MPI_Comm comm, setupAide &options) options.setArgs("MESH SOLVER", "ELASTICITY"); options.setArgs("MESH SOLVER TOLERANCE", "1e-6"); } + if(ciMode == 5){ + platform->par->set("casedata", "p_rot", 0.5); + } + if(ciMode == 6){ + options.setArgs("MESH SOLVER", "ELASTICITY"); + options.setArgs("MESH SOLVER TOLERANCE", "1e-12"); + platform->par->set("casedata", "p_rot", 0.5); + } } void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) @@ -60,18 +70,18 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) { expectedErr[0] = 0.2465620E-06; expectedErr[1] = 0.6938894E-16; - expectedErr[2] = 0.3044039E-05; - expectedErr[3] = 0.2144404E-05; + expectedErr[2] = 0.48E-05; + expectedErr[3] = 0.295E-05; expectedErr[4] = 0.2465620E-05; - expectedErr[5] = 0.1080412E-05; + expectedErr[5] = 0.125E-05; expectedPIter = 6; } if(ciMode == 2) { expectedErr[0] = 0.2465620E-06; expectedErr[1] = 0.6938894E-16; - expectedErr[2] = 0.3044039E-05; - expectedErr[3] = 0.2144404E-05; + expectedErr[2] = 0.48E-05; + expectedErr[3] = 0.295E-05; expectedErr[4] = 0.2465620E-05; expectedErr[5] = 0.2443839E-05; expectedPIter = 7; @@ -80,8 +90,8 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) { expectedErr[0] = 0.2465620E-06; expectedErr[1] = 0.6938894E-16; - expectedErr[2] = 0.3044039E-05; - expectedErr[3] = 0.1966487E-05; + expectedErr[2] = 0.48E-05; + expectedErr[3] = 0.295E-05; expectedErr[4] = 0.2465620E-05; expectedErr[5] = 0.2443839E-05; } @@ -89,13 +99,33 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) { expectedErr[0] = 0.2465620E-06; expectedErr[1] = 0.6938894E-16; - expectedErr[2] = 0.3044039E-05; - expectedErr[3] = 0.1966487E-05; + expectedErr[2] = 0.48E-05; + expectedErr[3] = 0.295E-05; expectedErr[4] = 0.2465620E-05; expectedErr[5] = 0.2443839E-05; } + if(ciMode == 5) + { + expectedErr[0] = 0.2465620E-06; + expectedErr[1] = 0.6938894E-16; + expectedErr[2] = 0.48E-05; + expectedErr[3] = 0.295E-05; + expectedErr[4] = 0.2465620E-05; + expectedErr[5] = 0.125E-05; + expectedPIter = 6; + } + if(ciMode == 6) + { + expectedErr[0] = 0.2465620E-06; + expectedErr[1] = 0.6938894E-16; + expectedErr[2] = 0.3E-05; + expectedErr[3] = 0.21E-05; + expectedErr[4] = 0.2465620E-05; + expectedErr[5] = 0.108E-05; + expectedPIter = 6; + } - const dfloat floor = 1e-12; // values below this threshold are ignored + const dfloat absTol = 1e-8; // values below this threshold are ignored const int expectedVIter = 11; double vxErr, prErr; @@ -108,10 +138,10 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) if(rank == 0) printf("relative error to target: "); for(int i = 0 ; i < Nfields; ++i){ - if(abs(err[i]) > floor){ + if(abs(err[i]) > absTol){ const dfloat relErr = abs(err[i]-expectedErr[i])/expectedErr[i]; if(rank == 0) printf("err[%d]=%g ", i, relErr); - if(relErr > EPS) passed = false; + if(relErr > EPS && abs(expectedErr[i]) < abs(err[i])) passed = false; } } velIterErr = abs(expectedVIter - nrs->uvwSolver->Niter); @@ -126,11 +156,6 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) const int meshIterErr = abs(nrs->meshSolver->Niter - expectedMeshIter); if(meshIterErr >= 3) passed = false; } - if(ciMode == 4) - { - const int meshIterThreshold = 200; - if(nrs->meshSolver->Niter >= meshIterThreshold) passed = false; - } passed ? (PASS) : (FAIL); } diff --git a/examples/mv_cyl/mv_cyl.oudf b/examples/mv_cyl/mv_cyl.oudf index 3314070a4..289c3d848 100644 --- a/examples/mv_cyl/mv_cyl.oudf +++ b/examples/mv_cyl/mv_cyl.oudf @@ -1,9 +1,20 @@ // Boundary conditions void velocityDirichletConditions(bcData *bc) { - bc->u = 0.0; - bc->v = p_AMP*sin(p_OMEGA*bc->time); - bc->w = 0.0; + const dfloat u = 0.0; + const dfloat v = p_AMP*sin(p_OMEGA*bc->time); + const dfloat w = 0.0; + + bc->u = u * cos(p_ROT) - v * sin(p_ROT); + bc->v = u * sin(p_ROT) + v * cos(p_ROT); + bc->w = w; +} + +void meshVelocityDirichletConditions(bcData *bc) +{ + bc->meshu = bc->u; + bc->meshv = bc->v; + bc->meshw = bc->w; } @kernel void userVp(const dlong Nelements, @@ -26,22 +37,26 @@ void velocityDirichletConditions(bcData *bc) } @kernel void userMeshVelocity(const dlong Nelements, const dlong offset, - const dfloat ymin, - const dfloat ymax, + const dfloat hmin, + const dfloat hmax, const dfloat time, - @restrict const dfloat * y, + @restrict const dfloat * h, @restrict dfloat * W) { for(dlong e=0;epar->extract("casedata", "p_delta",P_DELTA); platform->par->extract("casedata", "p_omega",P_OMEGA); platform->par->extract("casedata", "p_amp",P_AMP); + platform->par->extract("casedata", "p_rot",P_ROT); if (platform->options.compareArgs("BUILD ONLY", "FALSE")) { double* const nek_cb_scnrs = (double*) nek::ptr("cb_scnrs"); nek_cb_scnrs[0] = XLEN; @@ -80,6 +83,7 @@ void UDF_Setup0(MPI_Comm comm, setupAide &options) nek_cb_scnrs[4] = P_DELTA; nek_cb_scnrs[5] = P_OMEGA; nek_cb_scnrs[6] = P_AMP; + nek_cb_scnrs[7] = P_ROT; } } @@ -97,14 +101,24 @@ void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep) cds_t* cds = nrs->cds; linAlg_t* linAlg = platform->linAlg; if(platform->options.compareArgs("MESH SOLVER", "USER")){ - const dfloat ymin = linAlg->min(mesh->Nlocal, mesh->o_y, platform->comm.mpiComm); - const dfloat ymax = linAlg->max(mesh->Nlocal, mesh->o_y, platform->comm.mpiComm); + + // rotate back into reference frame + platform->linAlg->axpbyz( + mesh->Nlocal, + -1.0 * std::sin(P_ROT), + mesh->o_x, + std::cos(P_ROT), + mesh->o_y, + platform->o_mempool.slice0); + + const dfloat hmin = linAlg->min(mesh->Nlocal, platform->o_mempool.slice0, platform->comm.mpiComm); + const dfloat hmax = linAlg->max(mesh->Nlocal, platform->o_mempool.slice0, platform->comm.mpiComm); userMeshVelocityKernel(mesh->Nelements, nrs->fieldOffset, - ymin, - ymax, + hmin, + hmax, time, - mesh->o_y, + platform->o_mempool.slice0, mesh->o_U); } diff --git a/examples/mv_cyl/mv_cyl.usr b/examples/mv_cyl/mv_cyl.usr index ded9547f2..1801d713b 100644 --- a/examples/mv_cyl/mv_cyl.usr +++ b/examples/mv_cyl/mv_cyl.usr @@ -80,13 +80,19 @@ C======================================================================= INCLUDE 'NEKUSE' include 'CASEDATA' + real uxt, uyt + omega = P_OMEGA amp = P_AMP IF (IFIELD .eq. 1) THEN - UX = 0.0 - UY = amp*sin(omega*time) ! piston + uxt = 0.0 + uyt = amp * sin(omega * time) + + UX = uxt * cos(P_ROT) - uyt * sin(P_ROT) + UY = uxt * sin(P_ROT) + uyt * cos(P_ROT) UZ = 0.0 + ENDIF return @@ -141,11 +147,23 @@ C======================================================================= P_DELTA = sc_nrs(5) P_OMEGA = sc_nrs(6) P_AMP = sc_nrs(7) + P_ROT = sc_nrs(8) call rescale_x(xm1, 0.0,XLEN) call rescale_x(ym1,-1.0,-1.0+YLEN) call rescale_x(zm1, 0.0,ZLEN) + ! rotate coordinates + ntot = nx1*ny1*nz1*nelt + do i=1,ntot + xpt = xm1(i,1,1,1) + ypt = ym1(i,1,1,1) + + xm1(i,1,1,1) = xpt * cos(P_ROT) - ypt * sin(P_ROT) + ym1(i,1,1,1) = xpt * sin(P_ROT) + ypt * cos(P_ROT) + enddo + + do iel=1,nelt do ifc=5,6 cbc(ifc,iel,1) = 'P ' @@ -200,22 +218,37 @@ C======================================================================= include 'TOTAL' include 'CASEDATA' + real ht(lx1, ly1, lz1, lelt) + real xp, yp + + real hmin, hmax, hlen, h, hsc + real wxt, wyt + common /c_mybc/ omega, v_piston n = nx1*ny1*nz1*nelv - ymin = glmin(ym1,n) - ymax = glmax(ym1,n) ! this is the separation between moving and nonmvg - ylen = ymin-ymax + do i=1,n + xp = xm1(i,1,1,1) + yp = ym1(i,1,1,1) + ht(i,1,1,1) = -xp * sin(P_ROT) + yp * cos(P_ROT) + enddo + + hmin = glmin(ht,n) + hmax = glmax(ht,n) ! this is the separation between moving and nonmvg + hlen = hmin-hmax omega = P_OMEGA amp = P_AMP v_piston = amp*sin(omega*time_) do i=1,n - yy = ym1(i,1,1,1) - ysc = (yy-ymax)/ylen - wx(i,1,1,1) = 0 - wy(i,1,1,1) = ysc*v_piston + h = ht(i,1,1,1) + hsc = (h-hmax)/hlen + wxt = 0.0 + wyt = hsc * v_piston + + wx(i,1,1,1) = wxt * cos(P_ROT) - wyt * sin(P_ROT) + wy(i,1,1,1) = wxt * sin(P_ROT) + wyt * cos(P_ROT) if (if3d) wz(i,1,1,1) = 0 enddo @@ -252,6 +285,7 @@ c real tavg, tex, var real err(6) save err + real ht(lx1, ly1, lz1, lelt) COMMON /NRSSCPTR/ nrs_scptr(1) integer*8 nrs_scptr @@ -261,7 +295,12 @@ c ntotv = nxyz*nelv termV = -1.*glcflux(vx,vy,vz) - ypist = glmin(ym1,ntotv) + do i = 1,ntotv + xp = xm1(i,1,1,1) + yp = ym1(i,1,1,1) + ht(i,1,1,1) = -xp * sin(P_ROT) + yp * cos(P_ROT) + enddo + ypist = glmin(ht,ntotv) call exact_sol (time,volex,vpex,pex,dpdtex,qtlex,ypex) tavg = glsc2(t, bm1, ntotv) / volvm1 diff --git a/examples/mv_cyl/mv_cyl_derived_bc.oudf b/examples/mv_cyl/mv_cyl_derived_bc.oudf new file mode 100644 index 000000000..1a712c005 --- /dev/null +++ b/examples/mv_cyl/mv_cyl_derived_bc.oudf @@ -0,0 +1,56 @@ +// Boundary conditions +void velocityDirichletConditions(bcData *bc) +{ + const dfloat u = 0.0; + const dfloat v = p_AMP*sin(p_OMEGA*bc->time); + const dfloat w = 0.0; + + bc->u = u * cos(p_ROT) - v * sin(p_ROT); + bc->v = u * sin(p_ROT) + v * cos(p_ROT); + bc->w = w; +} + +@kernel void userVp(const dlong Nelements, + const dlong uOffset, + const dlong sOffset, + const double p0th, + @restrict const dfloat * TEMP, + @restrict dfloat * UPROP, + @restrict dfloat * SPROP) +{ + for(dlong e=0;etrn = 0.0; + bc->tr1 = -3.0 * p_visc; + bc->tr2 = 0.0; +} diff --git a/examples/shlChannel/channel.par b/examples/shlChannel/channel.par new file mode 100644 index 000000000..54bb2ec10 --- /dev/null +++ b/examples/shlChannel/channel.par @@ -0,0 +1,25 @@ +[GENERAL] +polynomialOrder = 7 +stopAt = numSteps +numSteps = 100 +dt = 1e-03 +timeStepper = tombo2 +subcyclingSteps = 2 + +writeControl = steps +writeInterval = 100 + +[PROBLEMTYPE] +stressFormulation = true # required for variable viscosity/rotated shl/sym boundaries + +[PRESSURE] +residualTol = 1e-08 + +[VELOCITY] +boundaryTypeMap = shl, sym +residualTol = 1e-12 +density = 1.0 +viscosity = -100.0 + +[CASEDATA] +P_ROT = 0.5 diff --git a/examples/shlChannel/channel.re2 b/examples/shlChannel/channel.re2 new file mode 100644 index 000000000..81cb3844c Binary files /dev/null and b/examples/shlChannel/channel.re2 differ diff --git a/examples/shlChannel/channel.udf b/examples/shlChannel/channel.udf new file mode 100644 index 000000000..9b2f12bbe --- /dev/null +++ b/examples/shlChannel/channel.udf @@ -0,0 +1,54 @@ +// +// nekRS User Defined File +// +#include +#include "udf.hpp" +#include "ci.inc" + +/* User Functions */ + +static occa::kernel userfKernel; +static dfloat P_ROT; +static dfloat visc; + +void userf(nrs_t *nrs, dfloat time, occa::memory o_U, occa::memory o_FU) +{ + mesh_t* mesh = nrs->meshV; + const dfloat DP = 3.0 * visc; + userfKernel(mesh->Nlocal, nrs->fieldOffset, P_ROT, DP, o_FU); +} + +void UDF_LoadKernels(occa::properties& kernelInfo) +{ + platform->options.getArgs("VISCOSITY", visc); + kernelInfo["defines/p_visc"] = visc; + userfKernel = oudfBuildKernel(kernelInfo, "userf"); +} + +void UDF_Setup0(MPI_Comm comm, setupAide& options){ + options.getArgs("CI-MODE", ciMode); + if (ciMode) ciSetup(comm, options); + platform->par->extract("casedata", "p_rot", P_ROT); + if (platform->options.compareArgs("BUILD ONLY", "FALSE")) { + double* const nek_cb_scnrs = (double*) nek::ptr("cb_scnrs"); + nek_cb_scnrs[0] = P_ROT; + } +} + +void UDF_Setup(nrs_t *nrs) +{ + mesh_t *mesh = nrs->meshV; + cds_t *cds = nrs->cds; + + udf.uEqnSource = &userf; + +} + +void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep) +{ + if (nrs->isOutputStep) { + nek::ocopyToNek(time, tstep); + nek::userchk(); + } + if (ciMode) ciTestErrors(nrs, time, tstep); +} diff --git a/examples/shlChannel/channel.usr b/examples/shlChannel/channel.usr new file mode 100644 index 000000000..5da9ea070 --- /dev/null +++ b/examples/shlChannel/channel.usr @@ -0,0 +1,126 @@ +C +C USER SPECIFIED ROUTINES: +C +C - boundary conditions +C - initial conditions +C - variable properties +C - forcing function for fluid (f) +C - forcing function for passive scalar (q) +C - general purpose routine for checking errors etc. +C +c----------------------------------------------------------------------- + subroutine useric (ix,iy,iz,ieg) + include 'SIZE' + include 'TOTAL' + include 'NEKUSE' + include 'CASEDATA' + + + real h_tilde, vx_tilde, vy_tilde ! in rotated coordinate frame + + ie = gllel(ieg) + h_tilde = ym1(ix,iy,iz,ie) * cos(P_ROT) + h_tilde = h_tilde - xm1(ix,iy,iz,ie) * sin(P_ROT) + + uxex = 1.5 * (1.0 - h_tilde ** 2.0) + uyex = 0.0 + + vx_tilde = uxex * cos(P_ROT) - uyex * sin(P_ROT) + vy_tilde = uyex * cos(P_ROT) + uxex * sin(P_ROT) + + ux = vx_tilde + uy = vy_tilde + uz = 0.0 + + return + end +c----------------------------------------------------------------------- + subroutine userchk + include 'SIZE' + include 'TOTAL' + include 'CASEDATA' + + return + end +c----------------------------------------------------------------------- + subroutine usrdat ! This routine to modify element vertices + include 'SIZE' ! _before_ mesh is generated, which + include 'TOTAL' ! guarantees GLL mapping of mesh. + integer e + + do e=1,nelv ! Rescale mesh to [0,1]x[-1,0]x[0,1] + do i=1,2**ndim ! Assumes original domain in .re2 file on [0,1]^3 + xc(i,e) = 1.0*xc(i,e) + yc(i,e) = 1.0*yc(i,e) - 1.0 + zc(i,e) = 1.0*zc(i,e) + enddo + enddo + + return + end + +c----------------------------------------------------------------------- + subroutine usrdat2() ! This routine to modify mesh coordinates + include 'SIZE' + include 'TOTAL' + include 'CASEDATA' + + common /scnrs/ sc_nrs(1) + real sc_nrs + + real xpt, ypt + + P_ROT = sc_nrs(1) + + call rescale_x(xm1,0.0,1.0) + call rescale_x(ym1,-1.0,0.0) + call rescale_x(zm1,0.0,1.0) + + ntot = nx1*ny1*nz1*nelt + + do i=1,ntot + xpt = xm1(i,1,1,1) + ypt = ym1(i,1,1,1) + + xm1(i,1,1,1) = xpt * cos(P_ROT) - ypt * sin(P_ROT) + ym1(i,1,1,1) = xpt * sin(P_ROT) + ypt * cos(P_ROT) + enddo + + do iel=1,nelt + cbc(5,iel,1) = 'P ' + cbc(6,iel,1) = 'P ' + do ifc=1,2*ndim + cbc(ifc,iel,2) = cbc(ifc,iel,1) + if (cbc(ifc,iel,1) .eq. 'W ') cbc(ifc,iel,2) = 't ' + if (cbc(ifc,iel,1) .eq. 'SYM') cbc(ifc,iel,2) = 'I ' + if (cbc(ifc,iel,1) .eq. 'W ') boundaryID(ifc,iel) = 1 + if (cbc(ifc,iel,1) .eq. 'SYM') boundaryID(ifc,iel) = 2 + enddo + enddo + + return + end +c----------------------------------------------------------------------- + subroutine usrdat3 + include 'SIZE' + include 'TOTAL' + + return + end +c----------------------------------------------------------------------- + subroutine usrsetvert(glo_num,nel,nx,ny,nz) ! to modify glo_num + integer*8 glo_num(1) + + ! kludge for periodic bc in z + nxy = nx*ny + nxyz = nx*ny*nz + do iel = 1,nel + ioff = nxyz*(iel-1) + do ixy = 1,nxy + glo_num(ioff + nxy*(nz-1) + ixy) = glo_num(ioff + ixy) + enddo + enddo + + return + end +c----------------------------------------------------------------------- diff --git a/examples/shlChannel/ci.inc b/examples/shlChannel/ci.inc new file mode 100644 index 000000000..c32144131 --- /dev/null +++ b/examples/shlChannel/ci.inc @@ -0,0 +1,93 @@ +#include + +static int ciMode = 0; + +#define PASS { if (rank == 0) printf("TESTS passed \n"); MPI_Finalize(); exit(0); } +#define FAIL { if (rank == 0) printf("TESTS failed!\n"); MPI_Finalize(); exit(2); } + +void ciSetup(MPI_Comm comm, setupAide &options) +{ + options.setArgs("POLYNOMIAL DEGREE", std::string("7")); + options.setArgs("RESTART FROM FILE", std::string("0")); + options.setArgs("SOLUTION OUTPUT INTERVAL", "0"); + options.setArgs("VISCOSITY", std::string("1e-2")); + options.setArgs("DENSITY", std::string("1")); + options.setArgs("END TIME", std::string("0.05")); + options.setArgs("DT", std::string("1e-3")); + options.setArgs("SUBCYCLING STEPS", std::string("0")); + options.setArgs("VELOCITY BLOCK SOLVER", "TRUE"); + options.setArgs("TIME INTEGRATOR", "TOMBO3"); + options.setArgs("ADVECTION", "FALSE"); + options.setArgs("VELOCITY SOLVER TOLERANCE", std::string("1e-12")); + options.setArgs("PRESSURE SOLVER TOLERANCE", std::string("1e-08")); + options.setArgs("STRESSFORMULATION", "TRUE"); + options.setArgs("ADVECTION TYPE", "CONVECTIVE"); + + if(ciMode == 1){ + platform->par->set("casedata", "p_rot", 0.0); + } + if(ciMode == 2){ + platform->par->set("casedata", "p_rot", 0.7853981633974483); + } +} + +void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) +{ + if (!nrs->lastStep) return; + + auto mesh = nrs->meshV; + + const int rank = platform->comm.mpiRank; + + std::vector Ux(mesh->Nlocal, 0.0); + std::vector Uy(mesh->Nlocal, 0.0); + std::vector Uz(mesh->Nlocal, 0.0); + + auto o_Ux = nrs->o_U + 0 * nrs->fieldOffset * sizeof(dfloat); + auto o_Uy = nrs->o_U + 1 * nrs->fieldOffset * sizeof(dfloat); + auto o_Uz = nrs->o_U + 2 * nrs->fieldOffset * sizeof(dfloat); + + o_Ux.copyTo(Ux.data(), mesh->Nlocal * sizeof(dfloat)); + o_Uy.copyTo(Uy.data(), mesh->Nlocal * sizeof(dfloat)); + o_Uz.copyTo(Uz.data(), mesh->Nlocal * sizeof(dfloat)); + + dfloat theta = 0.0; + platform->par->extract("casedata", "p_rot", theta); + + const dfloat floorTol = 1e-12; + dfloat LInfError_x = 0.0; + dfloat LInfError_y = 0.0; + + for(int i = 0; i < mesh->Nlocal; ++i){ + const auto x = mesh->x[i]; + const auto y = mesh->y[i]; + const auto h_tilde = y * cos(theta) - x * sin(theta); + const auto vx_tilde = Ux[i] * cos(theta) + Uy[i] * sin(theta); + const auto vy_tilde = Uy[i] * cos(theta) - Ux[i] * sin(theta); + + const auto uxex = 1.5 * (1.0 - h_tilde * h_tilde); + const dfloat uyex = 0.0; + + if(std::abs(uxex) > floorTol){ + LInfError_x = std::max(LInfError_x, std::abs(uxex - vx_tilde) / std::abs(uxex)); + } + + if(std::abs(uyex) > floorTol){ + LInfError_y = std::max(LInfError_y, std::abs(uyex - vy_tilde) / std::abs(uyex)); + } + } + + dfloat errors[2] = {LInfError_x, LInfError_y}; + MPI_Allreduce(MPI_IN_PLACE, errors, 2, MPI_DFLOAT, MPI_MAX, platform->comm.mpiComm); + LInfError_x = errors[0]; + LInfError_y = errors[1]; + + const dfloat tol = 1e-5; + + if(platform->comm.mpiRank == 0){ + std::cout << "LInfError in Vx: " << LInfError_x << "\n"; + std::cout << "LInfError in Vy: " << LInfError_y << "\n"; + } + + (LInfError_x < tol && LInfError_y < tol) ? (PASS) : (FAIL); +} diff --git a/examples/shlChannel/input.box b/examples/shlChannel/input.box new file mode 100644 index 000000000..cf4c3c4cf --- /dev/null +++ b/examples/shlChannel/input.box @@ -0,0 +1,22 @@ +base.rea +-3 spatial dimension ( < 0 --> generate .rea/.re2 pair) +1 number of fields +#======================================================================= +# +# Example of .box file for Taylor-Green +# +# If nelx (y or z) < 0, then genbox automatically generates the +# grid spacing in the x (y or z) direction +# with a geometric ratio given by "ratio". +# ( ratio=1 implies uniform spacing ) +# +# Note that the character bcs _must_ have 3 spaces. +# +#======================================================================= +# +Box +-8 -8 -1 nelx,nely,nelz for Box +0 1 1. x0,x1,gain (rescaled in usrdat) +0 1 1. y0,y1,gain (rescaled in usrdat) +0 1 1. z0,z1,gain +P ,P ,W ,SYM,E ,E bc's (3 chars each!) diff --git a/examples/tgv/tgv.oudf b/examples/tgv/tgv.oudf index 92a6ec4e5..669611af1 100644 --- a/examples/tgv/tgv.oudf +++ b/examples/tgv/tgv.oudf @@ -1 +1,13 @@ -// Boundary conditions +@kernel void magSqr(const dlong Ntotal, + const dlong offset, + @restrict const dfloat *U, + @restrict dfloat * uSqr) +{ + for (dlong n=0;n #include "udf.hpp" +occa::kernel magSqrKernel; + +void printDiagnostics(nrs_t *nrs, dfloat time, int tstep) +{ + mesh_t *mesh = nrs->meshV; + + const dfloat scale = 0.5/mesh->volume; + magSqrKernel(mesh->Nlocal, nrs->fieldOffset, nrs->o_U, platform->o_mempool.slice0); + const dfloat energy = scale * platform->linAlg->innerProd(mesh->Nlocal, + platform->o_mempool.slice0, mesh->o_LMM, platform->comm.mpiComm, 0); + + nrs->curlKernel( + mesh->Nelements, + 0, + mesh->o_vgeo, + mesh->o_D, + nrs->fieldOffset, + nrs->o_U, + platform->o_mempool.slice0); + magSqrKernel(mesh->Nlocal, nrs->fieldOffset, platform->o_mempool.slice0, platform->o_mempool.slice3); + const dfloat enstrophy = scale * platform->linAlg->innerProd(mesh->Nlocal, + platform->o_mempool.slice3, mesh->o_LMM, platform->comm.mpiComm, 0); + + dfloat mue, rho; + platform->options.getArgs("VISCOSITY", mue); + platform->options.getArgs("DENSITY", rho); + const dfloat nu = mue/rho; + + if(platform->comm.mpiRank == 0) + printf(" enst= %g, energy= %g, -2*nu*enst= %g", enstrophy, energy, -2*nu*enstrophy); + + static dfloat energyComputed[3] = {0}; + + if(tstep) { + dfloat sum = 0.0; + for(int i = 0 ; i < nrs->nBDF; ++i) + sum += nrs->coeffBDF[i] * energyComputed[i]; + energyComputed[2] = energyComputed[1]; + energyComputed[1] = energyComputed[0]; + energyComputed[0] = energy; + const dfloat dissipationRate = (nrs->g0*energy - sum) / nrs->dt[0]; + const dfloat nuEff = -dissipationRate/(2*enstrophy); + if(platform->comm.mpiRank == 0) + printf(", d(energy)/dt= %g, nuEff/nu= %g", dissipationRate, nuEff/nu); + } else { + energyComputed[0] = energy; + } + + if(platform->comm.mpiRank == 0) printf("\n"); +} + /* UDF Functions */ void UDF_LoadKernels(occa::properties& kernelInfo) { + magSqrKernel = oudfBuildKernel(kernelInfo, "magSqr"); } void UDF_Setup(nrs_t *nrs) @@ -27,6 +79,8 @@ void UDF_Setup(nrs_t *nrs) void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep) { + printDiagnostics(nrs, time, tstep); + if (nrs->isOutputStep) { nek::ocopyToNek(time, tstep); nek::userchk(); diff --git a/examples/tgv/tgv.usr b/examples/tgv/tgv.usr index 8c5c4b289..cda5f16e0 100644 --- a/examples/tgv/tgv.usr +++ b/examples/tgv/tgv.usr @@ -30,30 +30,6 @@ C----------------------------------------------------------------------- include 'SIZE' include 'TOTAL' - common /SCRNS/ w1 (lx1*ly1*lz1*lelv), - & w2 (lx1*ly1*lz1*lelv), - & omg(lx1*ly1*lz1*lelv,ldim) - - character*80 fnames(3) - - n = nx1*ny1*nz1*nelv - -c if (mod(istep,50).ne.0) return - - sum_e1 = 0. - sum_e2 = 0. - call curl(omg,vx,vy,vz,.false.,w1,w2) - do i = 1,n - vv = vx(i,1,1,1)**2 + vy(i,1,1,1)**2 + vz(i,1,1,1)**2 - oo = omg(i,1)**2 + omg(i,2)**2 + omg(i,3)**2 - sum_e1 = sum_e1 + vv*bm1(i,1,1,1) - sum_e2 = sum_e2 + oo*bm1(i,1,1,1) - enddo - e1 = 0.5 * glsum(sum_e1,1) / volvm1 - e2 = 0.5 * glsum(sum_e2,1) / volvm1 - if (nid.eq.0) write(6,2) time, e1, e2 - 2 format(1p3e13.4,' monitor') - return end c----------------------------------------------------------------------- diff --git a/examples/turbPipe/turbPipe.udf b/examples/turbPipe/turbPipe.udf index a76e690ce..3fd9de850 100644 --- a/examples/turbPipe/turbPipe.udf +++ b/examples/turbPipe/turbPipe.udf @@ -4,7 +4,7 @@ #include #include "udf.hpp" #include "plugins/velRecycling.hpp" -#include "plugins/avg.hpp" +#include "plugins/tavg.hpp" static dfloat ZLENGTH; static int NSLABS; @@ -15,7 +15,7 @@ static int NELSLAB; void UDF_LoadKernels(occa::properties& kernelInfo) { velRecycling::buildKernel(kernelInfo); - avg::buildKernel(kernelInfo); + tavg::buildKernel(kernelInfo); } void UDF_Setup(nrs_t *nrs) @@ -31,7 +31,7 @@ void UDF_Setup(nrs_t *nrs) const hlong offset = NELSLAB * round(NSLABS * zRecycLayer/ZLENGTH); velRecycling::setup(nrs, nrs->o_usrwrk, offset, bID, wbar); - avg::setup(nrs); + tavg::setup(nrs); } void UDF_Setup0(MPI_Comm comm, setupAide &options) { @@ -51,13 +51,13 @@ void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep) mesh_t *mesh = nrs->meshV; velRecycling::copy(); - avg::run(time); + tavg::run(time); if (nrs->isOutputStep) { occa::memory o_UZ = nrs->o_U + 2*nrs->fieldOffset * sizeof(dfloat); const dfloat ubar = platform->linAlg->innerProd(mesh->Nlocal, o_UZ, mesh->o_LMM, platform->comm.mpiComm)/mesh->volume; if (platform->comm.mpiRank == 0) printf(" uBulk: %g\n", ubar); - avg::outfld(); + tavg::outfld(); } } diff --git a/examples/turbPipePeriodic/README.md b/examples/turbPipePeriodic/README.md index 1f225e893..8e172b80e 100644 --- a/examples/turbPipePeriodic/README.md +++ b/examples/turbPipePeriodic/README.md @@ -1,14 +1,111 @@ # LES of a round turbulent pipe flow at Re_tau=550. -## Intra-Node Performance Results --cimode=1 +## Performance Results for `--cimode=1` -| CPU/GPU | Total Solve [s] 200 steps | -| ---------------------- | --------------------------| -| Nvidia A100 (1/2/4) | 17.3/11.7/8.5 | -| Nvidia V100 (1/2/4) | 26.2/15.4/9.4 | -| AMD MI100 (1/2/4) | 29.7/?/? | -| AMD EPYC 7742 (1/2) | 131.8/67.4 | +### NVIDIA V100 +``` + elapsedStepSum 2.71067e+01s 0.34 + solve 2.70991e+01s 0.34 + min 6.82823e-02s + max 1.12194e-01s + flop/s 8.78751e+11 + makef 7.90189e+00s 0.29 200 + udfUEqnSource 7.96262e-03s 0.00 200 + udfProperties 4.81280e-04s 0.00 201 + velocitySolve 6.94172e+00s 0.26 200 + rhs 5.17492e-01s 0.07 200 + pressureSolve 1.14328e+01s 0.42 200 + rhs 1.02998e+00s 0.09 200 + preconditioner 8.31936e+00s 0.73 784 + pMG smoother 6.30344e+00s 0.76 3136 + coarse grid 1.13952e+00s 0.14 784 + initial guess 5.96070e-01s 0.05 200 +``` + +### NVIDIA A100 +``` + elapsedStepSum 1.67510e+01s 0.28 + solve 1.67437e+01s 0.28 + min 4.11823e-02s + max 6.82533e-02s + flop/s 1.42219e+12 + + makef 5.19625e+00s 0.31 200 + udfUEqnSource 5.15341e-03s 0.00 200 + udfProperties 4.99168e-04s 0.00 201 + velocitySolve 4.17803e+00s 0.25 200 + rhs 2.99974e-01s 0.07 200 + pressureSolve 7.00880e+00s 0.42 200 + rhs 5.76952e-01s 0.08 200 + preconditioner 5.15284e+00s 0.74 784 + pMG smoother 3.77986e+00s 0.73 3136 + coarse grid 8.23711e-01s 0.16 784 + initial guess 3.66708e-01s 0.05 200 +``` + +### AMD MI100 +``` + elapsedStepSum 3.07072e+01s 0.40 + solve 3.06957e+01s 0.39 + min 7.14650e-02s + max 1.19184e-01s + flop/s 7.74037e+11 + + makef 1.05012e+01s 0.34 200 + udfUEqnSource 8.05440e-03s 0.00 200 + udfProperties 1.14064e-03s 0.00 201 + velocitySolve 6.90001e+00s 0.22 200 + rhs 4.90322e-01s 0.07 200 + pressureSolve 1.24524e+01s 0.41 200 + rhs 1.02992e+00s 0.08 200 + preconditioner 9.23485e+00s 0.74 776 + pMG smoother 7.16544e+00s 0.78 3104 + coarse grid 1.12134e+00s 0.12 776 + initial guess 6.34976e-01s 0.05 200 +``` + +### AMD MI250X/1 +``` + elapsedStepSum 2.35982e+01s 0.11 + solve 2.35922e+01s 0.11 + min 5.87667e-02s + max 9.92639e-02s + flop/s 1.00710e+12 + + makef 6.85422e+00s 0.29 200 + udfUEqnSource 6.26127e-03s 0.00 200 + udfProperties 9.04159e-04s 0.00 201 + velocitySolve 5.38714e+00s 0.23 200 + rhs 3.83831e-01s 0.07 200 + pressureSolve 1.05776e+01s 0.45 200 + rhs 8.74453e-01s 0.08 200 + preconditioner 7.89478e+00s 0.75 776 + pMG smoother 6.02886e+00s 0.76 3104 + coarse grid 1.08808e+00s 0.14 776 + initial guess 5.38374e-01s 0.05 200 +``` + +### AMD EPYC 7742 +``` + elapsedStepSum 1.41905e+02s 0.74 + solve 1.41852e+02s 0.74 + min 3.10468e-01s + max 5.58652e-01s + flop/s 1.67737e+11 + + makef 5.93037e+01s 0.42 200 + udfUEqnSource 2.25468e-02s 0.00 200 + udfProperties 1.28648e-03s 0.00 201 + velocitySolve 3.66557e+01s 0.26 200 + rhs 4.64122e+00s 0.13 200 + pressureSolve 4.11335e+01s 0.29 200 + rhs 5.57414e+00s 0.14 200 + preconditioner 2.60316e+01s 0.63 779 + pMG smoother 2.17306e+01s 0.83 3116 + coarse grid 6.11829e-01s 0.02 779 + initial guess 3.06133e+00s 0.07 200 +``` ## Reference Data [1] ftp://ftp.mech.kth.se/pub/pschlatt/DATA/PIPE/ diff --git a/examples/turbPipePeriodic/ci.inc b/examples/turbPipePeriodic/ci.inc index ed44d8990..c3929acce 100644 --- a/examples/turbPipePeriodic/ci.inc +++ b/examples/turbPipePeriodic/ci.inc @@ -55,8 +55,4 @@ void ciTestErrors(nrs_t *nrs, dfloat time, int tstep) { const int rank = platform->comm.mpiRank; mesh_t* mesh = nrs->meshV; - - platform->timer.printRunStat(tstep); - - PASS; } diff --git a/examples/turbPipePeriodic/turbPipe.par b/examples/turbPipePeriodic/turbPipe.par index 3454daace..50447a35d 100644 --- a/examples/turbPipePeriodic/turbPipe.par +++ b/examples/turbPipePeriodic/turbPipe.par @@ -1,4 +1,5 @@ [GENERAL] +verbose = true polynomialOrder = 7 #startFrom = "restart.fld" stopAt = endTime diff --git a/examples/turbPipePeriodic/turbPipe.udf b/examples/turbPipePeriodic/turbPipe.udf index 27e79f636..319d1df7e 100644 --- a/examples/turbPipePeriodic/turbPipe.udf +++ b/examples/turbPipePeriodic/turbPipe.udf @@ -54,21 +54,6 @@ void UDF_Setup(nrs_t *nrs) void UDF_ExecuteStep(nrs_t *nrs, dfloat time, int tstep) { - if(platform->options.compareArgs("CONSTANT FLOW RATE", "TRUE")){ - mesh_t * mesh = nrs->meshV; - occa::memory o_Uz = nrs->o_U + 2 * nrs->fieldOffset * sizeof(dfloat); - const dfloat ubar = platform->linAlg->innerProd(mesh->Nlocal, o_Uz, mesh->o_LMM, platform->comm.mpiComm)/mesh->volume; - - dfloat expectedUbar; - platform->options.getArgs("FLOW RATE", expectedUbar); - - if(platform->comm.mpiRank == 0 && tstep > 0){ - printf("constantFlowScale = %g, flowRateError = %g\n", - ConstantFlowRate::scaleFactor(), - std::abs(ubar - expectedUbar)); - } - fflush(stdout); - } if (nrs->lastStep) if (ciMode) ciTestErrors(nrs, time, tstep); } diff --git a/git-hooks/README.md b/git-hooks/README.md new file mode 100644 index 000000000..af041f7cc --- /dev/null +++ b/git-hooks/README.md @@ -0,0 +1 @@ +Running `./install-hooks` from the `git-hooks` directory once will install of the hooks in `.git/hooks`. diff --git a/git-hooks/install-hooks b/git-hooks/install-hooks new file mode 100755 index 000000000..56e367fb5 --- /dev/null +++ b/git-hooks/install-hooks @@ -0,0 +1,2 @@ +#!/bin/bash +cp * ../.git/hooks diff --git a/git-hooks/pre-commit b/git-hooks/pre-commit new file mode 100755 index 000000000..db22f8963 --- /dev/null +++ b/git-hooks/pre-commit @@ -0,0 +1,5 @@ +#!/bin/bash +for hook in $(find .git/hooks/ -name 'pre-commit-*') +do + ${hook} +done diff --git a/git-hooks/pre-commit-clang-format b/git-hooks/pre-commit-clang-format new file mode 100755 index 000000000..ad54fe4b0 --- /dev/null +++ b/git-hooks/pre-commit-clang-format @@ -0,0 +1,7 @@ +#!/bin/bash + +git-clang-format --style=file | tail -n +2 > .git-clang-format-changed-files +while read p; do + git add $p +done < .git-clang-format-changed-files +rm .git-clang-format-changed-files diff --git a/nrsconfig b/nrsconfig index b034b6391..d69f39cb8 100755 --- a/nrsconfig +++ b/nrsconfig @@ -6,20 +6,24 @@ set -e -a : ${NEKRS_CC:="mpicc"} : ${NEKRS_CXX:="mpic++"} : ${NEKRS_FC:="mpif77"} -: ${NEKRS_COMPILER_FLAGS:=""} +: ${NEKRS_COMPILER_FLAGS:="-O2 -g -march=native -mtune=native -ftree-vectorize"} : ${NEKRS_GPU_MPI:=1} : ${OCCA_CXX:="g++"} -: ${OCCA_CXXFLAGS:="-O3 -march=native -mtune=native"} -: ${OCCA_CUDA_COMPILER_FLAGS:="-O3 --fmad=true"} -: ${OCCA_HIP_COMPILER_FLAGS:="-O3 -ffp-contract=fast"} +: ${OCCA_CXXFLAGS:="-O3 -g -march=native -mtune=native"} +: ${OCCA_CUDA_COMPILER_FLAGS:="-O3 --fmad=true -lineinfo"} +: ${OCCA_HIP_COMPILER_FLAGS:="-O3 -g -ffp-contract=fast"} : ${OCCA_OPENCL_COMPILER_FLAGS:="-cl-std=CL2.0 -cl-mad-enable -cl-no-signed-zeros"} : ${OCCA_ENABLE_CUDA:=1} : ${OCCA_ENABLE_HIP:=1} -: ${OCCA_ENABLE_OPENCL:=1} -: ${AMGX_ENABLED:=0} +: ${OCCA_ENABLE_OPENCL:=0} +: ${ENABLE_AMGX:=0} + +if uname -a | grep 'ppc64'; then + NEKRS_COMPILER_FLAGS=$(echo "$NEKRS_COMPILER_FLAGS" | sed -e "s/march/mcpu/g") +fi : ${NEKRS_CXXFLAGS:="${NEKRS_COMPILER_FLAGS}"} : ${NEKRS_CFLAGS:="${NEKRS_COMPILER_FLAGS}"} @@ -32,15 +36,18 @@ NEKRS_BUILD_DIR=`pwd`/build mkdir -p "${NEKRS_BUILD_DIR}" > /dev/null 2>&1 cd ${NEKRS_BUILD_DIR} +export CUDAHOSTCXX=${NEKRS_CXX} + cmake -Wno-dev \ -Wfatal-errors \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ -DCMAKE_BUILD_TYPE="RelWithDebInfo" \ -DCMAKE_Fortran_COMPILER="${NEKRS_FC}" \ -DCMAKE_C_COMPILER="${NEKRS_CC}" \ -DCMAKE_CXX_COMPILER="${NEKRS_CXX}" \ - -DCMAKE_Fortran_FLAGS="${NEKRS_FFLAGS}" \ - -DCMAKE_C_FLAGS="${NEKRS_CFLAGS}" \ - -DCMAKE_CXX_FLAGS="${NEKRS_CXXFLAGS}" \ + -DCMAKE_Fortran_FLAGS_RELWITHDEBINFO="${NEKRS_FFLAGS}" \ + -DCMAKE_C_FLAGS_RELWITHDEBINFO="${NEKRS_CFLAGS}" \ + -DCMAKE_CXX_FLAGS_RELWITHDEBINFO="${NEKRS_CFLAGS}" \ -DCMAKE_INSTALL_PREFIX="${NEKRS_INSTALL_DIR}" \ -DOCCA_CXX="${OCCA_CXX}" \ -DOCCA_CXXFLAGS="${OCCA_CXXFLAGS}" \ @@ -51,8 +58,7 @@ cmake -Wno-dev \ -DENABLE_HIP="${OCCA_ENABLE_HIP}" \ -DENABLE_OPENCL="${OCCA_ENABLE_OPENCL}" \ -DGPU_MPI="${NEKRS_GPU_MPI}" \ - -DHYPRE_BIGINT=1 \ - -DENABLE_AMGX="${AMGX_ENABLED}" \ + -DENABLE_AMGX="${ENABLE_AMGX}" \ .. if [ $? -eq 0 ]; then diff --git a/okl/bench/advsub/readCubDMatrix.okl b/okl/bench/advsub/readCubDMatrix.okl new file mode 100644 index 000000000..f37423c75 --- /dev/null +++ b/okl/bench/advsub/readCubDMatrix.okl @@ -0,0 +1,10 @@ +@kernel void readCubDMatrix(@restrict dfloat * cubD) +{ + for(dlong threadBlock = 0; threadBlock < 1; ++threadBlock; @outer(0)) { + for(int j = 0; j < p_cubNq; ++j; @inner(1)) { + for(int i = 0; i < p_cubNq; ++i; @inner(0)) { + cubD[j * p_cubNq + i] = c_D[j][i]; + } + } + } +} \ No newline at end of file diff --git a/okl/bench/advsub/readIMatrix.okl b/okl/bench/advsub/readIMatrix.okl new file mode 100644 index 000000000..04c1d9d3f --- /dev/null +++ b/okl/bench/advsub/readIMatrix.okl @@ -0,0 +1,10 @@ +@kernel void readIMatrix(@restrict dfloat * cubInterpT) +{ + for(dlong threadBlock = 0; threadBlock < 1; ++threadBlock; @outer(0)) { + for(int j = 0; j < p_cubNq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + cubInterpT[j + i * p_cubNq] = c_I[j][i]; + } + } + } +} \ No newline at end of file diff --git a/okl/cds/advectMeshVelocityHex3D.okl b/okl/cds/advectMeshVelocityHex3D.okl index 43a2caa30..e6f504851 100644 --- a/okl/cds/advectMeshVelocityHex3D.okl +++ b/okl/cds/advectMeshVelocityHex3D.okl @@ -45,7 +45,9 @@ @exclusive dfloat r_SWy[p_Nq]; @exclusive dfloat r_SWz[p_Nq]; +#ifdef smXX #pragma unroll +#endif for(int k = 0; k < p_Nq; ++k){ @barrier("local"); @@ -136,4 +138,4 @@ } } } -} \ No newline at end of file +} diff --git a/okl/cds/dirichletBC.okl b/okl/cds/dirichletBC.okl new file mode 100644 index 000000000..bf620ebc6 --- /dev/null +++ b/okl/cds/dirichletBC.okl @@ -0,0 +1,81 @@ +/* + + The MIT License (MIT) + + Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + */ + +//RHS contributions for continuous solver + +@kernel void dirichletBC(const dlong Nelements, + const dlong offset, + const dlong scalarId, + const dfloat time, + @restrict const dfloat* sgeo, + @restrict const dfloat* x, + @restrict const dfloat* y, + @restrict const dfloat* z, + @restrict const dlong* vmapM, + @restrict const int* EToBM, + @restrict const int* EToB, + @restrict const dfloat* W, + @restrict dfloat* S) +{ + for(dlong e = 0; e < Nelements; e++; @outer(0)) + for(int f = 0; f < p_Nfaces; f++) { + @barrier("global"); + for(int m = 0; m < p_Nfp; ++m; @inner(0)) { + struct bcData bc; + const int n = m + f * p_Nfp; + const dlong sk = e * p_Nfp * p_Nfaces + n; + const dlong idM = vmapM[sk]; + const dlong bcType = EToB[f + p_Nfaces * e]; + + if(bcType == 1) { + bc.id = EToBM[f + p_Nfaces * e]; + bc.idM = idM; + bc.time = time; + bc.x = x[idM]; + bc.y = y[idM]; + bc.z = z[idM]; + bc.s = S[idM]; + bc.wrk = W; + bc.fieldOffset = offset; + bc.scalarId = scalarId; + + bc.nx = sgeo[sk * p_Nsgeo + p_NXID]; + bc.ny = sgeo[sk * p_Nsgeo + p_NYID]; + bc.nz = sgeo[sk * p_Nsgeo + p_NZID]; + + bc.t1x = sgeo[sk * p_Nsgeo + p_T1XID]; + bc.t1y = sgeo[sk * p_Nsgeo + p_T1YID]; + bc.t1z = sgeo[sk * p_Nsgeo + p_T1ZID]; + + bc.t2x = sgeo[sk * p_Nsgeo + p_T2XID]; + bc.t2y = sgeo[sk * p_Nsgeo + p_T2YID]; + bc.t2z = sgeo[sk * p_Nsgeo + p_T2ZID]; + scalarDirichletConditions(&bc); + S[idM] = bc.s; + } + } + } +} diff --git a/okl/cds/helmholtzBCHex3D.okl b/okl/cds/helmholtzBCHex3D.okl index 9b8273d5b..dd239ff14 100644 --- a/okl/cds/helmholtzBCHex3D.okl +++ b/okl/cds/helmholtzBCHex3D.okl @@ -33,6 +33,12 @@ bc.nx = sgeo[sk * p_Nsgeo + p_NXID]; \ bc.ny = sgeo[sk * p_Nsgeo + p_NYID]; \ bc.nz = sgeo[sk * p_Nsgeo + p_NZID]; \ + bc.t1x = sgeo[sk * p_Nsgeo + p_T1XID]; \ + bc.t1y = sgeo[sk * p_Nsgeo + p_T1YID]; \ + bc.t1z = sgeo[sk * p_Nsgeo + p_T1ZID]; \ + bc.t2x = sgeo[sk * p_Nsgeo + p_T2XID]; \ + bc.t2y = sgeo[sk * p_Nsgeo + p_T2YID]; \ + bc.t2z = sgeo[sk * p_Nsgeo + p_T2ZID]; \ bc.x = x[bc.idM]; \ bc.y = y[bc.idM]; \ bc.z = z[bc.idM]; \ @@ -51,6 +57,7 @@ } //RHS contributions for continuous solver + @kernel void helmholtzBCHex3D(const dlong Nelements, @restrict const dfloat* sgeo, @restrict const dlong* vmapM, @@ -224,49 +231,3 @@ } } } - -@kernel void dirichletBC(const dlong Nelements, - const dlong offset, - const dlong scalarId, - const dfloat time, - @restrict const dfloat* sgeo, - @restrict const dfloat* x, - @restrict const dfloat* y, - @restrict const dfloat* z, - @restrict const dlong* vmapM, - @restrict const int* EToBM, - @restrict const int* EToB, - @restrict const dfloat* W, - @restrict dfloat* S) -{ - for(dlong e = 0; e < Nelements; e++; @outer(0)) - for(int f = 0; f < p_Nfaces; f++) { - @barrier("global"); - for(int m = 0; m < p_Nfp; ++m; @inner(0)) { - struct bcData bc; - const int n = m + f * p_Nfp; - const dlong sk = e * p_Nfp * p_Nfaces + n; - const dlong idM = vmapM[sk]; - const dlong bcType = EToB[f + p_Nfaces * e]; - - if(bcType == 1) { - bc.id = EToBM[f + p_Nfaces * e]; - bc.idM = idM; - bc.time = time; - bc.x = x[idM]; - bc.y = y[idM]; - bc.z = z[idM]; - bc.s = S[idM]; - bc.wrk = W; - bc.fieldOffset = offset; - bc.scalarId = scalarId; - - bc.nx = sgeo[sk * p_Nsgeo + p_NXID]; - bc.ny = sgeo[sk * p_Nsgeo + p_NYID]; - bc.nz = sgeo[sk * p_Nsgeo + p_NZID]; - scalarDirichletConditions(&bc); - S[idM] = bc.s; - } - } - } -} diff --git a/okl/cds/regularization/relativeMassHighestMode.okl b/okl/cds/regularization/relativeMassHighestMode.okl index ca855d061..6bc5f013b 100644 --- a/okl/cds/regularization/relativeMassHighestMode.okl +++ b/okl/cds/regularization/relativeMassHighestMode.okl @@ -19,7 +19,7 @@ for(int j = 0; j < p_Nq; ++j; @inner(1)) { for(int i = 0; i < p_Nq; ++i; @inner(0)) { const int id = i + j * p_Nq; - s_FT[0][id] = fMT[id + is * p_Nq * p_Nq]; + s_FT[j][i] = fMT[id + is * p_Nq * p_Nq]; #pragma unroll p_Nq for(int k = 0; k < p_Nq; ++k) @@ -56,8 +56,9 @@ @barrier("local"); -// loop around slices +#ifdef smXX #pragma unroll p_Nq +#endif for(int k = 0; k < p_Nq; ++k) { //load slice to @shared for(int j = 0; j < p_Nq; ++j; @inner(1)) diff --git a/okl/cds/advectionHex3D.okl b/okl/cds/strongAdvectionCubatureVolumeHex3D.okl similarity index 73% rename from okl/cds/advectionHex3D.okl rename to okl/cds/strongAdvectionCubatureVolumeHex3D.okl index 65622af30..29fcfca95 100644 --- a/okl/cds/advectionHex3D.okl +++ b/okl/cds/strongAdvectionCubatureVolumeHex3D.okl @@ -24,74 +24,6 @@ */ -@kernel void strongAdvectionVolumeHex3D(const dlong Nelements, - @restrict const dfloat* vgeo, - @restrict const dfloat* D, - const dlong voffset, - const dlong soffset, - @restrict const dfloat* S, - @restrict const dfloat* Urst, - @restrict const dfloat* RHO, - @restrict dfloat* NS) -{ - for(dlong e = 0; e < Nelements; ++e; @outer(0)) { - @shared dfloat s_S[p_Nq][p_Nq]; - @exclusive dfloat s_Sloc[p_Nq]; - @shared dfloat s_D[p_Nq][p_Nq]; - - #pragma unroll p_Nq - for(int k = 0; k < p_Nq; ++k){ - for(int j = 0; j < p_Nq; ++j; @inner(1)){ - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - dlong id = i + j * p_Nq; - if(k == 0) - s_D[j][i] = D[id]; - - id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - - const dfloat Sn = S[id + soffset]; - s_S[j][i] = Sn; - if(k == 0){ - #pragma unroll p_Nq - for(int l = 0 ; l < p_Nq; ++l){ - const dlong offset = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i + soffset; - s_Sloc[l] = S[offset]; - } - } - } - } - - @barrier("local"); - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - dfloat dSdr = 0, dSds = 0, dSdt = 0; - -#pragma unroll p_Nq - for (int n = 0; n < p_Nq; n++) { - const dfloat Dr = s_D[i][n]; - const dfloat Ds = s_D[j][n]; - const dfloat Dt = s_D[k][n]; - dSdr += Dr * s_S[j][n]; - dSds += Ds * s_S[n][i]; - dSdt += Dt * s_Sloc[n]; - } - - const dfloat Uhat = Urst[id + 0 * voffset]; - const dfloat Vhat = Urst[id + 1 * voffset]; - const dfloat What = Urst[id + 2 * voffset]; - - const dfloat rhoM = RHO[id + soffset]; - const dlong gid = e * p_Np * p_Nvgeo + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat IJW = vgeo[gid + p_IJWID * p_Np]; - - NS[id] = IJW * rhoM * (Uhat * dSdr + Vhat * dSds + What * dSdt); - } - } - } - } -} @kernel void strongAdvectionCubatureVolumeHex3D(const dlong Nelements, @restrict const dfloat* vgeo, @@ -140,7 +72,6 @@ for(int b = 0; b < p_cubNq; ++b; @inner(1)) for(int a = 0; a < p_cubNq; ++a; @inner(0)) if(a < p_Nq && b < p_Nq) { - // this can be improved const dlong id = e * p_Np + c * p_Nq * p_Nq + b * p_Nq + a; // U.grad U s_S[b][a] = S[id + soffset]; @@ -187,7 +118,9 @@ } } +#ifdef smXX #pragma unroll p_cubNq +#endif for(int k = 0; k < p_cubNq; ++k) { @barrier("local"); diff --git a/okl/cds/strongAdvectionVolumeHex3D.okl b/okl/cds/strongAdvectionVolumeHex3D.okl new file mode 100644 index 000000000..bc08488e5 --- /dev/null +++ b/okl/cds/strongAdvectionVolumeHex3D.okl @@ -0,0 +1,98 @@ +/* + + The MIT License (MIT) + + Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + */ + + +@kernel void strongAdvectionVolumeHex3D(const dlong Nelements, + @restrict const dfloat* vgeo, + @restrict const dfloat* D, + const dlong voffset, + const dlong soffset, + @restrict const dfloat* S, + @restrict const dfloat* Urst, + @restrict const dfloat* RHO, + @restrict dfloat* NS) +{ + for(dlong e = 0; e < Nelements; ++e; @outer(0)) { + @shared dfloat s_S[p_Nq][p_Nq]; + @exclusive dfloat s_Sloc[p_Nq]; + @shared dfloat s_D[p_Nq][p_Nq]; + +#ifdef smXX + #pragma unroll p_Nq +#endif + for(int k = 0; k < p_Nq; ++k){ + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)){ + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + dlong id = i + j * p_Nq; + if(k == 0) + s_D[j][i] = D[id]; + + id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat Sn = S[id + soffset]; + s_S[j][i] = Sn; + if(k == 0){ + #pragma unroll p_Nq + for(int l = 0 ; l < p_Nq; ++l){ + const dlong offset = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i + soffset; + s_Sloc[l] = S[offset]; + } + } + } + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + dfloat dSdr = 0, dSds = 0, dSdt = 0; + +#pragma unroll p_Nq + for (int n = 0; n < p_Nq; n++) { + const dfloat Dr = s_D[i][n]; + const dfloat Ds = s_D[j][n]; + const dfloat Dt = s_D[k][n]; + dSdr += Dr * s_S[j][n]; + dSds += Ds * s_S[n][i]; + dSdt += Dt * s_Sloc[n]; + } + + const dfloat Uhat = Urst[id + 0 * voffset]; + const dfloat Vhat = Urst[id + 1 * voffset]; + const dfloat What = Urst[id + 2 * voffset]; + + const dfloat rhoM = RHO[id + soffset]; + const dlong gid = e * p_Np * p_Nvgeo + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat IJW = vgeo[gid + p_IJWID * p_Np]; + + NS[id] = IJW * rhoM * (Uhat * dSdr + Vhat * dSds + What * dSdt); + } + } + } + } +} diff --git a/okl/cds/subCycleHex3D.okl b/okl/cds/subCycleHex3D.okl deleted file mode 100644 index ce597c6d7..000000000 --- a/okl/cds/subCycleHex3D.okl +++ /dev/null @@ -1,356 +0,0 @@ -/* - - The MIT License (MIT) - - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - */ -@kernel void subCycleStrongCubatureVolumeHex3D(const dlong Nelements, - @restrict const dlong* elementList, - @restrict const dfloat* cubD, - @restrict const dfloat* cubInterpT, - const dlong offset, - const dlong cubatureOffset, - const dlong NSOffset, - @restrict const dfloat* invLumpedMassMatrix, - @restrict const dfloat* BdivW, - const dfloat c0, - const dfloat c1, - const dfloat c2, - @restrict const dfloat* conv, - @restrict const dfloat* S, - @restrict dfloat* NU) -{ - // (phi, U.grad Ud) - for(dlong e = 0; e < Nelements; ++e; @outer(0)) { - @shared dfloat s_cubD[p_cubNq][p_cubNq]; - @shared dfloat s_cubInterpT[p_Nq][p_cubNq]; - - @shared dfloat s_U[p_cubNq][p_cubNq]; - - @shared dfloat s_Ud[p_cubNq][p_cubNq]; - - @shared dfloat s_Ud1[p_Nq][p_cubNq]; - - @exclusive dfloat r_U[p_cubNq], r_V[p_cubNq], r_W[p_cubNq]; - @exclusive dfloat r_U2[p_cubNq]; - @exclusive dfloat r_Ud[p_cubNq]; - - @exclusive dfloat r_c[p_nEXT]; - - @exclusive dlong element; - - for(int j = 0; j < p_cubNq; ++j; @inner(1)) { - for(int i = 0; i < p_cubNq; ++i; @inner(0)) { - const int id = i + j * p_cubNq; - element = elementList[e]; - -#pragma unroll p_nEXT - for (int s = 0; s < p_nEXT; s++) { - dfloat coeff = 0; - if(s == 0) coeff = c0; - if(s == 1) coeff = c1; - if(s == 2) coeff = c2; - r_c[s] = coeff; - } - - if (id < p_Nq * p_cubNq) { - s_cubInterpT[j][i] = cubInterpT[id]; - } - - s_cubD[j][i] = cubD[id]; -#pragma unroll p_cubNq - for(int k = 0; k < p_cubNq; ++k) { - dfloat Ue = 0.0; - dfloat Ve = 0.0; - dfloat We = 0.0; - const int id = element * p_cubNp + k * p_cubNq * p_cubNq + j * p_cubNq + i; - for(int s = 0 ; s < p_nEXT; ++s){ - const int s_offset = s * p_NVfields * cubatureOffset; - Ue += r_c[s] * conv[id + 0 * cubatureOffset + s_offset]; - Ve += r_c[s] * conv[id + 1 * cubatureOffset + s_offset]; - We += r_c[s] * conv[id + 2 * cubatureOffset + s_offset]; - } - r_U[k] = Ue; - r_V[k] = Ve; - r_W[k] = We; - r_Ud[k] = 0; - } - } - } - - for(int c = 0; c < p_Nq; ++c) { - @barrier("local"); - - for(int b = 0; b < p_cubNq; ++b; @inner(1)) - for(int a = 0; a < p_cubNq; ++a; @inner(0)) - if(a < p_Nq && b < p_Nq) { - // this can be improved - const dlong id = element * p_Np + c * p_Nq * p_Nq + b * p_Nq + a; - - s_Ud[b][a] = S[id]; - } - - @barrier("local"); - - // interpolate in 'r' - for(int b = 0; b < p_cubNq; ++b; @inner(1)) - for(int i = 0; i < p_cubNq; ++i; @inner(0)) - if(b < p_Nq) { - dfloat Ud1 = 0; - - for(int a = 0; a < p_Nq; ++a) { - dfloat Iia = s_cubInterpT[a][i]; - Ud1 += Iia * s_Ud[b][a]; - } - - s_Ud1[b][i] = Ud1; - } - - @barrier("local"); - - // interpolate in 's' - for(int j = 0; j < p_cubNq; ++j; @inner(1)) { - for(int i = 0; i < p_cubNq; ++i; @inner(0)) { - dfloat Ud2 = 0; - - // interpolate in b - for(int b = 0; b < p_Nq; ++b) { - dfloat Ijb = s_cubInterpT[b][j]; - Ud2 += Ijb * s_Ud1[b][i]; - } - - // interpolate in c progressively -#pragma unroll p_cubNq - for(int k = 0; k < p_cubNq; ++k) { - dfloat Ikc = s_cubInterpT[c][k]; - - r_Ud[k] += Ikc * Ud2; - } - } - } - } - -#pragma unroll p_cubNq - for(int k = 0; k < p_cubNq; ++k) { - @barrier("local"); - - for(int j = 0; j < p_cubNq; ++j; @inner(1)) - for(int i = 0; i < p_cubNq; ++i; @inner(0)) { - s_Ud[j][i] = r_Ud[k]; - } - - @barrier("local"); - - for(int j = 0; j < p_cubNq; ++j; @inner(1)) - for(int i = 0; i < p_cubNq; ++i; @inner(0)) { - dfloat Udr = 0, Uds = 0, Udt = 0; - - for(int n = 0; n < p_cubNq; ++n) { - dfloat Din = s_cubD[i][n]; - Udr += Din * s_Ud[j][n]; - } - - for(int n = 0; n < p_cubNq; ++n) { - dfloat Djn = s_cubD[j][n]; - Uds += Djn * s_Ud[n][i]; - } - - for(int n = 0; n < p_cubNq; ++n) { - dfloat Dkn = s_cubD[k][n]; - Udt += Dkn * r_Ud[n]; - } - - const dfloat Uhat = r_U[k]; - const dfloat Vhat = r_V[k]; - const dfloat What = r_W[k]; - - // U*dUdx + V*dUdy + W*dUdz = (U*(drdx*dUdr+dsdx*dUds+dtdx*dUdt) + V*(drdy*dUdr ..)) - - // I_f^t*(J_f*C_f^t)*G_f*\hat{D}_f*I_f*u - r_U2[k] = Uhat * Udr + Vhat * Uds + What * Udt; - } - } - - // now project back in t - for(int c = 0; c < p_Nq; ++c) { - @barrier("local"); - - for(int j = 0; j < p_cubNq; ++j; @inner(1)) { - for(int i = 0; i < p_cubNq; ++i; @inner(0)) { - dfloat rhsU = 0; - -#pragma unroll p_cubNq - for(int k = 0; k < p_cubNq; ++k) { - dfloat Ikc = s_cubInterpT[c][k]; - rhsU += Ikc * r_U2[k]; - } - - s_U[j][i] = rhsU; - } - } - - @barrier("local"); - - for(int b = 0; b < p_cubNq; ++b; @inner(1)) - for(int i = 0; i < p_cubNq; ++i; @inner(0)) - if(b < p_Nq) { - dfloat rhsU = 0; - - for(int j = 0; j < p_cubNq; ++j) { - dfloat Ijb = s_cubInterpT[b][j]; - rhsU += Ijb * s_U[j][i]; - } - - s_Ud[b][i] = rhsU; - } - - @barrier("local"); - - for(int b = 0; b < p_cubNq; ++b; @inner(1)) - for(int a = 0; a < p_cubNq; ++a; @inner(0)) - if(a < p_Nq && b < p_Nq) { - dfloat rhsU = 0; - - for(int i = 0; i < p_cubNq; ++i) { - dfloat Iia = s_cubInterpT[a][i]; - rhsU += Iia * s_Ud[b][i]; - } - - const dlong id = element * p_Np + c * p_Nq * p_Nq + b * p_Nq + a; - dfloat invLMM = p_MovingMesh ? 0.0 : invLumpedMassMatrix[id]; - dfloat bdivw = 0.0; - if(p_MovingMesh){ - #pragma unroll - for (int s = 0; s < p_nEXT; s++) { - invLMM += r_c[s] * invLumpedMassMatrix[id + s * offset]; - bdivw += r_c[s] * BdivW[id + s * offset]; - } - } - - NU[id + NSOffset] = (rhsU - bdivw * S[id]) * invLMM; - } - } - } -} - -@kernel void subCycleStrongVolumeHex3D(const dlong Nelements, - @restrict const dlong* elementList, - @restrict const dfloat* D, - const dlong offset, - const dlong NSOffset, - @restrict const dfloat* invLumpedMassMatrix, - @restrict const dfloat* BdivW, - const dfloat c0, - const dfloat c1, - const dfloat c2, - @restrict const dfloat* conv, - @restrict const dfloat* S, - @restrict dfloat* NU) -{ - for(dlong e = 0; e < Nelements; ++e; @outer(0)) { - @shared dfloat s_Ud[p_Nq][p_Nq]; - @exclusive dfloat s_Udloc[p_Nq]; - - @shared dfloat s_D[p_Nq][p_Nq]; - - @exclusive dfloat r_c[p_nEXT]; - - @exclusive dlong element; - - #pragma unroll p_Nq - for(int k = 0; k < p_Nq; ++k){ - for(int j = 0; j < p_Nq; ++j; @inner(1)){ - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - dlong id = i + j * p_Nq; - element = elementList[e]; - -#pragma unroll p_nEXT - for (int s = 0; s < p_nEXT; s++) { - dfloat coeff = 0; - if(s == 0) coeff = c0; - if(s == 1) coeff = c1; - if(s == 2) coeff = c2; - r_c[s] = coeff; - } - - if(k == 0) - s_D[j][i] = D[id]; - - id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - - const dfloat Udn = S[id]; - - s_Ud[j][i] = Udn; - if(k == 0){ - #pragma unroll p_Nq - for(int l = 0 ; l < p_Nq; ++l){ - const dlong other_id = element * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; - const dfloat locUdn = S[other_id]; - s_Udloc[l] = locUdn; - } - } - } - } - - @barrier("local"); - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - - dfloat duddr = 0, dudds = 0, duddt = 0; - -#pragma unroll p_Nq - for (int n = 0; n < p_Nq; n++) { - const dfloat Dr = s_D[i][n]; - const dfloat Ds = s_D[j][n]; - const dfloat Dt = s_D[k][n]; - duddr += Dr * s_Ud[j][n]; - dudds += Ds * s_Ud[n][i]; - duddt += Dt * s_Udloc[n]; - - } - - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - - dfloat invLMM = p_MovingMesh ? 0.0 : invLumpedMassMatrix[id]; - dfloat bdivw = 0.0; - dfloat Ue = 0, Ve = 0, We = 0; -#pragma unroll p_nEXT - for (int s = 0; s < p_nEXT; s++) { - dfloat Um = conv[id + 0 * offset + s * p_NVfields * offset]; - dfloat Vm = conv[id + 1 * offset + s * p_NVfields * offset]; - dfloat Wm = conv[id + 2 * offset + s * p_NVfields * offset]; - Ue += r_c[s] * Um; - Ve += r_c[s] * Vm; - We += r_c[s] * Wm; - if(p_MovingMesh){ - invLMM += r_c[s] * invLumpedMassMatrix[id + s * offset]; - bdivw += r_c[s] * BdivW[id + s * offset]; - } - } - - NU[id + NSOffset] = (Ue * duddr + Ve * dudds + We * duddt - bdivw * S[id]) * invLMM; - } - } - } - } -} diff --git a/okl/cds/subCycleInitU0.okl b/okl/cds/subCycleInitU0.okl new file mode 100644 index 000000000..0d8ed60a9 --- /dev/null +++ b/okl/cds/subCycleInitU0.okl @@ -0,0 +1,59 @@ +/* + + The MIT License (MIT) + + Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + */ + +@kernel void subCycleInitU0(const dlong N, + const dlong Nfields, + const dlong fieldOffset, + const dlong torder, + const dlong nEXT, + const dlong toffset, + const dlong offset, + const dfloat coef, + @restrict const dfloat* massMatrix, + @restrict const dfloat *S, + @restrict dfloat *BS){ + for(dlong n=0;n 12*6/(1*8) = 9 TFLOPSs > peak - dfloat Dkm = s_D[k][m]; - r_Ut += Dkm * r_U[m]; - r_Vt += Dkm * r_V[m]; - r_Wt += Dkm * r_W[m]; - } - } - } - - @barrier("local"); - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - dfloat Ur = 0.f, Us = 0.f; - dfloat Vr = 0.f, Vs = 0.f; - dfloat Wr = 0.f, Ws = 0.f; - -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - // 8 shared, 12 FLOPS => 12TB/s*12/(8*8) => 2.25TF on V100 - dfloat Dim = s_D[i][m]; - dfloat Djm = s_D[j][m]; - - Ur += Dim * s_U[j][m]; - Us += Djm * s_U[m][i]; - - Vr += Dim * s_V[j][m]; - Vs += Djm * s_V[m][i]; - - Wr += Dim * s_W[j][m]; - Ws += Djm * s_W[m][i]; - } - - const dlong gbase = e * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - - const dfloat G00 = ggeo[gbase + p_G00ID * p_Np]; - const dfloat G01 = ggeo[gbase + p_G01ID * p_Np]; - const dfloat G02 = ggeo[gbase + p_G02ID * p_Np]; - - const dfloat G11 = ggeo[gbase + p_G11ID * p_Np]; - const dfloat G12 = ggeo[gbase + p_G12ID * p_Np]; - const dfloat G22 = ggeo[gbase + p_G22ID * p_Np]; - - const dfloat GwJ = ggeo[gbase + p_GWJID * p_Np]; - - s_GUr[j][i] = (G00 * Ur + G01 * Us + G02 * r_Ut); - s_GVr[j][i] = (G00 * Vr + G01 * Vs + G02 * r_Vt); - s_GWr[j][i] = (G00 * Wr + G01 * Ws + G02 * r_Wt); - - s_GUs[j][i] = (G01 * Ur + G11 * Us + G12 * r_Ut); - s_GVs[j][i] = (G01 * Vr + G11 * Vs + G12 * r_Vt); - s_GWs[j][i] = (G01 * Wr + G11 * Ws + G12 * r_Wt); - - r_Ut = (G02 * Ur + G12 * Us + G22 * r_Ut); - r_Vt = (G02 * Vr + G12 * Vs + G22 * r_Vt); - r_Wt = (G02 * Wr + G12 * Ws + G22 * r_Wt); - - r_AU[k] += GwJ * lambda[0 * loffset] * r_U[k]; - r_AV[k] += GwJ * lambda[1 * loffset] * r_V[k]; - r_AW[k] += GwJ * lambda[2 * loffset] * r_W[k]; - } - } - - @barrier("local"); - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - dfloat AUtmp = 0, AVtmp = 0, AWtmp = 0; - -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - // 9 shared, 18 flops => 12TB/s*18/(9*8) = 3TFLOPS/s - dfloat Dmi = s_D[m][i]; - dfloat Dmj = s_D[m][j]; - dfloat Dkm = s_D[k][m]; - - AUtmp += Dmi * s_GUr[j][m]; - AUtmp += Dmj * s_GUs[m][i]; - - AVtmp += Dmi * s_GVr[j][m]; - AVtmp += Dmj * s_GVs[m][i]; - - AWtmp += Dmi * s_GWr[j][m]; - AWtmp += Dmj * s_GWs[m][i]; - - r_AU[m] += Dkm * r_Ut; - r_AV[m] += Dkm * r_Vt; - r_AW[m] += Dkm * r_Wt; - } - - r_AU[k] += AUtmp; - r_AV[k] += AVtmp; - r_AW[k] += AWtmp; - } - } - } - - // write out - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { -#pragma unroll p_Nq - for(int k = 0; k < p_Nq; k++) { - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - Aq[id + 0 * offset] = r_AU[k]; - Aq[id + 1 * offset] = r_AV[k]; - Aq[id + 2 * offset] = r_AW[k]; - } - } - } - } -} - -// Currently Implemented for -@kernel void ellipticBlockPartialAxHex3D_N3(const dlong Nelements, - const dlong offset, - const dlong loffset, - @restrict const dlong* elementList, - @restrict const dfloat* ggeo, - @restrict const dfloat* D, - @restrict const dfloat* S, - @restrict const dfloat* lambda, - @restrict const dfloat* q, - @restrict dfloat* Aq) -{ - for(dlong e = 0; e < Nelements; ++e; @outer(0)) { - @shared dfloat s_D[p_Nq][p_Nq]; - - @shared dfloat s_U[p_Nq][p_Nq]; - @shared dfloat s_V[p_Nq][p_Nq]; - @shared dfloat s_W[p_Nq][p_Nq]; - - @shared dfloat s_GUr[p_Nq][p_Nq]; - @shared dfloat s_GUs[p_Nq][p_Nq]; - - @shared dfloat s_GVr[p_Nq][p_Nq]; - @shared dfloat s_GVs[p_Nq][p_Nq]; - - @shared dfloat s_GWr[p_Nq][p_Nq]; - @shared dfloat s_GWs[p_Nq][p_Nq]; - - @exclusive dfloat r_Ut, r_Vt, r_Wt; - - @exclusive dlong element; - // too much register ....(2*3*8 for N=7) - @exclusive dfloat r_U[p_Nq], r_V[p_Nq], r_W[p_Nq]; - @exclusive dfloat r_AU[p_Nq], r_AV[p_Nq], r_AW[p_Nq]; - - // array of threads - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - element = elementList[e]; - //load D into local memory - // s_D[i][j] = d \phi_i at node j - s_D[j][i] = D[p_Nq * j + i]; // D is column major - - // load pencil of u into register - const dlong base = i + j * p_Nq + element * p_Np; - - for(int k = 0; k < p_Nq; k++) { - // - r_U[k] = q[base + k * p_Nq * p_Nq + 0 * offset]; - r_V[k] = q[base + k * p_Nq * p_Nq + 1 * offset]; - r_W[k] = q[base + k * p_Nq * p_Nq + 2 * offset]; - // - r_AU[k] = 0.f; - r_AV[k] = 0.f; - r_AW[k] = 0.f; - } - } - - // Layer by layer -#pragma unroll p_Nq - for(int k = 0; k < p_Nq; k++) { - @barrier("local"); - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - // share u(:,:,k) - s_U[j][i] = r_U[k]; - s_V[j][i] = r_V[k]; - s_W[j][i] = r_W[k]; - - r_Ut = 0; - r_Vt = 0; - r_Wt = 0; - -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - // 1 shared, 6 flops => 12*6/(1*8) = 9 TFLOPSs > peak - dfloat Dkm = s_D[k][m]; - r_Ut += Dkm * r_U[m]; - r_Vt += Dkm * r_V[m]; - r_Wt += Dkm * r_W[m]; - } - } - } - - @barrier("local"); - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - dfloat Ur = 0.f, Us = 0.f; - dfloat Vr = 0.f, Vs = 0.f; - dfloat Wr = 0.f, Ws = 0.f; - -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - // 8 shared, 12 FLOPS => 12TB/s*12/(8*8) => 2.25TF on V100 - dfloat Dim = s_D[i][m]; - dfloat Djm = s_D[j][m]; - - Ur += Dim * s_U[j][m]; - Us += Djm * s_U[m][i]; - - Vr += Dim * s_V[j][m]; - Vs += Djm * s_V[m][i]; - - Wr += Dim * s_W[j][m]; - Ws += Djm * s_W[m][i]; - } - - const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - - const dfloat G00 = ggeo[gbase + p_G00ID * p_Np]; - const dfloat G01 = ggeo[gbase + p_G01ID * p_Np]; - const dfloat G02 = ggeo[gbase + p_G02ID * p_Np]; - - const dfloat G11 = ggeo[gbase + p_G11ID * p_Np]; - const dfloat G12 = ggeo[gbase + p_G12ID * p_Np]; - const dfloat G22 = ggeo[gbase + p_G22ID * p_Np]; - - const dfloat GwJ = ggeo[gbase + p_GWJID * p_Np]; - - s_GUr[j][i] = (G00 * Ur + G01 * Us + G02 * r_Ut); - s_GVr[j][i] = (G00 * Vr + G01 * Vs + G02 * r_Vt); - s_GWr[j][i] = (G00 * Wr + G01 * Ws + G02 * r_Wt); - - s_GUs[j][i] = (G01 * Ur + G11 * Us + G12 * r_Ut); - s_GVs[j][i] = (G01 * Vr + G11 * Vs + G12 * r_Vt); - s_GWs[j][i] = (G01 * Wr + G11 * Ws + G12 * r_Wt); - - r_Ut = (G02 * Ur + G12 * Us + G22 * r_Ut); - r_Vt = (G02 * Vr + G12 * Vs + G22 * r_Vt); - r_Wt = (G02 * Wr + G12 * Ws + G22 * r_Wt); - - r_AU[k] += GwJ * lambda[0 * loffset] * r_U[k]; - r_AV[k] += GwJ * lambda[1 * loffset] * r_V[k]; - r_AW[k] += GwJ * lambda[2 * loffset] * r_W[k]; - } - } - - @barrier("local"); - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - dfloat AUtmp = 0, AVtmp = 0, AWtmp = 0; - -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - // 9 shared, 18 flops => 12TB/s*18/(9*8) = 3TFLOPS/s - dfloat Dmi = s_D[m][i]; - dfloat Dmj = s_D[m][j]; - dfloat Dkm = s_D[k][m]; - - AUtmp += Dmi * s_GUr[j][m]; - AUtmp += Dmj * s_GUs[m][i]; - - AVtmp += Dmi * s_GVr[j][m]; - AVtmp += Dmj * s_GVs[m][i]; - - AWtmp += Dmi * s_GWr[j][m]; - AWtmp += Dmj * s_GWs[m][i]; - - r_AU[m] += Dkm * r_Ut; - r_AV[m] += Dkm * r_Vt; - r_AW[m] += Dkm * r_Wt; - } - - r_AU[k] += AUtmp; - r_AV[k] += AVtmp; - r_AW[k] += AWtmp; - } - } - } - - // write out - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { -#pragma unroll p_Nq - for(int k = 0; k < p_Nq; k++) { - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - Aq[id + 0 * offset] = r_AU[k]; - Aq[id + 1 * offset] = r_AV[k]; - Aq[id + 2 * offset] = r_AW[k]; - } - } - } - } -} - -// Currently Implemented for -@kernel void ellipticBlockAxVarHex3D_N3(const dlong Nelements, - const dlong offset, - const dlong loffset, - @restrict const dlong* elementList, - @restrict const dfloat* ggeo, - @restrict const dfloat* D, - @restrict const dfloat* S, - @restrict const dfloat* lambda, - @restrict const dfloat* q, - @restrict dfloat* Aq) -{ - for(dlong e = 0; e < Nelements; ++e; @outer(0)) { - @shared dfloat s_D[p_Nq][p_Nq]; - - @shared dfloat s_U[p_Nq][p_Nq]; - @shared dfloat s_V[p_Nq][p_Nq]; - @shared dfloat s_W[p_Nq][p_Nq]; - - @shared dfloat s_GUr[p_Nq][p_Nq]; - @shared dfloat s_GUs[p_Nq][p_Nq]; - - @shared dfloat s_GVr[p_Nq][p_Nq]; - @shared dfloat s_GVs[p_Nq][p_Nq]; - - @shared dfloat s_GWr[p_Nq][p_Nq]; - @shared dfloat s_GWs[p_Nq][p_Nq]; - - @exclusive dfloat r_Ut, r_Vt, r_Wt; - - // too much register ....(2*3*8 for N=7) - @exclusive dfloat r_U[p_Nq], r_V[p_Nq], r_W[p_Nq]; - @exclusive dfloat r_AU[p_Nq], r_AV[p_Nq], r_AW[p_Nq]; - - // array of threads - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - //load D into local memory - // s_D[i][j] = d \phi_i at node j - s_D[j][i] = D[p_Nq * j + i]; // D is column major - - // load pencil of u into register - const dlong base = i + j * p_Nq + e * p_Np; - - for(int k = 0; k < p_Nq; k++) { - // - r_U[k] = q[base + k * p_Nq * p_Nq + 0 * offset]; - r_V[k] = q[base + k * p_Nq * p_Nq + 1 * offset]; - r_W[k] = q[base + k * p_Nq * p_Nq + 2 * offset]; - // - r_AU[k] = 0.f; - r_AV[k] = 0.f; - r_AW[k] = 0.f; - } - } - - // Layer by layer -#pragma unroll p_Nq - for(int k = 0; k < p_Nq; k++) { - @barrier("local"); - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - // share u(:,:,k) - s_U[j][i] = r_U[k]; - s_V[j][i] = r_V[k]; - s_W[j][i] = r_W[k]; - - r_Ut = 0; - r_Vt = 0; - r_Wt = 0; - -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - // 1 shared, 6 flops => 12*6/(1*8) = 9 TFLOPSs > peak - dfloat Dkm = s_D[k][m]; - r_Ut += Dkm * r_U[m]; - r_Vt += Dkm * r_V[m]; - r_Wt += Dkm * r_W[m]; - } - } - } - - @barrier("local"); - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - dfloat Ur = 0.f, Us = 0.f; - dfloat Vr = 0.f, Vs = 0.f; - dfloat Wr = 0.f, Ws = 0.f; - -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - // 8 shared, 12 FLOPS => 12TB/s*12/(8*8) => 2.25TF on V100 - dfloat Dim = s_D[i][m]; - dfloat Djm = s_D[j][m]; - - Ur += Dim * s_U[j][m]; - Us += Djm * s_U[m][i]; - - Vr += Dim * s_V[j][m]; - Vs += Djm * s_V[m][i]; - - Wr += Dim * s_W[j][m]; - Ws += Djm * s_W[m][i]; - } - - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat u_lam0 = lambda[id + 0 * offset + 0 * loffset]; - const dfloat u_lam1 = lambda[id + 1 * offset + 0 * loffset]; - const dfloat v_lam0 = lambda[id + 0 * offset + 1 * loffset]; - const dfloat v_lam1 = lambda[id + 1 * offset + 1 * loffset]; - const dfloat w_lam0 = lambda[id + 0 * offset + 2 * loffset]; - const dfloat w_lam1 = lambda[id + 1 * offset + 2 * loffset]; - - const dlong gbase = e * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat G00 = ggeo[gbase + p_G00ID * p_Np]; - const dfloat G01 = ggeo[gbase + p_G01ID * p_Np]; - const dfloat G02 = ggeo[gbase + p_G02ID * p_Np]; - const dfloat G11 = ggeo[gbase + p_G11ID * p_Np]; - const dfloat G12 = ggeo[gbase + p_G12ID * p_Np]; - const dfloat G22 = ggeo[gbase + p_G22ID * p_Np]; - const dfloat GwJ = ggeo[gbase + p_GWJID * p_Np]; - - s_GUr[j][i] = u_lam0 * (G00 * Ur + G01 * Us + G02 * r_Ut); - s_GUs[j][i] = u_lam0 * (G01 * Ur + G11 * Us + G12 * r_Ut); - r_Ut = u_lam0 * (G02 * Ur + G12 * Us + G22 * r_Ut); - - s_GVr[j][i] = v_lam0 * (G00 * Vr + G01 * Vs + G02 * r_Vt); - s_GVs[j][i] = v_lam0 * (G01 * Vr + G11 * Vs + G12 * r_Vt); - r_Vt = v_lam0 * (G02 * Vr + G12 * Vs + G22 * r_Vt); - - s_GWr[j][i] = w_lam0 * (G00 * Wr + G01 * Ws + G02 * r_Wt); - s_GWs[j][i] = w_lam0 * (G01 * Wr + G11 * Ws + G12 * r_Wt); - r_Wt = w_lam0 * (G02 * Wr + G12 * Ws + G22 * r_Wt); - - r_AU[k] += GwJ * u_lam1 * r_U[k]; - r_AV[k] += GwJ * v_lam1 * r_V[k]; - r_AW[k] += GwJ * w_lam1 * r_W[k]; - } - } - - @barrier("local"); - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - dfloat AUtmp = 0, AVtmp = 0, AWtmp = 0; - -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - // 9 shared, 18 flops => 12TB/s*18/(9*8) = 3TFLOPS/s - dfloat Dmi = s_D[m][i]; - dfloat Dmj = s_D[m][j]; - dfloat Dkm = s_D[k][m]; - - AUtmp += Dmi * s_GUr[j][m]; - AUtmp += Dmj * s_GUs[m][i]; - - AVtmp += Dmi * s_GVr[j][m]; - AVtmp += Dmj * s_GVs[m][i]; - - AWtmp += Dmi * s_GWr[j][m]; - AWtmp += Dmj * s_GWs[m][i]; - - r_AU[m] += Dkm * r_Ut; - r_AV[m] += Dkm * r_Vt; - r_AW[m] += Dkm * r_Wt; - } - - r_AU[k] += AUtmp; - r_AV[k] += AVtmp; - r_AW[k] += AWtmp; - } - } - } - - // write out - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { -#pragma unroll p_Nq - for(int k = 0; k < p_Nq; k++) { - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - Aq[id + 0 * offset] = r_AU[k]; - Aq[id + 1 * offset] = r_AV[k]; - Aq[id + 2 * offset] = r_AW[k]; - } - } - } - } -} - -// Currently Implemented for -@kernel void ellipticBlockPartialAxVarHex3D_N3(const dlong Nelements, - const dlong offset, - const dlong loffset, - @restrict const dlong* elementList, - @restrict const dfloat* ggeo, - @restrict const dfloat* D, - @restrict const dfloat* S, - @restrict const dfloat* lambda, - @restrict const dfloat* q, - @restrict dfloat* Aq) -{ - for(dlong e = 0; e < Nelements; ++e; @outer(0)) { - @shared dfloat s_D[p_Nq][p_Nq]; - - @shared dfloat s_U[p_Nq][p_Nq]; - @shared dfloat s_V[p_Nq][p_Nq]; - @shared dfloat s_W[p_Nq][p_Nq]; - - @shared dfloat s_GUr[p_Nq][p_Nq]; - @shared dfloat s_GUs[p_Nq][p_Nq]; - - @shared dfloat s_GVr[p_Nq][p_Nq]; - @shared dfloat s_GVs[p_Nq][p_Nq]; - - @shared dfloat s_GWr[p_Nq][p_Nq]; - @shared dfloat s_GWs[p_Nq][p_Nq]; - - @exclusive dfloat r_Ut, r_Vt, r_Wt; - - @exclusive dlong element; - // too much register ....(2*3*8 for N=7) - @exclusive dfloat r_U[p_Nq], r_V[p_Nq], r_W[p_Nq]; - @exclusive dfloat r_AU[p_Nq], r_AV[p_Nq], r_AW[p_Nq]; - - // array of threads - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - element = elementList[e]; - //load D into local memory - // s_D[i][j] = d \phi_i at node j - s_D[j][i] = D[p_Nq * j + i]; // D is column major - - // load pencil of u into register - const dlong base = i + j * p_Nq + element * p_Np; - - for(int k = 0; k < p_Nq; k++) { - // - r_U[k] = q[base + k * p_Nq * p_Nq + 0 * offset]; - r_V[k] = q[base + k * p_Nq * p_Nq + 1 * offset]; - r_W[k] = q[base + k * p_Nq * p_Nq + 2 * offset]; - // - r_AU[k] = 0.f; - r_AV[k] = 0.f; - r_AW[k] = 0.f; - } - } - - // Layer by layer -#pragma unroll p_Nq - for(int k = 0; k < p_Nq; k++) { - @barrier("local"); - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - // share u(:,:,k) - s_U[j][i] = r_U[k]; - s_V[j][i] = r_V[k]; - s_W[j][i] = r_W[k]; - - r_Ut = 0; - r_Vt = 0; - r_Wt = 0; - -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - // 1 shared, 6 flops => 12*6/(1*8) = 9 TFLOPSs > peak - dfloat Dkm = s_D[k][m]; - r_Ut += Dkm * r_U[m]; - r_Vt += Dkm * r_V[m]; - r_Wt += Dkm * r_W[m]; - } - } - } - - @barrier("local"); - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - dfloat Ur = 0.f, Us = 0.f; - dfloat Vr = 0.f, Vs = 0.f; - dfloat Wr = 0.f, Ws = 0.f; - -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - // 8 shared, 12 FLOPS => 12TB/s*12/(8*8) => 2.25TF on V100 - dfloat Dim = s_D[i][m]; - dfloat Djm = s_D[j][m]; - Ur += Dim * s_U[j][m]; - Us += Djm * s_U[m][i]; - Vr += Dim * s_V[j][m]; - Vs += Djm * s_V[m][i]; - Wr += Dim * s_W[j][m]; - Ws += Djm * s_W[m][i]; - } - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat u_lam0 = lambda[id + 0 * offset + 0 * loffset]; - const dfloat u_lam1 = lambda[id + 1 * offset + 0 * loffset]; - const dfloat v_lam0 = lambda[id + 0 * offset + 1 * loffset]; - const dfloat v_lam1 = lambda[id + 1 * offset + 1 * loffset]; - const dfloat w_lam0 = lambda[id + 0 * offset + 2 * loffset]; - const dfloat w_lam1 = lambda[id + 1 * offset + 2 * loffset]; - - const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat G00 = ggeo[gbase + p_G00ID * p_Np]; - const dfloat G01 = ggeo[gbase + p_G01ID * p_Np]; - const dfloat G02 = ggeo[gbase + p_G02ID * p_Np]; - const dfloat G11 = ggeo[gbase + p_G11ID * p_Np]; - const dfloat G12 = ggeo[gbase + p_G12ID * p_Np]; - const dfloat G22 = ggeo[gbase + p_G22ID * p_Np]; - const dfloat GwJ = ggeo[gbase + p_GWJID * p_Np]; - - s_GUr[j][i] = u_lam0 * (G00 * Ur + G01 * Us + G02 * r_Ut); - s_GUs[j][i] = u_lam0 * (G01 * Ur + G11 * Us + G12 * r_Ut); - r_Ut = u_lam0 * (G02 * Ur + G12 * Us + G22 * r_Ut); - // - s_GVr[j][i] = v_lam0 * (G00 * Vr + G01 * Vs + G02 * r_Vt); - s_GVs[j][i] = v_lam0 * (G01 * Vr + G11 * Vs + G12 * r_Vt); - r_Vt = v_lam0 * (G02 * Vr + G12 * Vs + G22 * r_Vt); - // - s_GWr[j][i] = w_lam0 * (G00 * Wr + G01 * Ws + G02 * r_Wt); - s_GWs[j][i] = w_lam0 * (G01 * Wr + G11 * Ws + G12 * r_Wt); - r_Wt = w_lam0 * (G02 * Wr + G12 * Ws + G22 * r_Wt); - // - r_AU[k] += GwJ * u_lam1 * r_U[k]; - r_AV[k] += GwJ * v_lam1 * r_V[k]; - r_AW[k] += GwJ * w_lam1 * r_W[k]; - } - } - - @barrier("local"); - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - dfloat AUtmp = 0, AVtmp = 0, AWtmp = 0; - -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - // 9 shared, 18 flops => 12TB/s*18/(9*8) = 3TFLOPS/s - dfloat Dmi = s_D[m][i]; - dfloat Dmj = s_D[m][j]; - dfloat Dkm = s_D[k][m]; - - AUtmp += Dmi * s_GUr[j][m]; - AUtmp += Dmj * s_GUs[m][i]; - - AVtmp += Dmi * s_GVr[j][m]; - AVtmp += Dmj * s_GVs[m][i]; - - AWtmp += Dmi * s_GWr[j][m]; - AWtmp += Dmj * s_GWs[m][i]; - - r_AU[m] += Dkm * r_Ut; - r_AV[m] += Dkm * r_Vt; - r_AW[m] += Dkm * r_Wt; - } - - r_AU[k] += AUtmp; - r_AV[k] += AVtmp; - r_AW[k] += AWtmp; - } - } - } - - // write out - - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { -#pragma unroll p_Nq - for(int k = 0; k < p_Nq; k++) { - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - Aq[id + 0 * offset] = r_AU[k]; - Aq[id + 1 * offset] = r_AV[k]; - Aq[id + 2 * offset] = r_AW[k]; - } - } - } - } -} - -@kernel void ellipticStressAxVarHex3D(const dlong Nelements, - const dlong offset, - const dlong loffset, - @restrict const dfloat* vgeo, - @restrict const dfloat* D, - @restrict const dfloat* S, - @restrict const dfloat* lambda, - @restrict const dfloat* q, - @restrict dfloat* Aq) -{ - for(dlong e = 0; e < Nelements; ++e; @outer(0)) { - // AK: heavy memory usage, optimize later - @shared dfloat s_D[p_Nq][p_Nq]; - - @shared dfloat s_U[p_Nq][p_Nq]; - @shared dfloat s_V[p_Nq][p_Nq]; - @shared dfloat s_W[p_Nq][p_Nq]; - @shared dfloat s_SUr[p_Nq][p_Nq]; - @shared dfloat s_SUs[p_Nq][p_Nq]; - - @exclusive dfloat s_Uloc[p_Nq]; - @exclusive dfloat s_Vloc[p_Nq]; - @exclusive dfloat s_Wloc[p_Nq]; - - @exclusive dfloat s_SUtloc[p_Nq]; - - @shared dfloat s_SVr[p_Nq][p_Nq]; - @shared dfloat s_SVs[p_Nq][p_Nq]; - @exclusive dfloat s_SVt[p_Nq]; - - @shared dfloat s_SWr[p_Nq][p_Nq]; - @shared dfloat s_SWs[p_Nq][p_Nq]; - @exclusive dfloat s_SWt[p_Nq]; - // - @exclusive dfloat rx, ry, rz; - @exclusive dfloat sx, sy, sz; - @exclusive dfloat tx, ty, tz; - - // Symmetric Stress Tensor - @exclusive dfloat s11,s12,s13; - @exclusive dfloat s21,s22,s23; - @exclusive dfloat s31,s32,s33; - - @exclusive dfloat r_Au[p_Nq]; - @exclusive dfloat r_Av[p_Nq]; - @exclusive dfloat r_Aw[p_Nq]; - - // prefetch q - for(int k = 0; k < p_Nq; ++k) { - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - if(k == 0) s_D[j][i] = D[p_Nq * j + i]; - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - s_U[j][i] = q[id + 0 * offset]; - s_V[j][i] = q[id + 1 * offset]; - s_W[j][i] = q[id + 2 * offset]; - if(k == 0) { - for(int l = 0; l < p_Nq; ++l) { - const dlong other_id = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; - s_Uloc[l] = q[other_id + 0 * offset]; - s_Vloc[l] = q[other_id + 1 * offset]; - s_Wloc[l] = q[other_id + 2 * offset]; - } - } - } - @barrier("local"); - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo; - rx = vgeo[gid + p_RXID * p_Np]; - ry = vgeo[gid + p_RYID * p_Np]; - rz = vgeo[gid + p_RZID * p_Np]; - - sx = vgeo[gid + p_SXID * p_Np]; - sy = vgeo[gid + p_SYID * p_Np]; - sz = vgeo[gid + p_SZID * p_Np]; - - tx = vgeo[gid + p_TXID * p_Np]; - ty = vgeo[gid + p_TYID * p_Np]; - tz = vgeo[gid + p_TZID * p_Np]; - - const dfloat JW = vgeo[gid + p_JWID * p_Np]; - - // compute 1D derivatives - dfloat ur = 0.f, us = 0.f, ut = 0.f; - dfloat vr = 0.f, vs = 0.f, vt = 0.f; - dfloat wr = 0.f, ws = 0.f, wt = 0.f; - for(int m = 0; m < p_Nq; ++m) { - const dfloat Dim = s_D[i][m]; // Dr - const dfloat Djm = s_D[j][m]; // Ds - const dfloat Dkm = s_D[k][m]; // Dt - - ur += Dim * s_U[j][m]; - us += Djm * s_U[m][i]; - ut += Dkm * s_Uloc[m]; - // - vr += Dim * s_V[j][m]; - vs += Djm * s_V[m][i]; - vt += Dkm * s_Vloc[m]; - // - wr += Dim * s_W[j][m]; - ws += Djm * s_W[m][i]; - wt += Dkm * s_Wloc[m]; - } - - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - // not sure that we need anistropic diffusion!!!! - // con be simplified for istropic diffusion - const dfloat u_lam0 = lambda[id + 0 * offset + 0 * loffset]; - const dfloat u_lam1 = lambda[id + 1 * offset + 0 * loffset]; - const dfloat v_lam0 = lambda[id + 0 * offset + 1 * loffset]; - const dfloat v_lam1 = lambda[id + 1 * offset + 1 * loffset]; - const dfloat w_lam0 = lambda[id + 0 * offset + 2 * loffset]; - const dfloat w_lam1 = lambda[id + 1 * offset + 2 * loffset]; - - const dfloat dudx = rx * ur + sx * us + tx * ut; - const dfloat dudy = ry * ur + sy * us + ty * ut; - const dfloat dudz = rz * ur + sz * us + tz * ut; - - const dfloat dvdx = rx * vr + sx * vs + tx * vt; - const dfloat dvdy = ry * vr + sy * vs + ty * vt; - const dfloat dvdz = rz * vr + sz * vs + tz * vt; - - const dfloat dwdx = rx * wr + sx * ws + tx * wt; - const dfloat dwdy = ry * wr + sy * ws + ty * wt; - const dfloat dwdz = rz * wr + sz * ws + tz * wt; - - s11 = u_lam0 * JW * (dudx + dudx); - s12 = u_lam0 * JW * (dudy + dvdx); - s13 = u_lam0 * JW * (dudz + dwdx); - - s21 = v_lam0 * JW * (dvdx + dudy); - s22 = v_lam0 * JW * (dvdy + dvdy); - s23 = v_lam0 * JW * (dvdz + dwdy); - - s31 = w_lam0 * JW * (dwdx + dudz); - s32 = w_lam0 * JW * (dwdy + dvdz); - s33 = w_lam0 * JW * (dwdz + dwdz); - // store in register - r_Au[k] = u_lam1 * JW * s_U[j][i]; - r_Av[k] = v_lam1 * JW * s_V[j][i]; - r_Aw[k] = w_lam1 * JW * s_W[j][i]; - } - @barrier("local"); - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - s_SUr[j][i] = rx * s11 + ry * s12 + rz * s13; - s_SUs[j][i] = sx * s11 + sy * s12 + sz * s13; - s_SUtloc[k] = tx * s11 + ty * s12 + tz * s13; - // - s_SVr[j][i] = rx * s21 + ry * s22 + rz * s23; - s_SVs[j][i] = sx * s21 + sy * s22 + sz * s23; - s_SVt[k] = tx * s21 + ty * s22 + tz * s23; - // - s_SWr[j][i] = rx * s31 + ry * s32 + rz * s33; - s_SWs[j][i] = sx * s31 + sy * s32 + sz * s33; - s_SWt[k] = tx * s31 + ty * s32 + tz * s33; - } - @barrier("local"); - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - const dfloat Dim = s_D[m][i]; // Dr' - const dfloat Djm = s_D[m][j]; // Ds' - - r_Au[k] += Dim * s_SUr[j][m]; - r_Au[k] += Djm * s_SUs[m][i]; - - r_Av[k] += Dim * s_SVr[j][m]; - r_Av[k] += Djm * s_SVs[m][i]; - - r_Aw[k] += Dim * s_SWr[j][m]; - r_Aw[k] += Djm * s_SWs[m][i]; - } - } - } - @barrier("local"); - } - - for(int k = 0; k < p_Nq; ++k) { - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - const dfloat Dkm = s_D[m][k]; // Dt' - - r_Au[k] += Dkm * s_SUtloc[m]; - - r_Av[k] += Dkm * s_SVt[m]; - - r_Aw[k] += Dkm * s_SWt[m]; - } - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - Aq[id + 0 * offset] = r_Au[k]; - Aq[id + 1 * offset] = r_Av[k]; - Aq[id + 2 * offset] = r_Aw[k]; - } - } - } - } -} - -// -@kernel void ellipticStressPartialAxVarHex3D(const dlong Nelements, - const dlong offset, - const dlong loffset, - @restrict const dlong* elementList, - @restrict const dfloat* vgeo, - @restrict const dfloat* D, - @restrict const dfloat* S, - @restrict const dfloat* lambda, - @restrict const dfloat* q, - @restrict dfloat* Aq) -{ - for(dlong e = 0; e < Nelements; ++e; @outer(0)) { - @shared dfloat s_D[p_Nq][p_Nq]; - - @shared dfloat s_U[p_Nq][p_Nq]; - @shared dfloat s_V[p_Nq][p_Nq]; - @shared dfloat s_W[p_Nq][p_Nq]; - @exclusive dfloat s_Uloc[p_Nq]; - @exclusive dfloat s_Vloc[p_Nq]; - @exclusive dfloat s_Wloc[p_Nq]; - @shared dfloat s_SUr[p_Nq][p_Nq]; - @shared dfloat s_SUs[p_Nq][p_Nq]; - @exclusive dfloat s_SUtloc[p_Nq]; - - @shared dfloat s_SVr[p_Nq][p_Nq]; - @shared dfloat s_SVs[p_Nq][p_Nq]; - @exclusive dfloat s_SVt[p_Nq]; - - @shared dfloat s_SWr[p_Nq][p_Nq]; - @shared dfloat s_SWs[p_Nq][p_Nq]; - @exclusive dfloat s_SWt[p_Nq]; - - // - @exclusive dfloat rx, ry, rz; - @exclusive dfloat sx, sy, sz; - @exclusive dfloat tx, ty, tz; - // Symmetric Stress Tensor - @exclusive dfloat s11,s12,s13; - @exclusive dfloat s21,s22,s23; - @exclusive dfloat s31,s32,s33; - - @exclusive dfloat r_Au[p_Nq]; - @exclusive dfloat r_Av[p_Nq]; - @exclusive dfloat r_Aw[p_Nq]; - @exclusive dlong element; - - // prefetch q - for(int k = 0; k < p_Nq; ++k) { - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - element = elementList[e]; - if(k == 0) s_D[j][i] = D[p_Nq * j + i]; - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - s_U[j][i] = q[id + 0 * offset]; - s_V[j][i] = q[id + 1 * offset]; - s_W[j][i] = q[id + 2 * offset]; - if(k == 0) { - for(int l = 0; l < p_Nq; ++l) { - const dlong other_id = element * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; - s_Uloc[l] = q[other_id + 0 * offset]; - s_Vloc[l] = q[other_id + 1 * offset]; - s_Wloc[l] = q[other_id + 2 * offset]; - } - } - } - @barrier("local"); - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np * p_Nvgeo; - rx = vgeo[gid + p_RXID * p_Np]; - ry = vgeo[gid + p_RYID * p_Np]; - rz = vgeo[gid + p_RZID * p_Np]; - - sx = vgeo[gid + p_SXID * p_Np]; - sy = vgeo[gid + p_SYID * p_Np]; - sz = vgeo[gid + p_SZID * p_Np]; - - tx = vgeo[gid + p_TXID * p_Np]; - ty = vgeo[gid + p_TYID * p_Np]; - tz = vgeo[gid + p_TZID * p_Np]; - - const dfloat JW = vgeo[gid + p_JWID * p_Np]; - - // compute 1D derivatives - dfloat ur = 0.f, us = 0.f, ut = 0.f; - dfloat vr = 0.f, vs = 0.f, vt = 0.f; - dfloat wr = 0.f, ws = 0.f, wt = 0.f; - for(int m = 0; m < p_Nq; ++m) { - const dfloat Dim = s_D[i][m]; // Dr - const dfloat Djm = s_D[j][m]; // Ds - const dfloat Dkm = s_D[k][m]; // Dt - - ur += Dim * s_U[j][m]; - us += Djm * s_U[m][i]; - ut += Dkm * s_Uloc[m]; - // - vr += Dim * s_V[j][m]; - vs += Djm * s_V[m][i]; - vt += Dkm * s_Vloc[m]; - // - wr += Dim * s_W[j][m]; - ws += Djm * s_W[m][i]; - wt += Dkm * s_Wloc[m]; - } - - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - - const dfloat u_lam0 = lambda[id + 0 * offset + 0 * loffset]; - const dfloat u_lam1 = lambda[id + 1 * offset + 0 * loffset]; - const dfloat v_lam0 = lambda[id + 0 * offset + 1 * loffset]; - const dfloat v_lam1 = lambda[id + 1 * offset + 1 * loffset]; - const dfloat w_lam0 = lambda[id + 0 * offset + 2 * loffset]; - const dfloat w_lam1 = lambda[id + 1 * offset + 2 * loffset]; - - const dfloat dudx = rx * ur + sx * us + tx * ut; - const dfloat dudy = ry * ur + sy * us + ty * ut; - const dfloat dudz = rz * ur + sz * us + tz * ut; - - const dfloat dvdx = rx * vr + sx * vs + tx * vt; - const dfloat dvdy = ry * vr + sy * vs + ty * vt; - const dfloat dvdz = rz * vr + sz * vs + tz * vt; - - const dfloat dwdx = rx * wr + sx * ws + tx * wt; - const dfloat dwdy = ry * wr + sy * ws + ty * wt; - const dfloat dwdz = rz * wr + sz * ws + tz * wt; - - s11 = u_lam0 * JW * (dudx + dudx); - s12 = u_lam0 * JW * (dudy + dvdx); - s13 = u_lam0 * JW * (dudz + dwdx); - - s21 = v_lam0 * JW * (dvdx + dudy); - s22 = v_lam0 * JW * (dvdy + dvdy); - s23 = v_lam0 * JW * (dvdz + dwdy); - - s31 = w_lam0 * JW * (dwdx + dudz); - s32 = w_lam0 * JW * (dwdy + dvdz); - s33 = w_lam0 * JW * (dwdz + dwdz); - // store in register - r_Au[k] = u_lam1 * JW * s_U[j][i]; - r_Av[k] = v_lam1 * JW * s_V[j][i]; - r_Aw[k] = w_lam1 * JW * s_W[j][i]; - } - @barrier("local"); - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - s_SUr[j][i] = rx * s11 + ry * s12 + rz * s13; - s_SUs[j][i] = sx * s11 + sy * s12 + sz * s13; - s_SUtloc[k] = tx * s11 + ty * s12 + tz * s13; - // - s_SVr[j][i] = rx * s21 + ry * s22 + rz * s23; - s_SVs[j][i] = sx * s21 + sy * s22 + sz * s23; - s_SVt[k] = tx * s21 + ty * s22 + tz * s23; - // - s_SWr[j][i] = rx * s31 + ry * s32 + rz * s33; - s_SWs[j][i] = sx * s31 + sy * s32 + sz * s33; - s_SWt[k] = tx * s31 + ty * s32 + tz * s33; - } - @barrier("local"); - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - const dfloat Dim = s_D[m][i]; // Dr' - const dfloat Djm = s_D[m][j]; // Ds' - - r_Au[k] += Dim * s_SUr[j][m]; - r_Au[k] += Djm * s_SUs[m][i]; - - r_Av[k] += Dim * s_SVr[j][m]; - r_Av[k] += Djm * s_SVs[m][i]; - - r_Aw[k] += Dim * s_SWr[j][m]; - r_Aw[k] += Djm * s_SWs[m][i]; - } - } - } - } - -// loop over slabs - for(int k = 0; k < p_Nq; ++k) { - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - const dfloat Dkm = s_D[m][k]; // Dt' - - r_Au[k] += Dkm * s_SUtloc[m]; - - r_Av[k] += Dkm * s_SVt[m]; - - r_Aw[k] += Dkm * s_SWt[m]; - } - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - Aq[id + 0 * offset] = r_Au[k]; - Aq[id + 1 * offset] = r_Av[k]; - Aq[id + 2 * offset] = r_Aw[k]; - } - } - } - } -} - -// -@kernel void ellipticStressAxHex3D(const dlong Nelements, - const dlong offset, - const dlong loffset, - @restrict const dfloat* vgeo, - @restrict const dfloat* D, - @restrict const dfloat* S, - @restrict const dfloat* lambda, - @restrict const dfloat* q, - @restrict dfloat* Aq) -{ - for(dlong e = 0; e < Nelements; ++e; @outer(0)) { - // AK: heavy memory usage, optimize later - @shared dfloat s_D[p_Nq][p_Nq]; - - @shared dfloat s_U[p_Nq][p_Nq]; - @shared dfloat s_V[p_Nq][p_Nq]; - @shared dfloat s_W[p_Nq][p_Nq]; - @shared dfloat s_SUr[p_Nq][p_Nq]; - @shared dfloat s_SUs[p_Nq][p_Nq]; - @exclusive dfloat s_Uloc[p_Nq]; - @exclusive dfloat s_Vloc[p_Nq]; - @exclusive dfloat s_Wloc[p_Nq]; - @exclusive dfloat s_SUtloc[p_Nq]; - - @shared dfloat s_SVr[p_Nq][p_Nq]; - @shared dfloat s_SVs[p_Nq][p_Nq]; - @exclusive dfloat s_SVt[p_Nq]; - - @shared dfloat s_SWr[p_Nq][p_Nq]; - @shared dfloat s_SWs[p_Nq][p_Nq]; - @exclusive dfloat s_SWt[p_Nq]; - // - @exclusive dfloat rx, ry, rz; - @exclusive dfloat sx, sy, sz; - @exclusive dfloat tx, ty, tz; - - // Symmetric Stress Tensor - @exclusive dfloat s11,s12,s13; - @exclusive dfloat s21,s22,s23; - @exclusive dfloat s31,s32,s33; - - @exclusive dfloat r_Au[p_Nq]; - @exclusive dfloat r_Av[p_Nq]; - @exclusive dfloat r_Aw[p_Nq]; - - // prefetch q - for(int k = 0; k < p_Nq; ++k) { - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - if(k == 0) s_D[j][i] = D[p_Nq * j + i]; - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - s_U[j][i] = q[id + 0 * offset]; - s_V[j][i] = q[id + 1 * offset]; - s_W[j][i] = q[id + 2 * offset]; - if(k == 0) { - for(int l = 0; l < p_Nq; ++l) { - const dlong other_id = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; - s_Uloc[l] = q[other_id + 0 * offset]; - s_Vloc[l] = q[other_id + 1 * offset]; - s_Wloc[l] = q[other_id + 2 * offset]; - } - } - } - @barrier("local"); - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo; - rx = vgeo[gid + p_RXID * p_Np]; - ry = vgeo[gid + p_RYID * p_Np]; - rz = vgeo[gid + p_RZID * p_Np]; - - sx = vgeo[gid + p_SXID * p_Np]; - sy = vgeo[gid + p_SYID * p_Np]; - sz = vgeo[gid + p_SZID * p_Np]; - - tx = vgeo[gid + p_TXID * p_Np]; - ty = vgeo[gid + p_TYID * p_Np]; - tz = vgeo[gid + p_TZID * p_Np]; - - const dfloat JW = vgeo[gid + p_JWID * p_Np]; - - // compute 1D derivatives - dfloat ur = 0.f, us = 0.f, ut = 0.f; - dfloat vr = 0.f, vs = 0.f, vt = 0.f; - dfloat wr = 0.f, ws = 0.f, wt = 0.f; - for(int m = 0; m < p_Nq; ++m) { - const dfloat Dim = s_D[i][m]; // Dr - const dfloat Djm = s_D[j][m]; // Ds - const dfloat Dkm = s_D[k][m]; // Dt - - ur += Dim * s_U[j][m]; - us += Djm * s_U[m][i]; - ut += Dkm * s_Uloc[m]; - // - vr += Dim * s_V[j][m]; - vs += Djm * s_V[m][i]; - vt += Dkm * s_Vloc[m]; - // - wr += Dim * s_W[j][m]; - ws += Djm * s_W[m][i]; - wt += Dkm * s_Wloc[m]; - } - - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - - const dfloat dudx = rx * ur + sx * us + tx * ut; - const dfloat dudy = ry * ur + sy * us + ty * ut; - const dfloat dudz = rz * ur + sz * us + tz * ut; - - const dfloat dvdx = rx * vr + sx * vs + tx * vt; - const dfloat dvdy = ry * vr + sy * vs + ty * vt; - const dfloat dvdz = rz * vr + sz * vs + tz * vt; - - const dfloat dwdx = rx * wr + sx * ws + tx * wt; - const dfloat dwdy = ry * wr + sy * ws + ty * wt; - const dfloat dwdz = rz * wr + sz * ws + tz * wt; - - s11 = JW * (dudx + dudx); - s12 = JW * (dudy + dvdx); - s13 = JW * (dudz + dwdx); - - s21 = JW * (dvdx + dudy); - s22 = JW * (dvdy + dvdy); - s23 = JW * (dvdz + dwdy); - - s31 = JW * (dwdx + dudz); - s32 = JW * (dwdy + dvdz); - s33 = JW * (dwdz + dwdz); - // store in register - r_Au[k] = lambda[id + 1 * offset + 0 * loffset] * JW * s_U[j][i]; - r_Av[k] = lambda[id + 1 * offset + 1 * loffset] * JW * s_V[j][i]; - r_Aw[k] = lambda[id + 1 * offset + 2 * loffset] * JW * s_W[j][i]; - } - @barrier("local"); - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - s_SUr[j][i] = rx * s11 + ry * s12 + rz * s13; - s_SUs[j][i] = sx * s11 + sy * s12 + sz * s13; - s_SUtloc[k] = tx * s11 + ty * s12 + tz * s13; - // - s_SVr[j][i] = rx * s21 + ry * s22 + rz * s23; - s_SVs[j][i] = sx * s21 + sy * s22 + sz * s23; - s_SVt[k] = tx * s21 + ty * s22 + tz * s23; - // - s_SWr[j][i] = rx * s31 + ry * s32 + rz * s33; - s_SWs[j][i] = sx * s31 + sy * s32 + sz * s33; - s_SWt[k] = tx * s31 + ty * s32 + tz * s33; - } - @barrier("local"); - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - const dfloat Dim = s_D[m][i]; // Dr' - const dfloat Djm = s_D[m][j]; // Ds' - - r_Au[k] += Dim * s_SUr[j][m]; - r_Au[k] += Djm * s_SUs[m][i]; - - r_Av[k] += Dim * s_SVr[j][m]; - r_Av[k] += Djm * s_SVs[m][i]; - - r_Aw[k] += Dim * s_SWr[j][m]; - r_Aw[k] += Djm * s_SWs[m][i]; - } - } - } - } - - for(int k = 0; k < p_Nq; ++k) { - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - const dfloat Dkm = s_D[m][k]; // Dt' - - r_Au[k] += Dkm * s_SUtloc[m]; - - r_Av[k] += Dkm * s_SVt[m]; - - r_Aw[k] += Dkm * s_SWt[m]; - } - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - Aq[id + 0 * offset] = r_Au[k]; - Aq[id + 1 * offset] = r_Av[k]; - Aq[id + 2 * offset] = r_Aw[k]; - } - } - } - } -} - -// -@kernel void ellipticStressPartialAxHex3D(const dlong Nelements, - const dlong offset, - const dlong loffset, - @restrict const dlong* elementList, - @restrict const dfloat* vgeo, - @restrict const dfloat* D, - @restrict const dfloat* S, - @restrict const dfloat* lambda, - @restrict const dfloat* q, - @restrict dfloat* Aq) -{ - for(dlong e = 0; e < Nelements; ++e; @outer(0)) { - @shared dfloat s_D[p_Nq][p_Nq]; - - @shared dfloat s_U[p_Nq][p_Nq]; - @shared dfloat s_V[p_Nq][p_Nq]; - @shared dfloat s_W[p_Nq][p_Nq]; - @shared dfloat s_SUr[p_Nq][p_Nq]; - @shared dfloat s_SUs[p_Nq][p_Nq]; - @exclusive dfloat s_Uloc[p_Nq]; - @exclusive dfloat s_Vloc[p_Nq]; - @exclusive dfloat s_Wloc[p_Nq]; - @exclusive dfloat s_SUtloc[p_Nq]; - - @shared dfloat s_SVr[p_Nq][p_Nq]; - @shared dfloat s_SVs[p_Nq][p_Nq]; - @exclusive dfloat s_SVt[p_Nq]; - - @shared dfloat s_SWr[p_Nq][p_Nq]; - @shared dfloat s_SWs[p_Nq][p_Nq]; - @exclusive dfloat s_SWt[p_Nq]; - - // - @exclusive dfloat rx, ry, rz; - @exclusive dfloat sx, sy, sz; - @exclusive dfloat tx, ty, tz; - // Symmetric Stress Tensor - @exclusive dfloat s11,s12,s13; - @exclusive dfloat s21,s22,s23; - @exclusive dfloat s31,s32,s33; - - @exclusive dfloat r_Au[p_Nq]; - @exclusive dfloat r_Av[p_Nq]; - @exclusive dfloat r_Aw[p_Nq]; - @exclusive dlong element; - - // prefetch q - for(int k = 0; k < p_Nq; ++k) { - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - element = elementList[e]; - if(k == 0) s_D[j][i] = D[p_Nq * j + i]; - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - s_U[j][i] = q[id + 0 * offset]; - s_V[j][i] = q[id + 1 * offset]; - s_W[j][i] = q[id + 2 * offset]; - if(k == 0) { - for(int l = 0; l < p_Nq; ++l) { - const dlong other_id = element * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; - s_Uloc[l] = q[other_id + 0 * offset]; - s_Vloc[l] = q[other_id + 1 * offset]; - s_Wloc[l] = q[other_id + 2 * offset]; - } - } - } - @barrier("local"); - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np * p_Nvgeo; - rx = vgeo[gid + p_RXID * p_Np]; - ry = vgeo[gid + p_RYID * p_Np]; - rz = vgeo[gid + p_RZID * p_Np]; - - sx = vgeo[gid + p_SXID * p_Np]; - sy = vgeo[gid + p_SYID * p_Np]; - sz = vgeo[gid + p_SZID * p_Np]; - - tx = vgeo[gid + p_TXID * p_Np]; - ty = vgeo[gid + p_TYID * p_Np]; - tz = vgeo[gid + p_TZID * p_Np]; - - const dfloat JW = vgeo[gid + p_JWID * p_Np]; - - // compute 1D derivatives - dfloat ur = 0.f, us = 0.f, ut = 0.f; - dfloat vr = 0.f, vs = 0.f, vt = 0.f; - dfloat wr = 0.f, ws = 0.f, wt = 0.f; - for(int m = 0; m < p_Nq; ++m) { - const dfloat Dim = s_D[i][m]; // Dr - const dfloat Djm = s_D[j][m]; // Ds - const dfloat Dkm = s_D[k][m]; // Dt - - ur += Dim * s_U[j][m]; - us += Djm * s_U[m][i]; - ut += Dkm * s_Uloc[m]; - // - vr += Dim * s_V[j][m]; - vs += Djm * s_V[m][i]; - vt += Dkm * s_Vloc[m]; - // - wr += Dim * s_W[j][m]; - ws += Djm * s_W[m][i]; - wt += Dkm * s_Wloc[m]; - } - - const dfloat dudx = rx * ur + sx * us + tx * ut; - const dfloat dudy = ry * ur + sy * us + ty * ut; - const dfloat dudz = rz * ur + sz * us + tz * ut; - - const dfloat dvdx = rx * vr + sx * vs + tx * vt; - const dfloat dvdy = ry * vr + sy * vs + ty * vt; - const dfloat dvdz = rz * vr + sz * vs + tz * vt; - - const dfloat dwdx = rx * wr + sx * ws + tx * wt; - const dfloat dwdy = ry * wr + sy * ws + ty * wt; - const dfloat dwdz = rz * wr + sz * ws + tz * wt; - - s11 = JW * (dudx + dudx); - s12 = JW * (dudy + dvdx); - s13 = JW * (dudz + dwdx); - - s21 = JW * (dvdx + dudy); - s22 = JW * (dvdy + dvdy); - s23 = JW * (dvdz + dwdy); - - s31 = JW * (dwdx + dudz); - s32 = JW * (dwdy + dvdz); - s33 = JW * (dwdz + dwdz); - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - - r_Au[k] = lambda[id + 1 * offset + 0 * loffset] * JW * s_U[j][i]; - r_Av[k] = lambda[id + 1 * offset + 1 * loffset] * JW * s_V[j][i]; - r_Aw[k] = lambda[id + 1 * offset + 2 * loffset] * JW * s_W[j][i]; - } - @barrier("local"); - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - s_SUr[j][i] = rx * s11 + ry * s12 + rz * s13; - s_SUs[j][i] = sx * s11 + sy * s12 + sz * s13; - s_SUtloc[k] = tx * s11 + ty * s12 + tz * s13; - // - s_SVr[j][i] = rx * s21 + ry * s22 + rz * s23; - s_SVs[j][i] = sx * s21 + sy * s22 + sz * s23; - s_SVt[k] = tx * s21 + ty * s22 + tz * s23; - // - s_SWr[j][i] = rx * s31 + ry * s32 + rz * s33; - s_SWs[j][i] = sx * s31 + sy * s32 + sz * s33; - s_SWt[k] = tx * s31 + ty * s32 + tz * s33; - } - @barrier("local"); - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - const dfloat Dim = s_D[m][i]; // Dr' - const dfloat Djm = s_D[m][j]; // Ds' - - r_Au[k] += Dim * s_SUr[j][m]; - r_Au[k] += Djm * s_SUs[m][i]; - - r_Av[k] += Dim * s_SVr[j][m]; - r_Av[k] += Djm * s_SVs[m][i]; - - r_Aw[k] += Dim * s_SWr[j][m]; - r_Aw[k] += Djm * s_SWs[m][i]; - } - } - } - } - for(int k = 0; k < p_Nq; ++k) { - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { -#pragma unroll p_Nq - for(int m = 0; m < p_Nq; m++) { - const dfloat Dkm = s_D[m][k]; // Dt' - - r_Au[k] += Dkm * s_SUtloc[m]; - - r_Av[k] += Dkm * s_SVt[m]; - - r_Aw[k] += Dkm * s_SWt[m]; - } - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - Aq[id + 0 * offset] = r_Au[k]; - Aq[id + 1 * offset] = r_Av[k]; - Aq[id + 2 * offset] = r_Aw[k]; - } - } - } - } -} \ No newline at end of file diff --git a/okl/elliptic/ellipticBuildDiagonalHex3D.okl b/okl/elliptic/ellipticBlockBuildDiagonalHex3D.okl similarity index 85% rename from okl/elliptic/ellipticBuildDiagonalHex3D.okl rename to okl/elliptic/ellipticBlockBuildDiagonalHex3D.okl index f73e090ab..a4aa1e6ec 100644 --- a/okl/elliptic/ellipticBuildDiagonalHex3D.okl +++ b/okl/elliptic/ellipticBlockBuildDiagonalHex3D.okl @@ -23,12 +23,11 @@ SOFTWARE. */ +#define p_MaxNFields (3) @kernel void ellipticBlockBuildDiagonalHex3D(const dlong Nelements, + const dlong Nfields, const dlong offset, const dlong loffset, - const int allNeumann, - const dfloat allNeumannScale, - @restrict const int* mapB, @restrict const dfloat* ggeo, @restrict const dfloat* D, @restrict const dfloat* S, @@ -38,13 +37,15 @@ for(dlong e = 0; e < Nelements; ++e; @outer(0)) { @shared dfloat s_D[p_Nq][p_Nq]; - @shared dfloat s_lambda0[p_Nfields][p_Nq][p_Nq]; + @shared dfloat s_lambda0[p_MaxNFields][p_Nq][p_Nq]; @shared dfloat s_Grr[p_Nq][p_Nq]; @shared dfloat s_Gss[p_Nq][p_Nq]; @exclusive dfloat s_Gtt[p_Nq]; - @exclusive dfloat s_lambdat[p_Nfields][p_Nq]; - // prefetch lamda 0 + @exclusive dfloat s_lambdat[p_MaxNFields][p_Nq]; + +#ifdef smXX #pragma unroll p_Nq +#endif for(int k = 0; k < p_Nq; ++k) { for(int j = 0; j < p_Nq; ++j; @inner(1)) { for(int i = 0; i < p_Nq; ++i; @inner(0)) { @@ -57,7 +58,7 @@ s_Grr[j][i] = ggeo[base + p_G00ID * p_Np]; s_Gss[j][i] = ggeo[base + p_G11ID * p_Np]; - for(int l = 0; l < p_Nfields; l++) + for(int l = 0; l < Nfields; l++) s_lambda0[l][j][i] = lambda[id + 0 * offset + l * loffset]; if( k == 0 ) { #pragma unroll p_Nq @@ -65,7 +66,7 @@ const dlong other_base = e * p_Nggeo * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; const dlong other_id = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; s_Gtt[l] = ggeo[other_base + p_G22ID * p_Np]; - for(int field = 0; field < p_Nfields; field++) + for(int field = 0; field < Nfields; field++) s_lambdat[field][l] = lambda[other_id + 0 * offset + field * loffset]; } } @@ -78,10 +79,8 @@ const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; dfloat r_q = 1.0; - for(int l = 0; l < p_Nfields; l++) { - const int if_not_masked = (mapB[id + l * offset] != 1 ) ? 1:0; + for(int l = 0; l < Nfields; l++) { - if(if_not_masked) { r_q = 0.0; // first make it zero const dlong base = e * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; @@ -90,7 +89,6 @@ dfloat gst = ggeo[base + p_G12ID * p_Np]; dfloat gwJ = ggeo[base + p_GWJID * p_Np]; - dfloat lambda_1 = lambda[id + 1 * offset + l * loffset]; dfloat lambda_0 = s_lambda0[l][j][i]; r_q += 2.0 * grs * lambda_0 * s_D[i][i] * s_D[j][j]; @@ -103,12 +101,12 @@ r_q += s_Gtt[m] * s_lambdat[l][m] * s_D[m][k] * s_D[m][k]; } +#ifndef p_poisson + dfloat lambda_1 = lambda[id + 1 * offset + l * loffset]; r_q += gwJ * lambda_1; +#endif - if(allNeumann) - r_q += allNeumannScale; - } - Aq[id + l * offset] = (pfloat) r_q; + Aq[id + l * offset] = (pfloat) r_q; } } } diff --git a/okl/elliptic/ellipticBlockPartialAxCoeffHex3D_N3.c b/okl/elliptic/ellipticBlockPartialAxCoeffHex3D_N3.c new file mode 100644 index 000000000..678a0067a --- /dev/null +++ b/okl/elliptic/ellipticBlockPartialAxCoeffHex3D_N3.c @@ -0,0 +1,144 @@ +extern "C" void FUNC(ellipticBlockPartialAxCoeffHex3D_N3)(const dlong & Nelements, + const dlong & offset, + const dlong & loffset, + const dlong* __restrict__ elementList, + const dfloat* __restrict__ ggeo, + const dfloat* __restrict__ D, + const dfloat* __restrict__ S, + const dfloat* __restrict__ lambda, + const dfloat* __restrict__ q, + dfloat* __restrict__ Aq ) +{ + dfloat s_q[3][p_Nq][p_Nq][p_Nq]; + dfloat s_Gqr[3][p_Nq][p_Nq][p_Nq]; + dfloat s_Gqs[3][p_Nq][p_Nq][p_Nq]; + dfloat s_Gqt[3][p_Nq][p_Nq][p_Nq]; + + dfloat s_D[p_Nq][p_Nq]; + dfloat s_S[p_Nq][p_Nq]; + + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + s_D[j][i] = D[j * p_Nq + i]; + s_S[j][i] = S[j * p_Nq + i]; + } + +#ifdef __NEKRS__OMP__ + #pragma omp parallel for private(s_q, s_Gqr, s_Gqs, s_Gqt) +#endif + for(dlong e = 0; e < Nelements; ++e) { + const dlong element = elementList[e]; + + for(int k = 0; k < p_Nq; k++) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong base = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np; + s_q[0][k][j][i] = q[base + 0 * offset]; + s_q[1][k][j][i] = q[base + 1 * offset]; + s_q[2][k][j][i] = q[base + 2 * offset]; + } + + for(int k = 0; k < p_Nq; ++k) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat r_G00 = ggeo[gbase + p_G00ID * p_Np]; + const dfloat r_G01 = ggeo[gbase + p_G01ID * p_Np]; + const dfloat r_G11 = ggeo[gbase + p_G11ID * p_Np]; + const dfloat r_G12 = ggeo[gbase + p_G12ID * p_Np]; + const dfloat r_G02 = ggeo[gbase + p_G02ID * p_Np]; + const dfloat r_G22 = ggeo[gbase + p_G22ID * p_Np]; + + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat r_lam00 = lambda[id + 0 * offset + 0 * loffset]; + const dfloat r_lam10 = lambda[id + 0 * offset + 1 * loffset]; + const dfloat r_lam20 = lambda[id + 0 * offset + 2 * loffset]; + + dfloat qr0 = 0.f, qr1 = 0.f, qr2 = 0.f; + dfloat qs0 = 0.f, qs1 = 0.f, qs2 = 0.f; + dfloat qt0 = 0.f, qt1 = 0.f, qt2 = 0.f; + + for(int m = 0; m < p_Nq; m++) { + qr0 += s_S[m][i] * s_q[0][k][j][m]; + qs0 += s_S[m][j] * s_q[0][k][m][i]; + qt0 += s_S[m][k] * s_q[0][m][j][i]; + // + qr1 += s_S[m][i] * s_q[1][k][j][m]; + qs1 += s_S[m][j] * s_q[1][k][m][i]; + qt1 += s_S[m][k] * s_q[1][m][j][i]; + + qr2 += s_S[m][i] * s_q[2][k][j][m]; + qs2 += s_S[m][j] * s_q[2][k][m][i]; + qt2 += s_S[m][k] * s_q[2][m][j][i]; + } + + dfloat Gqr0 = r_G00 * qr0 + r_G01 * qs0 + r_G02 * qt0; + dfloat Gqs0 = r_G01 * qr0 + r_G11 * qs0 + r_G12 * qt0; + dfloat Gqt0 = r_G02 * qr0 + r_G12 * qs0 + r_G22 * qt0; + + dfloat Gqr1 = r_G00 * qr1 + r_G01 * qs1 + r_G02 * qt1; + dfloat Gqs1 = r_G01 * qr1 + r_G11 * qs1 + r_G12 * qt1; + dfloat Gqt1 = r_G02 * qr1 + r_G12 * qs1 + r_G22 * qt1; + + dfloat Gqr2 = r_G00 * qr2 + r_G01 * qs2 + r_G02 * qt2; + dfloat Gqs2 = r_G01 * qr2 + r_G11 * qs2 + r_G12 * qt2; + dfloat Gqt2 = r_G02 * qr2 + r_G12 * qs2 + r_G22 * qt2; + + s_Gqr[0][k][j][i] = r_lam00 * Gqr0; + s_Gqs[0][k][j][i] = r_lam00 * Gqs0; + s_Gqt[0][k][j][i] = r_lam00 * Gqt0; + + s_Gqr[1][k][j][i] = r_lam10 * Gqr1; + s_Gqs[1][k][j][i] = r_lam10 * Gqs1; + s_Gqt[1][k][j][i] = r_lam10 * Gqt1; + + s_Gqr[2][k][j][i] = r_lam20 * Gqr2; + s_Gqs[2][k][j][i] = r_lam20 * Gqs2; + s_Gqt[2][k][j][i] = r_lam20 * Gqt2; + } + + for(int k = 0; k < p_Nq; k++) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat r_GwJ = ggeo[gbase + p_GWJID * p_Np]; + + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat r_lam01 = lambda[id + 1 * offset + 0 * loffset]; + const dfloat r_lam11 = lambda[id + 1 * offset + 1 * loffset]; + const dfloat r_lam21 = lambda[id + 1 * offset + 2 * loffset]; + + dfloat r_Aq0 = r_GwJ * r_lam01 * s_q[0][k][j][i]; + dfloat r_Aq1 = r_GwJ * r_lam11 * s_q[1][k][j][i]; + dfloat r_Aq2 = r_GwJ * r_lam21 * s_q[2][k][j][i]; + + dfloat r_Aqr0 = 0.f, r_Aqs0 = 0.f, r_Aqt0 = 0.f; + dfloat r_Aqr1 = 0.f, r_Aqs1 = 0.f, r_Aqt1 = 0.f; + dfloat r_Aqr2 = 0.f, r_Aqs2 = 0.f, r_Aqt2 = 0.f; + + for(int m = 0; m < p_Nq; m++) { + r_Aqr0 += s_D[m][i] * s_Gqr[0][k][j][m]; + r_Aqr1 += s_D[m][i] * s_Gqr[1][k][j][m]; + r_Aqr2 += s_D[m][i] * s_Gqr[2][k][j][m]; + } + + for(int m = 0; m < p_Nq; m++) { + r_Aqs0 += s_D[m][j] * s_Gqs[0][k][m][i]; + r_Aqs1 += s_D[m][j] * s_Gqs[1][k][m][i]; + r_Aqs2 += s_D[m][j] * s_Gqs[2][k][m][i]; + } + + for(int m = 0; m < p_Nq; m++) { + r_Aqt0 += s_D[m][k] * s_Gqt[0][m][j][i]; + r_Aqt1 += s_D[m][k] * s_Gqt[1][m][j][i]; + r_Aqt2 += s_D[m][k] * s_Gqt[2][m][j][i]; + } + + Aq[id + 0 * offset] = r_Aqr0 + r_Aqs0 + r_Aqt0 + r_Aq0; + Aq[id + 1 * offset] = r_Aqr1 + r_Aqs1 + r_Aqt1 + r_Aq1; + Aq[id + 2 * offset] = r_Aqr2 + r_Aqs2 + r_Aqt2 + r_Aq2; + } + } +} diff --git a/okl/elliptic/ellipticBlockPartialAxCoeffHex3D_N3.okl b/okl/elliptic/ellipticBlockPartialAxCoeffHex3D_N3.okl new file mode 100644 index 000000000..60dede57d --- /dev/null +++ b/okl/elliptic/ellipticBlockPartialAxCoeffHex3D_N3.okl @@ -0,0 +1,177 @@ +@kernel void ellipticBlockPartialAxCoeffHex3D_N3(const dlong Nelements, + const dlong offset, + const dlong loffset, + @restrict const dlong* elementList, + @restrict const dfloat* ggeo, + @restrict const dfloat* D, + @restrict const dfloat* S, + @restrict const dfloat* lambda, + @restrict const dfloat* q, + @restrict dfloat* Aq) +{ + for(dlong e = 0; e < Nelements; ++e; @outer(0)) { + @shared dfloat s_D[p_Nq][p_Nq]; + + @shared dfloat s_U[p_Nq][p_Nq]; + @shared dfloat s_V[p_Nq][p_Nq]; + @shared dfloat s_W[p_Nq][p_Nq]; + + @shared dfloat s_GUr[p_Nq][p_Nq]; + @shared dfloat s_GUs[p_Nq][p_Nq]; + + @shared dfloat s_GVr[p_Nq][p_Nq]; + @shared dfloat s_GVs[p_Nq][p_Nq]; + + @shared dfloat s_GWr[p_Nq][p_Nq]; + @shared dfloat s_GWs[p_Nq][p_Nq]; + + @exclusive dfloat r_Ut, r_Vt, r_Wt; + + @exclusive dlong element; + @exclusive dfloat r_U[p_Nq], r_V[p_Nq], r_W[p_Nq]; + @exclusive dfloat r_AU[p_Nq], r_AV[p_Nq], r_AW[p_Nq]; + + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + element = elementList[e]; + s_D[j][i] = D[p_Nq * j + i]; // D is column major + + const dlong base = i + j * p_Nq + element * p_Np; + + for(int k = 0; k < p_Nq; k++) { + r_U[k] = q[base + k * p_Nq * p_Nq + 0 * offset]; + r_V[k] = q[base + k * p_Nq * p_Nq + 1 * offset]; + r_W[k] = q[base + k * p_Nq * p_Nq + 2 * offset]; + + r_AU[k] = 0.f; + r_AV[k] = 0.f; + r_AW[k] = 0.f; + } + } + +#pragma unroll p_Nq + for(int k = 0; k < p_Nq; k++) { + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + s_U[j][i] = r_U[k]; + s_V[j][i] = r_V[k]; + s_W[j][i] = r_W[k]; + + r_Ut = 0; + r_Vt = 0; + r_Wt = 0; + +#pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + dfloat Dkm = s_D[k][m]; + r_Ut += Dkm * r_U[m]; + r_Vt += Dkm * r_V[m]; + r_Wt += Dkm * r_W[m]; + } + } + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + dfloat Ur = 0.f, Us = 0.f; + dfloat Vr = 0.f, Vs = 0.f; + dfloat Wr = 0.f, Ws = 0.f; + +#pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + dfloat Dim = s_D[i][m]; + dfloat Djm = s_D[j][m]; + Ur += Dim * s_U[j][m]; + Us += Djm * s_U[m][i]; + Vr += Dim * s_V[j][m]; + Vs += Djm * s_V[m][i]; + Wr += Dim * s_W[j][m]; + Ws += Djm * s_W[m][i]; + } + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat u_lam0 = lambda[id + 0 * offset + 0 * loffset]; + const dfloat u_lam1 = lambda[id + 1 * offset + 0 * loffset]; + const dfloat v_lam0 = lambda[id + 0 * offset + 1 * loffset]; + const dfloat v_lam1 = lambda[id + 1 * offset + 1 * loffset]; + const dfloat w_lam0 = lambda[id + 0 * offset + 2 * loffset]; + const dfloat w_lam1 = lambda[id + 1 * offset + 2 * loffset]; + + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat G00 = ggeo[gbase + p_G00ID * p_Np]; + const dfloat G01 = ggeo[gbase + p_G01ID * p_Np]; + const dfloat G02 = ggeo[gbase + p_G02ID * p_Np]; + const dfloat G11 = ggeo[gbase + p_G11ID * p_Np]; + const dfloat G12 = ggeo[gbase + p_G12ID * p_Np]; + const dfloat G22 = ggeo[gbase + p_G22ID * p_Np]; + const dfloat GwJ = ggeo[gbase + p_GWJID * p_Np]; + + s_GUr[j][i] = u_lam0 * (G00 * Ur + G01 * Us + G02 * r_Ut); + s_GUs[j][i] = u_lam0 * (G01 * Ur + G11 * Us + G12 * r_Ut); + r_Ut = u_lam0 * (G02 * Ur + G12 * Us + G22 * r_Ut); + + s_GVr[j][i] = v_lam0 * (G00 * Vr + G01 * Vs + G02 * r_Vt); + s_GVs[j][i] = v_lam0 * (G01 * Vr + G11 * Vs + G12 * r_Vt); + r_Vt = v_lam0 * (G02 * Vr + G12 * Vs + G22 * r_Vt); + + s_GWr[j][i] = w_lam0 * (G00 * Wr + G01 * Ws + G02 * r_Wt); + s_GWs[j][i] = w_lam0 * (G01 * Wr + G11 * Ws + G12 * r_Wt); + r_Wt = w_lam0 * (G02 * Wr + G12 * Ws + G22 * r_Wt); + + r_AU[k] += GwJ * u_lam1 * r_U[k]; + r_AV[k] += GwJ * v_lam1 * r_V[k]; + r_AW[k] += GwJ * w_lam1 * r_W[k]; + } + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + dfloat AUtmp = 0, AVtmp = 0, AWtmp = 0; + +#pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + dfloat Dmi = s_D[m][i]; + dfloat Dmj = s_D[m][j]; + dfloat Dkm = s_D[k][m]; + + AUtmp += Dmi * s_GUr[j][m]; + AUtmp += Dmj * s_GUs[m][i]; + + AVtmp += Dmi * s_GVr[j][m]; + AVtmp += Dmj * s_GVs[m][i]; + + AWtmp += Dmi * s_GWr[j][m]; + AWtmp += Dmj * s_GWs[m][i]; + + r_AU[m] += Dkm * r_Ut; + r_AV[m] += Dkm * r_Vt; + r_AW[m] += Dkm * r_Wt; + } + + r_AU[k] += AUtmp; + r_AV[k] += AVtmp; + r_AW[k] += AWtmp; + } + } + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for(int k = 0; k < p_Nq; k++) { + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + Aq[id + 0 * offset] = r_AU[k]; + Aq[id + 1 * offset] = r_AV[k]; + Aq[id + 2 * offset] = r_AW[k]; + } + } + } + } +} diff --git a/okl/elliptic/ellipticBlockPartialAxHex3D_N3.c b/okl/elliptic/ellipticBlockPartialAxHex3D_N3.c new file mode 100644 index 000000000..1f7d4355b --- /dev/null +++ b/okl/elliptic/ellipticBlockPartialAxHex3D_N3.c @@ -0,0 +1,145 @@ +extern "C" +void FUNC(ellipticBlockPartialAxHex3D_N3)(const dlong & Nelements, + const dlong & offset, + const dlong & loffset, + const dlong* __restrict__ elementList, + const dfloat* __restrict__ ggeo, + const dfloat* __restrict__ D, + const dfloat* __restrict__ S, + const dfloat* __restrict__ lambda, + const dfloat* __restrict__ q, + dfloat* __restrict__ Aq ) +{ + dfloat s_q[3][p_Nq][p_Nq][p_Nq]; + dfloat s_Gqr[3][p_Nq][p_Nq][p_Nq]; + dfloat s_Gqs[3][p_Nq][p_Nq][p_Nq]; + dfloat s_Gqt[3][p_Nq][p_Nq][p_Nq]; + + dfloat s_D[p_Nq][p_Nq]; + dfloat s_S[p_Nq][p_Nq]; + + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + s_D[j][i] = D[j * p_Nq + i]; + s_S[j][i] = S[j * p_Nq + i]; + } + +#ifdef __NEKRS__OMP__ + #pragma omp parallel for private(s_q, s_Gqr, s_Gqs, s_Gqt) +#endif + for(dlong e = 0; e < Nelements; ++e) { + const dlong element = elementList[e]; + + for(int k = 0; k < p_Nq; k++) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong base = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np; + s_q[0][k][j][i] = q[base + 0 * offset]; + s_q[1][k][j][i] = q[base + 1 * offset]; + s_q[2][k][j][i] = q[base + 2 * offset]; + } + + for(int k = 0; k < p_Nq; ++k) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat r_G00 = ggeo[gbase + p_G00ID * p_Np]; + const dfloat r_G01 = ggeo[gbase + p_G01ID * p_Np]; + const dfloat r_G11 = ggeo[gbase + p_G11ID * p_Np]; + const dfloat r_G12 = ggeo[gbase + p_G12ID * p_Np]; + const dfloat r_G02 = ggeo[gbase + p_G02ID * p_Np]; + const dfloat r_G22 = ggeo[gbase + p_G22ID * p_Np]; + + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat r_lam00 = lambda[0 * loffset]; + const dfloat r_lam10 = lambda[1 * loffset]; + const dfloat r_lam20 = lambda[2 * loffset]; + + dfloat qr0 = 0.f, qr1 = 0.f, qr2 = 0.f; + dfloat qs0 = 0.f, qs1 = 0.f, qs2 = 0.f; + dfloat qt0 = 0.f, qt1 = 0.f, qt2 = 0.f; + + for(int m = 0; m < p_Nq; m++) { + qr0 += s_S[m][i] * s_q[0][k][j][m]; + qs0 += s_S[m][j] * s_q[0][k][m][i]; + qt0 += s_S[m][k] * s_q[0][m][j][i]; + // + qr1 += s_S[m][i] * s_q[1][k][j][m]; + qs1 += s_S[m][j] * s_q[1][k][m][i]; + qt1 += s_S[m][k] * s_q[1][m][j][i]; + + qr2 += s_S[m][i] * s_q[2][k][j][m]; + qs2 += s_S[m][j] * s_q[2][k][m][i]; + qt2 += s_S[m][k] * s_q[2][m][j][i]; + } + + dfloat Gqr0 = r_G00 * qr0 + r_G01 * qs0 + r_G02 * qt0; + dfloat Gqs0 = r_G01 * qr0 + r_G11 * qs0 + r_G12 * qt0; + dfloat Gqt0 = r_G02 * qr0 + r_G12 * qs0 + r_G22 * qt0; + + dfloat Gqr1 = r_G00 * qr1 + r_G01 * qs1 + r_G02 * qt1; + dfloat Gqs1 = r_G01 * qr1 + r_G11 * qs1 + r_G12 * qt1; + dfloat Gqt1 = r_G02 * qr1 + r_G12 * qs1 + r_G22 * qt1; + + dfloat Gqr2 = r_G00 * qr2 + r_G01 * qs2 + r_G02 * qt2; + dfloat Gqs2 = r_G01 * qr2 + r_G11 * qs2 + r_G12 * qt2; + dfloat Gqt2 = r_G02 * qr2 + r_G12 * qs2 + r_G22 * qt2; + + s_Gqr[0][k][j][i] = r_lam00 * Gqr0; + s_Gqs[0][k][j][i] = r_lam00 * Gqs0; + s_Gqt[0][k][j][i] = r_lam00 * Gqt0; + + s_Gqr[1][k][j][i] = r_lam10 * Gqr1; + s_Gqs[1][k][j][i] = r_lam10 * Gqs1; + s_Gqt[1][k][j][i] = r_lam10 * Gqt1; + + s_Gqr[2][k][j][i] = r_lam20 * Gqr2; + s_Gqs[2][k][j][i] = r_lam20 * Gqs2; + s_Gqt[2][k][j][i] = r_lam20 * Gqt2; + } + + for(int k = 0; k < p_Nq; k++) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat r_GwJ = ggeo[gbase + p_GWJID * p_Np]; + + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat r_lam01 = lambda[id + 1 * offset + 0 * loffset]; + const dfloat r_lam11 = lambda[id + 1 * offset + 1 * loffset]; + const dfloat r_lam21 = lambda[id + 1 * offset + 2 * loffset]; + + dfloat r_Aq0 = r_GwJ * r_lam01 * s_q[0][k][j][i]; + dfloat r_Aq1 = r_GwJ * r_lam11 * s_q[1][k][j][i]; + dfloat r_Aq2 = r_GwJ * r_lam21 * s_q[2][k][j][i]; + + dfloat r_Aqr0 = 0.f, r_Aqs0 = 0.f, r_Aqt0 = 0.f; + dfloat r_Aqr1 = 0.f, r_Aqs1 = 0.f, r_Aqt1 = 0.f; + dfloat r_Aqr2 = 0.f, r_Aqs2 = 0.f, r_Aqt2 = 0.f; + + for(int m = 0; m < p_Nq; m++) { + r_Aqr0 += s_D[m][i] * s_Gqr[0][k][j][m]; + r_Aqr1 += s_D[m][i] * s_Gqr[1][k][j][m]; + r_Aqr2 += s_D[m][i] * s_Gqr[2][k][j][m]; + } + + for(int m = 0; m < p_Nq; m++) { + r_Aqs0 += s_D[m][j] * s_Gqs[0][k][m][i]; + r_Aqs1 += s_D[m][j] * s_Gqs[1][k][m][i]; + r_Aqs2 += s_D[m][j] * s_Gqs[2][k][m][i]; + } + + for(int m = 0; m < p_Nq; m++) { + r_Aqt0 += s_D[m][k] * s_Gqt[0][m][j][i]; + r_Aqt1 += s_D[m][k] * s_Gqt[1][m][j][i]; + r_Aqt2 += s_D[m][k] * s_Gqt[2][m][j][i]; + } + + Aq[id + 0 * offset] = r_Aqr0 + r_Aqs0 + r_Aqt0 + r_Aq0; + Aq[id + 1 * offset] = r_Aqr1 + r_Aqs1 + r_Aqt1 + r_Aq1; + Aq[id + 2 * offset] = r_Aqr2 + r_Aqs2 + r_Aqt2 + r_Aq2; + } + } +} diff --git a/okl/elliptic/ellipticBlockPartialAxHex3D_N3.okl b/okl/elliptic/ellipticBlockPartialAxHex3D_N3.okl new file mode 100644 index 000000000..6998fbe51 --- /dev/null +++ b/okl/elliptic/ellipticBlockPartialAxHex3D_N3.okl @@ -0,0 +1,180 @@ +@kernel void ellipticBlockPartialAxHex3D_N3(const dlong Nelements, + const dlong offset, + const dlong loffset, + @restrict const dlong* elementList, + @restrict const dfloat* ggeo, + @restrict const dfloat* D, + @restrict const dfloat* S, + @restrict const dfloat* lambda, + @restrict const dfloat* q, + @restrict dfloat* Aq) +{ + for(dlong e = 0; e < Nelements; ++e; @outer(0)) { + @shared dfloat s_D[p_Nq][p_Nq]; + + @shared dfloat s_U[p_Nq][p_Nq]; + @shared dfloat s_V[p_Nq][p_Nq]; + @shared dfloat s_W[p_Nq][p_Nq]; + + @shared dfloat s_GUr[p_Nq][p_Nq]; + @shared dfloat s_GUs[p_Nq][p_Nq]; + + @shared dfloat s_GVr[p_Nq][p_Nq]; + @shared dfloat s_GVs[p_Nq][p_Nq]; + + @shared dfloat s_GWr[p_Nq][p_Nq]; + @shared dfloat s_GWs[p_Nq][p_Nq]; + + @exclusive dfloat r_Ut, r_Vt, r_Wt; + + @exclusive dlong element; + @exclusive dfloat r_U[p_Nq], r_V[p_Nq], r_W[p_Nq]; + @exclusive dfloat r_AU[p_Nq], r_AV[p_Nq], r_AW[p_Nq]; + + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + element = elementList[e]; + s_D[j][i] = D[p_Nq * j + i]; // D is column major + + const dlong base = i + j * p_Nq + element * p_Np; + + for(int k = 0; k < p_Nq; k++) { + // + r_U[k] = q[base + k * p_Nq * p_Nq + 0 * offset]; + r_V[k] = q[base + k * p_Nq * p_Nq + 1 * offset]; + r_W[k] = q[base + k * p_Nq * p_Nq + 2 * offset]; + // + r_AU[k] = 0.f; + r_AV[k] = 0.f; + r_AW[k] = 0.f; + } + } + +#pragma unroll p_Nq + for(int k = 0; k < p_Nq; k++) { + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + s_U[j][i] = r_U[k]; + s_V[j][i] = r_V[k]; + s_W[j][i] = r_W[k]; + + r_Ut = 0; + r_Vt = 0; + r_Wt = 0; + +#pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + dfloat Dkm = s_D[k][m]; + r_Ut += Dkm * r_U[m]; + r_Vt += Dkm * r_V[m]; + r_Wt += Dkm * r_W[m]; + } + } + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + dfloat Ur = 0.f, Us = 0.f; + dfloat Vr = 0.f, Vs = 0.f; + dfloat Wr = 0.f, Ws = 0.f; + +#pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + dfloat Dim = s_D[i][m]; + dfloat Djm = s_D[j][m]; + + Ur += Dim * s_U[j][m]; + Us += Djm * s_U[m][i]; + + Vr += Dim * s_V[j][m]; + Vs += Djm * s_V[m][i]; + + Wr += Dim * s_W[j][m]; + Ws += Djm * s_W[m][i]; + } + + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat G00 = ggeo[gbase + p_G00ID * p_Np]; + const dfloat G01 = ggeo[gbase + p_G01ID * p_Np]; + const dfloat G02 = ggeo[gbase + p_G02ID * p_Np]; + + const dfloat G11 = ggeo[gbase + p_G11ID * p_Np]; + const dfloat G12 = ggeo[gbase + p_G12ID * p_Np]; + const dfloat G22 = ggeo[gbase + p_G22ID * p_Np]; + + const dfloat GwJ = ggeo[gbase + p_GWJID * p_Np]; + const dfloat r_lam00 = lambda[0 * loffset]; + const dfloat r_lam10 = lambda[1 * loffset]; + const dfloat r_lam20 = lambda[2 * loffset]; + + s_GUr[j][i] = r_lam00 * (G00 * Ur + G01 * Us + G02 * r_Ut); + s_GUs[j][i] = r_lam00 * (G01 * Ur + G11 * Us + G12 * r_Ut); + r_Ut = r_lam00 * (G02 * Ur + G12 * Us + G22 * r_Ut); + + s_GVr[j][i] = r_lam10 * (G00 * Vr + G01 * Vs + G02 * r_Vt); + s_GVs[j][i] = r_lam10 * (G01 * Vr + G11 * Vs + G12 * r_Vt); + r_Vt = r_lam10 * (G02 * Vr + G12 * Vs + G22 * r_Vt); + + s_GWr[j][i] = r_lam20 * (G00 * Wr + G01 * Ws + G02 * r_Wt); + s_GWs[j][i] = r_lam20 * (G01 * Wr + G11 * Ws + G12 * r_Wt); + r_Wt = r_lam20 * (G02 * Wr + G12 * Ws + G22 * r_Wt); + + r_AU[k] += GwJ * lambda[0 * loffset] * r_U[k]; + r_AV[k] += GwJ * lambda[1 * loffset] * r_V[k]; + r_AW[k] += GwJ * lambda[2 * loffset] * r_W[k]; + } + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + dfloat AUtmp = 0, AVtmp = 0, AWtmp = 0; + +#pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + dfloat Dmi = s_D[m][i]; + dfloat Dmj = s_D[m][j]; + dfloat Dkm = s_D[k][m]; + + AUtmp += Dmi * s_GUr[j][m]; + AUtmp += Dmj * s_GUs[m][i]; + + AVtmp += Dmi * s_GVr[j][m]; + AVtmp += Dmj * s_GVs[m][i]; + + AWtmp += Dmi * s_GWr[j][m]; + AWtmp += Dmj * s_GWs[m][i]; + + r_AU[m] += Dkm * r_Ut; + r_AV[m] += Dkm * r_Vt; + r_AW[m] += Dkm * r_Wt; + } + + r_AU[k] += AUtmp; + r_AV[k] += AVtmp; + r_AW[k] += AWtmp; + } + } + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for(int k = 0; k < p_Nq; k++) { + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + Aq[id + 0 * offset] = r_AU[k]; + Aq[id + 1 * offset] = r_AV[k]; + Aq[id + 2 * offset] = r_AW[k]; + } + } + } + } +} diff --git a/okl/elliptic/ellipticSerialUpdatePCG.c b/okl/elliptic/ellipticBlockUpdatePCG.c similarity index 92% rename from okl/elliptic/ellipticSerialUpdatePCG.c rename to okl/elliptic/ellipticBlockUpdatePCG.c index e1e573973..7e5249e14 100644 --- a/okl/elliptic/ellipticSerialUpdatePCG.c +++ b/okl/elliptic/ellipticBlockUpdatePCG.c @@ -24,14 +24,12 @@ */ -extern "C" -void FUNC(ellipticBlockUpdatePCG)(const dlong & N, +extern "C" void FUNC(ellipticBlockUpdatePCG)(const dlong & N, const dlong & offset, const dfloat* __restrict__ cpu_invDegree, const dfloat* __restrict__ cpu_p, const dfloat* __restrict__ cpu_Ap, const dfloat & alpha, - dfloat* __restrict__ cpu_x, dfloat* __restrict__ cpu_r, dfloat* __restrict__ cpu_rdotr) { @@ -43,7 +41,6 @@ void FUNC(ellipticBlockUpdatePCG)(const dlong & N, for(int fld = 0; fld < p_Nfields; fld++) for(int i = 0; i < N; ++i) { const dlong n = i + fld * offset; - cpu_x[n] += alpha * cpu_p[n]; const dfloat rn = cpu_r[n] - alpha * cpu_Ap[n]; rdotr += rn * rn * cpu_invDegree[i]; diff --git a/okl/elliptic/ellipticUpdatePCG.okl b/okl/elliptic/ellipticBlockUpdatePCG.okl similarity index 95% rename from okl/elliptic/ellipticUpdatePCG.okl rename to okl/elliptic/ellipticBlockUpdatePCG.okl index 88f60b89a..e99be42cd 100644 --- a/okl/elliptic/ellipticUpdatePCG.okl +++ b/okl/elliptic/ellipticBlockUpdatePCG.okl @@ -29,7 +29,6 @@ @restrict const dfloat* p, @restrict const dfloat* Ap, const dfloat alpha, - @restrict dfloat* x, @restrict dfloat* r, @restrict dfloat* redr) { @@ -43,17 +42,14 @@ dfloat sum = 0.0; #pragma unroll for(int fld = 0; fld < p_Nfields; fld++) { - dfloat xn = x[n + fld * offset]; dfloat rn = r[n + fld * offset]; const dfloat pn = p[n + fld * offset]; const dfloat Apn = Ap[n + fld * offset]; - xn += alpha * pn; rn -= alpha * Apn; sum += rn * rn; - x[n + fld * offset] = xn; r[n + fld * offset] = rn; } s_sum[t] = sum*invDegree[n]; diff --git a/okl/elliptic/ellipticPartialAxCoeffHex3D.c b/okl/elliptic/ellipticPartialAxCoeffHex3D.c new file mode 100644 index 000000000..f6a7d8d5d --- /dev/null +++ b/okl/elliptic/ellipticPartialAxCoeffHex3D.c @@ -0,0 +1,95 @@ +extern "C" void FUNC(ellipticPartialAxCoeffHex3D)(const dlong & Nelements, + const dlong & offset, + const dlong & loffset, + const dlong* __restrict__ elementList, + const dfloat* __restrict__ ggeo, + const dfloat* __restrict__ D, + const dfloat* __restrict__ S, + const dfloat* __restrict__ lambda, + const dfloat* __restrict__ q, + dfloat* __restrict__ Aq ) +{ + dfloat s_q[p_Nq][p_Nq][p_Nq]; + dfloat s_Gqr[p_Nq][p_Nq][p_Nq]; + dfloat s_Gqs[p_Nq][p_Nq][p_Nq]; + dfloat s_Gqt[p_Nq][p_Nq][p_Nq]; + +#ifdef __NEKRS__OMP__ + #pragma omp parallel for private(s_q, s_Gqr, s_Gqs, s_Gqt) +#endif + for(dlong e = 0; e < Nelements; ++e) { + const dlong element = elementList[e]; + + for(int k = 0; k < p_Nq; k++) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong base = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np; + const dfloat qbase = q[base]; + s_q[k][j][i] = qbase; + } + + for(int k = 0; k < p_Nq; ++k) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat r_G00 = ggeo[gbase + p_G00ID * p_Np]; + const dfloat r_G01 = ggeo[gbase + p_G01ID * p_Np]; + const dfloat r_G11 = ggeo[gbase + p_G11ID * p_Np]; + const dfloat r_G12 = ggeo[gbase + p_G12ID * p_Np]; + const dfloat r_G02 = ggeo[gbase + p_G02ID * p_Np]; + const dfloat r_G22 = ggeo[gbase + p_G22ID * p_Np]; + + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat r_lam0 = lambda[id + 0 * offset]; + + dfloat qr = 0.f; + dfloat qs = 0.f; + dfloat qt = 0.f; + + for(int m = 0; m < p_Nq; m++){ + qr += S[m*p_Nq + i] * s_q[k][j][m]; + qs += S[m*p_Nq + j] * s_q[k][m][i]; + qt += S[m*p_Nq + k] * s_q[m][j][i]; + } + + dfloat Gqr = r_G00 * qr; + Gqr += r_G01 * qs; + Gqr += r_G02 * qt; + + dfloat Gqs = r_G01 * qr; + Gqs += r_G11 * qs; + Gqs += r_G12 * qt; + + dfloat Gqt = r_G02 * qr; + Gqt += r_G12 * qs; + Gqt += r_G22 * qt; + + s_Gqr[k][j][i] = r_lam0 * Gqr; + s_Gqs[k][j][i] = r_lam0 * Gqs; + s_Gqt[k][j][i] = r_lam0 * Gqt; + } + + for(int k = 0; k < p_Nq; k++) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + dfloat r_Aq = 0; +#ifndef p_poisson + const dfloat r_lam1 = lambda[id + 1 * offset]; + r_Aq = ggeo[gbase + p_GWJID * p_Np] * r_lam1 * s_q[k][j][i]; +#endif + dfloat r_Aqr = 0, r_Aqs = 0, r_Aqt = 0; + + for(int m = 0; m < p_Nq; m++){ + r_Aqr += D[m*p_Nq+i] * s_Gqr[k][j][m]; + r_Aqs += D[m*p_Nq+j] * s_Gqs[k][m][i]; + r_Aqt += D[m*p_Nq+k] * s_Gqt[m][j][i]; + } + + Aq[id] = r_Aqr + r_Aqs + r_Aqt + r_Aq; + } + } +} diff --git a/okl/elliptic/ellipticPartialAxCoeffHex3D.okl b/okl/elliptic/ellipticPartialAxCoeffHex3D.okl new file mode 100644 index 000000000..13892b7ed --- /dev/null +++ b/okl/elliptic/ellipticPartialAxCoeffHex3D.okl @@ -0,0 +1,143 @@ +@kernel void ellipticPartialAxCoeffHex3D(const dlong Nelements, + const dlong offset, + const dlong loffset, + @restrict const dlong* elementList, + @restrict const dfloat* ggeo, + @restrict const dfloat* D, + @restrict const dfloat* S, + @restrict const dfloat* lambda, + @restrict const dfloat* q, + @restrict dfloat* Aq) +{ + for(dlong e = 0; e < Nelements; ++e; @outer(0)) { +#if (p_Nq % 2 == 0) + @shared dfloat s_D[p_Nq][p_Nq+1]; +#else + @shared dfloat s_D[p_Nq][p_Nq]; +#endif + @shared dfloat s_q[p_Nq][p_Nq]; + + @shared dfloat s_Gqr[p_Nq][p_Nq]; + @shared dfloat s_Gqs[p_Nq][p_Nq]; + + @exclusive dfloat r_qt, r_Gqt, r_Auk; + @exclusive dfloat r_q[p_Nq]; + @exclusive dfloat r_Aq[p_Nq]; + + @exclusive dlong element; + + @exclusive dfloat r_G00, r_G01, r_G02, r_G11, r_G12, r_G22, r_GwJ; + @exclusive dfloat r_lam0, r_lam1; + + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + s_D[j][i] = D[p_Nq * j + i]; + element = elementList[e]; + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for(int k = 0; k < p_Nq; k++) { + const dlong base = i + j * p_Nq + element * p_Np; + r_q[k] = q[base + k * p_Nq * p_Nq]; + r_Aq[k] = 0; + } + } + } + + @barrier("local"); + +#pragma unroll p_Nq + for(int k = 0; k < p_Nq; k++) { + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + r_G00 = ggeo[gbase + p_G00ID * p_Np]; + r_G01 = ggeo[gbase + p_G01ID * p_Np]; + r_G02 = ggeo[gbase + p_G02ID * p_Np]; + + r_G11 = ggeo[gbase + p_G11ID * p_Np]; + r_G12 = ggeo[gbase + p_G12ID * p_Np]; + r_G22 = ggeo[gbase + p_G22ID * p_Np]; + + r_GwJ = ggeo[gbase + p_GWJID * p_Np]; + + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + r_lam0 = lambda[id + 0 * offset]; + +#ifdef p_poisson + r_lam1 = 0; +#else + r_lam1 = lambda[id + 1 * offset]; +#endif + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + s_q[j][i] = r_q[k]; + + r_qt = 0; + +#pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) + r_qt += s_D[k][m] * r_q[m]; + } + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + dfloat qr = 0; + dfloat qs = 0; + +#pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + qr += s_D[i][m] * s_q[j][m]; + qs += s_D[j][m] * s_q[m][i]; + } + + s_Gqs[j][i] = r_lam0 * (r_G01 * qr + r_G11 * qs + r_G12 * r_qt); + s_Gqr[j][i] = r_lam0 * (r_G00 * qr + r_G01 * qs + r_G02 * r_qt); + + r_Gqt = r_lam0 * (r_G02 * qr + r_G12 * qs + r_G22 * r_qt); + r_Auk = r_lam1 * r_GwJ * r_q[k]; + } + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + r_Auk += s_D[m][j] * s_Gqs[m][i]; + r_Aq[m] += s_D[k][m] * r_Gqt; + r_Auk += s_D[m][i] * s_Gqr[j][m]; + } + + r_Aq[k] += r_Auk; + } + } + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for(int k = 0; k < p_Nq; k++) { + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + Aq[id] = r_Aq[k]; + } + } + } + } +} diff --git a/okl/elliptic/ellipticPartialAxHex3D.c b/okl/elliptic/ellipticPartialAxHex3D.c new file mode 100644 index 000000000..babaa907e --- /dev/null +++ b/okl/elliptic/ellipticPartialAxHex3D.c @@ -0,0 +1,125 @@ +/* + + The MIT License (MIT) + + Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + */ + +extern "C" void FUNC(ellipticPartialAxHex3D)(const dlong & Nelements, + const dlong & offset, + const dlong & loffset, + const dlong* __restrict__ elementList, + const dfloat* __restrict__ ggeo, + const dfloat* __restrict__ D, + const dfloat* __restrict__ S, + const dfloat* __restrict__ lambda, + const dfloat* __restrict__ q, + dfloat* __restrict__ Aq ) +{ + dfloat s_q[p_Nq][p_Nq][p_Nq]; + dfloat s_Gqr[p_Nq][p_Nq][p_Nq]; + dfloat s_Gqs[p_Nq][p_Nq][p_Nq]; + dfloat s_Gqt[p_Nq][p_Nq][p_Nq]; + + dfloat s_D[p_Nq][p_Nq]; + dfloat s_S[p_Nq][p_Nq]; + + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + s_D[j][i] = D[j * p_Nq + i]; + s_S[j][i] = S[j * p_Nq + i]; + } + +#ifdef __NEKRS__OMP__ + #pragma omp parallel for private(s_q, s_Gqr, s_Gqs, s_Gqt) +#endif + for(dlong e = 0; e < Nelements; ++e) { + const dlong element = elementList[e]; + + for(int k = 0; k < p_Nq; k++) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong base = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np; + s_q[k][j][i] = q[base]; + } + + for(int k = 0; k < p_Nq; ++k) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat r_G00 = ggeo[gbase + p_G00ID * p_Np]; + const dfloat r_G01 = ggeo[gbase + p_G01ID * p_Np]; + const dfloat r_G11 = ggeo[gbase + p_G11ID * p_Np]; + const dfloat r_G12 = ggeo[gbase + p_G12ID * p_Np]; + const dfloat r_G02 = ggeo[gbase + p_G02ID * p_Np]; + const dfloat r_G22 = ggeo[gbase + p_G22ID * p_Np]; + + dfloat qr = 0.f; + dfloat qs = 0.f; + dfloat qt = 0.f; + + for(int m = 0; m < p_Nq; m++) { + qr += s_D[i][m] * s_q[k][j][m]; + qs += s_D[j][m] * s_q[k][m][i]; + qt += s_D[k][m] * s_q[m][j][i]; + } + + dfloat Gqr = r_G00 * qr; + Gqr += r_G01 * qs; + Gqr += r_G02 * qt; + + dfloat Gqs = r_G01 * qr; + Gqs += r_G11 * qs; + Gqs += r_G12 * qt; + + dfloat Gqt = r_G02 * qr; + Gqt += r_G12 * qs; + Gqt += r_G22 * qt; + + s_Gqr[k][j][i] = Gqr; + s_Gqs[k][j][i] = Gqs; + s_Gqt[k][j][i] = Gqt; + } + + for(int k = 0; k < p_Nq; k++) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + dfloat r_Aq = 0; +#ifndef p_poisson + r_Aq = ggeo[gbase + p_GWJID * p_Np] * lambda[1*loffset] * s_q[k][j][i]; +#endif + dfloat r_Aqr = 0, r_Aqs = 0, r_Aqt = 0; + + for(int m = 0; m < p_Nq; m++) + r_Aqr += s_S[i][m] * s_Gqr[k][j][m]; + for(int m = 0; m < p_Nq; m++) + r_Aqs += s_S[j][m] * s_Gqs[k][m][i]; + for(int m = 0; m < p_Nq; m++) + r_Aqt += s_S[k][m] * s_Gqt[m][j][i]; + + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + Aq[id] = lambda[0*loffset]*(r_Aqr + r_Aqs + r_Aqt) + r_Aq; + } + } +} diff --git a/okl/elliptic/ellipticPartialAxHex3D.okl b/okl/elliptic/ellipticPartialAxHex3D.okl new file mode 100644 index 000000000..fece0b1c8 --- /dev/null +++ b/okl/elliptic/ellipticPartialAxHex3D.okl @@ -0,0 +1,1426 @@ +#if p_knl == 0 +// original kernel +@kernel void ellipticPartialAxHex3D_v0(const dlong Nelements, + const dlong offset, + const dlong loffset, + @restrict const dlong *elementList, + @restrict const dfloat *ggeo, + @restrict const dfloat *D, + @restrict const dfloat *S, + @restrict const dfloat *lambda, + @restrict const dfloat *q, + @restrict dfloat *Aq) +{ + for (dlong e = 0; e < Nelements; ++e; @outer(0)) { + +#if defined(FP32) && defined(gfxXX) + @shared dfloat s_D[p_Nq][p_Nq]; +#elif (p_Nq % 2 == 0) + @shared dfloat s_D[p_Nq][p_Nq + 1]; +#else + @shared dfloat s_D[p_Nq][p_Nq]; +#endif + @shared dfloat s_q[p_Nq][p_Nq]; + + @shared dfloat s_Gqr[p_Nq][p_Nq]; + @shared dfloat s_Gqs[p_Nq][p_Nq]; + + @exclusive dfloat r_qt, r_Gqt, r_Auk; + @exclusive dfloat r_q[p_Nq]; + @exclusive dfloat r_Aq[p_Nq]; + + @exclusive dlong element; + + @exclusive dfloat r_G00, r_G01, r_G02, r_G11, r_G12, r_G22, r_GwJ; + + for (int j = 0; j < p_Nq; ++j; @inner(1)) + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + s_D[j][i] = D[p_Nq * j + i]; + element = elementList[e]; + } + + @barrier("local"); + + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + const dlong base = i + j * p_Nq + element * p_Np; + r_q[k] = q[base + k * p_Nq * p_Nq]; + r_Aq[k] = 0; + } + } + } + + @barrier("local"); + +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + @barrier("local"); + for (int j = 0; j < p_Nq; ++j; @inner(1)) + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + r_G00 = ggeo[gbase + p_G00ID * p_Np]; + r_G01 = ggeo[gbase + p_G01ID * p_Np]; + r_G02 = ggeo[gbase + p_G02ID * p_Np]; + + r_G11 = ggeo[gbase + p_G11ID * p_Np]; + r_G12 = ggeo[gbase + p_G12ID * p_Np]; + r_G22 = ggeo[gbase + p_G22ID * p_Np]; + +#ifdef p_poisson + r_GwJ = 0; +#else + r_GwJ = ggeo[gbase + p_GWJID * p_Np]; +#endif + } + + @barrier("local"); + + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + s_q[j][i] = r_q[k]; + + r_qt = 0; + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) + r_qt += s_D[k][m] * r_q[m]; + } + } + + @barrier("local"); + + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + dfloat qr = 0; + dfloat qs = 0; + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + qr += s_D[i][m] * s_q[j][m]; + qs += s_D[j][m] * s_q[m][i]; + } + + s_Gqs[j][i] = (r_G01 * qr + r_G11 * qs + r_G12 * r_qt); + s_Gqr[j][i] = (r_G00 * qr + r_G01 * qs + r_G02 * r_qt); + + const dfloat lambda0 = lambda[0 * loffset]; + + r_Gqt = lambda0 * (r_G02 * qr + r_G12 * qs + r_G22 * r_qt); +#ifdef p_poisson + r_Auk = 0.0; +#else + const dfloat lambda1 = lambda[1 * loffset]; + r_Auk = r_GwJ * lambda1 * r_q[k]; +#endif + } + } + + @barrier("local"); + + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + r_Auk += s_D[m][j] * s_Gqs[m][i]; + r_Aq[m] += s_D[k][m] * r_Gqt; + r_Auk += s_D[m][i] * s_Gqr[j][m]; + } + + r_Aq[k] += r_Auk; + } + } + } + + @barrier("local"); + + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + Aq[id] = r_Aq[k]; + } + } + } + } +} + +#endif + +// padding for bank conflicts +#if p_Nq == 16 +#define p_pad 1 +#else +#define p_pad 0 +#endif + +#if p_Nq == 16 || p_Nq == 14 || p_Nq == 12 || p_Nq == 10 || p_Nq == 8 || p_Nq == 6 || p_Nq == 4 +#define p_pad 0 +#else +#define p_pad 1 +#endif + +#if p_knl == 1 +@kernel void ellipticPartialAxHex3D_v1(const dlong Nelements, + const dlong offset, + const dlong loffset, + @restrict const dlong *elementList, + @restrict const dfloat *ggeo, + @restrict const dfloat *D, + @restrict const dfloat *S, + @restrict const dfloat *lambda, + @restrict const dfloat *q, + @restrict dfloat *Aq) +{ + + for (dlong e = 0; e < Nelements; e++; @outer(0)) { + + @shared dfloat s_D[p_Nq][p_Nq + p_pad]; + @shared dfloat s_q[p_Nq][p_Nq + p_pad]; + @shared dfloat s_v[p_Nq][p_Nq + p_pad]; + @shared dfloat s_w[p_Nq][p_Nq + p_pad]; + + @exclusive dfloat r_GDut, r_Auk; + + // register array to hold u(i,j,0:N) private to thread + @exclusive dfloat r_u[p_Nq]; + // array for results Au(i,j,0:N) + @exclusive dfloat r_Au[p_Nq]; + + @exclusive dlong element; + + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + + // load D into local memory + // s_D[i][j] = d \phi_i at node j + s_D[j][i] = D[p_Nq * j + i]; // D is column major + + element = elementList[e]; + + const dlong base = i + j * p_Nq + element * p_Np; + +// load pencil of u into register +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + const dlong id = base + k * p_Nq * p_Nq; + r_u[k] = (id != -1) ? q[id] : 0.0; + } + +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + r_Au[k] = 0.0; + } + } + } + + // Layer by layer +#ifdef smXX +// only force some type of unrolling in CUDA mode +#pragma unroll p_Nq +#endif + for (int k = 0; k < p_Nq; k++) { + @barrier("local"); + + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + // share u(:,:,k) + s_q[j][i] = r_u[k]; + } + } + @barrier("local"); + + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + // prefetch geometric factors + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat r_G00 = ggeo[gbase + p_G00ID * p_Np]; + const dfloat r_G01 = ggeo[gbase + p_G01ID * p_Np]; + const dfloat r_G02 = ggeo[gbase + p_G02ID * p_Np]; + + const dfloat r_G11 = ggeo[gbase + p_G11ID * p_Np]; + const dfloat r_G12 = ggeo[gbase + p_G12ID * p_Np]; + const dfloat r_G22 = ggeo[gbase + p_G22ID * p_Np]; + +#ifdef p_poisson + const dfloat r_GwJ = 0; +#else + const dfloat r_GwJ = ggeo[gbase + p_GWJID * p_Np]; +#endif + + dfloat ur = 0.f; + dfloat us = 0.f; + dfloat ut = 0; + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + ut += s_D[k][m] * r_u[m]; + } + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + ur += s_D[i][m] * s_q[j][m]; + us += s_D[j][m] * s_q[m][i]; + } + + const dfloat lambda0 = lambda[0 * loffset]; + + s_w[j][i] = (r_G01 * ur + r_G11 * us + r_G12 * ut); + s_v[j][i] = (r_G00 * ur + r_G01 * us + r_G02 * ut); + r_GDut = lambda0 * (r_G02 * ur + r_G12 * us + r_G22 * ut); +#ifdef p_poisson + r_Auk = 0.0; +#else + r_Auk = r_GwJ * lambda[1 * loffset] * r_u[k]; +#endif + } + } + + @barrier("local"); + + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + r_Au[m] += s_D[k][m] * r_GDut; + } + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + r_Auk += s_D[m][j] * s_w[m][i]; + r_Auk += s_D[m][i] * s_v[j][m]; + } + + r_Au[k] += r_Auk; + } + } + } // end Layer by layer + + @barrier("local"); + + // write out + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong id = element * p_Np + j * p_Nq + i; + +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + Aq[id + k * p_Nq * p_Nq] = r_Au[k]; + } + } + } + } +} +#endif + +#if defined(FP32) +#if p_Nq == 2 +#define p_NelementsPerBlk 63 +#elif p_Nq == 3 +#define p_NelementsPerBlk 27 +#elif p_Nq == 4 +#define p_NelementsPerBlk 15 +#elif p_Nq == 5 +#define p_NelementsPerBlk 9 +#elif p_Nq == 6 +#define p_NelementsPerBlk 7 +#elif p_Nq == 7 +#define p_NelementsPerBlk 5 +#elif p_Nq == 8 +#define p_NelementsPerBlk 5 +#elif p_Nq == 9 +#define p_NelementsPerBlk 3 +#elif p_Nq == 10 +#define p_NelementsPerBlk 5 +#elif p_Nq == 11 +#define p_NelementsPerBlk 3 +#elif p_Nq == 12 +#define p_NelementsPerBlk 3 +#elif p_Nq == 13 +#define p_NelementsPerBlk 3 +#elif p_Nq == 14 +#define p_NelementsPerBlk 3 +#elif p_Nq == 15 +#define p_NelementsPerBlk 3 +#else +#define p_NelementsPerBlk 1 +#endif + +#else + +// 2D, blocked version +#if p_N == 1 +#define p_NelementsPerBlk 16 +#elif p_N == 2 +#define p_NelementsPerBlk 56 +#elif p_N == 3 +#define p_NelementsPerBlk 32 +#elif p_N == 4 +#define p_NelementsPerBlk 5 +#elif p_N == 5 +#define p_NelementsPerBlk 1 +#elif p_N == 6 +#define p_NelementsPerBlk 5 +#elif p_N == 7 +#define p_NelementsPerBlk 1 +#elif p_N == 8 +#define p_NelementsPerBlk 3 +#elif p_N == 9 +#define p_NelementsPerBlk 1 +#elif p_N == 10 +#define p_NelementsPerBlk 1 +#elif p_N == 11 +#define p_NelementsPerBlk 1 +#elif p_N == 12 +#define p_NelementsPerBlk 1 +#elif p_N == 13 +#define p_NelementsPerBlk 1 +#elif p_N == 14 +#define p_NelementsPerBlk 1 +#elif p_N == 15 +#define p_NelementsPerBlk 1 +#else +#define p_NelementsPerBlk 1 +#endif + +#endif + +// padding for bank conflicts +#if p_Nq == 16 +#define p_pad 1 +#else +#define p_pad 0 +#endif + +#if p_knl == 2 + +@kernel void ellipticPartialAxHex3D_v2(const dlong Nelements, + const dlong offset, + const dlong loffset, + @restrict const dlong *elementList, + @restrict const dfloat *ggeo, + @restrict const dfloat *D, + @restrict const dfloat *S, + @restrict const dfloat *lambda, + @restrict const dfloat *q, + @restrict dfloat *Aq) +{ + + for (dlong eo = 0; eo < Nelements; eo += p_NelementsPerBlk; @outer(0)) { + + @shared dfloat s_D[p_Nq][p_Nq + p_pad]; + @shared dfloat s_q[p_NelementsPerBlk][p_Nq][p_Nq + p_pad]; + @shared dfloat s_v[p_NelementsPerBlk][p_Nq][p_Nq + p_pad]; + @shared dfloat s_w[p_NelementsPerBlk][p_Nq][p_Nq + p_pad]; + + @exclusive dfloat r_GDut, r_Auk; + + // register array to hold u(i,j,0:N) private to thread + @exclusive dfloat r_u[p_Nq]; + // array for results Au(i,j,0:N) + @exclusive dfloat r_Au[p_Nq]; + + @exclusive dlong r_e, element; + + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + + // load D into local memory + // s_D[i][j] = d \phi_i at node j + if (es == 0) { + s_D[j][i] = D[p_Nq * j + i]; // D is column major + } + + r_e = es + eo; + + if (r_e < Nelements) { + element = elementList[r_e]; + + const dlong base = i + j * p_Nq + element * p_Np; + +// load pencil of u into register +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + const dlong id = base + k * p_Nq * p_Nq; + r_u[k] = (id != -1) ? q[id] : 0.0; + } + +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + r_Au[k] = 0.0; + } + } + } + } + } + + // Layer by layer +#ifdef smXX +// only force some type of unrolling in CUDA mode +#pragma unroll p_Nq +#endif +#ifdef gfxXX +// on HIP, tell the compiler to not unroll this loop +#pragma nounroll +#endif + for (int k = 0; k < p_Nq; k++) { + @barrier("local"); + + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + // share u(:,:,k) + s_q[es][j][i] = r_u[k]; + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + + dfloat r_G00, r_G01, r_G02, r_G11, r_G12, r_G22, r_GwJ; + + if (r_e < Nelements) { + // prefetch geometric factors + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + r_G00 = ggeo[gbase + p_G00ID * p_Np]; + r_G01 = ggeo[gbase + p_G01ID * p_Np]; + r_G02 = ggeo[gbase + p_G02ID * p_Np]; + + r_G11 = ggeo[gbase + p_G11ID * p_Np]; + r_G12 = ggeo[gbase + p_G12ID * p_Np]; + r_G22 = ggeo[gbase + p_G22ID * p_Np]; + +#ifdef p_poisson + r_GwJ = 0; +#else + r_GwJ = ggeo[gbase + p_GWJID * p_Np]; +#endif + } + + dfloat ur = 0.f; + dfloat us = 0.f; + dfloat ut = 0; + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + ut += s_D[k][m] * r_u[m]; + } + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + ur += s_D[i][m] * s_q[es][j][m]; + us += s_D[j][m] * s_q[es][m][i]; + } + + const dfloat lambda0 = lambda[0 * loffset]; + + s_w[es][j][i] = (r_G01 * ur + r_G11 * us + r_G12 * ut); + s_v[es][j][i] = (r_G00 * ur + r_G01 * us + r_G02 * ut); + r_GDut = lambda0 * (r_G02 * ur + r_G12 * us + r_G22 * ut); + +#ifdef p_poisson + r_Auk = 0.0; +#else + const dfloat lambda1 = lambda[1 * loffset]; + r_Auk = r_GwJ * lambda1 * r_u[k]; +#endif + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + r_Au[m] += s_D[k][m] * r_GDut; + } + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + r_Auk += s_D[m][j] * s_w[es][m][i]; + r_Auk += s_D[m][i] * s_v[es][j][m]; + } + + r_Au[k] += r_Auk; + } + } + } + } // end Layer by layer + + @barrier("local"); + + // write out + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + if (r_e < Nelements) { + const dlong id = element * p_Np + j * p_Nq + i; + +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + Aq[id + k * p_Nq * p_Nq] = r_Au[k]; + } + } + } + } + } + } +} +#endif + +// 3D thread structure, good for low orders +#if p_Nq < 11 + +#if defined(FP32) +#if p_Nq == 2 +#define p_NelementsPerBlk 27 +#elif p_Nq == 3 +#define p_NelementsPerBlk 15 +#elif p_Nq == 4 +#define p_NelementsPerBlk 15 +#elif p_Nq == 5 +#define p_NelementsPerBlk 8 +#elif p_Nq == 6 +#define p_NelementsPerBlk 4 +#elif p_Nq == 7 +#define p_NelementsPerBlk 2 +#elif p_Nq == 8 +#define p_NelementsPerBlk 2 +#else +#define p_NelementsPerBlk 1 +#endif + +#else + +#if p_N == 1 +#define p_NelementsPerBlk 8 +#elif p_N == 2 +#define p_NelementsPerBlk 4 +#elif p_N == 3 +#define p_NelementsPerBlk 2 +#elif p_N == 4 +#define p_NelementsPerBlk 1 +#elif p_N == 5 +#define p_NelementsPerBlk 1 +#elif p_N == 6 +#define p_NelementsPerBlk 1 +#elif p_N == 7 +#define p_NelementsPerBlk 1 +#else +#define p_NelementsPerBlk 1 +#endif + +#endif + +#if p_knl == 3 +@kernel void ellipticPartialAxHex3D_v3(const dlong Nelements, + const dlong offset, + const dlong loffset, + @restrict const dlong *elementList, + @restrict const dfloat *ggeo, + @restrict const dfloat *D, + @restrict const dfloat *S, + @restrict const dfloat *lambda, + @restrict const dfloat *q, + @restrict dfloat *Aq) +{ +// padding for bank conflicts +#if (p_Nq == 8 || p_Nq == 4) && defined(smXX) +#define p_pad 1 +#else +#define p_pad 0 +#endif + +#if p_Nq == 16 || p_Nq == 14 || p_Nq == 12 || p_Nq == 8 || p_Nq == 6 || p_Nq == 4 +#define p_pad 0 +#else +#define p_pad 1 +#endif + + for (int eo = 0; eo < Nelements; eo += p_NelementsPerBlk; @outer(0)) { + + @shared dfloat s_D[p_Nq][p_Nq + p_pad]; + @shared dfloat s_DT[p_Nq][p_Nq + p_pad]; + @shared dfloat s_q[p_NelementsPerBlk][p_Nq][p_Nq][p_Nq + p_pad]; + @shared dfloat s_Gqr[p_NelementsPerBlk][p_Nq][p_Nq][p_Nq + p_pad]; + @shared dfloat s_Gqs[p_NelementsPerBlk][p_Nq][p_Nq][p_Nq + p_pad]; + @shared dfloat s_Gqt[p_NelementsPerBlk][p_Nq][p_Nq][p_Nq + p_pad]; + + @exclusive dlong element; + @exclusive dfloat r_wJ; + + @exclusive int k, es; + + for (int ke = 0; ke < p_Nq * p_NelementsPerBlk; ++ke; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + + // load operators + if (ke == 0) { + const int id = j * p_Nq + i; + const dfloat Dji = D[id]; + s_D[j][i] = Dji; + s_DT[i][j] = Dji; + } + + k = ke % p_Nq; + es = ke / p_Nq; + dlong r_e = es + eo; + element = (r_e < Nelements) ? elementList[r_e] : -1; + if (element != -1) { + const dlong id = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np; + if (id != -1) + s_q[es][k][j][i] = q[id]; + else + s_q[es][k][j][i] = 0.0; + } + } + } + } + + @barrier("local"); + + for (int ke = 0; ke < p_Nq * p_NelementsPerBlk; ++ke; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + + if (element != -1) { + + // 't' terms + dfloat tmp = 0.0; + + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat G00 = ggeo[gbase + p_G00ID * p_Np]; + const dfloat G01 = ggeo[gbase + p_G01ID * p_Np]; + const dfloat G02 = ggeo[gbase + p_G02ID * p_Np]; + + const dfloat G11 = ggeo[gbase + p_G11ID * p_Np]; + const dfloat G12 = ggeo[gbase + p_G12ID * p_Np]; + const dfloat G22 = ggeo[gbase + p_G22ID * p_Np]; + +#ifdef p_poisson + r_wJ = 0; +#else + r_wJ = ggeo[gbase + p_GWJID * p_Np]; +#endif + + // #pragma unroll p_Unr + for (int m = 0; m < p_Nq; ++m) { + const dfloat pmji = s_q[es][m][j][i]; + const dfloat Dkm = s_DT[m][k]; + tmp += Dkm * pmji; + } + + s_Gqr[es][k][j][i] = G02 * tmp; + s_Gqs[es][k][j][i] = G12 * tmp; + s_Gqt[es][k][j][i] = G22 * tmp; + + // 'r' terms + tmp = 0; + // #pragma unroll p_Unr + for (int m = 0; m < p_Nq; ++m) { + const dfloat Dim = s_D[i][m]; + tmp += Dim * s_q[es][k][j][m]; + } + + s_Gqr[es][k][j][i] += G00 * tmp; + s_Gqs[es][k][j][i] += G01 * tmp; + s_Gqt[es][k][j][i] += G02 * tmp; + + // 's' terms + tmp = 0; + // #pragma unroll p_Unr + for (int m = 0; m < p_Nq; ++m) { + const dfloat Djm = s_D[j][m]; + tmp += Djm * s_q[es][k][m][i]; + } + + s_Gqr[es][k][j][i] += G01 * tmp; + s_Gqs[es][k][j][i] += G11 * tmp; + s_Gqt[es][k][j][i] += lambda[0 * loffset] * G12 * tmp; + } + } + } + } + + @barrier("local"); + + for (int ke = 0; ke < p_Nq * p_NelementsPerBlk; ++ke; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + + if (element != -1) { +#ifdef p_poisson + dfloat tmpAp = 0.0; +#else + dfloat tmpAp = s_q[es][k][j][i] * lambda[1 * loffset] * r_wJ; +#endif + + // use same matrix for both slices + // #pragma unroll p_Unr + for (int m = 0; m < p_Nq; ++m) { + const dfloat Dmi = s_D[m][i]; + const dfloat Dmj = s_D[m][j]; + + tmpAp += Dmi * s_Gqr[es][k][j][m]; + tmpAp += Dmj * s_Gqs[es][k][m][i]; + } + + // #pragma unroll p_Unr + for (int m = 0; m < p_Nq; ++m) { + const dfloat Gpt = s_Gqt[es][m][j][i]; + const dfloat Dmk = s_D[m][k]; + tmpAp += Dmk * Gpt; + } + + const dlong base = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np; + Aq[base] = tmpAp; + } + } + } + } + } +} +#endif +#endif + +// padding for bank conflicts +#if p_Nq == 16 +#define p_pad 1 +#else +#define p_pad 0 +#endif + +#if p_Nq == 3 +#define p_NelementsPerBlk 31 +#define p_pad 1 +#elif p_Nq == 4 +#define p_NelementsPerBlk 31 +#elif p_Nq == 5 +#define p_NelementsPerBlk 19 +#define p_pad 1 +#elif p_Nq == 6 +#define p_NelementsPerBlk 7 +#elif p_Nq == 7 +#define p_NelementsPerBlk 5 +#define p_pad 1 +#elif p_Nq == 8 +#define p_NelementsPerBlk 3 +#elif p_Nq == 9 +#define p_NelementsPerBlk 5 +#define p_pad 1 +#elif p_Nq == 10 +#define p_NelementsPerBlk 3 +#elif p_Nq == 11 +#define p_NelementsPerBlk 2 +#define p_pad 1 +#elif p_Nq == 12 +#define p_NelementsPerBlk 3 +#elif p_Nq == 13 +#define p_NelementsPerBlk 3 +#define p_pad 1 +#elif p_Nq == 14 +#define p_NelementsPerBlk 3 +#elif p_Nq == 15 +#define p_NelementsPerBlk 1 +#else +#define p_NelementsPerBlk 1 +#endif + +#if p_knl == 4 +@kernel void ellipticPartialAxHex3D_v4(const dlong Nelements, + const dlong offset, + const dlong loffset, + @restrict const dlong *elementList, + @restrict const dfloat *ggeo, + @restrict const dfloat *D, + @restrict const dfloat *S, + @restrict const dfloat *lambda, + @restrict const dfloat *q, + @restrict dfloat *Aq) +{ + + for (dlong eo = 0; eo < Nelements; eo += p_NelementsPerBlk; @outer(0)) { + + @shared dfloat s_D[p_Nq][p_Nq + p_pad]; + @shared dfloat s_q[p_Nq][p_Nq][p_NelementsPerBlk]; + @shared dfloat s_v[p_Nq][p_Nq][p_NelementsPerBlk]; + @shared dfloat s_w[p_Nq][p_Nq][p_NelementsPerBlk]; + + @exclusive dfloat r_Auk; + + // register array to hold u(i,j,0:N) private to thread + @exclusive dfloat r_u[p_Nq]; + // array for results Au(i,j,0:N) + @exclusive dfloat r_Au[p_Nq]; + + @exclusive dlong r_e, element; + + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + + // load D into local memory + // s_D[i][j] = d \phi_i at node j + if (es == 0) { + s_D[j][i] = D[p_Nq * j + i]; // D is column major + } + + r_e = es + eo; + + if (r_e < Nelements) { + element = elementList[r_e]; + + const dlong base = i + j * p_Nq + element * p_Np; + + // load pencil of u into register +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + const dlong id = base + k * p_Nq * p_Nq; + r_u[k] = (id != -1) ? q[id] : 0.0; + } + +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + r_Au[k] = 0.0; + } + } + } + } + } + + // Layer by layer +#ifdef smXX +// only force some type of unrolling in CUDA mode +#pragma unroll p_Nq +#endif +#ifdef gfxXX +// on HIP, tell the compiler to not unroll this loop +#pragma nounroll +#endif + for (int k = 0; k < p_Nq; k++) { + + @barrier("local"); + + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + // share u(:,:,k) + s_q[j][i][es] = r_u[k]; + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + + dfloat r_G00, r_G01, r_G02, r_G11, r_G12, r_G22, r_GwJ; + + if (r_e < Nelements) { + // prefetch geometric factors + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + r_G00 = ggeo[gbase + p_G00ID * p_Np]; + r_G01 = ggeo[gbase + p_G01ID * p_Np]; + r_G02 = ggeo[gbase + p_G02ID * p_Np]; + + r_G11 = ggeo[gbase + p_G11ID * p_Np]; + r_G12 = ggeo[gbase + p_G12ID * p_Np]; + r_G22 = ggeo[gbase + p_G22ID * p_Np]; + +#ifdef p_poisson + r_GwJ = 0; +#else + r_GwJ = ggeo[gbase + p_GWJID * p_Np]; +#endif + } + + dfloat ur = 0.f; + dfloat us = 0.f; + dfloat ut = 0; + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + ur += s_D[i][m] * s_q[j][m][es]; + us += s_D[j][m] * s_q[m][i][es]; + } + + const dfloat lambda0 = lambda[0 * loffset]; + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + ut += s_D[k][m] * r_u[m]; + } + + s_w[j][i][es] = (r_G01 * ur + r_G11 * us + r_G12 * ut); + s_v[j][i][es] = (r_G00 * ur + r_G01 * us + r_G02 * ut); + dfloat r_GDut = lambda0 * (r_G02 * ur + r_G12 * us + r_G22 * ut); + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + r_Au[m] += s_D[k][m] * r_GDut; + } + +#ifdef p_poisson + r_Auk = 0.0; +#else + const dfloat lambda1 = lambda[1 * loffset]; + r_Auk = r_GwJ * lambda1 * r_u[k]; +#endif + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + r_Auk += s_D[m][j] * s_w[m][i][es]; + r_Auk += s_D[m][i] * s_v[j][m][es]; + } + + r_Au[k] += r_Auk; + } + } + } + } // end Layer by layer + + @barrier("local"); + + // write out + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + if (r_e < Nelements) { + const dlong id = element * p_Np + j * p_Nq + i; + +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + Aq[id + k * p_Nq * p_Nq] = r_Au[k]; + } + } + } + } + } + } +} +#endif + +#if p_knl == 5 +@kernel void ellipticPartialAxHex3D_v5(const dlong Nelements, + const dlong offset, + const dlong loffset, + @restrict const dlong *elementList, + @restrict const dfloat *ggeo, + @restrict const dfloat *D, + @restrict const dfloat *S, + @restrict const dfloat *lambda, + @restrict const dfloat *q, + @restrict dfloat *Aq) +{ + + for (dlong eo = 0; eo < Nelements; eo += p_NelementsPerBlk; @outer(0)) { + + @shared dfloat s_D[p_Nq][p_Nq + p_pad]; + @shared dfloat s_q[p_Nq][p_Nq][p_NelementsPerBlk]; + @shared dfloat s_v[p_Nq][p_Nq][p_NelementsPerBlk]; + @shared dfloat s_w[p_Nq][p_Nq][p_NelementsPerBlk]; + + @exclusive dfloat r_GDut, r_Auk; + + // register array to hold u(i,j,0:N) private to thread + @exclusive dfloat r_u[p_Nq]; + // array for results Au(i,j,0:N) + @exclusive dfloat r_Au[p_Nq]; + + @exclusive dlong r_e, element; + + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + + // load D into local memory + // s_D[i][j] = d \phi_i at node j + if (es == 0) { + s_D[j][i] = D[p_Nq * j + i]; // D is column major + } + + r_e = es + eo; + + if (r_e < Nelements) { + element = elementList[r_e]; + + const dlong base = i + j * p_Nq + element * p_Np; + +// load pencil of u into register +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + const dlong id = base + k * p_Nq * p_Nq; + r_u[k] = (id != -1) ? q[id] : 0.0; + } + +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + r_Au[k] = 0.0; + } + } + } + } + } + + // Layer by layer +#ifdef smXX +// only force some type of unrolling in CUDA mode +#pragma unroll p_Nq +#endif +#ifdef gfxXX +// on HIP, tell the compiler to not unroll this loop +#pragma nounroll +#endif + for (int k = 0; k < p_Nq; k++) { + + @barrier("local"); + + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + // share u(:,:,k) + s_q[j][i][es] = r_u[k]; + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + + dfloat r_G00, r_G01, r_G02, r_G11, r_G12, r_G22, r_GwJ; + + if (r_e < Nelements) { + // prefetch geometric factors + + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + r_G00 = ggeo[gbase + p_G00ID * p_Np]; + r_G01 = ggeo[gbase + p_G01ID * p_Np]; + r_G02 = ggeo[gbase + p_G02ID * p_Np]; + + r_G11 = ggeo[gbase + p_G11ID * p_Np]; + r_G12 = ggeo[gbase + p_G12ID * p_Np]; + r_G22 = ggeo[gbase + p_G22ID * p_Np]; + +#ifdef p_poisson + r_GwJ = 0; +#else + r_GwJ = ggeo[gbase + p_GWJID * p_Np]; +#endif + } + + dfloat ur = 0.f; + dfloat us = 0.f; + dfloat ut = 0; + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + ur += s_D[i][m] * s_q[j][m][es]; + us += s_D[j][m] * s_q[m][i][es]; + } + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + ut += s_D[k][m] * r_u[m]; + } + + const dfloat lambda0 = lambda[0 * loffset]; + + s_w[j][i][es] = (r_G01 * ur + r_G11 * us + r_G12 * ut); + s_v[j][i][es] = (r_G00 * ur + r_G01 * us + r_G02 * ut); + r_GDut = lambda0 * (r_G02 * ur + r_G12 * us + r_G22 * ut); + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + r_Au[m] += s_D[k][m] * r_GDut; + } + +#ifdef p_poisson + r_Auk = 0.0; +#else + const dfloat lambda1 = lambda[1 * loffset]; + r_Auk = r_GwJ * lambda1 * r_u[k]; +#endif + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + r_Auk += s_D[m][j] * s_w[m][i][es]; + r_Auk += s_D[m][i] * s_v[j][m][es]; + } + + r_Au[k] += r_Auk; + } + } + } + } // end Layer by layer + + @barrier("local"); + + // write out + for (int es = 0; es < p_NelementsPerBlk; ++es; @inner(2)) { + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + if (r_e < Nelements) { + const dlong id = element * p_Np + j * p_Nq + i; + +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + Aq[id + k * p_Nq * p_Nq] = r_Au[k]; + } + } + } + } + } + } +} +#endif + +// padding for bank conflicts +#if p_Nq == 16 +#define p_pad 1 +#else +#define p_pad 0 +#endif + +#if p_Nq == 3 +#define p_NelementsPerBlk 31 +#define p_pad 1 +#elif p_Nq == 4 +#define p_NelementsPerBlk 31 +#define p_pad 1 +#elif p_Nq == 5 +#define p_NelementsPerBlk 19 +#define p_pad 1 +#elif p_Nq == 6 +#define p_NelementsPerBlk 7 +#elif p_Nq == 7 +#define p_NelementsPerBlk 5 +#define p_pad 1 +#elif p_Nq == 8 +#define p_NelementsPerBlk 3 +#elif p_Nq == 9 +#define p_NelementsPerBlk 3 +#define p_pad 1 +#elif p_Nq == 10 +#define p_NelementsPerBlk 3 +#define p_pad 0 +#elif p_Nq == 11 +#define p_NelementsPerBlk 2 +#define p_pad 1 +#elif p_Nq == 12 +#define p_NelementsPerBlk 3 +#elif p_Nq == 13 +#define p_NelementsPerBlk 3 +#define p_pad 1 +#elif p_Nq == 14 +#define p_NelementsPerBlk 3 +#elif p_Nq == 15 +#define p_NelementsPerBlk 1 +#else +#define p_NelementsPerBlk 1 +#endif + +// padding for bank conflicts +#if p_Nq == 16 || p_Nq == 14 || p_Nq == 12 || p_Nq == 10 || p_Nq == 8 || p_Nq == 6 || p_Nq == 4 +#define p_pad 0 +#else +#define p_pad 1 +#endif + +#if p_knl == 6 +@kernel void ellipticPartialAxHex3D_v6(const dlong Nelements, + const dlong offset, + const dlong loffset, + @restrict const dlong *elementList, + @restrict const dfloat *ggeo, + @restrict const dfloat *D, + @restrict const dfloat *S, + @restrict const dfloat *lambda, + @restrict const dfloat *q, + @restrict dfloat *Aq) +{ + + for (dlong e = 0; e < Nelements; e++; @outer(0)) { + + @shared dfloat s_D[p_Nq][p_Nq + p_pad]; + @shared dfloat s_q[p_Nq][p_Nq + p_pad]; + @shared dfloat s_v[p_Nq][p_Nq + p_pad]; + @shared dfloat s_w[p_Nq][p_Nq + p_pad]; + + @exclusive dfloat r_GDut, r_Auk; + + // register array to hold u(i,j,0:N) private to thread + @exclusive dfloat r_u[p_Nq]; + // array for results Au(i,j,0:N) + @exclusive dfloat r_Au[p_Nq]; + + @exclusive dlong element; + + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + + // load D into local memory + // s_D[i][j] = d \phi_i at node j + s_D[j][i] = D[p_Nq * j + i]; // D is column major + + element = elementList[e]; + + const dlong base = i + j * p_Nq + element * p_Np; + +// load pencil of u into register +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + const dlong id = base + k * p_Nq * p_Nq; + r_u[k] = (id != -1) ? q[id] : 0.0; + } + +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + r_Au[k] = 0.0; + } + } + } + + // Layer by layer +#ifdef smXX +// only force some type of unrolling in CUDA mode +#pragma unroll p_Nq +#endif + for (int k = 0; k < p_Nq; k++) { + + @barrier("local"); + + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + // share u(:,:,k) + s_q[j][i] = r_u[k]; + } + } + + @barrier("local"); + + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + // prefetch geometric factors + const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat r_G00 = ggeo[gbase + p_G00ID * p_Np]; + const dfloat r_G01 = ggeo[gbase + p_G01ID * p_Np]; + const dfloat r_G02 = ggeo[gbase + p_G02ID * p_Np]; + + const dfloat r_G11 = ggeo[gbase + p_G11ID * p_Np]; + const dfloat r_G12 = ggeo[gbase + p_G12ID * p_Np]; + const dfloat r_G22 = ggeo[gbase + p_G22ID * p_Np]; + +#ifdef p_poisson + const dfloat r_GwJ = 0; +#else + const dfloat r_GwJ = ggeo[gbase + p_GWJID * p_Np]; +#endif + + dfloat ur = 0.f; + dfloat us = 0.f; + dfloat ut = 0; + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + ut += s_D[k][m] * r_u[m]; + } + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + ur += s_D[i][m] * s_q[j][m]; + us += s_D[j][m] * s_q[m][i]; + } + + const dfloat lambda0 = lambda[0 * loffset]; + + s_w[j][i] = (r_G01 * ur + r_G11 * us + r_G12 * ut); + s_v[j][i] = (r_G00 * ur + r_G01 * us + r_G02 * ut); + r_GDut = lambda0 * (r_G02 * ur + r_G12 * us + r_G22 * ut); +#ifdef p_poisson + r_Auk = 0.0; +#else + r_Auk = r_GwJ * lambda[1 * loffset] * r_u[k]; +#endif + } + } + + @barrier("local"); + + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + r_Au[m] += s_D[k][m] * r_GDut; + } + +#pragma unroll p_Nq + for (int m = 0; m < p_Nq; m++) { + r_Auk += s_D[m][j] * s_w[m][i]; + r_Auk += s_D[m][i] * s_v[j][m]; + } + + r_Au[k] += r_Auk; + } + } + } // end Layer by layer + + @barrier("local"); + + // write out + for (int j = 0; j < p_Nq; ++j; @inner(1)) { + for (int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong id = element * p_Np + j * p_Nq + i; + +#pragma unroll p_Nq + for (int k = 0; k < p_Nq; k++) { + Aq[id + k * p_Nq * p_Nq] = r_Au[k]; + } + } + } + } +} +#endif diff --git a/okl/elliptic/ellipticPartialAxTrilinearHex3D.okl b/okl/elliptic/ellipticPartialAxTrilinearHex3D.okl new file mode 100644 index 000000000..d29cdf4b3 --- /dev/null +++ b/okl/elliptic/ellipticPartialAxTrilinearHex3D.okl @@ -0,0 +1,207 @@ +#define p_eighth ((dfloat)0.125) + +@kernel void ellipticPartialAxTrilinearHex3D(const dlong Nelements, + const dlong offset, + const dlong loffset, + @restrict const dlong* elementList, + @restrict const dfloat* EXYZ, + @restrict const dfloat* gllzw, + @restrict const dfloat* D, + @restrict const dfloat* S, + @restrict const dfloat* lambda, + @restrict const dfloat* q, + @restrict dfloat* Aq) +{ + for(dlong e = 0; e < Nelements; ++e; @outer(0)) { +#if (p_Nq % 2 == 0) + @shared dfloat s_D[p_Nq][p_Nq+1]; +#else + @shared dfloat s_D[p_Nq][p_Nq]; +#endif + @shared dfloat s_q[p_Nq][p_Nq]; + + @shared dfloat s_Gqr[p_Nq][p_Nq]; + @shared dfloat s_Gqs[p_Nq][p_Nq]; + + @shared dfloat s_gllwz[2][p_Nq]; + @shared dfloat s_EXYZ[p_dim][p_Nverts]; + + @exclusive dfloat r_qt, r_Gqt, r_Auk; + @exclusive dfloat r_q[p_Nq]; + @exclusive dfloat r_Aq[p_Nq]; + + @exclusive dlong element; + + @exclusive dfloat r_G00, r_G01, r_G02, r_G11, r_G12, r_G22, r_GwJ; + + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + s_D[j][i] = D[p_Nq * j + i]; + + if(j < 2) + s_gllwz[j][i] = gllzw[j * p_Nq + i]; + + element = elementList[e]; + const dlong base = i + j * p_Nq + element * p_Np; + for(int k = 0; k < p_Nq; k++) { + r_q[k] = q[base + k * p_Nq * p_Nq]; + r_Aq[k] = 0; + } + + int n = i + j * p_Nq; + while(n < p_Nverts * p_dim) { + s_EXYZ[0][n] = EXYZ[element * p_Nverts * p_dim + n]; + n += p_Nq * p_Nq; + } + } + + @barrier("local"); + + #pragma unroll p_Nq + for(int k = 0; k < p_Nq; k++) { + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dfloat rn = s_gllwz[0][i]; + const dfloat sn = s_gllwz[0][j]; + const dfloat tn = s_gllwz[0][k]; + +#define xe s_EXYZ[0] +#define ye s_EXYZ[1] +#define ze s_EXYZ[2] + + const dfloat xr = p_eighth * + ( (1 - tn) * (1 - sn) * (xe[1] - xe[0]) + (1 - tn) * (1 + sn) * + (xe[2] - xe[3]) + + (1 + tn) * (1 - sn) * (xe[5] - xe[4]) + (1 + tn) * (1 + sn) * + (xe[6] - xe[7]) ); + const dfloat xs = p_eighth * + ( (1 - tn) * (1 - rn) * (xe[3] - xe[0]) + (1 - tn) * (1 + rn) * + (xe[2] - xe[1]) + + (1 + tn) * (1 - rn) * (xe[7] - xe[4]) + (1 + tn) * (1 + rn) * + (xe[6] - xe[5]) ); + const dfloat xt = p_eighth * + ( (1 - rn) * (1 - sn) * (xe[4] - xe[0]) + (1 + rn) * (1 - sn) * + (xe[5] - xe[1]) + + (1 + rn) * (1 + sn) * (xe[6] - xe[2]) + (1 - rn) * (1 + sn) * + (xe[7] - xe[3]) ); + + const dfloat yr = p_eighth * + ( (1 - tn) * (1 - sn) * (ye[1] - ye[0]) + (1 - tn) * (1 + sn) * + (ye[2] - ye[3]) + + (1 + tn) * (1 - sn) * (ye[5] - ye[4]) + (1 + tn) * (1 + sn) * + (ye[6] - ye[7]) ); + const dfloat ys = p_eighth * + ( (1 - tn) * (1 - rn) * (ye[3] - ye[0]) + (1 - tn) * (1 + rn) * + (ye[2] - ye[1]) + + (1 + tn) * (1 - rn) * (ye[7] - ye[4]) + (1 + tn) * (1 + rn) * + (ye[6] - ye[5]) ); + const dfloat yt = p_eighth * + ( (1 - rn) * (1 - sn) * (ye[4] - ye[0]) + (1 + rn) * (1 - sn) * + (ye[5] - ye[1]) + + (1 + rn) * (1 + sn) * (ye[6] - ye[2]) + (1 - rn) * (1 + sn) * + (ye[7] - ye[3]) ); + + const dfloat zr = p_eighth * + ( (1 - tn) * (1 - sn) * (ze[1] - ze[0]) + (1 - tn) * (1 + sn) * + (ze[2] - ze[3]) + + (1 + tn) * (1 - sn) * (ze[5] - ze[4]) + (1 + tn) * (1 + sn) * + (ze[6] - ze[7]) ); + const dfloat zs = p_eighth * + ( (1 - tn) * (1 - rn) * (ze[3] - ze[0]) + (1 - tn) * (1 + rn) * + (ze[2] - ze[1]) + + (1 + tn) * (1 - rn) * (ze[7] - ze[4]) + (1 + tn) * (1 + rn) * + (ze[6] - ze[5]) ); + const dfloat zt = p_eighth * + ( (1 - rn) * (1 - sn) * (ze[4] - ze[0]) + (1 + rn) * (1 - sn) * + (ze[5] - ze[1]) + + (1 + rn) * (1 + sn) * (ze[6] - ze[2]) + (1 - rn) * (1 + sn) * + (ze[7] - ze[3]) ); + + const dfloat J = xr * (ys * zt - zs * yt) - yr * (xs * zt - zs * xt) + zr * + (xs * yt - ys * xt); + + const dfloat rx = (ys * zt - zs * yt), ry = -(xs * zt - zs * xt), + rz = (xs * yt - ys * xt); + const dfloat sx = -(yr * zt - zr * yt), sy = (xr * zt - zr * xt), + sz = -(xr * yt - yr * xt); + const dfloat tx = (yr * zs - zr * ys), ty = -(xr * zs - zr * xs), + tz = (xr * ys - yr * xs); + + const dfloat W = s_gllwz[1][i] * s_gllwz[1][j] * s_gllwz[1][k]; + const dfloat sc = W / J; + + r_G00 = sc * (rx * rx + ry * ry + rz * rz); + r_G01 = sc * (rx * sx + ry * sy + rz * sz); + r_G02 = sc * (rx * tx + ry * ty + rz * tz); + r_G11 = sc * (sx * sx + sy * sy + sz * sz); + r_G12 = sc * (sx * tx + sy * ty + sz * tz); + r_G22 = sc * (tx * tx + ty * ty + tz * tz); + +#ifndef p_poisson + r_GwJ = W * J; +#else + r_GwJ = 0.0; +#endif + } + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + s_q[j][i] = r_q[k]; + + r_qt = 0; + + #pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) + r_qt += s_D[k][m] * r_q[m]; + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + dfloat qr = 0; + dfloat qs = 0; + + #pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + qr += s_D[i][m] * s_q[j][m]; + qs += s_D[j][m] * s_q[m][i]; + } + + s_Gqs[j][i] = (r_G01 * qr + r_G11 * qs + r_G12 * r_qt); + s_Gqr[j][i] = (r_G00 * qr + r_G01 * qs + r_G02 * r_qt); + + r_Gqt = lambda[0*loffset] * (r_G02 * qr + r_G12 * qs + r_G22 * r_qt); + r_Auk = r_GwJ * lambda[1*loffset] * r_q[k]; + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + #pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + r_Auk += s_D[m][j] * s_Gqs[m][i]; + r_Aq[m] += s_D[k][m] * r_Gqt; // DT(m,k)*ut(i,j,k,e) + r_Auk += s_D[m][i] * s_Gqr[j][m]; + } + + r_Aq[k] += r_Auk; + } + } + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + #pragma unroll p_Nq + for(int k = 0; k < p_Nq; k++) { + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + Aq[id] = r_Aq[k]; + } + } + } +} diff --git a/okl/elliptic/ellipticSchwarzSolverHex3D.c b/okl/elliptic/ellipticSchwarzSolverHex3D.c deleted file mode 100644 index 9b2fc8544..000000000 --- a/okl/elliptic/ellipticSchwarzSolverHex3D.c +++ /dev/null @@ -1,522 +0,0 @@ -extern "C" void FUNC(preFDM) (const dlong& Nelements, - const pfloat* __restrict__ u, - pfloat* __restrict__ work1) -{ - #define getIdx(k,j,i,e) ((k)*p_Nq_e*p_Nq_e+(j)*p_Nq_e+(i)+(e)*p_Nq_e*p_Nq_e*p_Nq_e) - #define getIdx2(k,j,i,e) ((k-1)*p_Nq*p_Nq+(j-1)*p_Nq+(i-1)+(e)*p_Nq*p_Nq*p_Nq) - #define sWork1(k,j,i,e) (work1[(getIdx(k,j,i,e))]) - #define uArr(k,j,i,e) (u[(getIdx2(k,j,i,e))]) - -#ifdef __NEKRS__OMP__ - #pragma omp parallel for -#endif - for (dlong elem = 0; elem < Nelements; elem++) { - #pragma unroll - for(int k = 0; k < p_Nq_e; ++k){ - #pragma unroll - for(int j = 0; j < p_Nq_e; ++j){ - #pragma unroll - for(int i = 0; i < p_Nq_e; ++i){ - const bool iBound = i>=1 && i <(p_Nq_e-1); - const bool jBound = j>=1 && j <(p_Nq_e-1); - const bool kBound = k>=1 && k <(p_Nq_e-1); - if(iBound && jBound && kBound){ - const dlong elem_offset = elem * p_Nq * p_Nq * p_Nq; - const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset; - sWork1(k,j,i,elem) = uArr(k,j,i,elem); - } else { - sWork1(k,j,i,elem) = 0.0; - } - } - } - } - - - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 2; - sWork1(l1,j,k,elem) = uArr(l2,j,k,elem); - } - } - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 2; - sWork1(p_Nq_e - l1 - 1,j,k,elem) = uArr(p_Nq_e - l2 - 1,j,k,elem); - } - } - - - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 2; - sWork1(i,l1,k,elem) = uArr(i,l2,k,elem); - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 2; - sWork1(i,p_Nq_e - l1 - 1,k,elem) = uArr(i,p_Nq_e - l2 - 1,k,elem); - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - const int l1 = 0; - const int l2 = 2; - sWork1(i,j,l1,elem) = uArr(i,j,l2,elem); - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - const int l1 = 0; - const int l2 = 2; - sWork1(i,j,p_Nq_e - l1 - 1,elem) = uArr(i,j,p_Nq_e - l2 - 1,elem); - } - } - } - #undef getIdx - #undef getIdx2 - #undef sWork1 - #undef uArr -} - -extern "C" void FUNC(postFDM) (const dlong& Nelements, - pfloat* __restrict__ my_work1, - pfloat* __restrict__ my_work2, - pfloat* __restrict__ Su, - const pfloat* __restrict__ wts) -{ - pfloat work1[p_Nq_e][p_Nq_e][p_Nq_e]; - pfloat work2[p_Nq_e][p_Nq_e][p_Nq_e]; - for (dlong elem = 0; elem < Nelements; ++elem) { - #pragma unroll - for(int k = 0; k < p_Nq_e; ++k){ - #pragma unroll - for(int j = 0; j < p_Nq_e; ++j){ - #pragma unroll - for(int i = 0; i < p_Nq_e; ++i) { - const dlong elem_offset = elem * p_Nq_e * p_Nq_e * p_Nq_e; - const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; - work1[k][j][i] = my_work2[idx]; - work2[k][j][i] = my_work1[idx]; - } - } - } - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 0; - work1[l1][j][k] = work1[l1][j][k] - work2[l2][j][k]; - } - } - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 0; - work1[p_Nq_e - l1 - 1][j][k] = work1[p_Nq_e - l1 - 1][j][k] - - work2[p_Nq_e - l2 - 1][j][k]; - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 0; - work1[i][l1][k] = work1[i][l1][k] - work2[i][l2][k]; - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 0; - work1[i][p_Nq_e - l1 - 1][k] = work1[i][p_Nq_e - l1 - 1][k] - - work2[i][p_Nq_e - l2 - 1][k]; - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - const int l1 = 0; - const int l2 = 0; - work1[i][j][l1] = work1[i][j][l1] - work2[i][j][l2]; - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - const int l1 = 0; - const int l2 = 0; - work1[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l1 - 1] - - work2[i][j][p_Nq_e - l2 - 1]; - } - } - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 2; - const int l2 = 0; - work1[l1][j][k] = work1[l1][j][k] + work1[l2][j][k]; - } - } - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 2; - const int l2 = 0; - work1[p_Nq_e - l1 - 1][j][k] = work1[p_Nq_e - l1 - 1][j][k] + - work1[p_Nq_e - l2 - 1][j][k]; - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 2; - const int l2 = 0; - work1[i][l1][k] = work1[i][l1][k] + work1[i][l2][k]; - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 2; - const int l2 = 0; - work1[i][p_Nq_e - l1 - 1][k] = work1[i][p_Nq_e - l1 - 1][k] + - work1[i][p_Nq_e - l2 - 1][k]; - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - const int l1 = 2; - const int l2 = 0; - work1[i][j][l1] = work1[i][j][l1] + work1[i][j][l2]; - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - const int l1 = 2; - const int l2 = 0; - work1[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l1 - 1] + - work1[i][j][p_Nq_e - l2 - 1]; - } - } - #pragma unroll - for(int k = 0; k < p_Nq; ++k){ - #pragma unroll - for(int j = 0; j < p_Nq; ++j){ - #pragma unroll - for(int i = 0; i < p_Nq; ++i){ - const dlong elem_offset = elem * p_Nq * p_Nq * p_Nq; - const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset; - Su[idx] = work1[k + 1][j + 1][i + 1] * wts[idx]; - } - } - } - } -} - -extern "C" void FUNC(fusedFDM) ( - const dlong& Nelements, - const dlong& localNelements, - const dlong* __restrict__ elementList, - pfloat* __restrict__ Su, - const pfloat* __restrict__ S_x, - const pfloat* __restrict__ S_y, - const pfloat* __restrict__ S_z, - const pfloat* __restrict__ inv_L, -#if p_restrict - const dfloat* __restrict__ wts, -#endif - pfloat* __restrict__ u - ) -{ -#define getIdx(k,j,i,e) ((k)*p_Nq_e*p_Nq_e+(j)*p_Nq_e+(i)+(e)*p_Nq_e*p_Nq_e*p_Nq_e) -#define work1(k,j,i,e) (u[(getIdx(k,j,i,e))]) - pfloat S_x_e[p_Nq_e][p_Nq_e]; - pfloat S_y_e[p_Nq_e][p_Nq_e]; - pfloat S_z_e[p_Nq_e][p_Nq_e]; - pfloat S_x_eT[p_Nq_e][p_Nq_e]; - pfloat S_y_eT[p_Nq_e][p_Nq_e]; - pfloat S_z_eT[p_Nq_e][p_Nq_e]; - pfloat tmp[p_Nq_e][p_Nq_e][p_Nq_e]; - pfloat work2[p_Nq_e][p_Nq_e][p_Nq_e]; - - for (dlong my_elem = 0; my_elem < Nelements; ++my_elem) { - const dlong element = my_elem; - const dlong elem = element; - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 2; - work1(l1,j,k,elem) = work1(l1,j,k,elem) - work1(l2,j,k,elem); - } - } - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 2; - work1(p_Nq_e - l1 - 1,j,k,elem) = work1(p_Nq_e - l1 - 1,j,k,elem) - - work1(p_Nq_e - l2 - 1,j,k,elem); - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 2; - work1(i,l1,k,elem) = work1(i,l1,k,elem) - work1(i,l2,k,elem); - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 2; - work1(i,p_Nq_e - l1 - 1,k,elem) = work1(i,p_Nq_e - l1 - 1,k,elem) - - work1(i,p_Nq_e - l2 - 1,k,elem); - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - const int l1 = 0; - const int l2 = 2; - work1(i,j,l1,elem) = work1(i,j,l1,elem) - work1(i,j,l2,elem); - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - const int l1 = 0; - const int l2 = 2; - work1(i,j,p_Nq_e - l1 - 1,elem) = work1(i,j,p_Nq_e - l1 - 1,elem) - - work1(i,j,p_Nq_e - l2 - 1,elem); - } - } - #pragma unroll - for (int i = 0; i < p_Nq_e; i++){ - #pragma unroll - for (int j = 0; j < p_Nq_e; j++) { - const int ij = j + i * p_Nq_e; - S_x_e[i][j] = S_x[ij + element * p_Nq_e * p_Nq_e]; - S_y_e[i][j] = S_y[ij + element * p_Nq_e * p_Nq_e]; - S_z_e[i][j] = S_z[ij + element * p_Nq_e * p_Nq_e]; - S_x_eT[j][i] = S_x_e[i][j]; - S_y_eT[j][i] = S_y_e[i][j]; - S_z_eT[j][i] = S_z_e[i][j]; - } - } - #pragma unroll - for (int k = 0; k < p_Nq_e; k++) { - #pragma unroll - for (int j = 0; j < p_Nq_e; j++) { - #pragma unroll - for (int i = 0; i < p_Nq_e; i++) { - pfloat value = 0.0; - #pragma unroll - for (int l = 0; l < p_Nq_e; l++) - value += S_x_e[l][j] * work1(k,i,l,elem); - work2[k][i][j] = value; - } - } - } - #pragma unroll - for (int k = 0; k < p_Nq_e; k++) { - #pragma unroll - for (int j = 0; j < p_Nq_e; j++) { - #pragma unroll - for (int i = 0; i < p_Nq_e; i++) { - pfloat value = 0.0; - #pragma unroll - for (int l = 0; l < p_Nq_e; l++) - value += S_y_e[l][j] * work2[k][l][i]; - //work1(j,i,k,elem) = value; - tmp[j][k][i] = value; - } - } - } - - #pragma unroll - for (int k = 0; k < p_Nq_e; k++) { - #pragma unroll - for (int j = 0; j < p_Nq_e; j++) { - #pragma unroll - for (int i = 0; i < p_Nq_e; i++) { - const int v = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e; - pfloat value = 0.0; - #pragma unroll - for (int l = 0; l < p_Nq_e; l++) - value += S_z_e[l][k] * tmp[j][l][i]; - work2[k][i][j] = value * inv_L[v + element * p_Nq_e * p_Nq_e * p_Nq_e]; - } - } - } - - #pragma unroll - for (int k = 0; k < p_Nq_e; k++) { - #pragma unroll - for (int j = 0; j < p_Nq_e; j++) { - #pragma unroll - for (int i = 0; i < p_Nq_e; i++) { - pfloat value = 0.0; - #pragma unroll - for (int l = 0; l < p_Nq_e; l++) - value += S_x_eT[l][i] * work2[k][l][j]; - tmp[k][j][i] = value; - } - } - } - - #pragma unroll - for (int k = 0; k < p_Nq_e; k++) { - #pragma unroll - for (int j = 0; j < p_Nq_e; j++) { - #pragma unroll - for (int i = 0; i < p_Nq_e; i++) { - pfloat value = 0.0; - #pragma unroll - for (int l = 0; l < p_Nq_e; l++) - value += S_y_eT[l][j] * tmp[k][l][i]; - work2[j][k][i] = value; - } - } - } - #pragma unroll - for (int k = 0; k < p_Nq_e; k++) { - #pragma unroll - for (int j = 0; j < p_Nq_e; j++) { - #pragma unroll - for (int i = 0; i < p_Nq_e; i++) { - pfloat value = 0.0; - #pragma unroll - for (int l = 0; l < p_Nq_e; l++) - value += S_z_eT[l][k] * work2[j][l][i]; - -#if (!p_restrict) - const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; - const int v = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; - Su[v] = value; -#endif - tmp[k][j][i] = value; - } - } - } -#if (!p_restrict) - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - const int l1 = 0; - const int l2 = 0; - work2[l1][j][k] = tmp[l2][j][k]; - work2[p_Nq_e - l1 - 1][j][k] = tmp[p_Nq_e - l2 - 1][j][k]; - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 0; - work2[i][l1][k] = tmp[i][l2][k]; - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int k = 1; k < p_Nq_e-1; ++k){ - const int l1 = 0; - const int l2 = 0; - work2[i][p_Nq_e - l1 - 1][k] = tmp[i][p_Nq_e - l2 - 1][k]; - } - } - - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - const int l1 = 0; - const int l2 = 0; - work2[i][j][l1] = tmp[i][j][l2]; - } - } - #pragma unroll - for(int i = 1; i < p_Nq_e-1; ++i){ - #pragma unroll - for(int j = 1; j < p_Nq_e-1; ++j){ - const int l1 = 0; - const int l2 = 0; - work2[i][j][p_Nq_e - l1 - 1] = tmp[i][j][p_Nq_e - l2 - 1]; - } - } - - #pragma unroll - for(int k = 0; k < p_Nq_e; ++k){ - #pragma unroll - for(int j = 0; j < p_Nq_e; ++j){ - #pragma unroll - for(int i = 0; i < p_Nq_e; ++i) { - const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; - const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; - u[idx] = work2[k][j][i]; - } - } - } -#else /* if (!p_restrict) */ - #pragma unroll - for(int k = 0; k < p_Nq; ++k){ - #pragma unroll - for(int j = 0; j < p_Nq; ++j){ - #pragma unroll - for(int i = 0; i < p_Nq; ++i){ - const dlong elem_offset = element * p_Nq * p_Nq * p_Nq; - const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset; - Su[idx] = tmp[k + 1][j + 1][i + 1] * wts[idx]; - } - } - } - -#endif - } -#undef getIdx -#undef work1 -} diff --git a/okl/elliptic/ellipticSchwarzSolverHex3D.okl b/okl/elliptic/ellipticSchwarzSolverHex3D.okl deleted file mode 100644 index 63a0a5595..000000000 --- a/okl/elliptic/ellipticSchwarzSolverHex3D.okl +++ /dev/null @@ -1,429 +0,0 @@ -@kernel void preFDM(const dlong Nelements, - @restrict const pfloat* u, - @restrict pfloat* work1) -{ - for (dlong elem = 0; elem < Nelements; elem++; @outer) { - @shared pfloat sWork1[p_Nq_e][p_Nq_e][p_Nq_e]; - for(int k = 0; k < p_Nq_e; ++k; @inner) { - for(int j = 0; j < p_Nq_e; ++j; @inner) { -#pragma unroll - for(int i = 0; i < p_Nq_e; ++i) - sWork1[k][j][i] = 0.0; - } - } - - @barrier("local"); - - for(int k = 0; k < p_Nq; ++k){ - @barrier("local"); - for(int j = 0; j < p_Nq_e; ++j; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner){ - if(i < p_Nq && j < p_Nq) { - const dlong elem_offset = elem * p_Nq * p_Nq * p_Nq; - const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset; - sWork1[k + 1][j + 1][i + 1] = u[idx]; - } - } - } - } - - @barrier("local"); - - for(int k = 0; k < p_Nq_e; ++k; @inner){ - for(int j = 0; j < p_Nq_e; ++j; @inner){ - if(k >= 1 && k < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { - const int l1 = 0; - const int l2 = 2; - sWork1[l1][j][k] = sWork1[l2][j][k]; - sWork1[p_Nq_e - l1 - 1][j][k] = sWork1[p_Nq_e - l2 - 1][j][k]; - } - } - } - - @barrier("local"); - - for(int k = 0; k < p_Nq_e; ++k; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner){ - if(k >= 1 && k < p_Nq_e - 1 && i >= 1 && i < p_Nq_e - 1) { - const int l1 = 0; - const int l2 = 2; - sWork1[i][l1][k] = sWork1[i][l2][k]; - sWork1[i][p_Nq_e - l1 - 1][k] = sWork1[i][p_Nq_e - l2 - 1][k]; - } - } - } - @barrier("local"); - for(int j = 0; j < p_Nq_e; ++j; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner){ - if(i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { - const int l1 = 0; - const int l2 = 2; - sWork1[i][j][l1] = sWork1[i][j][l2]; - sWork1[i][j][p_Nq_e - l1 - 1] = sWork1[i][j][p_Nq_e - l2 - 1]; - } - } - } - @barrier("local"); - for(int k = 0; k < p_Nq_e; ++k){ - @barrier("local"); - for(int j = 0; j < p_Nq_e; ++j; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner) { - const dlong elem_offset = p_Nq_e * p_Nq_e * p_Nq_e * elem; - const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; - work1[idx] = sWork1[k][j][i]; - } - } - } - } -} - -@kernel void postFDM(const dlong Nelements, - @restrict pfloat* my_work1, - @restrict pfloat* my_work2, - @restrict pfloat* Su, - @restrict const pfloat* wts) -{ - for (dlong elem = 0; elem < Nelements; ++elem; @outer) { - @shared pfloat work1[p_Nq_e][p_Nq_e][p_Nq_e]; - @shared pfloat work2[p_Nq_e][p_Nq_e][p_Nq_e]; - for(int k = 0; k < p_Nq_e; ++k){ - @barrier("local"); - for(int j = 0; j < p_Nq_e; ++j; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner) { - const dlong elem_offset = elem * p_Nq_e * p_Nq_e * p_Nq_e; - const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; - work1[k][j][i] = my_work2[idx]; - work2[k][j][i] = my_work1[idx]; - } - } - } - @barrier("local"); - for(int k = 0; k < p_Nq_e; ++k; @inner){ - for(int j = 0; j < p_Nq_e; ++j; @inner){ - if(k >= 1 && k < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { - const int l1 = 0; - const int l2 = 0; - work1[l1][j][k] = work1[l1][j][k] - work2[l2][j][k]; - work1[p_Nq_e - l1 - 1][j][k] = work1[p_Nq_e - l1 - 1][j][k] - - work2[p_Nq_e - l2 - 1][j][k]; - } - } - } - @barrier("local"); - for(int k = 0; k < p_Nq_e; ++k; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner){ - if(k >= 1 && k < p_Nq_e - 1 && i >= 1 && i < p_Nq_e - 1) { - const int l1 = 0; - const int l2 = 0; - work1[i][l1][k] = work1[i][l1][k] - work2[i][l2][k]; - work1[i][p_Nq_e - l1 - 1][k] = work1[i][p_Nq_e - l1 - 1][k] - - work2[i][p_Nq_e - l2 - 1][k]; - } - } - } - @barrier("local"); - for(int j = 0; j < p_Nq_e; ++j; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner){ - if(i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { - const int l1 = 0; - const int l2 = 0; - work1[i][j][l1] = work1[i][j][l1] - work2[i][j][l2]; - work1[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l1 - 1] - - work2[i][j][p_Nq_e - l2 - 1]; - } - } - } - @barrier("local"); - for(int k = 0; k < p_Nq_e; ++k; @inner){ - for(int j = 0; j < p_Nq_e; ++j; @inner){ - if(k >= 1 && k < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { - const int l1 = 2; - const int l2 = 0; - work1[l1][j][k] = work1[l1][j][k] + work1[l2][j][k]; - work1[p_Nq_e - l1 - 1][j][k] = work1[p_Nq_e - l1 - 1][j][k] + - work1[p_Nq_e - l2 - 1][j][k]; - } - } - } - @barrier("local"); - for(int k = 0; k < p_Nq_e; ++k; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner){ - if(k >= 1 && k < p_Nq_e - 1 && i >= 1 && i < p_Nq_e - 1) { - const int l1 = 2; - const int l2 = 0; - work1[i][l1][k] = work1[i][l1][k] + work1[i][l2][k]; - work1[i][p_Nq_e - l1 - 1][k] = work1[i][p_Nq_e - l1 - 1][k] + - work1[i][p_Nq_e - l2 - 1][k]; - } - } - } - @barrier("local"); - for(int j = 0; j < p_Nq_e; ++j; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner){ - if(i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { - const int l1 = 2; - const int l2 = 0; - work1[i][j][l1] = work1[i][j][l1] + work1[i][j][l2]; - work1[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l1 - 1] + - work1[i][j][p_Nq_e - l2 - 1]; - } - } - } - @barrier("local"); - for(int k = 0; k < p_Nq; ++k){ - @barrier("local"); - for(int j = 0; j < p_Nq_e; ++j; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner){ - if(i < p_Nq && j < p_Nq) { - const dlong elem_offset = elem * p_Nq * p_Nq * p_Nq; - const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset; - Su[idx] = work1[k + 1][j + 1][i + 1] * wts[idx]; - } - } - } - } - } -} - -@kernel void fusedFDM( - const dlong Nelements, - const dlong localNelements, - @restrict const dlong* elementList, - @restrict pfloat* Su, - @restrict const pfloat* S_x, - @restrict const pfloat* S_y, - @restrict const pfloat* S_z, - @restrict const pfloat* inv_L, -#if p_restrict - @restrict const dfloat* wts, -#endif - @restrict pfloat* u - ) -{ -#if p_overlap - for (dlong my_elem = 0; my_elem < localNelements; ++my_elem; @outer) { -#else - for (dlong my_elem = 0; my_elem < Nelements; ++my_elem; @outer) { -#endif - @shared pfloat S_x_e[p_Nq_e][p_Nq_e]; - @shared pfloat S_y_e[p_Nq_e][p_Nq_e]; - @shared pfloat S_z_e[p_Nq_e][p_Nq_e]; - @shared pfloat S_x_eT[p_Nq_e][p_Nq_e]; - @shared pfloat S_y_eT[p_Nq_e][p_Nq_e]; - @shared pfloat S_z_eT[p_Nq_e][p_Nq_e]; - @shared pfloat work1[p_Nq_e][p_Nq_e][p_Nq_e]; - @shared pfloat work2[p_Nq_e][p_Nq_e][p_Nq_e]; - @exclusive dlong element; - - for(int k = 0; k < p_Nq_e; ++k) { - @barrier("local"); - for(int j = 0; j < p_Nq_e; ++j; @inner) { - for(int i = 0; i < p_Nq_e; ++i; @inner) { -#if p_overlap - element = elementList[my_elem]; -#else - element = my_elem; -#endif - const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; - const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; - work1[k][j][i] = u[idx]; - } - } - } - @barrier("local"); - for(int k = 0; k < p_Nq_e; ++k; @inner){ - for(int j = 0; j < p_Nq_e; ++j; @inner){ - if(k >= 1 && k < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { - const int l1 = 0; - const int l2 = 2; - work1[l1][j][k] = work1[l1][j][k] - work1[l2][j][k]; - work1[p_Nq_e - l1 - 1][j][k] = work1[p_Nq_e - l1 - 1][j][k] - - work1[p_Nq_e - l2 - 1][j][k]; - } - } - } - @barrier("local"); - for(int k = 0; k < p_Nq_e; ++k; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner){ - if(k >= 1 && k < p_Nq_e - 1 && i >= 1 && i < p_Nq_e - 1) { - const int l1 = 0; - const int l2 = 2; - work1[i][l1][k] = work1[i][l1][k] - work1[i][l2][k]; - work1[i][p_Nq_e - l1 - 1][k] = work1[i][p_Nq_e - l1 - 1][k] - - work1[i][p_Nq_e - l2 - 1][k]; - } - } - } - @barrier("local"); - for(int j = 0; j < p_Nq_e; ++j; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner){ - if(i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { - const int l1 = 0; - const int l2 = 2; - work1[i][j][l1] = work1[i][j][l1] - work1[i][j][l2]; - work1[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l1 - 1] - - work1[i][j][p_Nq_e - l2 - 1]; - } - } - } - @barrier("local"); - for (int i = 0; i < p_Nq_e; i++; @inner){ - for (int j = 0; j < p_Nq_e; j++; @inner) { - const int ij = j + i * p_Nq_e; - S_x_e[i][j] = S_x[ij + element * p_Nq_e * p_Nq_e]; - S_y_e[i][j] = S_y[ij + element * p_Nq_e * p_Nq_e]; - S_z_e[i][j] = S_z[ij + element * p_Nq_e * p_Nq_e]; - S_x_eT[j][i] = S_x_e[i][j]; - S_y_eT[j][i] = S_y_e[i][j]; - S_z_eT[j][i] = S_z_e[i][j]; - } - } - @barrier("local"); - for (int k = 0; k < p_Nq_e; k++; @inner) { - for (int j = 0; j < p_Nq_e; j++; @inner) { -#pragma unroll - for (int i = 0; i < p_Nq_e; i++) { - pfloat value = 0.0; -#pragma unroll - for (int l = 0; l < p_Nq_e; l++) - value += S_x_eT[j][l] * work1[k][i][l]; - work2[k][j][i] = value; - } - } - } - @barrier("local"); - for (int k = 0; k < p_Nq_e; k++; @inner) { - for (int j = 0; j < p_Nq_e; j++; @inner) { -#pragma unroll - for (int i = 0; i < p_Nq_e; i++) { - pfloat value = 0.0; -#pragma unroll - for (int l = 0; l < p_Nq_e; l++) - value += S_y_eT[j][l] * work2[k][i][l]; - work1[j][i][k] = value; - } - } - } - @barrier("local"); - for (int k = 0; k < p_Nq_e; k++) { - @barrier("local"); - for (int j = 0; j < p_Nq_e; j++; @inner) { - for (int i = 0; i < p_Nq_e; i++; @inner) { - const int v = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e; - pfloat value = 0.0; -#pragma unroll - for (int l = 0; l < p_Nq_e; l++) - value += S_z_eT[k][l] * work1[j][i][l]; - work2[k][j][i] = value * inv_L[v + element * p_Nq_e * p_Nq_e * p_Nq_e]; - } - } - } - @barrier("local"); - for (int k = 0; k < p_Nq_e; k++; @inner) { - for (int j = 0; j < p_Nq_e; j++; @inner) { -#pragma unroll - for (int i = 0; i < p_Nq_e; i++) { - pfloat value = 0.0; -#pragma unroll - for (int l = 0; l < p_Nq_e; l++) - value += S_x_e[i][l] * work2[k][j][l]; - work1[k][i][j] = value; - } - } - } - @barrier("local"); - for (int k = 0; k < p_Nq_e; k++; @inner) { - for (int j = 0; j < p_Nq_e; j++; @inner) { -#pragma unroll - for (int i = 0; i < p_Nq_e; i++) { - pfloat value = 0.0; -#pragma unroll - for (int l = 0; l < p_Nq_e; l++) - value += S_y_e[j][l] * work1[k][i][l]; - work2[j][i][k] = value; - } - } - } - @barrier("local"); - for (int k = 0; k < p_Nq_e; k++) { - @barrier("local"); - for (int j = 0; j < p_Nq_e; j++; @inner) { - for (int i = 0; i < p_Nq_e; i++; @inner) { - pfloat value = 0.0; -#pragma unroll - for (int l = 0; l < p_Nq_e; l++) - value += S_z_e[k][l] * work2[j][i][l]; - -#if (!p_restrict) - const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; - const int v = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; - Su[v] = value; -#endif - work1[k][j][i] = value; - } - } - } -#if (!p_restrict) - @barrier("local"); - for(int k = 0; k < p_Nq_e; ++k; @inner){ - for(int j = 0; j < p_Nq_e; ++j; @inner){ - if(k >= 1 && k < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { - const int l1 = 0; - const int l2 = 0; - work2[l1][j][k] = work1[l2][j][k]; - work2[p_Nq_e - l1 - 1][j][k] = work1[p_Nq_e - l2 - 1][j][k]; - } - } - } - @barrier("local"); - for(int k = 0; k < p_Nq_e; ++k; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner){ - if(k >= 1 && k < p_Nq_e - 1 && i >= 1 && i < p_Nq_e - 1) { - const int l1 = 0; - const int l2 = 0; - work2[i][l1][k] = work1[i][l2][k]; - work2[i][p_Nq_e - l1 - 1][k] = work1[i][p_Nq_e - l2 - 1][k]; - } - } - } - @barrier("local"); - for(int j = 0; j < p_Nq_e; ++j; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner){ - if(i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { - const int l1 = 0; - const int l2 = 0; - work2[i][j][l1] = work1[i][j][l2]; - work2[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l2 - 1]; - } - } - } - @barrier("local"); - for(int k = 0; k < p_Nq_e; ++k){ - @barrier("local"); - for(int j = 0; j < p_Nq_e; ++j; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner) { - const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; - const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; - u[idx] = work2[k][j][i]; - } - } - } - -#else /* if (!p_restrict) */ - @barrier("local"); - for(int k = 0; k < p_Nq; ++k){ - @barrier("local"); - for(int j = 0; j < p_Nq_e; ++j; @inner){ - for(int i = 0; i < p_Nq_e; ++i; @inner){ - if(i < p_Nq && j < p_Nq) { - const dlong elem_offset = element * p_Nq * p_Nq * p_Nq; - const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset; - Su[idx] = work1[k + 1][j + 1][i + 1] * wts[idx]; - } - } - } - } - -#endif - } -} diff --git a/okl/elliptic/ellipticSerialAxHex3D.c b/okl/elliptic/ellipticSerialAxHex3D.c deleted file mode 100644 index 4d7f5a3ce..000000000 --- a/okl/elliptic/ellipticSerialAxHex3D.c +++ /dev/null @@ -1,541 +0,0 @@ -/* - - The MIT License (MIT) - - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - */ - -extern "C" -void FUNC(ellipticAxHex3D)(const dlong & Nelements, - const dfloat* __restrict__ ggeo, - const dfloat* __restrict__ D, - const dfloat* __restrict__ S, - const dfloat & lambda, - const dfloat* __restrict__ q, - dfloat* __restrict__ Aq ) -{ - dfloat s_q[p_Nq][p_Nq][p_Nq]; - dfloat s_Gqr[p_Nq][p_Nq][p_Nq]; - dfloat s_Gqs[p_Nq][p_Nq][p_Nq]; - dfloat s_Gqt[p_Nq][p_Nq][p_Nq]; - - dfloat s_D[p_Nq][p_Nq]; - dfloat s_S[p_Nq][p_Nq]; - - for(int j = 0; j < p_Nq; ++j) - for(int i = 0; i < p_Nq; ++i) { - s_D[j][i] = D[j * p_Nq + i]; - s_S[j][i] = S[j * p_Nq + i]; - } - -#ifdef __NEKRS__OMP__ - #pragma omp parallel for private(s_q, s_Gqr, s_Gqs, s_Gqt) -#endif - for(dlong e = 0; e < Nelements; ++e) { - const dlong element = e; - - for(int k = 0; k < p_Nq; k++) - for(int j = 0; j < p_Nq; ++j) - for(int i = 0; i < p_Nq; ++i) { - const dlong base = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np; - const dfloat qbase = q[base]; - s_q[k][j][i] = qbase; - } - - for(int k = 0; k < p_Nq; ++k) - for(int j = 0; j < p_Nq; ++j) - for(int i = 0; i < p_Nq; ++i) { - const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat r_G00 = ggeo[gbase + p_G00ID * p_Np]; - const dfloat r_G01 = ggeo[gbase + p_G01ID * p_Np]; - const dfloat r_G11 = ggeo[gbase + p_G11ID * p_Np]; - const dfloat r_G12 = ggeo[gbase + p_G12ID * p_Np]; - const dfloat r_G02 = ggeo[gbase + p_G02ID * p_Np]; - const dfloat r_G22 = ggeo[gbase + p_G22ID * p_Np]; - - dfloat qr = 0.f; - dfloat qs = 0.f; - dfloat qt = 0.f; - - for(int m = 0; m < p_Nq; m++) { - qr += s_D[i][m] * s_q[k][j][m]; - qs += s_D[j][m] * s_q[k][m][i]; - qt += s_D[k][m] * s_q[m][j][i]; - } - - dfloat Gqr = r_G00 * qr; - Gqr += r_G01 * qs; - Gqr += r_G02 * qt; - - dfloat Gqs = r_G01 * qr; - Gqs += r_G11 * qs; - Gqs += r_G12 * qt; - - dfloat Gqt = r_G02 * qr; - Gqt += r_G12 * qs; - Gqt += r_G22 * qt; - - s_Gqr[k][j][i] = Gqr; - s_Gqs[k][j][i] = Gqs; - s_Gqt[k][j][i] = Gqt; - } - - for(int k = 0; k < p_Nq; k++) - for(int j = 0; j < p_Nq; ++j) - for(int i = 0; i < p_Nq; ++i) { - const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat r_GwJ = ggeo[gbase + p_GWJID * p_Np]; - - dfloat r_Aq = r_GwJ * lambda * s_q[k][j][i]; - dfloat r_Aqr = 0, r_Aqs = 0, r_Aqt = 0; - - for(int m = 0; m < p_Nq; m++) - r_Aqr += s_S[i][m] * s_Gqr[k][j][m]; - for(int m = 0; m < p_Nq; m++) - r_Aqs += s_S[j][m] * s_Gqs[k][m][i]; - for(int m = 0; m < p_Nq; m++) - r_Aqt += s_S[k][m] * s_Gqt[m][j][i]; - - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - Aq[id] = r_Aqr + r_Aqs + r_Aqt + r_Aq; - } - } -} - -extern "C" -void FUNC(ellipticAxVarHex3D)(const dlong & Nelements, - const dlong & offset, - const dfloat* __restrict__ ggeo, - const dfloat* __restrict__ D, - const dfloat* __restrict__ S, - const dfloat* __restrict__ lambda, - const dfloat* __restrict__ q, - dfloat* __restrict__ Aq ) -{ - dfloat s_q[p_Nq][p_Nq][p_Nq]; - dfloat s_Gqr[p_Nq][p_Nq][p_Nq]; - dfloat s_Gqs[p_Nq][p_Nq][p_Nq]; - dfloat s_Gqt[p_Nq][p_Nq][p_Nq]; - -#ifdef __NEKRS__OMP__ - #pragma omp parallel for private(s_q, s_Gqr, s_Gqs, s_Gqt) -#endif - for(dlong e = 0; e < Nelements; ++e) { - const dlong element = e; - -#pragma unroll - for(int k = 0; k < p_Nq; k++) -#pragma unroll - for(int j = 0; j < p_Nq; ++j) -#pragma unroll - for(int i = 0; i < p_Nq; ++i) { - const dlong base = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np; - const dfloat qbase = q[base]; - s_q[k][j][i] = qbase; - } - -#pragma unroll - for(int k = 0; k < p_Nq; ++k) -#pragma unroll - for(int j = 0; j < p_Nq; ++j) -#pragma unroll - for(int i = 0; i < p_Nq; ++i) { - const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat r_G00 = ggeo[gbase + p_G00ID * p_Np]; - const dfloat r_G01 = ggeo[gbase + p_G01ID * p_Np]; - const dfloat r_G11 = ggeo[gbase + p_G11ID * p_Np]; - const dfloat r_G12 = ggeo[gbase + p_G12ID * p_Np]; - const dfloat r_G02 = ggeo[gbase + p_G02ID * p_Np]; - const dfloat r_G22 = ggeo[gbase + p_G22ID * p_Np]; - - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat r_lam0 = lambda[id + 0 * offset]; - - dfloat qr = 0.f; - dfloat qs = 0.f; - dfloat qt = 0.f; - -#pragma unroll - for(int m = 0; m < p_Nq; m++){ - qr += S[m*p_Nq + i] * s_q[k][j][m]; - qs += S[m*p_Nq + j] * s_q[k][m][i]; - qt += S[m*p_Nq + k] * s_q[m][j][i]; - } - - dfloat Gqr = r_G00 * qr; - Gqr += r_G01 * qs; - Gqr += r_G02 * qt; - - dfloat Gqs = r_G01 * qr; - Gqs += r_G11 * qs; - Gqs += r_G12 * qt; - - dfloat Gqt = r_G02 * qr; - Gqt += r_G12 * qs; - Gqt += r_G22 * qt; - - s_Gqr[k][j][i] = r_lam0 * Gqr; - s_Gqs[k][j][i] = r_lam0 * Gqs; - s_Gqt[k][j][i] = r_lam0 * Gqt; - } - -#pragma unroll - for(int k = 0; k < p_Nq; k++) -#pragma unroll - for(int j = 0; j < p_Nq; ++j) -#pragma unroll - for(int i = 0; i < p_Nq; ++i) { - const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat r_GwJ = ggeo[gbase + p_GWJID * p_Np]; - - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat r_lam1 = lambda[id + 1 * offset]; - - dfloat r_Aq = r_GwJ * r_lam1 * s_q[k][j][i]; - dfloat r_Aqr = 0, r_Aqs = 0, r_Aqt = 0; - -#pragma unroll - for(int m = 0; m < p_Nq; m++){ - r_Aqr += D[m*p_Nq+i] * s_Gqr[k][j][m]; - r_Aqs += D[m*p_Nq+j] * s_Gqs[k][m][i]; - r_Aqt += D[m*p_Nq+k] * s_Gqt[m][j][i]; - } - - Aq[id] = r_Aqr + r_Aqs + r_Aqt + r_Aq; - } - } -} - -extern "C" -void FUNC(ellipticBlockAxVarHex3D_N3)(const dlong & Nelements, - const dlong & offset, - const dlong & loffset, - const dfloat* __restrict__ ggeo, - const dfloat* __restrict__ D, - const dfloat* __restrict__ S, - const dfloat* __restrict__ lambda, - const dfloat* __restrict__ q, - dfloat* __restrict__ Aq ) -{ - dfloat s_q[3][p_Nq][p_Nq][p_Nq]; - dfloat s_Gqr[3][p_Nq][p_Nq][p_Nq]; - dfloat s_Gqs[3][p_Nq][p_Nq][p_Nq]; - dfloat s_Gqt[3][p_Nq][p_Nq][p_Nq]; - - dfloat s_D[p_Nq][p_Nq]; - dfloat s_S[p_Nq][p_Nq]; - - for(int j = 0; j < p_Nq; ++j) - for(int i = 0; i < p_Nq; ++i) { - s_D[j][i] = D[j * p_Nq + i]; - s_S[j][i] = S[j * p_Nq + i]; - } - -#ifdef __NEKRS__OMP__ - #pragma omp parallel for private(s_q, s_Gqr, s_Gqs, s_Gqt) -#endif - for(dlong e = 0; e < Nelements; ++e) { - const dlong element = e; - - for(int k = 0; k < p_Nq; k++) - for(int j = 0; j < p_Nq; ++j) - for(int i = 0; i < p_Nq; ++i) { - const dlong base = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np; - s_q[0][k][j][i] = q[base + 0 * offset]; - s_q[1][k][j][i] = q[base + 1 * offset]; - s_q[2][k][j][i] = q[base + 2 * offset]; - } - - for(int k = 0; k < p_Nq; ++k) - for(int j = 0; j < p_Nq; ++j) - for(int i = 0; i < p_Nq; ++i) { - const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat r_G00 = ggeo[gbase + p_G00ID * p_Np]; - const dfloat r_G01 = ggeo[gbase + p_G01ID * p_Np]; - const dfloat r_G11 = ggeo[gbase + p_G11ID * p_Np]; - const dfloat r_G12 = ggeo[gbase + p_G12ID * p_Np]; - const dfloat r_G02 = ggeo[gbase + p_G02ID * p_Np]; - const dfloat r_G22 = ggeo[gbase + p_G22ID * p_Np]; - - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - - const dfloat r_lam00 = lambda[id + 0 * offset + 0 * loffset]; - const dfloat r_lam10 = lambda[id + 0 * offset + 1 * loffset]; - const dfloat r_lam20 = lambda[id + 0 * offset + 2 * loffset]; - - dfloat qr0 = 0.f, qr1 = 0.f, qr2 = 0.f; - dfloat qs0 = 0.f, qs1 = 0.f, qs2 = 0.f; - dfloat qt0 = 0.f, qt1 = 0.f, qt2 = 0.f; - - for(int m = 0; m < p_Nq; m++) { - qr0 += s_S[m][i] * s_q[0][k][j][m]; - qs0 += s_S[m][j] * s_q[0][k][m][i]; - qt0 += s_S[m][k] * s_q[0][m][j][i]; - // - qr1 += s_S[m][i] * s_q[1][k][j][m]; - qs1 += s_S[m][j] * s_q[1][k][m][i]; - qt1 += s_S[m][k] * s_q[1][m][j][i]; - - qr2 += s_S[m][i] * s_q[2][k][j][m]; - qs2 += s_S[m][j] * s_q[2][k][m][i]; - qt2 += s_S[m][k] * s_q[2][m][j][i]; - } - - dfloat Gqr0 = r_G00 * qr0 + r_G01 * qs0 + r_G02 * qt0; - dfloat Gqs0 = r_G01 * qr0 + r_G11 * qs0 + r_G12 * qt0; - dfloat Gqt0 = r_G02 * qr0 + r_G12 * qs0 + r_G22 * qt0; - - dfloat Gqr1 = r_G00 * qr1 + r_G01 * qs1 + r_G02 * qt1; - dfloat Gqs1 = r_G01 * qr1 + r_G11 * qs1 + r_G12 * qt1; - dfloat Gqt1 = r_G02 * qr1 + r_G12 * qs1 + r_G22 * qt1; - - dfloat Gqr2 = r_G00 * qr2 + r_G01 * qs2 + r_G02 * qt2; - dfloat Gqs2 = r_G01 * qr2 + r_G11 * qs2 + r_G12 * qt2; - dfloat Gqt2 = r_G02 * qr2 + r_G12 * qs2 + r_G22 * qt2; - - s_Gqr[0][k][j][i] = r_lam00 * Gqr0; - s_Gqs[0][k][j][i] = r_lam00 * Gqs0; - s_Gqt[0][k][j][i] = r_lam00 * Gqt0; - - s_Gqr[1][k][j][i] = r_lam10 * Gqr1; - s_Gqs[1][k][j][i] = r_lam10 * Gqs1; - s_Gqt[1][k][j][i] = r_lam10 * Gqt1; - - s_Gqr[2][k][j][i] = r_lam20 * Gqr2; - s_Gqs[2][k][j][i] = r_lam20 * Gqs2; - s_Gqt[2][k][j][i] = r_lam20 * Gqt2; - } - - for(int k = 0; k < p_Nq; k++) - for(int j = 0; j < p_Nq; ++j) - for(int i = 0; i < p_Nq; ++i) { - const dlong gbase = element * p_Nggeo * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat r_GwJ = ggeo[gbase + p_GWJID * p_Np]; - - const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - - const dfloat r_lam01 = lambda[id + 1 * offset + 0 * loffset]; - const dfloat r_lam11 = lambda[id + 1 * offset + 1 * loffset]; - const dfloat r_lam21 = lambda[id + 1 * offset + 2 * loffset]; - - dfloat r_Aq0 = r_GwJ * r_lam01 * s_q[0][k][j][i]; - dfloat r_Aq1 = r_GwJ * r_lam11 * s_q[1][k][j][i]; - dfloat r_Aq2 = r_GwJ * r_lam21 * s_q[2][k][j][i]; - - dfloat r_Aqr0 = 0.f, r_Aqs0 = 0.f, r_Aqt0 = 0.f; - dfloat r_Aqr1 = 0.f, r_Aqs1 = 0.f, r_Aqt1 = 0.f; - dfloat r_Aqr2 = 0.f, r_Aqs2 = 0.f, r_Aqt2 = 0.f; - - for(int m = 0; m < p_Nq; m++) { - r_Aqr0 += s_D[m][i] * s_Gqr[0][k][j][m]; - r_Aqr1 += s_D[m][i] * s_Gqr[1][k][j][m]; - r_Aqr2 += s_D[m][i] * s_Gqr[2][k][j][m]; - } - - for(int m = 0; m < p_Nq; m++) { - r_Aqs0 += s_D[m][j] * s_Gqs[0][k][m][i]; - r_Aqs1 += s_D[m][j] * s_Gqs[1][k][m][i]; - r_Aqs2 += s_D[m][j] * s_Gqs[2][k][m][i]; - } - - for(int m = 0; m < p_Nq; m++) { - r_Aqt0 += s_D[m][k] * s_Gqt[0][m][j][i]; - r_Aqt1 += s_D[m][k] * s_Gqt[1][m][j][i]; - r_Aqt2 += s_D[m][k] * s_Gqt[2][m][j][i]; - } - - Aq[id + 0 * offset] = r_Aqr0 + r_Aqs0 + r_Aqt0 + r_Aq0; - Aq[id + 1 * offset] = r_Aqr1 + r_Aqs1 + r_Aqt1 + r_Aq1; - Aq[id + 2 * offset] = r_Aqr2 + r_Aqs2 + r_Aqt2 + r_Aq2; - } - } -} - -// -extern "C" -void FUNC(ellipticStressAxVarHex3D)(const dlong &Nelements, - const dlong &offset, - const dlong &loffset, - const dfloat* __restrict__ vgeo, - const dfloat* __restrict__ D, - const dfloat* __restrict__ S, - const dfloat* __restrict__ lambda, - const dfloat* __restrict__ q, - dfloat* __restrict__ Aq) -{ - dfloat s_D[p_Nq][p_Nq]; - - dfloat s_U[p_Nq][p_Nq][p_Nq]; - dfloat s_V[p_Nq][p_Nq][p_Nq]; - dfloat s_W[p_Nq][p_Nq][p_Nq]; - - dfloat s_SUr[p_Nq][p_Nq][p_Nq]; - dfloat s_SUs[p_Nq][p_Nq][p_Nq]; - dfloat s_SUt[p_Nq][p_Nq][p_Nq]; - - dfloat s_SVr[p_Nq][p_Nq][p_Nq]; - dfloat s_SVs[p_Nq][p_Nq][p_Nq]; - dfloat s_SVt[p_Nq][p_Nq][p_Nq]; - - dfloat s_SWr[p_Nq][p_Nq][p_Nq]; - dfloat s_SWs[p_Nq][p_Nq][p_Nq]; - dfloat s_SWt[p_Nq][p_Nq][p_Nq]; - - for(int j = 0; j < p_Nq; ++j) - for(int i = 0; i < p_Nq; ++i) - s_D[j][i] = D[j * p_Nq + i]; - -#ifdef __NEKRS__OMP__ - #pragma omp parallel for private(s_U, s_V, s_W, s_SUr, s_SUs, s_SUt, s_SVr, s_SVs, s_SVt, s_SWr, s_SWs, s_SWt) -#endif - for(dlong e = 0; e < Nelements; ++e) { - for(int k = 0; k < p_Nq; ++k) - for(int j = 0; j < p_Nq; ++j) - for(int i = 0; i < p_Nq; ++i) { - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - s_U[k][j][i] = q[id + 0 * offset]; - s_V[k][j][i] = q[id + 1 * offset]; - s_W[k][j][i] = q[id + 2 * offset]; - } - - // loop over slabs - for(int k = 0; k < p_Nq; ++k) - for(int j = 0; j < p_Nq; ++j) - for(int i = 0; i < p_Nq; ++i) { - const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo; - const dfloat rx = vgeo[gid + p_RXID * p_Np]; - const dfloat ry = vgeo[gid + p_RYID * p_Np]; - const dfloat rz = vgeo[gid + p_RZID * p_Np]; - - const dfloat sx = vgeo[gid + p_SXID * p_Np]; - const dfloat sy = vgeo[gid + p_SYID * p_Np]; - const dfloat sz = vgeo[gid + p_SZID * p_Np]; - - const dfloat tx = vgeo[gid + p_TXID * p_Np]; - const dfloat ty = vgeo[gid + p_TYID * p_Np]; - const dfloat tz = vgeo[gid + p_TZID * p_Np]; - - const dfloat JW = vgeo[gid + p_JWID * p_Np]; - - // compute 1D derivatives - dfloat ur = 0.f, us = 0.f, ut = 0.f; - dfloat vr = 0.f, vs = 0.f, vt = 0.f; - dfloat wr = 0.f, ws = 0.f, wt = 0.f; - for(int m = 0; m < p_Nq; ++m) { - const dfloat Dim = s_D[i][m]; // Dr - const dfloat Djm = s_D[j][m]; // Ds - const dfloat Dkm = s_D[k][m]; // Dt - - ur += Dim * s_U[k][j][m]; - us += Djm * s_U[k][m][i]; - ut += Dkm * s_U[m][j][i]; - // - vr += Dim * s_V[k][j][m]; - vs += Djm * s_V[k][m][i]; - vt += Dkm * s_V[m][j][i]; - // - wr += Dim * s_W[k][j][m]; - ws += Djm * s_W[k][m][i]; - wt += Dkm * s_W[m][j][i]; - } - - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat u_lam0 = lambda[id + 0 * offset + 0 * loffset]; - // const dfloat u_lam1 = lambda[id + 1*offset + 0*loffset]; - const dfloat v_lam0 = lambda[id + 0 * offset + 1 * loffset]; - // const dfloat v_lam1 = lambda[id + 1*offset + 1*loffset]; - const dfloat w_lam0 = lambda[id + 0 * offset + 2 * loffset]; - // const dfloat w_lam1 = lambda[id + 1*offset + 2*loffset]; - - const dfloat dudx = rx * ur + sx * us + tx * ut; - const dfloat dudy = ry * ur + sy * us + ty * ut; - const dfloat dudz = rz * ur + sz * us + tz * ut; - - const dfloat dvdx = rx * vr + sx * vs + tx * vt; - const dfloat dvdy = ry * vr + sy * vs + ty * vt; - const dfloat dvdz = rz * vr + sz * vs + tz * vt; - - const dfloat dwdx = rx * wr + sx * ws + tx * wt; - const dfloat dwdy = ry * wr + sy * ws + ty * wt; - const dfloat dwdz = rz * wr + sz * ws + tz * wt; - - const dfloat s11 = u_lam0 * JW * (dudx + dudx); - const dfloat s12 = u_lam0 * JW * (dudy + dvdx); - const dfloat s13 = u_lam0 * JW * (dudz + dwdx); - - const dfloat s21 = v_lam0 * JW * (dvdx + dudy); - const dfloat s22 = v_lam0 * JW * (dvdy + dvdy); - const dfloat s23 = v_lam0 * JW * (dvdz + dwdy); - - const dfloat s31 = w_lam0 * JW * (dwdx + dudz); - const dfloat s32 = w_lam0 * JW * (dwdy + dvdz); - const dfloat s33 = w_lam0 * JW * (dwdz + dwdz); - - s_SUr[k][j][i] = rx * s11 + ry * s12 + rz * s13; - s_SUs[k][j][i] = sx * s11 + sy * s12 + sz * s13; - s_SUt[k][j][i] = tx * s11 + ty * s12 + tz * s13; - // - s_SVr[k][j][i] = rx * s21 + ry * s22 + rz * s23; - s_SVs[k][j][i] = sx * s21 + sy * s22 + sz * s23; - s_SVt[k][j][i] = tx * s21 + ty * s22 + tz * s23; - // - s_SWr[k][j][i] = rx * s31 + ry * s32 + rz * s33; - s_SWs[k][j][i] = sx * s31 + sy * s32 + sz * s33; - s_SWt[k][j][i] = tx * s31 + ty * s32 + tz * s33; - } - -// loop over slabs - for(int k = 0; k < p_Nq; ++k) - for(int j = 0; j < p_Nq; ++j) - for(int i = 0; i < p_Nq; ++i) { - dfloat r_Au = 0.f, r_Av = 0.f, r_Aw = 0.f; - for(int m = 0; m < p_Nq; m++) { - const dfloat Dim = s_D[m][i]; // Dr' - const dfloat Djm = s_D[m][j]; // Ds' - const dfloat Dkm = s_D[m][k]; // Dt' - - r_Au += Dim * s_SUr[k][j][m]; - r_Au += Djm * s_SUs[k][m][i]; - r_Au += Dkm * s_SUt[m][j][i]; - - r_Av += Dim * s_SVr[k][j][m]; - r_Av += Djm * s_SVs[k][m][i]; - r_Av += Dkm * s_SVt[m][j][i]; - - r_Aw += Dim * s_SWr[k][j][m]; - r_Aw += Djm * s_SWs[k][m][i]; - r_Aw += Dkm * s_SWt[m][j][i]; - } - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat u_lam1 = lambda[id + 1 * offset + 0 * loffset]; - const dfloat v_lam1 = lambda[id + 1 * offset + 1 * loffset]; - const dfloat w_lam1 = lambda[id + 1 * offset + 2 * loffset]; - - const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo; - const dfloat JW = vgeo[gid + p_JWID * p_Np]; - // store in register - Aq[id + 0 * offset] = r_Au + u_lam1 * JW * s_U[k][j][i]; - Aq[id + 1 * offset] = r_Av + v_lam1 * JW * s_V[k][j][i]; - Aq[id + 2 * offset] = r_Aw + w_lam1 * JW * s_W[k][j][i]; - } - } -} diff --git a/okl/elliptic/ellipticStressPartialAxCoeffHex3D.c b/okl/elliptic/ellipticStressPartialAxCoeffHex3D.c new file mode 100644 index 000000000..38c4b957a --- /dev/null +++ b/okl/elliptic/ellipticStressPartialAxCoeffHex3D.c @@ -0,0 +1,166 @@ +extern "C" void FUNC(ellipticStressPartialAxCoeffHex3D)(const dlong &Nelements, + const dlong &offset, + const dlong &loffset, + const dlong* __restrict__ elementList, + const dfloat* __restrict__ vgeo, + const dfloat* __restrict__ D, + const dfloat* __restrict__ S, + const dfloat* __restrict__ lambda, + const dfloat* __restrict__ q, + dfloat* __restrict__ Aq) +{ + dfloat s_D[p_Nq][p_Nq]; + + dfloat s_U[p_Nq][p_Nq][p_Nq]; + dfloat s_V[p_Nq][p_Nq][p_Nq]; + dfloat s_W[p_Nq][p_Nq][p_Nq]; + + dfloat s_SUr[p_Nq][p_Nq][p_Nq]; + dfloat s_SUs[p_Nq][p_Nq][p_Nq]; + dfloat s_SUt[p_Nq][p_Nq][p_Nq]; + + dfloat s_SVr[p_Nq][p_Nq][p_Nq]; + dfloat s_SVs[p_Nq][p_Nq][p_Nq]; + dfloat s_SVt[p_Nq][p_Nq][p_Nq]; + + dfloat s_SWr[p_Nq][p_Nq][p_Nq]; + dfloat s_SWs[p_Nq][p_Nq][p_Nq]; + dfloat s_SWt[p_Nq][p_Nq][p_Nq]; + + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) + s_D[j][i] = D[j * p_Nq + i]; + +#ifdef __NEKRS__OMP__ + #pragma omp parallel for private(s_U, s_V, s_W, s_SUr, s_SUs, s_SUt, s_SVr, s_SVs, s_SVt, s_SWr, s_SWs, s_SWt) +#endif + for(dlong elem = 0; elem < Nelements; ++elem) { + dlong e = elementList[elem]; + for(int k = 0; k < p_Nq; ++k) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + s_U[k][j][i] = q[id + 0 * offset]; + s_V[k][j][i] = q[id + 1 * offset]; + s_W[k][j][i] = q[id + 2 * offset]; + } + + // loop over slabs + for(int k = 0; k < p_Nq; ++k) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo; + const dfloat rx = vgeo[gid + p_RXID * p_Np]; + const dfloat ry = vgeo[gid + p_RYID * p_Np]; + const dfloat rz = vgeo[gid + p_RZID * p_Np]; + + const dfloat sx = vgeo[gid + p_SXID * p_Np]; + const dfloat sy = vgeo[gid + p_SYID * p_Np]; + const dfloat sz = vgeo[gid + p_SZID * p_Np]; + + const dfloat tx = vgeo[gid + p_TXID * p_Np]; + const dfloat ty = vgeo[gid + p_TYID * p_Np]; + const dfloat tz = vgeo[gid + p_TZID * p_Np]; + + const dfloat JW = vgeo[gid + p_JWID * p_Np]; + + // compute 1D derivatives + dfloat ur = 0.f, us = 0.f, ut = 0.f; + dfloat vr = 0.f, vs = 0.f, vt = 0.f; + dfloat wr = 0.f, ws = 0.f, wt = 0.f; + for(int m = 0; m < p_Nq; ++m) { + const dfloat Dim = s_D[i][m]; // Dr + const dfloat Djm = s_D[j][m]; // Ds + const dfloat Dkm = s_D[k][m]; // Dt + + ur += Dim * s_U[k][j][m]; + us += Djm * s_U[k][m][i]; + ut += Dkm * s_U[m][j][i]; + // + vr += Dim * s_V[k][j][m]; + vs += Djm * s_V[k][m][i]; + vt += Dkm * s_V[m][j][i]; + // + wr += Dim * s_W[k][j][m]; + ws += Djm * s_W[k][m][i]; + wt += Dkm * s_W[m][j][i]; + } + + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat u_lam0 = lambda[id + 0 * offset + 0 * loffset]; + const dfloat v_lam0 = lambda[id + 0 * offset + 1 * loffset]; + const dfloat w_lam0 = lambda[id + 0 * offset + 2 * loffset]; + + const dfloat dudx = rx * ur + sx * us + tx * ut; + const dfloat dudy = ry * ur + sy * us + ty * ut; + const dfloat dudz = rz * ur + sz * us + tz * ut; + + const dfloat dvdx = rx * vr + sx * vs + tx * vt; + const dfloat dvdy = ry * vr + sy * vs + ty * vt; + const dfloat dvdz = rz * vr + sz * vs + tz * vt; + + const dfloat dwdx = rx * wr + sx * ws + tx * wt; + const dfloat dwdy = ry * wr + sy * ws + ty * wt; + const dfloat dwdz = rz * wr + sz * ws + tz * wt; + + const dfloat s11 = u_lam0 * JW * (dudx + dudx); + const dfloat s12 = u_lam0 * JW * (dudy + dvdx); + const dfloat s13 = u_lam0 * JW * (dudz + dwdx); + + const dfloat s21 = v_lam0 * JW * (dvdx + dudy); + const dfloat s22 = v_lam0 * JW * (dvdy + dvdy); + const dfloat s23 = v_lam0 * JW * (dvdz + dwdy); + + const dfloat s31 = w_lam0 * JW * (dwdx + dudz); + const dfloat s32 = w_lam0 * JW * (dwdy + dvdz); + const dfloat s33 = w_lam0 * JW * (dwdz + dwdz); + + s_SUr[k][j][i] = rx * s11 + ry * s12 + rz * s13; + s_SUs[k][j][i] = sx * s11 + sy * s12 + sz * s13; + s_SUt[k][j][i] = tx * s11 + ty * s12 + tz * s13; + // + s_SVr[k][j][i] = rx * s21 + ry * s22 + rz * s23; + s_SVs[k][j][i] = sx * s21 + sy * s22 + sz * s23; + s_SVt[k][j][i] = tx * s21 + ty * s22 + tz * s23; + // + s_SWr[k][j][i] = rx * s31 + ry * s32 + rz * s33; + s_SWs[k][j][i] = sx * s31 + sy * s32 + sz * s33; + s_SWt[k][j][i] = tx * s31 + ty * s32 + tz * s33; + } + +// loop over slabs + for(int k = 0; k < p_Nq; ++k) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + dfloat r_Au = 0.f, r_Av = 0.f, r_Aw = 0.f; + for(int m = 0; m < p_Nq; m++) { + const dfloat Dim = s_D[m][i]; // Dr' + const dfloat Djm = s_D[m][j]; // Ds' + const dfloat Dkm = s_D[m][k]; // Dt' + + r_Au += Dim * s_SUr[k][j][m]; + r_Au += Djm * s_SUs[k][m][i]; + r_Au += Dkm * s_SUt[m][j][i]; + + r_Av += Dim * s_SVr[k][j][m]; + r_Av += Djm * s_SVs[k][m][i]; + r_Av += Dkm * s_SVt[m][j][i]; + + r_Aw += Dim * s_SWr[k][j][m]; + r_Aw += Djm * s_SWs[k][m][i]; + r_Aw += Dkm * s_SWt[m][j][i]; + } + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat u_lam1 = lambda[id + 1 * offset + 0 * loffset]; + const dfloat v_lam1 = lambda[id + 1 * offset + 1 * loffset]; + const dfloat w_lam1 = lambda[id + 1 * offset + 2 * loffset]; + + const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo; + const dfloat JW = vgeo[gid + p_JWID * p_Np]; + // store in register + Aq[id + 0 * offset] = r_Au + u_lam1 * JW * s_U[k][j][i]; + Aq[id + 1 * offset] = r_Av + v_lam1 * JW * s_V[k][j][i]; + Aq[id + 2 * offset] = r_Aw + w_lam1 * JW * s_W[k][j][i]; + } + } +} diff --git a/okl/elliptic/ellipticStressPartialAxCoeffHex3D.okl b/okl/elliptic/ellipticStressPartialAxCoeffHex3D.okl new file mode 100644 index 000000000..0d9aa7913 --- /dev/null +++ b/okl/elliptic/ellipticStressPartialAxCoeffHex3D.okl @@ -0,0 +1,199 @@ +@kernel void ellipticStressPartialAxCoeffHex3D(const dlong Nelements, + const dlong offset, + const dlong loffset, + @restrict const dlong* elementList, + @restrict const dfloat* vgeo, + @restrict const dfloat* D, + @restrict const dfloat* S, + @restrict const dfloat* lambda, + @restrict const dfloat* q, + @restrict dfloat* Aq) +{ + for(dlong e = 0; e < Nelements; ++e; @outer(0)) { + @shared dfloat s_D[p_Nq][p_Nq]; + + @shared dfloat s_U[p_Nq][p_Nq]; + @shared dfloat s_V[p_Nq][p_Nq]; + @shared dfloat s_W[p_Nq][p_Nq]; + @exclusive dfloat s_Uloc[p_Nq]; + @exclusive dfloat s_Vloc[p_Nq]; + @exclusive dfloat s_Wloc[p_Nq]; + @shared dfloat s_SUr[p_Nq][p_Nq]; + @shared dfloat s_SUs[p_Nq][p_Nq]; + @exclusive dfloat s_SUtloc[p_Nq]; + + @shared dfloat s_SVr[p_Nq][p_Nq]; + @shared dfloat s_SVs[p_Nq][p_Nq]; + @exclusive dfloat s_SVt[p_Nq]; + + @shared dfloat s_SWr[p_Nq][p_Nq]; + @shared dfloat s_SWs[p_Nq][p_Nq]; + @exclusive dfloat s_SWt[p_Nq]; + + @exclusive dfloat rx, ry, rz; + @exclusive dfloat sx, sy, sz; + @exclusive dfloat tx, ty, tz; + + @exclusive dfloat s11,s12,s13; + @exclusive dfloat s21,s22,s23; + @exclusive dfloat s31,s32,s33; + + @exclusive dfloat r_Au[p_Nq]; + @exclusive dfloat r_Av[p_Nq]; + @exclusive dfloat r_Aw[p_Nq]; + @exclusive dlong element; + + for(int k = 0; k < p_Nq; ++k) { + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + element = elementList[e]; + if(k == 0) s_D[j][i] = D[p_Nq * j + i]; + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + s_U[j][i] = q[id + 0 * offset]; + s_V[j][i] = q[id + 1 * offset]; + s_W[j][i] = q[id + 2 * offset]; + if(k == 0) { + for(int l = 0; l < p_Nq; ++l) { + const dlong other_id = element * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; + s_Uloc[l] = q[other_id + 0 * offset]; + s_Vloc[l] = q[other_id + 1 * offset]; + s_Wloc[l] = q[other_id + 2 * offset]; + } + } + } + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + element * p_Np * p_Nvgeo; + rx = vgeo[gid + p_RXID * p_Np]; + ry = vgeo[gid + p_RYID * p_Np]; + rz = vgeo[gid + p_RZID * p_Np]; + + sx = vgeo[gid + p_SXID * p_Np]; + sy = vgeo[gid + p_SYID * p_Np]; + sz = vgeo[gid + p_SZID * p_Np]; + + tx = vgeo[gid + p_TXID * p_Np]; + ty = vgeo[gid + p_TYID * p_Np]; + tz = vgeo[gid + p_TZID * p_Np]; + + const dfloat JW = vgeo[gid + p_JWID * p_Np]; + + dfloat ur = 0.f, us = 0.f, ut = 0.f; + dfloat vr = 0.f, vs = 0.f, vt = 0.f; + dfloat wr = 0.f, ws = 0.f, wt = 0.f; + for(int m = 0; m < p_Nq; ++m) { + const dfloat Dim = s_D[i][m]; // Dr + const dfloat Djm = s_D[j][m]; // Ds + const dfloat Dkm = s_D[k][m]; // Dt + + ur += Dim * s_U[j][m]; + us += Djm * s_U[m][i]; + ut += Dkm * s_Uloc[m]; + + vr += Dim * s_V[j][m]; + vs += Djm * s_V[m][i]; + vt += Dkm * s_Vloc[m]; + + wr += Dim * s_W[j][m]; + ws += Djm * s_W[m][i]; + wt += Dkm * s_Wloc[m]; + } + + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat u_lam0 = lambda[id + 0 * offset + 0 * loffset]; + const dfloat u_lam1 = lambda[id + 1 * offset + 0 * loffset]; + const dfloat v_lam0 = lambda[id + 0 * offset + 1 * loffset]; + const dfloat v_lam1 = lambda[id + 1 * offset + 1 * loffset]; + const dfloat w_lam0 = lambda[id + 0 * offset + 2 * loffset]; + const dfloat w_lam1 = lambda[id + 1 * offset + 2 * loffset]; + + const dfloat dudx = rx * ur + sx * us + tx * ut; + const dfloat dudy = ry * ur + sy * us + ty * ut; + const dfloat dudz = rz * ur + sz * us + tz * ut; + + const dfloat dvdx = rx * vr + sx * vs + tx * vt; + const dfloat dvdy = ry * vr + sy * vs + ty * vt; + const dfloat dvdz = rz * vr + sz * vs + tz * vt; + + const dfloat dwdx = rx * wr + sx * ws + tx * wt; + const dfloat dwdy = ry * wr + sy * ws + ty * wt; + const dfloat dwdz = rz * wr + sz * ws + tz * wt; + + s11 = u_lam0 * JW * (dudx + dudx); + s12 = u_lam0 * JW * (dudy + dvdx); + s13 = u_lam0 * JW * (dudz + dwdx); + + s21 = v_lam0 * JW * (dvdx + dudy); + s22 = v_lam0 * JW * (dvdy + dvdy); + s23 = v_lam0 * JW * (dvdz + dwdy); + + s31 = w_lam0 * JW * (dwdx + dudz); + s32 = w_lam0 * JW * (dwdy + dvdz); + s33 = w_lam0 * JW * (dwdz + dwdz); + + r_Au[k] = u_lam1 * JW * s_U[j][i]; + r_Av[k] = v_lam1 * JW * s_V[j][i]; + r_Aw[k] = w_lam1 * JW * s_W[j][i]; + } + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + s_SUr[j][i] = rx * s11 + ry * s12 + rz * s13; + s_SUs[j][i] = sx * s11 + sy * s12 + sz * s13; + s_SUtloc[k] = tx * s11 + ty * s12 + tz * s13; + + s_SVr[j][i] = rx * s21 + ry * s22 + rz * s23; + s_SVs[j][i] = sx * s21 + sy * s22 + sz * s23; + s_SVt[k] = tx * s21 + ty * s22 + tz * s23; + + s_SWr[j][i] = rx * s31 + ry * s32 + rz * s33; + s_SWs[j][i] = sx * s31 + sy * s32 + sz * s33; + s_SWt[k] = tx * s31 + ty * s32 + tz * s33; + } + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + const dfloat Dim = s_D[m][i]; // Dr' + const dfloat Djm = s_D[m][j]; // Ds' + + r_Au[k] += Dim * s_SUr[j][m]; + r_Au[k] += Djm * s_SUs[m][i]; + + r_Av[k] += Dim * s_SVr[j][m]; + r_Av[k] += Djm * s_SVs[m][i]; + + r_Aw[k] += Dim * s_SWr[j][m]; + r_Aw[k] += Djm * s_SWs[m][i]; + } + } + } + } + + for(int k = 0; k < p_Nq; ++k) { + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + const dfloat Dkm = s_D[m][k]; // Dt' + + r_Au[k] += Dkm * s_SUtloc[m]; + + r_Av[k] += Dkm * s_SVt[m]; + + r_Aw[k] += Dkm * s_SWt[m]; + } + const dlong id = element * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + Aq[id + 0 * offset] = r_Au[k]; + Aq[id + 1 * offset] = r_Av[k]; + Aq[id + 2 * offset] = r_Aw[k]; + } + } + } + } +} diff --git a/okl/elliptic/ellipticStressPartialAxHex3D.c b/okl/elliptic/ellipticStressPartialAxHex3D.c new file mode 100644 index 000000000..58544b734 --- /dev/null +++ b/okl/elliptic/ellipticStressPartialAxHex3D.c @@ -0,0 +1,166 @@ +extern "C" +void FUNC(ellipticStressPartialAxHex3D)(const dlong &Nelements, + const dlong &offset, + const dlong &loffset, + const dlong* __restrict__ elementList, + const dfloat* __restrict__ vgeo, + const dfloat* __restrict__ D, + const dfloat* __restrict__ S, + const dfloat* __restrict__ lambda, + const dfloat* __restrict__ q, + dfloat* __restrict__ Aq) +{ + dfloat s_D[p_Nq][p_Nq]; + + dfloat s_U[p_Nq][p_Nq][p_Nq]; + dfloat s_V[p_Nq][p_Nq][p_Nq]; + dfloat s_W[p_Nq][p_Nq][p_Nq]; + + dfloat s_SUr[p_Nq][p_Nq][p_Nq]; + dfloat s_SUs[p_Nq][p_Nq][p_Nq]; + dfloat s_SUt[p_Nq][p_Nq][p_Nq]; + + dfloat s_SVr[p_Nq][p_Nq][p_Nq]; + dfloat s_SVs[p_Nq][p_Nq][p_Nq]; + dfloat s_SVt[p_Nq][p_Nq][p_Nq]; + + dfloat s_SWr[p_Nq][p_Nq][p_Nq]; + dfloat s_SWs[p_Nq][p_Nq][p_Nq]; + dfloat s_SWt[p_Nq][p_Nq][p_Nq]; + + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) + s_D[j][i] = D[j * p_Nq + i]; + +#ifdef __NEKRS__OMP__ + #pragma omp parallel for private(s_U, s_V, s_W, s_SUr, s_SUs, s_SUt, s_SVr, s_SVs, s_SVt, s_SWr, s_SWs, s_SWt) +#endif + for(dlong elem = 0; elem < Nelements; ++elem) { + dlong e = elementList[elem]; + for(int k = 0; k < p_Nq; ++k) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + s_U[k][j][i] = q[id + 0 * offset]; + s_V[k][j][i] = q[id + 1 * offset]; + s_W[k][j][i] = q[id + 2 * offset]; + } + + // loop over slabs + for(int k = 0; k < p_Nq; ++k) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo; + const dfloat rx = vgeo[gid + p_RXID * p_Np]; + const dfloat ry = vgeo[gid + p_RYID * p_Np]; + const dfloat rz = vgeo[gid + p_RZID * p_Np]; + + const dfloat sx = vgeo[gid + p_SXID * p_Np]; + const dfloat sy = vgeo[gid + p_SYID * p_Np]; + const dfloat sz = vgeo[gid + p_SZID * p_Np]; + + const dfloat tx = vgeo[gid + p_TXID * p_Np]; + const dfloat ty = vgeo[gid + p_TYID * p_Np]; + const dfloat tz = vgeo[gid + p_TZID * p_Np]; + + const dfloat JW = vgeo[gid + p_JWID * p_Np]; + + // compute 1D derivatives + dfloat ur = 0.f, us = 0.f, ut = 0.f; + dfloat vr = 0.f, vs = 0.f, vt = 0.f; + dfloat wr = 0.f, ws = 0.f, wt = 0.f; + for(int m = 0; m < p_Nq; ++m) { + const dfloat Dim = s_D[i][m]; // Dr + const dfloat Djm = s_D[j][m]; // Ds + const dfloat Dkm = s_D[k][m]; // Dt + + ur += Dim * s_U[k][j][m]; + us += Djm * s_U[k][m][i]; + ut += Dkm * s_U[m][j][i]; + // + vr += Dim * s_V[k][j][m]; + vs += Djm * s_V[k][m][i]; + vt += Dkm * s_V[m][j][i]; + // + wr += Dim * s_W[k][j][m]; + ws += Djm * s_W[k][m][i]; + wt += Dkm * s_W[m][j][i]; + } + + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat u_lam0 = lambda[0 * offset + 0 * loffset]; + const dfloat v_lam0 = lambda[0 * offset + 1 * loffset]; + const dfloat w_lam0 = lambda[0 * offset + 2 * loffset]; + + const dfloat dudx = rx * ur + sx * us + tx * ut; + const dfloat dudy = ry * ur + sy * us + ty * ut; + const dfloat dudz = rz * ur + sz * us + tz * ut; + + const dfloat dvdx = rx * vr + sx * vs + tx * vt; + const dfloat dvdy = ry * vr + sy * vs + ty * vt; + const dfloat dvdz = rz * vr + sz * vs + tz * vt; + + const dfloat dwdx = rx * wr + sx * ws + tx * wt; + const dfloat dwdy = ry * wr + sy * ws + ty * wt; + const dfloat dwdz = rz * wr + sz * ws + tz * wt; + + const dfloat s11 = u_lam0 * JW * (dudx + dudx); + const dfloat s12 = u_lam0 * JW * (dudy + dvdx); + const dfloat s13 = u_lam0 * JW * (dudz + dwdx); + + const dfloat s21 = v_lam0 * JW * (dvdx + dudy); + const dfloat s22 = v_lam0 * JW * (dvdy + dvdy); + const dfloat s23 = v_lam0 * JW * (dvdz + dwdy); + + const dfloat s31 = w_lam0 * JW * (dwdx + dudz); + const dfloat s32 = w_lam0 * JW * (dwdy + dvdz); + const dfloat s33 = w_lam0 * JW * (dwdz + dwdz); + + s_SUr[k][j][i] = rx * s11 + ry * s12 + rz * s13; + s_SUs[k][j][i] = sx * s11 + sy * s12 + sz * s13; + s_SUt[k][j][i] = tx * s11 + ty * s12 + tz * s13; + // + s_SVr[k][j][i] = rx * s21 + ry * s22 + rz * s23; + s_SVs[k][j][i] = sx * s21 + sy * s22 + sz * s23; + s_SVt[k][j][i] = tx * s21 + ty * s22 + tz * s23; + // + s_SWr[k][j][i] = rx * s31 + ry * s32 + rz * s33; + s_SWs[k][j][i] = sx * s31 + sy * s32 + sz * s33; + s_SWt[k][j][i] = tx * s31 + ty * s32 + tz * s33; + } + +// loop over slabs + for(int k = 0; k < p_Nq; ++k) + for(int j = 0; j < p_Nq; ++j) + for(int i = 0; i < p_Nq; ++i) { + dfloat r_Au = 0.f, r_Av = 0.f, r_Aw = 0.f; + for(int m = 0; m < p_Nq; m++) { + const dfloat Dim = s_D[m][i]; // Dr' + const dfloat Djm = s_D[m][j]; // Ds' + const dfloat Dkm = s_D[m][k]; // Dt' + + r_Au += Dim * s_SUr[k][j][m]; + r_Au += Djm * s_SUs[k][m][i]; + r_Au += Dkm * s_SUt[m][j][i]; + + r_Av += Dim * s_SVr[k][j][m]; + r_Av += Djm * s_SVs[k][m][i]; + r_Av += Dkm * s_SVt[m][j][i]; + + r_Aw += Dim * s_SWr[k][j][m]; + r_Aw += Djm * s_SWs[k][m][i]; + r_Aw += Dkm * s_SWt[m][j][i]; + } + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat u_lam1 = lambda[1 * offset + 0 * loffset]; + const dfloat v_lam1 = lambda[1 * offset + 1 * loffset]; + const dfloat w_lam1 = lambda[1 * offset + 2 * loffset]; + + const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo; + const dfloat JW = vgeo[gid + p_JWID * p_Np]; + Aq[id + 0 * offset] = r_Au + u_lam1 * JW * s_U[k][j][i]; + Aq[id + 1 * offset] = r_Av + v_lam1 * JW * s_V[k][j][i]; + Aq[id + 2 * offset] = r_Aw + w_lam1 * JW * s_W[k][j][i]; + } + } +} diff --git a/okl/elliptic/ellipticStressPartialAxHex3D.okl b/okl/elliptic/ellipticStressPartialAxHex3D.okl new file mode 100644 index 000000000..90be7d629 --- /dev/null +++ b/okl/elliptic/ellipticStressPartialAxHex3D.okl @@ -0,0 +1,199 @@ +@kernel void ellipticStressPartialAxHex3D(const dlong Nelements, + const dlong offset, + const dlong loffset, + @restrict const dlong* elementList, + @restrict const dfloat* vgeo, + @restrict const dfloat* D, + @restrict const dfloat* S, + @restrict const dfloat* lambda, + @restrict const dfloat* q, + @restrict dfloat* Aq) +{ + for(dlong elem = 0; elem < Nelements; ++elem; @outer(0)) { + @shared dfloat s_D[p_Nq][p_Nq]; + + @shared dfloat s_U[p_Nq][p_Nq]; + @shared dfloat s_V[p_Nq][p_Nq]; + @shared dfloat s_W[p_Nq][p_Nq]; + @shared dfloat s_SUr[p_Nq][p_Nq]; + @shared dfloat s_SUs[p_Nq][p_Nq]; + @exclusive dfloat s_Uloc[p_Nq]; + @exclusive dfloat s_Vloc[p_Nq]; + @exclusive dfloat s_Wloc[p_Nq]; + @exclusive dfloat s_SUtloc[p_Nq]; + + @shared dfloat s_SVr[p_Nq][p_Nq]; + @shared dfloat s_SVs[p_Nq][p_Nq]; + @exclusive dfloat s_SVt[p_Nq]; + + @shared dfloat s_SWr[p_Nq][p_Nq]; + @shared dfloat s_SWs[p_Nq][p_Nq]; + @exclusive dfloat s_SWt[p_Nq]; + + @exclusive dfloat rx, ry, rz; + @exclusive dfloat sx, sy, sz; + @exclusive dfloat tx, ty, tz; + + @exclusive dfloat s11,s12,s13; + @exclusive dfloat s21,s22,s23; + @exclusive dfloat s31,s32,s33; + + @exclusive dfloat r_Au[p_Nq]; + @exclusive dfloat r_Av[p_Nq]; + @exclusive dfloat r_Aw[p_Nq]; + + @exclusive dlong e; + + for(int k = 0; k < p_Nq; ++k) { + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + if(k == 0) { + e = elementList[elem]; + s_D[j][i] = D[p_Nq * j + i]; + } + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + s_U[j][i] = q[id + 0 * offset]; + s_V[j][i] = q[id + 1 * offset]; + s_W[j][i] = q[id + 2 * offset]; + if(k == 0) { + for(int l = 0; l < p_Nq; ++l) { + const dlong other_id = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; + s_Uloc[l] = q[other_id + 0 * offset]; + s_Vloc[l] = q[other_id + 1 * offset]; + s_Wloc[l] = q[other_id + 2 * offset]; + } + } + } + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong gid = i + j * p_Nq + k * p_Nq * p_Nq + e * p_Np * p_Nvgeo; + rx = vgeo[gid + p_RXID * p_Np]; + ry = vgeo[gid + p_RYID * p_Np]; + rz = vgeo[gid + p_RZID * p_Np]; + + sx = vgeo[gid + p_SXID * p_Np]; + sy = vgeo[gid + p_SYID * p_Np]; + sz = vgeo[gid + p_SZID * p_Np]; + + tx = vgeo[gid + p_TXID * p_Np]; + ty = vgeo[gid + p_TYID * p_Np]; + tz = vgeo[gid + p_TZID * p_Np]; + + const dfloat JW = vgeo[gid + p_JWID * p_Np]; + + dfloat ur = 0.f, us = 0.f, ut = 0.f; + dfloat vr = 0.f, vs = 0.f, vt = 0.f; + dfloat wr = 0.f, ws = 0.f, wt = 0.f; + for(int m = 0; m < p_Nq; ++m) { + const dfloat Dim = s_D[i][m]; // Dr + const dfloat Djm = s_D[j][m]; // Ds + const dfloat Dkm = s_D[k][m]; // Dt + + ur += Dim * s_U[j][m]; + us += Djm * s_U[m][i]; + ut += Dkm * s_Uloc[m]; + + vr += Dim * s_V[j][m]; + vs += Djm * s_V[m][i]; + vt += Dkm * s_Vloc[m]; + + wr += Dim * s_W[j][m]; + ws += Djm * s_W[m][i]; + wt += Dkm * s_Wloc[m]; + } + + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat dudx = rx * ur + sx * us + tx * ut; + const dfloat dudy = ry * ur + sy * us + ty * ut; + const dfloat dudz = rz * ur + sz * us + tz * ut; + + const dfloat dvdx = rx * vr + sx * vs + tx * vt; + const dfloat dvdy = ry * vr + sy * vs + ty * vt; + const dfloat dvdz = rz * vr + sz * vs + tz * vt; + + const dfloat dwdx = rx * wr + sx * ws + tx * wt; + const dfloat dwdy = ry * wr + sy * ws + ty * wt; + const dfloat dwdz = rz * wr + sz * ws + tz * wt; + + const dfloat u_lam0 = lambda[0 * offset + 0 * loffset]; + const dfloat v_lam0 = lambda[0 * offset + 1 * loffset]; + const dfloat w_lam0 = lambda[0 * offset + 2 * loffset]; + + s11 = u_lam0 * JW * (dudx + dudx); + s12 = u_lam0 * JW * (dudy + dvdx); + s13 = u_lam0 * JW * (dudz + dwdx); + + s21 = v_lam0 * JW * (dvdx + dudy); + s22 = v_lam0 * JW * (dvdy + dvdy); + s23 = v_lam0 * JW * (dvdz + dwdy); + + s31 = w_lam0 * JW * (dwdx + dudz); + s32 = w_lam0 * JW * (dwdy + dvdz); + s33 = w_lam0 * JW * (dwdz + dwdz) +; + r_Au[k] = lambda[id + 1 * offset + 0 * loffset] * JW * s_U[j][i]; + r_Av[k] = lambda[id + 1 * offset + 1 * loffset] * JW * s_V[j][i]; + r_Aw[k] = lambda[id + 1 * offset + 2 * loffset] * JW * s_W[j][i]; + } + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + s_SUr[j][i] = rx * s11 + ry * s12 + rz * s13; + s_SUs[j][i] = sx * s11 + sy * s12 + sz * s13; + s_SUtloc[k] = tx * s11 + ty * s12 + tz * s13; + + s_SVr[j][i] = rx * s21 + ry * s22 + rz * s23; + s_SVs[j][i] = sx * s21 + sy * s22 + sz * s23; + s_SVt[k] = tx * s21 + ty * s22 + tz * s23; + + s_SWr[j][i] = rx * s31 + ry * s32 + rz * s33; + s_SWs[j][i] = sx * s31 + sy * s32 + sz * s33; + s_SWt[k] = tx * s31 + ty * s32 + tz * s33; + } + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + const dfloat Dim = s_D[m][i]; // Dr' + const dfloat Djm = s_D[m][j]; // Ds' + + r_Au[k] += Dim * s_SUr[j][m]; + r_Au[k] += Djm * s_SUs[m][i]; + + r_Av[k] += Dim * s_SVr[j][m]; + r_Av[k] += Djm * s_SVs[m][i]; + + r_Aw[k] += Dim * s_SWr[j][m]; + r_Aw[k] += Djm * s_SWs[m][i]; + } + } + } + } + + for(int k = 0; k < p_Nq; ++k) { + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for(int m = 0; m < p_Nq; m++) { + const dfloat Dkm = s_D[m][k]; // Dt' + + r_Au[k] += Dkm * s_SUtloc[m]; + + r_Av[k] += Dkm * s_SVt[m]; + + r_Aw[k] += Dkm * s_SWt[m]; + } + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + Aq[id + 0 * offset] = r_Au[k]; + Aq[id + 1 * offset] = r_Av[k]; + Aq[id + 2 * offset] = r_Aw[k]; + } + } + } + } +} diff --git a/okl/elliptic/fusedCopyDfloatToPfloat.c b/okl/elliptic/fusedCopyDfloatToPfloat.c new file mode 100644 index 000000000..024ceebdf --- /dev/null +++ b/okl/elliptic/fusedCopyDfloatToPfloat.c @@ -0,0 +1,43 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + + + + + +extern "C" void FUNC(fusedCopyDfloatToPfloat) (const dlong & N, + const dfloat * __restrict__ x_dfloat, + const dfloat * __restrict__ y_dfloat, + pfloat * __restrict__ x_pfloat, + pfloat * __restrict__ y_pfloat){ +#ifdef __NEKRS__OMP__ + #pragma omp parallel for +#endif + for(dlong n=0;n= 1 && k < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 2; + work1[l1][j][k] = work1[l1][j][k] - work1[l2][j][k]; + work1[p_Nq_e - l1 - 1][j][k] = work1[p_Nq_e - l1 - 1][j][k] - work1[p_Nq_e - l2 - 1][j][k]; + } + } + } + @barrier("local"); + for (int k = 0; k < p_Nq_e; ++k; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + if (k >= 1 && k < p_Nq_e - 1 && i >= 1 && i < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 2; + work1[i][l1][k] = work1[i][l1][k] - work1[i][l2][k]; + work1[i][p_Nq_e - l1 - 1][k] = work1[i][p_Nq_e - l1 - 1][k] - work1[i][p_Nq_e - l2 - 1][k]; + } + } + } + @barrier("local"); + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + if (i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 2; + work1[i][j][l1] = work1[i][j][l1] - work1[i][j][l2]; + work1[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l1 - 1] - work1[i][j][p_Nq_e - l2 - 1]; + } + } + } + @barrier("local"); + for (int i = 0; i < p_Nq_e; i++; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { + const int ij = j + i * p_Nq_e; + S_x_e[i][j] = S_x[ij + element * p_Nq_e * p_Nq_e]; + S_y_e[i][j] = S_y[ij + element * p_Nq_e * p_Nq_e]; + S_z_e[i][j] = S_z[ij + element * p_Nq_e * p_Nq_e]; + S_x_eT[j][i] = S_x_e[i][j]; + S_y_eT[j][i] = S_y_e[i][j]; + S_z_eT[j][i] = S_z_e[i][j]; + } + } + @barrier("local"); + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { +#pragma unroll + for (int i = 0; i < p_Nq_e; i++) { + pfloat value = 0.0; +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) + value += S_x_eT[j][l] * work1[k][i][l]; + work2[k][j][i] = value; + } + } + } + @barrier("local"); + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { +#pragma unroll + for (int i = 0; i < p_Nq_e; i++) { + pfloat value = 0.0; +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) + value += S_y_eT[j][l] * work2[k][i][l]; + work1[j][i][k] = value; + } + } + } + @barrier("local"); + for (int k = 0; k < p_Nq_e; k++) { + @barrier("local"); + for (int j = 0; j < p_Nq_e; j++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + const int v = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e; + pfloat value = 0.0; +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) + value += S_z_eT[k][l] * work1[j][i][l]; + work2[k][j][i] = value * inv_L[v + element * p_Nq_e * p_Nq_e * p_Nq_e]; + } + } + } + @barrier("local"); + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { +#pragma unroll + for (int i = 0; i < p_Nq_e; i++) { + pfloat value = 0.0; +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) + value += S_x_e[i][l] * work2[k][j][l]; + work1[k][i][j] = value; + } + } + } + @barrier("local"); + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { +#pragma unroll + for (int i = 0; i < p_Nq_e; i++) { + pfloat value = 0.0; +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) + value += S_y_e[j][l] * work1[k][i][l]; + work2[j][i][k] = value; + } + } + } + @barrier("local"); + for (int k = 0; k < p_Nq_e; k++) { + @barrier("local"); + for (int j = 0; j < p_Nq_e; j++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + pfloat value = 0.0; +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) + value += S_z_e[k][l] * work2[j][i][l]; + +#if (!p_restrict) + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + const int v = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + Su[v] = value; +#endif + work1[k][j][i] = value; + } + } + } +#if (!p_restrict) + @barrier("local"); + for (int k = 0; k < p_Nq_e; ++k; @inner) { + for (int j = 0; j < p_Nq_e; ++j; @inner) { + if (k >= 1 && k < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 0; + work2[l1][j][k] = work1[l2][j][k]; + work2[p_Nq_e - l1 - 1][j][k] = work1[p_Nq_e - l2 - 1][j][k]; + } + } + } + @barrier("local"); + for (int k = 0; k < p_Nq_e; ++k; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + if (k >= 1 && k < p_Nq_e - 1 && i >= 1 && i < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 0; + work2[i][l1][k] = work1[i][l2][k]; + work2[i][p_Nq_e - l1 - 1][k] = work1[i][p_Nq_e - l2 - 1][k]; + } + } + } + @barrier("local"); + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + if (i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 0; + work2[i][j][l1] = work1[i][j][l2]; + work2[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l2 - 1]; + } + } + } + @barrier("local"); + for (int k = 0; k < p_Nq_e; ++k) { + @barrier("local"); + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + u[idx] = work2[k][j][i]; + } + } + } + +#else /* if (!p_restrict) */ + @barrier("local"); + for (int k = 0; k < p_Nq; ++k) { + @barrier("local"); + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + if (i < p_Nq && j < p_Nq) { + const dlong elem_offset = element * p_Nq * p_Nq * p_Nq; + const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset; + Su[idx] = work1[k + 1][j + 1][i + 1] * wts[idx]; + } + } + } + } + +#endif + } +} +#endif + +#if p_knl == 1 +// Blocked version, tends to do well on low orders on NVIDIA +@kernel void fusedFDM_v1(const dlong Nelements, +#if p_overlap + @restrict const dlong *elementList, +#endif + @restrict pfloat *Su, + @restrict const pfloat *S_x, + @restrict const pfloat *S_y, + @restrict const pfloat *S_z, + @restrict const pfloat *inv_L, +#if p_restrict + @restrict const dfloat *wts, +#endif + @restrict pfloat *u) +{ + +#if p_Nq_e == 2 +#define p_Nblk 64 +#elif p_Nq_e == 3 +#define p_Nblk 28 +#elif p_Nq_e == 4 +#define p_Nblk 16 +#elif p_Nq_e == 5 +#define p_Nblk 10 +#elif p_Nq_e == 6 +#define p_Nblk 7 +#elif p_Nq_e == 7 +#define p_Nblk 5 +#elif p_Nq_e == 8 +#define p_Nblk 4 +#elif p_Nq_e == 9 +#define p_Nblk 3 +#elif p_Nq_e == 10 +#define p_Nblk 2 +#else +#define p_Nblk 1 +#endif + + for (dlong eo = 0; eo < Nelements; eo += p_Nblk; @outer) { + + @shared pfloat S_x_e[p_Nblk][p_Nq_e][p_Nq_e]; + @shared pfloat S_y_e[p_Nblk][p_Nq_e][p_Nq_e]; + @shared pfloat S_z_e[p_Nblk][p_Nq_e][p_Nq_e]; + @shared pfloat S_x_eT[p_Nblk][p_Nq_e][p_Nq_e]; + @shared pfloat S_y_eT[p_Nblk][p_Nq_e][p_Nq_e]; + @shared pfloat S_z_eT[p_Nblk][p_Nq_e][p_Nq_e]; + @shared pfloat work1[p_Nblk][p_Nq_e][p_Nq_e][p_Nq_e]; + @shared pfloat work2[p_Nblk][p_Nq_e][p_Nq_e][p_Nq_e]; + @exclusive dlong element; + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + + dlong my_elem = es + eo; + +#if p_overlap + element = (my_elem < Nelements) ? elementList[my_elem] : -1; +#else + element = (my_elem < Nelements) ? my_elem : -1; +#endif + + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + + if (element != -1) { +#pragma unroll + for (int k = 0; k < p_Nq_e; ++k) { + const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + work1[es][k][j][i] = u[idx]; + } + + const int ij = j + i * p_Nq_e + element * p_Nq_e * p_Nq_e; + S_x_e[es][i][j] = S_x[ij]; + S_y_e[es][i][j] = S_y[ij]; + S_z_e[es][i][j] = S_z[ij]; + S_x_eT[es][j][i] = S_x_e[es][i][j]; + S_y_eT[es][j][i] = S_y_e[es][i][j]; + S_z_eT[es][j][i] = S_z_e[es][i][j]; + } + } + } + } + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + + if (i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 2; + work1[es][i][j][l1] -= work1[es][i][j][l2]; + work1[es][i][j][p_Nq_e - l1 - 1] -= work1[es][i][j][p_Nq_e - l2 - 1]; + work1[es][i][l1][j] -= work1[es][i][l2][j]; + work1[es][i][p_Nq_e - l1 - 1][j] -= work1[es][i][p_Nq_e - l2 - 1][j]; + work1[es][l1][i][j] -= work1[es][l2][i][j]; + work1[es][p_Nq_e - l1 - 1][i][j] -= work1[es][p_Nq_e - l2 - 1][i][j]; + } + } + } + } + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int i = 0; i < p_Nq_e; i++) { + pfloat value = 0.0; + //#pragma unroll p_Nq_e +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) + value += S_x_eT[es][i][l] * work1[es][k][j][l]; + + work2[es][k][j][i] = value; + } + } + } + } + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int j = 0; j < p_Nq_e; j++) { + pfloat value = 0.0; + //#pragma unroll p_Nq_e +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) + value += S_y_eT[es][j][l] * work2[es][k][l][i]; + + work1[es][k][j][i] = value; + } + } + } + } + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int k = 0; k < p_Nq_e; k++) { + + pfloat value = 0.0; + //#pragma unroll p_Nq_e +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) + value += S_z_eT[es][k][l] * work1[es][l][j][i]; + + const int v = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e; + if (element != -1) + work2[es][k][j][i] = value * inv_L[v + element * p_Nq_e * p_Nq_e * p_Nq_e]; + } + } + } + } + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int i = 0; i < p_Nq_e; i++) { + pfloat value = 0.0; + //#pragma unroll p_Nq_e +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) + value += S_x_e[es][i][l] * work2[es][k][j][l]; + + work1[es][k][j][i] = value; + } + } + } + } + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int j = 0; j < p_Nq_e; j++) { + pfloat value = 0.0; + //#pragma unroll p_Nq_e +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) + value += S_y_e[es][j][l] * work1[es][k][l][i]; + work2[es][k][j][i] = value; + } + } + } + } + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int k = 0; k < p_Nq_e; k++) { + + pfloat value = 0.0; +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) + value += S_z_e[es][k][l] * work2[es][l][j][i]; + +#if (!p_restrict) + if (element != -1) { + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + const int v = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + Su[v] = value; + } + work1[es][k][j][i] = value; +#else + if (element != -1) + if (i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1 && k >= 1 && k < p_Nq_e - 1) { + const dlong elem_offset = element * p_Nq * p_Nq * p_Nq; + const dlong idx = i - 1 + (j - 1) * p_Nq + (k - 1) * p_Nq * p_Nq + elem_offset; + Su[idx] = value * wts[idx]; + } +#endif + } + } + } + } + + @barrier("local"); + +#if (!p_restrict) + // merged these three loops into one (however, the premise is shaky at best) + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + if (i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 0; + work2[es][l1][j][i] = work1[es][l2][j][i]; + work2[es][p_Nq_e - l1 - 1][j][i] = work1[es][p_Nq_e - l2 - 1][j][i]; + + work2[es][i][l1][j] = work1[es][i][l2][j]; + work2[es][i][p_Nq_e - l1 - 1][j] = work1[es][i][p_Nq_e - l2 - 1][j]; + + work2[es][i][j][l1] = work1[es][i][j][l2]; + work2[es][i][j][p_Nq_e - l1 - 1] = work1[es][i][j][p_Nq_e - l2 - 1]; + } + } + } + } + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + if (element != -1) { + for (int k = 0; k < p_Nq_e; ++k) { + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + u[idx] = work2[es][k][j][i]; + } + } + } + } + } + +#endif /* if (!p_restrict) */ + } +} +#endif + +#if p_knl == 2 +// Blocked kernel, tends to be good for low orders on AMD +@kernel void fusedFDM_v2(const dlong Nelements, +#if p_overlap + @restrict const dlong *elementList, +#endif + @restrict pfloat *Su, + @restrict const pfloat *S_x, + @restrict const pfloat *S_y, + @restrict const pfloat *S_z, + @restrict const pfloat *inv_L, +#if p_restrict + @restrict const dfloat *wts, +#endif + @restrict pfloat *u) +{ + +#if p_Nq_e == 2 +#define p_Nblk 63 +#elif p_Nq_e == 3 +#define p_Nblk 27 +#elif p_Nq_e == 4 +#define p_Nblk 15 +#elif p_Nq_e == 5 +#define p_Nblk 9 +#elif p_Nq_e == 6 +#define p_Nblk 7 +#elif p_Nq_e == 7 +#define p_Nblk 5 +#elif p_Nq_e == 8 +#ifdef gfxXX +#define p_Nblk 11 +#else +#define p_Nblk 3 +#endif +#elif p_Nq_e == 9 +#ifdef gfxXX +#define p_Nblk 3 +#else +#define p_Nblk 1 +#endif +#elif p_Nq_e == 10 +#ifdef gfxXX +#define p_Nblk 3 +#else +#define p_Nblk 1 +#endif +#else +#define p_Nblk 1 +#endif + + for (dlong eo = 0; eo < Nelements; eo += p_Nblk; @outer) { + + @shared pfloat S_x_e[p_Nq_e][p_Nq_e][p_Nblk]; + @shared pfloat S_y_e[p_Nq_e][p_Nq_e][p_Nblk]; + @shared pfloat S_z_e[p_Nq_e][p_Nq_e][p_Nblk]; + @shared pfloat work1[p_Nq_e][p_Nq_e][p_Nq_e][p_Nblk]; + @shared pfloat work2[p_Nq_e][p_Nq_e][p_Nq_e][p_Nblk]; + @exclusive dlong element; + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + + dlong my_elem = es + eo; + +#if p_overlap + element = (my_elem < Nelements) ? elementList[my_elem] : -1; +#else + element = (my_elem < Nelements) ? my_elem : -1; +#endif + + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + + if (element != -1) { +#pragma unroll + for (int k = 0; k < p_Nq_e; ++k) { + const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + work1[k][j][i][es] = u[idx]; + } + + const int ij = j + i * p_Nq_e + element * p_Nq_e * p_Nq_e; + S_x_e[i][j][es] = S_x[ij]; + S_y_e[i][j][es] = S_y[ij]; + S_z_e[i][j][es] = S_z[ij]; + } + } + } + } + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + + if (i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 2; + work1[i][j][l1][es] -= work1[i][j][l2][es]; + work1[i][j][p_Nq_e - l1 - 1][es] -= work1[i][j][p_Nq_e - l2 - 1][es]; + work1[i][l1][j][es] -= work1[i][l2][j][es]; + work1[i][p_Nq_e - l1 - 1][j][es] -= work1[i][p_Nq_e - l2 - 1][j][es]; + work1[l1][i][j][es] -= work1[l2][i][j][es]; + work1[p_Nq_e - l1 - 1][i][j][es] -= work1[p_Nq_e - l2 - 1][i][j][es]; + } + } + } + } + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { + pfloat values[p_Nq_e]; + + for (int i = 0; i < p_Nq_e; i++) + values[i] = 0; + +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[k][j][l][es]; +#pragma unroll + for (int i = 0; i < p_Nq_e; i++) { + values[i] += S_x_e[l][i][es] * tmp; + } + } + for (int i = 0; i < p_Nq_e; i++) + work2[k][j][i][es] = values[i]; + } + } + } + @barrier("local"); + + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + + pfloat values[p_Nq_e]; + + for (int j = 0; j < p_Nq_e; j++) + values[j] = 0; + +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work2[k][l][i][es]; +#pragma unroll + for (int j = 0; j < p_Nq_e; j++) { + values[j] += S_y_e[l][j][es] * tmp; + } + } + for (int j = 0; j < p_Nq_e; j++) + work1[k][j][i][es] = values[j]; + } + } + } + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + + pfloat values[p_Nq_e]; + + for (int k = 0; k < p_Nq_e; k++) + values[k] = 0; + +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[l][j][i][es]; +#pragma unroll + for (int k = 0; k < p_Nq_e; k++) { + values[k] += S_z_e[l][k][es] * tmp; + } + } + + if (element != -1) { + for (int k = 0; k < p_Nq_e; k++) { + const int v1 = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e; + const pfloat tmp = inv_L[v1 + element * p_Nq_e * p_Nq_e * p_Nq_e]; + + work2[k][j][i][es] = values[k] * tmp; + } + } + } + } + } + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { + + pfloat values[p_Nq_e]; + + for (int i = 0; i < p_Nq_e; i++) + values[i] = 0; + +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work2[k][j][l][es]; +#pragma unroll + for (int i = 0; i < p_Nq_e; i++) { + values[i] += S_x_e[i][l][es] * tmp; + } + } + for (int i = 0; i < p_Nq_e; i++) + work1[k][j][i][es] = values[i]; + } + } + } + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + + pfloat values[p_Nq_e]; + + for (int j = 0; j < p_Nq_e; j++) + values[j] = 0; + +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[k][l][i][es]; +#pragma unroll + for (int j = 0; j < p_Nq_e; j++) { + values[j] += S_y_e[j][l][es] * tmp; + } + } + for (int j = 0; j < p_Nq_e; j++) + work2[k][j][i][es] = values[j]; + } + } + } + + @barrier("local"); + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + + pfloat values[p_Nq_e]; + + for (int k = 0; k < p_Nq_e; k++) + values[k] = 0; + +#pragma unroll + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work2[l][j][i][es]; +#pragma unroll + for (int k = 0; k < p_Nq_e; k++) { + values[k] += S_z_e[k][l][es] * tmp; + } + } + + if (element != -1) { +#if (!p_restrict) + for (int k = 0; k < p_Nq_e; k++) { + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + const int v1 = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + Su[v1] = values[k]; + work1[k][j][i][es] = values[k]; + } +#else + for (int k = 0; k < p_Nq_e; k++) { + if (i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1 && k >= 1 && k < p_Nq_e - 1) { + const dlong elem_offset = element * p_Nq * p_Nq * p_Nq; + const dlong idx1 = i - 1 + (j - 1) * p_Nq + (k - 1) * p_Nq * p_Nq + elem_offset; + Su[idx1] = values[k] * wts[idx1]; + } + } +#endif + } + } + } + } + + @barrier("local"); + +#if (!p_restrict) + // merged these three loops into one (however, the premise is shaky at best) + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + if (i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 0; + work2[l1][j][i][es] = work1[l2][j][i][es]; + work2[p_Nq_e - l1 - 1][j][i][es] = work1[p_Nq_e - l2 - 1][j][i][es]; + + work2[i][l1][j][es] = work1[i][l2][j][es]; + work2[i][p_Nq_e - l1 - 1][j][es] = work1[i][p_Nq_e - l2 - 1][j][es]; + + work2[i][j][l1][es] = work1[i][j][l2][es]; + work2[i][j][p_Nq_e - l1 - 1][es] = work1[i][j][p_Nq_e - l2 - 1][es]; + } + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + if (element != -1) { + for (int k = 0; k < p_Nq_e; ++k) { + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + u[idx] = work2[k][j][i][es]; + } + } + } + } + } + +#endif + /* if (!p_restrict) */ + } +} +#endif + +#if p_knl == 3 +// Tends to do well for high orders on AMD +@kernel void fusedFDM_v3(const dlong Nelements, +#if p_overlap + @restrict const dlong *elementList, +#endif + @restrict pfloat *Su, + @restrict const pfloat *S_x, + @restrict const pfloat *S_y, + @restrict const pfloat *S_z, + @restrict const pfloat *inv_L, +#if p_restrict + @restrict const dfloat *wts, +#endif + @restrict pfloat *u) +{ + +#if p_Nq_e == 2 +#define p_Nblk 63 +#elif p_Nq_e == 3 +#define p_Nblk 27 +#elif p_Nq_e == 4 +#define p_Nblk 15 +#elif p_Nq_e == 5 +#define p_Nblk 9 +#elif p_Nq_e == 6 +#define p_Nblk 7 +#elif p_Nq_e == 7 +#define p_Nblk 5 +#elif p_Nq_e == 8 +#define p_Nblk 5 +#elif p_Nq_e == 9 +#define p_Nblk 3 +#elif p_Nq_e == 10 +#define p_Nblk 5 +#elif p_Nq_e == 11 +#define p_Nblk 3 +#elif p_Nq_e == 12 +#define p_Nblk 3 +#elif p_Nq_e == 13 +#define p_Nblk 3 +#elif p_Nq_e == 14 +#define p_Nblk 3 +#elif p_Nq_e == 15 +#define p_Nblk 3 +#else +#define p_Nblk 1 +#endif + + for (dlong eo = 0; eo < Nelements; eo += p_Nblk; @outer) { + + @shared pfloat S_x_e[p_Nq_e][p_Nq_e][p_Nblk]; + @shared pfloat S_y_e[p_Nq_e][p_Nq_e][p_Nblk]; + @shared pfloat S_z_e[p_Nq_e][p_Nq_e][p_Nblk]; + @shared pfloat work1[p_Nq_e][p_Nq_e][p_Nq_e][p_Nblk]; + + @exclusive dlong element; + + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + + dlong my_elem = es + eo; + +#if p_overlap + element = (my_elem < Nelements) ? elementList[my_elem] : -1; +#else + element = (my_elem < Nelements) ? my_elem : -1; +#endif + + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + + if (element != -1) { +#pragma unroll + for (int k = 0; k < p_Nq_e; ++k) { + const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + work1[k][j][i][es] = u[idx]; + } + + const int ij = j + i * p_Nq_e + element * p_Nq_e * p_Nq_e; + S_x_e[i][j][es] = S_x[ij]; + S_y_e[i][j][es] = S_y[ij]; + S_z_e[i][j][es] = S_z[ij]; + } + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + + if (i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 2; + work1[i][j][l1][es] -= work1[i][j][l2][es]; + work1[i][j][p_Nq_e - l1 - 1][es] -= work1[i][j][p_Nq_e - l2 - 1][es]; + work1[i][l1][j][es] -= work1[i][l2][j][es]; + work1[i][p_Nq_e - l1 - 1][j][es] -= work1[i][p_Nq_e - l2 - 1][j][es]; + work1[l1][i][j][es] -= work1[l2][i][j][es]; + work1[p_Nq_e - l1 - 1][i][j][es] -= work1[p_Nq_e - l2 - 1][i][j][es]; + } + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { + pfloat values[p_Nq_e]; + + for (int i = 0; i < p_Nq_e; i++) + values[i] = 0; + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[k][j][l][es]; +#pragma unroll + for (int i = 0; i < p_Nq_e; i++) { + values[i] += S_x_e[l][i][es] * tmp; + } + } + + for (int i = 0; i < p_Nq_e; i++) + work1[k][j][i][es] = values[i]; + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + + pfloat values[p_Nq_e]; + + for (int j = 0; j < p_Nq_e; j++) + values[j] = 0; + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[k][l][i][es]; +#pragma unroll + for (int j = 0; j < p_Nq_e; j++) { + values[j] += S_y_e[l][j][es] * tmp; + } + } + + for (int j = 0; j < p_Nq_e; j++) + work1[k][j][i][es] = values[j]; + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + + pfloat values[p_Nq_e]; + + for (int k = 0; k < p_Nq_e; k++) + values[k] = 0; + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[l][j][i][es]; +#pragma unroll + for (int k = 0; k < p_Nq_e; k++) { + values[k] += S_z_e[l][k][es] * tmp; + } + } + + if (element != -1) { + for (int k = 0; k < p_Nq_e; k++) { + const int v1 = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e; + const pfloat tmp = inv_L[v1 + element * p_Nq_e * p_Nq_e * p_Nq_e]; + + work1[k][j][i][es] = values[k] * tmp; + } + } + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { + + pfloat values[p_Nq_e]; + + for (int i = 0; i < p_Nq_e; i++) + values[i] = 0; + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[k][j][l][es]; +#pragma unroll + for (int i = 0; i < p_Nq_e; i++) { + values[i] += S_x_e[i][l][es] * tmp; + } + } + + for (int i = 0; i < p_Nq_e; i++) + work1[k][j][i][es] = values[i]; + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + + pfloat values[p_Nq_e]; + + for (int j = 0; j < p_Nq_e; j++) + values[j] = 0; + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[k][l][i][es]; +#pragma unroll + for (int j = 0; j < p_Nq_e; j++) { + values[j] += S_y_e[j][l][es] * tmp; + } + } + for (int j = 0; j < p_Nq_e; j++) + work1[k][j][i][es] = values[j]; + } + } + } + + @barrier("local"); + + for (int es = 0; es < p_Nblk; ++es; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + + pfloat values[p_Nq_e]; + + for (int k = 0; k < p_Nq_e; k++) + values[k] = 0; + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[l][j][i][es]; +#pragma unroll + for (int k = 0; k < p_Nq_e; k++) { + values[k] += S_z_e[k][l][es] * tmp; + } + } + +#if (!p_restrict) + if (element != -1) { + for (int k = 0; k < p_Nq_e; k++) { + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + const int v1 = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + Su[v1] = values[k]; + work1[k][j][i][es] = values[k]; + } + } +#else + if (element != -1) { + for (int k = 0; k < p_Nq_e; k++) { + if (i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1 && k >= 1 && k < p_Nq_e - 1) { + const dlong elem_offset = element * p_Nq * p_Nq * p_Nq; + const dlong idx1 = i - 1 + (j - 1) * p_Nq + (k - 1) * p_Nq * p_Nq + elem_offset; + Su[idx1] = values[k] * wts[idx1]; + } + } + } +#endif + } + } + } + +#if (!p_restrict) + @barrier("local"); + + for (int es = 0; es < p_Nblk; ++es; @inner) + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + if (element != -1) { + for (int k = 0; k < p_Nq_e; ++k) { + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + u[idx] = work1[k][j][i][es]; + } + } + } + } +#endif /* if (!p_restrict) */ + } +} +#endif + +#if p_knl == 4 +// Tends to do well for high orders on NVIDIA +@kernel void fusedFDM_v4(const dlong Nelements, +#if p_overlap + @restrict const dlong *elementList, +#endif + @restrict pfloat *Su, + @restrict const pfloat *S_x, + @restrict const pfloat *S_y, + @restrict const pfloat *S_z, + @restrict const pfloat *inv_L, +#if p_restrict + @restrict const dfloat *wts, +#endif + @restrict pfloat *u) +{ + + for (dlong e = 0; e < Nelements; ++e; @outer) { + + @shared pfloat S_x_e[p_Nq_e][p_Nq_e]; + @shared pfloat S_y_e[p_Nq_e][p_Nq_e]; + @shared pfloat S_z_e[p_Nq_e][p_Nq_e]; + @shared pfloat work1[p_Nq_e][p_Nq_e][p_Nq_e]; + + @exclusive dlong element; + + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + + dlong my_elem = e; + +#if p_overlap + element = (my_elem < Nelements) ? elementList[my_elem] : -1; +#else + element = (my_elem < Nelements) ? my_elem : -1; +#endif + + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + + if (element != -1) { +#pragma unroll + for (int k = 0; k < p_Nq_e; ++k) { + const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + work1[k][j][i] = u[idx]; + } + + const int ij = j + i * p_Nq_e + element * p_Nq_e * p_Nq_e; + S_x_e[i][j] = S_x[ij]; + S_y_e[i][j] = S_y[ij]; + S_z_e[i][j] = S_z[ij]; + } + } + } + + @barrier("local"); + + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + + if (i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 2; + work1[i][j][l1] -= work1[i][j][l2]; + work1[i][j][p_Nq_e - l1 - 1] -= work1[i][j][p_Nq_e - l2 - 1]; + work1[i][l1][j] -= work1[i][l2][j]; + work1[i][p_Nq_e - l1 - 1][j] -= work1[i][p_Nq_e - l2 - 1][j]; + work1[l1][i][j] -= work1[l2][i][j]; + work1[p_Nq_e - l1 - 1][i][j] -= work1[p_Nq_e - l2 - 1][i][j]; + } + } + } + + @barrier("local"); + + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { + pfloat values[p_Nq_e]; + + for (int i = 0; i < p_Nq_e; i++) + values[i] = 0; + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[k][j][l]; +#pragma unroll + for (int i = 0; i < p_Nq_e; i++) { + values[i] += S_x_e[l][i] * tmp; + } + } + + for (int i = 0; i < p_Nq_e; i++) + work1[k][j][i] = values[i]; + } + } + + @barrier("local"); + + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + + pfloat values[p_Nq_e]; + + for (int j = 0; j < p_Nq_e; j++) + values[j] = 0; + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[k][l][i]; +#pragma unroll + for (int j = 0; j < p_Nq_e; j++) { + values[j] += S_y_e[l][j] * tmp; + } + } + + for (int j = 0; j < p_Nq_e; j++) + work1[k][j][i] = values[j]; + } + } + + @barrier("local"); + + for (int j = 0; j < p_Nq_e; j++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + + pfloat values[p_Nq_e]; + + for (int k = 0; k < p_Nq_e; k++) + values[k] = 0; + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[l][j][i]; +#pragma unroll + for (int k = 0; k < p_Nq_e; k++) { + values[k] += S_z_e[l][k] * tmp; + } + } + + if (element != -1) { + for (int k = 0; k < p_Nq_e; k++) { + const int v1 = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e; + const pfloat tmp = inv_L[v1 + element * p_Nq_e * p_Nq_e * p_Nq_e]; + + work1[k][j][i] = values[k] * tmp; + } + } + } + } + + @barrier("local"); + + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int j = 0; j < p_Nq_e; j++; @inner) { + + pfloat values[p_Nq_e]; + + for (int i = 0; i < p_Nq_e; i++) + values[i] = 0; + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[k][j][l]; +#pragma unroll + for (int i = 0; i < p_Nq_e; i++) { + values[i] += S_x_e[i][l] * tmp; + } + } + + for (int i = 0; i < p_Nq_e; i++) + work1[k][j][i] = values[i]; + } + } + + @barrier("local"); + + for (int k = 0; k < p_Nq_e; k++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + + pfloat values[p_Nq_e]; + + for (int j = 0; j < p_Nq_e; j++) + values[j] = 0; + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[k][l][i]; +#pragma unroll + for (int j = 0; j < p_Nq_e; j++) { + values[j] += S_y_e[j][l] * tmp; + } + } + for (int j = 0; j < p_Nq_e; j++) + work1[k][j][i] = values[j]; + } + } + + @barrier("local"); + + for (int j = 0; j < p_Nq_e; j++; @inner) { + for (int i = 0; i < p_Nq_e; i++; @inner) { + + pfloat values[p_Nq_e]; + + for (int k = 0; k < p_Nq_e; k++) + values[k] = 0; + +#ifdef gfxXX +#pragma nounroll +#else +#pragma unroll p_Nq_e +#endif + for (int l = 0; l < p_Nq_e; l++) { + pfloat tmp = work1[l][j][i]; +#pragma unroll + for (int k = 0; k < p_Nq_e; k++) { + values[k] += S_z_e[k][l] * tmp; + } + } + +#if (!p_restrict) + if (element != -1) { + for (int k = 0; k < p_Nq_e; k++) { + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + const int v1 = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + Su[v1] = values[k]; + work1[k][j][i] = values[k]; + } + } +#else + if (element != -1) { + for (int k = 0; k < p_Nq_e; k++) { + if (i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1 && k >= 1 && k < p_Nq_e - 1) { + const dlong elem_offset = element * p_Nq * p_Nq * p_Nq; + const dlong idx1 = i - 1 + (j - 1) * p_Nq + (k - 1) * p_Nq * p_Nq + elem_offset; + Su[idx1] = values[k] * wts[idx1]; + } + } + } +#endif + } + } + +#if (!p_restrict) + @barrier("local"); + + for (int j = 0; j < p_Nq_e; ++j; @inner) { + for (int i = 0; i < p_Nq_e; ++i; @inner) { + if (element != -1) { + for (int k = 0; k < p_Nq_e; ++k) { + const dlong elem_offset = element * p_Nq_e * p_Nq_e * p_Nq_e; + const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + u[idx] = work1[k][j][i]; + } + } + } + } +#endif /* if (!p_restrict) */ + } +} +#endif \ No newline at end of file diff --git a/okl/elliptic/ellipticFusedResidualAndNorm.c b/okl/elliptic/fusedResidualAndNorm.c similarity index 100% rename from okl/elliptic/ellipticFusedResidualAndNorm.c rename to okl/elliptic/fusedResidualAndNorm.c diff --git a/okl/elliptic/ellipticFusedResidualAndNorm.okl b/okl/elliptic/fusedResidualAndNorm.okl similarity index 100% rename from okl/elliptic/ellipticFusedResidualAndNorm.okl rename to okl/elliptic/fusedResidualAndNorm.okl diff --git a/okl/elliptic/ellipticGather.okl b/okl/elliptic/gather.okl similarity index 88% rename from okl/elliptic/ellipticGather.okl rename to okl/elliptic/gather.okl index c7a9c3ccc..155a63f70 100644 --- a/okl/elliptic/ellipticGather.okl +++ b/okl/elliptic/gather.okl @@ -1,6 +1,6 @@ @kernel void gather( const dlong numRows, - @restrict const long long* dof_map, + @restrict const hlong* dof_map, @restrict const dfloat* w, @restrict pfloat* buffer ) diff --git a/okl/elliptic/ellipticGramSchmidtOrthogonalization.c b/okl/elliptic/gramSchmidtOrthogonalization.c similarity index 100% rename from okl/elliptic/ellipticGramSchmidtOrthogonalization.c rename to okl/elliptic/gramSchmidtOrthogonalization.c diff --git a/okl/elliptic/ellipticGramSchmidtOrthogonalization.okl b/okl/elliptic/gramSchmidtOrthogonalization.okl similarity index 100% rename from okl/elliptic/ellipticGramSchmidtOrthogonalization.okl rename to okl/elliptic/gramSchmidtOrthogonalization.okl diff --git a/okl/elliptic/multiScaledAddwOffset.okl b/okl/elliptic/multiScaledAddwOffset.okl new file mode 100644 index 000000000..596e4d8c8 --- /dev/null +++ b/okl/elliptic/multiScaledAddwOffset.okl @@ -0,0 +1,44 @@ +/* + + The MIT License (MIT) + + Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + */ + + + + +@kernel void multiScaledAddwOffset(const dlong N, + const dlong m, + const dlong destOffset, + const dlong fieldOffset, + @restrict const dfloat* alphas, + const dfloat beta, + @restrict dfloat* x) +{ + for(dlong n = 0; n < N; ++n; @tile(p_blockSize,@outer,@inner)) + if(n < N){ + for(dlong k = 0; k < m - 1; ++k) + for(dlong fld = 0; fld < p_Nfields; ++fld) + x[n + destOffset + fld * fieldOffset] = -alphas[k] * x[n + p_Nfields * k * fieldOffset + fld * fieldOffset] + beta * x[n + destOffset + fld * fieldOffset]; + } +} diff --git a/okl/elliptic/ellipticResidualProjection.okl b/okl/elliptic/multiWeightedInnerProduct2.okl similarity index 65% rename from okl/elliptic/ellipticResidualProjection.okl rename to okl/elliptic/multiWeightedInnerProduct2.okl index 7a2bd20c6..8755fe6c4 100644 --- a/okl/elliptic/ellipticResidualProjection.okl +++ b/okl/elliptic/multiWeightedInnerProduct2.okl @@ -24,55 +24,8 @@ */ -@kernel void scalarMultiply(const dlong N, - const dlong fieldOffset, - const dlong offset, - const dfloat alpha, - @restrict dfloat* x) -{ - for(dlong n = 0; n < N; ++n; @tile(p_blockSize,@outer,@inner)) - if(n < N){ - #pragma unroll p_Nfields - for(dlong field = 0 ; field < p_Nfields; ++field){ - x[n + offset + field * fieldOffset] = alpha * x[n + offset + field * fieldOffset]; - } - } -} -@kernel void multiScaledAddwOffset(const dlong N, - const dlong m, - const dlong destOffset, - const dlong fieldOffset, - @restrict const dfloat* alphas, - const dfloat beta, - @restrict dfloat* x) -{ - for(dlong n = 0; n < N; ++n; @tile(p_blockSize,@outer,@inner)) - if(n < N){ - for(dlong k = 0; k < m - 1; ++k) - for(dlong fld = 0; fld < p_Nfields; ++fld) - x[n + destOffset + fld * fieldOffset] = -alphas[k] * x[n + p_Nfields * k * fieldOffset + fld * fieldOffset] + beta * x[n + destOffset + fld * fieldOffset]; - } -} -@kernel void accumulate(const dlong N, - const dlong m, - const dlong fieldOffset, - @restrict const dfloat* alpha, - @restrict const dfloat* x, - @restrict dfloat* y) -{ - for(dlong n = 0; n < N; ++n; @tile(p_blockSize,@outer,@inner)) - if(n < N) { - // y = alpha[0] * o_xx[:,0] - for(dlong fld = 0 ; fld < p_Nfields; ++fld) - y[n + fld * fieldOffset] = alpha[0] * x[n + fld * fieldOffset]; - for(dlong k = 1; k < m; ++k) - for(dlong fld = 0 ; fld < p_Nfields; ++fld) - // y += alpha[k] * o_xx[:,k] - y[n + fld * fieldOffset] += alpha[k] * x[n + p_Nfields * k * fieldOffset + fld * fieldOffset]; - } -} @kernel void multiWeightedInnerProduct2(const dlong N, const dlong fieldOffset, diff --git a/okl/elliptic/postFDM.c b/okl/elliptic/postFDM.c new file mode 100644 index 000000000..39c83ce2b --- /dev/null +++ b/okl/elliptic/postFDM.c @@ -0,0 +1,155 @@ + + +extern "C" void FUNC(postFDM)(const dlong& Nelements, + pfloat* __restrict__ my_work1, + pfloat* __restrict__ my_work2, + pfloat* __restrict__ Su, + const pfloat* __restrict__ wts) +{ + pfloat work1[p_Nq_e][p_Nq_e][p_Nq_e]; + pfloat work2[p_Nq_e][p_Nq_e][p_Nq_e]; +#ifdef __NEKRS__OMP__ + #pragma omp parallel for private(work1, work2) +#endif + for (dlong elem = 0; elem < Nelements; ++elem) { + #pragma unroll + for(int k = 0; k < p_Nq_e; ++k){ + #pragma unroll + for(int j = 0; j < p_Nq_e; ++j){ + #pragma unroll + for(int i = 0; i < p_Nq_e; ++i) { + const dlong elem_offset = elem * p_Nq_e * p_Nq_e * p_Nq_e; + const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + work1[k][j][i] = my_work2[idx]; + work2[k][j][i] = my_work1[idx]; + } + } + } + #pragma unroll + for(int j = 1; j < p_Nq_e-1; ++j){ + #pragma unroll + for(int k = 1; k < p_Nq_e-1; ++k){ + const int l1 = 0; + const int l2 = 0; + work1[l1][j][k] = work1[l1][j][k] - work2[l2][j][k]; + } + } + #pragma unroll + for(int j = 1; j < p_Nq_e-1; ++j){ + #pragma unroll + for(int k = 1; k < p_Nq_e-1; ++k){ + const int l1 = 0; + const int l2 = 0; + work1[p_Nq_e - l1 - 1][j][k] = work1[p_Nq_e - l1 - 1][j][k] - + work2[p_Nq_e - l2 - 1][j][k]; + } + } + #pragma unroll + for(int i = 1; i < p_Nq_e-1; ++i){ + #pragma unroll + for(int k = 1; k < p_Nq_e-1; ++k){ + const int l1 = 0; + const int l2 = 0; + work1[i][l1][k] = work1[i][l1][k] - work2[i][l2][k]; + } + } + #pragma unroll + for(int i = 1; i < p_Nq_e-1; ++i){ + #pragma unroll + for(int k = 1; k < p_Nq_e-1; ++k){ + const int l1 = 0; + const int l2 = 0; + work1[i][p_Nq_e - l1 - 1][k] = work1[i][p_Nq_e - l1 - 1][k] - + work2[i][p_Nq_e - l2 - 1][k]; + } + } + #pragma unroll + for(int i = 1; i < p_Nq_e-1; ++i){ + #pragma unroll + for(int j = 1; j < p_Nq_e-1; ++j){ + const int l1 = 0; + const int l2 = 0; + work1[i][j][l1] = work1[i][j][l1] - work2[i][j][l2]; + } + } + #pragma unroll + for(int i = 1; i < p_Nq_e-1; ++i){ + #pragma unroll + for(int j = 1; j < p_Nq_e-1; ++j){ + const int l1 = 0; + const int l2 = 0; + work1[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l1 - 1] - + work2[i][j][p_Nq_e - l2 - 1]; + } + } + #pragma unroll + for(int j = 1; j < p_Nq_e-1; ++j){ + #pragma unroll + for(int k = 1; k < p_Nq_e-1; ++k){ + const int l1 = 2; + const int l2 = 0; + work1[l1][j][k] = work1[l1][j][k] + work1[l2][j][k]; + } + } + #pragma unroll + for(int j = 1; j < p_Nq_e-1; ++j){ + #pragma unroll + for(int k = 1; k < p_Nq_e-1; ++k){ + const int l1 = 2; + const int l2 = 0; + work1[p_Nq_e - l1 - 1][j][k] = work1[p_Nq_e - l1 - 1][j][k] + + work1[p_Nq_e - l2 - 1][j][k]; + } + } + #pragma unroll + for(int i = 1; i < p_Nq_e-1; ++i){ + #pragma unroll + for(int k = 1; k < p_Nq_e-1; ++k){ + const int l1 = 2; + const int l2 = 0; + work1[i][l1][k] = work1[i][l1][k] + work1[i][l2][k]; + } + } + #pragma unroll + for(int i = 1; i < p_Nq_e-1; ++i){ + #pragma unroll + for(int k = 1; k < p_Nq_e-1; ++k){ + const int l1 = 2; + const int l2 = 0; + work1[i][p_Nq_e - l1 - 1][k] = work1[i][p_Nq_e - l1 - 1][k] + + work1[i][p_Nq_e - l2 - 1][k]; + } + } + #pragma unroll + for(int i = 1; i < p_Nq_e-1; ++i){ + #pragma unroll + for(int j = 1; j < p_Nq_e-1; ++j){ + const int l1 = 2; + const int l2 = 0; + work1[i][j][l1] = work1[i][j][l1] + work1[i][j][l2]; + } + } + #pragma unroll + for(int i = 1; i < p_Nq_e-1; ++i){ + #pragma unroll + for(int j = 1; j < p_Nq_e-1; ++j){ + const int l1 = 2; + const int l2 = 0; + work1[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l1 - 1] + + work1[i][j][p_Nq_e - l2 - 1]; + } + } + #pragma unroll + for(int k = 0; k < p_Nq; ++k){ + #pragma unroll + for(int j = 0; j < p_Nq; ++j){ + #pragma unroll + for(int i = 0; i < p_Nq; ++i){ + const dlong elem_offset = elem * p_Nq * p_Nq * p_Nq; + const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset; + Su[idx] = work1[k + 1][j + 1][i + 1] * wts[idx]; + } + } + } + } +} diff --git a/okl/elliptic/postFDM.okl b/okl/elliptic/postFDM.okl new file mode 100644 index 000000000..92ffbbdc3 --- /dev/null +++ b/okl/elliptic/postFDM.okl @@ -0,0 +1,109 @@ + + +@kernel void postFDM(const dlong Nelements, + @restrict pfloat* my_work1, + @restrict pfloat* my_work2, + @restrict pfloat* Su, + @restrict const pfloat* wts) +{ + for (dlong elem = 0; elem < Nelements; ++elem; @outer) { + @shared pfloat work1[p_Nq_e][p_Nq_e][p_Nq_e]; + @shared pfloat work2[p_Nq_e][p_Nq_e][p_Nq_e]; + for(int k = 0; k < p_Nq_e; ++k){ + @barrier("local"); + for(int j = 0; j < p_Nq_e; ++j; @inner){ + for(int i = 0; i < p_Nq_e; ++i; @inner) { + const dlong elem_offset = elem * p_Nq_e * p_Nq_e * p_Nq_e; + const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + work1[k][j][i] = my_work2[idx]; + work2[k][j][i] = my_work1[idx]; + } + } + } + @barrier("local"); + for(int k = 0; k < p_Nq_e; ++k; @inner){ + for(int j = 0; j < p_Nq_e; ++j; @inner){ + if(k >= 1 && k < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 0; + work1[l1][j][k] = work1[l1][j][k] - work2[l2][j][k]; + work1[p_Nq_e - l1 - 1][j][k] = work1[p_Nq_e - l1 - 1][j][k] - + work2[p_Nq_e - l2 - 1][j][k]; + } + } + } + @barrier("local"); + for(int k = 0; k < p_Nq_e; ++k; @inner){ + for(int i = 0; i < p_Nq_e; ++i; @inner){ + if(k >= 1 && k < p_Nq_e - 1 && i >= 1 && i < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 0; + work1[i][l1][k] = work1[i][l1][k] - work2[i][l2][k]; + work1[i][p_Nq_e - l1 - 1][k] = work1[i][p_Nq_e - l1 - 1][k] - + work2[i][p_Nq_e - l2 - 1][k]; + } + } + } + @barrier("local"); + for(int j = 0; j < p_Nq_e; ++j; @inner){ + for(int i = 0; i < p_Nq_e; ++i; @inner){ + if(i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 0; + work1[i][j][l1] = work1[i][j][l1] - work2[i][j][l2]; + work1[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l1 - 1] - + work2[i][j][p_Nq_e - l2 - 1]; + } + } + } + @barrier("local"); + for(int k = 0; k < p_Nq_e; ++k; @inner){ + for(int j = 0; j < p_Nq_e; ++j; @inner){ + if(k >= 1 && k < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 2; + const int l2 = 0; + work1[l1][j][k] = work1[l1][j][k] + work1[l2][j][k]; + work1[p_Nq_e - l1 - 1][j][k] = work1[p_Nq_e - l1 - 1][j][k] + + work1[p_Nq_e - l2 - 1][j][k]; + } + } + } + @barrier("local"); + for(int k = 0; k < p_Nq_e; ++k; @inner){ + for(int i = 0; i < p_Nq_e; ++i; @inner){ + if(k >= 1 && k < p_Nq_e - 1 && i >= 1 && i < p_Nq_e - 1) { + const int l1 = 2; + const int l2 = 0; + work1[i][l1][k] = work1[i][l1][k] + work1[i][l2][k]; + work1[i][p_Nq_e - l1 - 1][k] = work1[i][p_Nq_e - l1 - 1][k] + + work1[i][p_Nq_e - l2 - 1][k]; + } + } + } + @barrier("local"); + for(int j = 0; j < p_Nq_e; ++j; @inner){ + for(int i = 0; i < p_Nq_e; ++i; @inner){ + if(i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 2; + const int l2 = 0; + work1[i][j][l1] = work1[i][j][l1] + work1[i][j][l2]; + work1[i][j][p_Nq_e - l1 - 1] = work1[i][j][p_Nq_e - l1 - 1] + + work1[i][j][p_Nq_e - l2 - 1]; + } + } + } + @barrier("local"); + for(int k = 0; k < p_Nq; ++k){ + @barrier("local"); + for(int j = 0; j < p_Nq_e; ++j; @inner){ + for(int i = 0; i < p_Nq_e; ++i; @inner){ + if(i < p_Nq && j < p_Nq) { + const dlong elem_offset = elem * p_Nq * p_Nq * p_Nq; + const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset; + Su[idx] = work1[k + 1][j + 1][i + 1] * wts[idx]; + } + } + } + } + } +} diff --git a/okl/elliptic/preFDM.c b/okl/elliptic/preFDM.c new file mode 100644 index 000000000..cc9b8b53e --- /dev/null +++ b/okl/elliptic/preFDM.c @@ -0,0 +1,98 @@ + + +extern "C" void FUNC(preFDM)(const dlong& Nelements, + const pfloat* __restrict__ u, + pfloat* __restrict__ work1) +{ + #define getIdx(k,j,i,e) ((k)*p_Nq_e*p_Nq_e+(j)*p_Nq_e+(i)+(e)*p_Nq_e*p_Nq_e*p_Nq_e) + #define getIdx2(k,j,i,e) ((k-1)*p_Nq*p_Nq+(j-1)*p_Nq+(i-1)+(e)*p_Nq*p_Nq*p_Nq) + #define sWork1(k,j,i,e) (work1[(getIdx(k,j,i,e))]) + #define uArr(k,j,i,e) (u[(getIdx2(k,j,i,e))]) + +#ifdef __NEKRS__OMP__ + #pragma omp parallel for +#endif + for (dlong elem = 0; elem < Nelements; elem++) { + #pragma unroll + for(int k = 0; k < p_Nq_e; ++k){ + #pragma unroll + for(int j = 0; j < p_Nq_e; ++j){ + #pragma unroll + for(int i = 0; i < p_Nq_e; ++i){ + const bool iBound = i>=1 && i <(p_Nq_e-1); + const bool jBound = j>=1 && j <(p_Nq_e-1); + const bool kBound = k>=1 && k <(p_Nq_e-1); + if(iBound && jBound && kBound){ + const dlong elem_offset = elem * p_Nq * p_Nq * p_Nq; + const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset; + sWork1(k,j,i,elem) = uArr(k,j,i,elem); + } else { + sWork1(k,j,i,elem) = 0.0; + } + } + } + } + + + #pragma unroll + for(int j = 1; j < p_Nq_e-1; ++j){ + #pragma unroll + for(int k = 1; k < p_Nq_e-1; ++k){ + const int l1 = 0; + const int l2 = 2; + sWork1(l1,j,k,elem) = uArr(l2,j,k,elem); + } + } + #pragma unroll + for(int j = 1; j < p_Nq_e-1; ++j){ + #pragma unroll + for(int k = 1; k < p_Nq_e-1; ++k){ + const int l1 = 0; + const int l2 = 2; + sWork1(p_Nq_e - l1 - 1,j,k,elem) = uArr(p_Nq_e - l2 - 1,j,k,elem); + } + } + + + #pragma unroll + for(int i = 1; i < p_Nq_e-1; ++i){ + #pragma unroll + for(int k = 1; k < p_Nq_e-1; ++k){ + const int l1 = 0; + const int l2 = 2; + sWork1(i,l1,k,elem) = uArr(i,l2,k,elem); + } + } + #pragma unroll + for(int i = 1; i < p_Nq_e-1; ++i){ + #pragma unroll + for(int k = 1; k < p_Nq_e-1; ++k){ + const int l1 = 0; + const int l2 = 2; + sWork1(i,p_Nq_e - l1 - 1,k,elem) = uArr(i,p_Nq_e - l2 - 1,k,elem); + } + } + #pragma unroll + for(int i = 1; i < p_Nq_e-1; ++i){ + #pragma unroll + for(int j = 1; j < p_Nq_e-1; ++j){ + const int l1 = 0; + const int l2 = 2; + sWork1(i,j,l1,elem) = uArr(i,j,l2,elem); + } + } + #pragma unroll + for(int i = 1; i < p_Nq_e-1; ++i){ + #pragma unroll + for(int j = 1; j < p_Nq_e-1; ++j){ + const int l1 = 0; + const int l2 = 2; + sWork1(i,j,p_Nq_e - l1 - 1,elem) = uArr(i,j,p_Nq_e - l2 - 1,elem); + } + } + } + #undef getIdx + #undef getIdx2 + #undef sWork1 + #undef uArr +} diff --git a/okl/elliptic/preFDM.okl b/okl/elliptic/preFDM.okl new file mode 100644 index 000000000..6d5cc3596 --- /dev/null +++ b/okl/elliptic/preFDM.okl @@ -0,0 +1,80 @@ + + +@kernel void preFDM(const dlong Nelements, + @restrict const pfloat* u, + @restrict pfloat* work1) +{ + for (dlong elem = 0; elem < Nelements; elem++; @outer) { + @shared pfloat sWork1[p_Nq_e][p_Nq_e][p_Nq_e]; + for(int k = 0; k < p_Nq_e; ++k; @inner) { + for(int j = 0; j < p_Nq_e; ++j; @inner) { +#pragma unroll + for(int i = 0; i < p_Nq_e; ++i) + sWork1[k][j][i] = 0.0; + } + } + + @barrier("local"); + + for(int k = 0; k < p_Nq; ++k){ + @barrier("local"); + for(int j = 0; j < p_Nq_e; ++j; @inner){ + for(int i = 0; i < p_Nq_e; ++i; @inner){ + if(i < p_Nq && j < p_Nq) { + const dlong elem_offset = elem * p_Nq * p_Nq * p_Nq; + const dlong idx = i + j * p_Nq + k * p_Nq * p_Nq + elem_offset; + sWork1[k + 1][j + 1][i + 1] = u[idx]; + } + } + } + } + + @barrier("local"); + + for(int k = 0; k < p_Nq_e; ++k; @inner){ + for(int j = 0; j < p_Nq_e; ++j; @inner){ + if(k >= 1 && k < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 2; + sWork1[l1][j][k] = sWork1[l2][j][k]; + sWork1[p_Nq_e - l1 - 1][j][k] = sWork1[p_Nq_e - l2 - 1][j][k]; + } + } + } + + @barrier("local"); + + for(int k = 0; k < p_Nq_e; ++k; @inner){ + for(int i = 0; i < p_Nq_e; ++i; @inner){ + if(k >= 1 && k < p_Nq_e - 1 && i >= 1 && i < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 2; + sWork1[i][l1][k] = sWork1[i][l2][k]; + sWork1[i][p_Nq_e - l1 - 1][k] = sWork1[i][p_Nq_e - l2 - 1][k]; + } + } + } + @barrier("local"); + for(int j = 0; j < p_Nq_e; ++j; @inner){ + for(int i = 0; i < p_Nq_e; ++i; @inner){ + if(i >= 1 && i < p_Nq_e - 1 && j >= 1 && j < p_Nq_e - 1) { + const int l1 = 0; + const int l2 = 2; + sWork1[i][j][l1] = sWork1[i][j][l2]; + sWork1[i][j][p_Nq_e - l1 - 1] = sWork1[i][j][p_Nq_e - l2 - 1]; + } + } + } + @barrier("local"); + for(int k = 0; k < p_Nq_e; ++k){ + @barrier("local"); + for(int j = 0; j < p_Nq_e; ++j; @inner){ + for(int i = 0; i < p_Nq_e; ++i; @inner) { + const dlong elem_offset = p_Nq_e * p_Nq_e * p_Nq_e * elem; + const dlong idx = i + j * p_Nq_e + k * p_Nq_e * p_Nq_e + elem_offset; + work1[idx] = sWork1[k][j][i]; + } + } + } + } +} diff --git a/okl/elliptic/scaledAdd.c b/okl/elliptic/scaledAdd.c new file mode 100644 index 000000000..40665f0fb --- /dev/null +++ b/okl/elliptic/scaledAdd.c @@ -0,0 +1,13 @@ +extern "C" void FUNC(scaledAdd) (const dlong & N, + const pfloat & alpha, + const pfloat * __restrict__ x, + const pfloat & beta, + pfloat * __restrict__ y){ + +#ifdef __NEKRS__OMP__ + #pragma omp parallel for +#endif + for(dlong n = 0; n < N; ++n){ + y[n] = alpha*x[n] + beta*y[n]; + } +} \ No newline at end of file diff --git a/okl/elliptic/scaledAdd.okl b/okl/elliptic/scaledAdd.okl new file mode 100644 index 000000000..9ac2197ae --- /dev/null +++ b/okl/elliptic/scaledAdd.okl @@ -0,0 +1,12 @@ +@kernel void scaledAdd(const dlong N, + const pfloat alpha, + @restrict const pfloat * x, + const pfloat beta, + @restrict pfloat * y){ + + for(dlong n=0;n512 - for(int t=0;t256 - for(int t=0;t512 - for(int t=0;t256 - for(int t=0;t 512 - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 512) s_wxy[t] += s_wxy[t + 512]; - @barrier("local"); -#endif -#if p_blockSize > 256 - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 256) s_wxy[t] += s_wxy[t + 256]; - @barrier("local"); -#endif - - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 128) s_wxy[t] += s_wxy[t + 128]; - @barrier("local"); - - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 64) s_wxy[t] += s_wxy[t + 64]; - @barrier("local"); - - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 32) s_wxy[t] += s_wxy[t + 32]; - @barrier("local"); - - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 16) s_wxy[t] += s_wxy[t + 16]; - @barrier("local"); - - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 8) s_wxy[t] += s_wxy[t + 8]; - @barrier("local"); - - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 4) s_wxy[t] += s_wxy[t + 4]; - @barrier("local"); - - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 2) s_wxy[t] += s_wxy[t + 2]; - @barrier("local"); - - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 1) wxy[b + v * Nblock] = s_wxy[0] + s_wxy[1]; - } - } -} diff --git a/okl/linAlg/linAlgMax.okl b/okl/linAlg/max.okl similarity index 100% rename from okl/linAlg/linAlgMax.okl rename to okl/linAlg/max.okl diff --git a/okl/linAlg/linAlgMin.okl b/okl/linAlg/min.okl similarity index 100% rename from okl/linAlg/linAlgMin.okl rename to okl/linAlg/min.okl diff --git a/okl/linAlg/norm1.c b/okl/linAlg/norm1.c new file mode 100644 index 000000000..d6e2a29ba --- /dev/null +++ b/okl/linAlg/norm1.c @@ -0,0 +1,40 @@ +/* +The MIT License (MIT) +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include + + +extern "C" void FUNC(norm1)(const dlong & Nblocks, const dlong & N, + const dfloat * __restrict__ cpu_a, + dfloat * __restrict__ normA){ + + dfloat wa2 = 0; + +#ifdef __NEKRS__OMP__ + #pragma omp parallel for reduction(+:wa2) +#endif + for(int i=0;i512 + for(int t=0;t256 + for(int t=0;t -extern "C" -void FUNC(norm1)(const dlong & Nblocks, const dlong & N, - const dfloat * __restrict__ cpu_a, - dfloat * __restrict__ normA){ - - dfloat wa2 = 0; - -#ifdef __NEKRS__OMP__ - #pragma omp parallel for reduction(+:wa2) -#endif - for(int i=0;i512 - for(int t=0;t256 - for(int t=0;t512 + for(int t=0;t256 + for(int t=0;t512 - for(int t=0;t256 - for(int t=0;t512 + for(int t=0;t256 + for(int t=0;t512 - for(int t=0;t256 - for(int t=0;t tol){ + const dfloat invMag = 1.0 / mag; + v[n + 0 * fieldOffset] = vx * invMag; + v[n + 1 * fieldOffset] = vy * invMag; + v[n + 2 * fieldOffset] = vz * invMag; + } + } +} diff --git a/okl/linAlg/linAlgAbs.okl b/okl/linAlg/vabs.okl similarity index 100% rename from okl/linAlg/linAlgAbs.okl rename to okl/linAlg/vabs.okl diff --git a/okl/linAlg/weightedInnerProd.c b/okl/linAlg/weightedInnerProd.c new file mode 100644 index 000000000..e2e3ac441 --- /dev/null +++ b/okl/linAlg/weightedInnerProd.c @@ -0,0 +1,44 @@ +/* +The MIT License (MIT) +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + + +extern "C" void FUNC(weightedInnerProd)( + const dlong & Nblocks, + const dlong & N, + const dfloat * __restrict__ cpu_w, + const dfloat * __restrict__ cpu_a, + const dfloat * __restrict__ cpu_b, + dfloat * __restrict__ cpu_wab){ + + dfloat wab = 0; + +#ifdef __NEKRS__OMP__ + #pragma omp parallel for reduction(+:wab) +#endif + for(int i=0;i512 + for(int t=0;t256 + for(int t=0;t512 + for(int t=0;t256 + for(int t=0;t 512 + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 512) s_wxy[t] += s_wxy[t + 512]; + @barrier("local"); +#endif +#if p_blockSize > 256 + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 256) s_wxy[t] += s_wxy[t + 256]; + @barrier("local"); +#endif + + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 128) s_wxy[t] += s_wxy[t + 128]; + @barrier("local"); + + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 64) s_wxy[t] += s_wxy[t + 64]; + @barrier("local"); + + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 32) s_wxy[t] += s_wxy[t + 32]; + @barrier("local"); + + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 16) s_wxy[t] += s_wxy[t + 16]; + @barrier("local"); + + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 8) s_wxy[t] += s_wxy[t + 8]; + @barrier("local"); + + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 4) s_wxy[t] += s_wxy[t + 4]; + @barrier("local"); + + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 2) s_wxy[t] += s_wxy[t + 2]; + @barrier("local"); + + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 1) wxy[b + v * Nblock] = s_wxy[0] + s_wxy[1]; + } + } +} diff --git a/okl/linAlg/weightedNorm1.c b/okl/linAlg/weightedNorm1.c new file mode 100644 index 000000000..57241f4d8 --- /dev/null +++ b/okl/linAlg/weightedNorm1.c @@ -0,0 +1,41 @@ +/* +The MIT License (MIT) +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#include + + +extern "C" void FUNC(weightedNorm1)(const dlong & Nblocks, const dlong & N, + const dfloat * __restrict__ cpu_w, + const dfloat * __restrict__ cpu_a, + dfloat * __restrict__ cpu_wa){ + + dfloat wa2 = 0; + +#ifdef __NEKRS__OMP__ + #pragma omp parallel for reduction(+:wa2) +#endif + for(int i=0;i512 + for(int t=0;t256 + for(int t=0;t -extern "C" -void FUNC(weightedNorm1)(const dlong & Nblocks, const dlong & N, - const dfloat * __restrict__ cpu_w, - const dfloat * __restrict__ cpu_a, - dfloat * __restrict__ cpu_wa){ - - dfloat wa2 = 0; - -#ifdef __NEKRS__OMP__ - #pragma omp parallel for reduction(+:wa2) -#endif - for(int i=0;i512 - for(int t=0;t256 - for(int t=0;t512 + for(int t=0;t256 + for(int t=0;t512 - for(int t=0;t256 - for(int t=0;t p_Nfp + +#define p_maxFieldsPerLoop 3 + +@kernel void avgBIDValue(const dlong Nelements, + const dlong BID, + const dlong Nfields, + const dlong fieldOffset, + const dlong offset, + @restrict const dfloat *sgeo, + @restrict const dlong *EToB, + @restrict const dlong *vmapM, + @restrict const dfloat *field, + @restrict dfloat *result){ + + for (dlong e = 0; e < Nelements; e++; @outer(0)) { + + // first reduction: compute surface area + for (int f = 0; f < p_Nfaces; f++) { + + @shared dfloat s_area[p_blockSize]; + + for (int m = 0; m < p_blockSize; ++m; @inner(0)) { + if (m < p_Nfp) { + if(EToB[f + p_Nfaces * e] == BID) { + const int n = m + f * p_Nfp; + const int sk = e * p_Nfp * p_Nfaces + n; + const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID]; + s_area[m] = WsJ; + } else { + s_area[m] = 0.0; + } + + } else { + s_area[m] = 0.0; + } + } + + // compute reduction + @barrier("local"); + +#if p_blockSize > 512 + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 512) { + s_area[t] += s_area[t + 512]; + } + } + @barrier("local"); +#endif + +#if p_blockSize > 256 + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 256) { + s_area[t] += s_area[t + 256]; + } + } + @barrier("local"); +#endif + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 128) { + s_area[t] += s_area[t + 128]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 64) { + s_area[t] += s_area[t + 64]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 32) { + s_area[t] += s_area[t + 32]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 16) { + s_area[t] += s_area[t + 16]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 8) { + s_area[t] += s_area[t + 8]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 4) { + s_area[t] += s_area[t + 4]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 2) { + s_area[t] += s_area[t + 2]; + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 1) { + result[f + p_Nfaces * e + Nfields * offset] = s_area[0] + s_area[1]; + } + } + + @barrier("local"); + } + + // second reduction: compute sum for each field + + for (int fldStart = 0; fldStart < Nfields; fldStart += p_maxFieldsPerLoop){ + for (int f = 0; f < p_Nfaces; f++) { + + @shared dfloat s_sum[p_maxFieldsPerLoop][p_blockSize]; + + for (int m = 0; m < p_blockSize; ++m; @inner(0)) { + + #pragma unroll p_maxFieldsPerLoop + for(int fld = 0; fld < p_maxFieldsPerLoop; fld++){ + const int fldId = fldStart + fld; + if (m < p_Nfp && fldId < Nfields) { + if(EToB[f + p_Nfaces * e] == BID) { + const int n = m + f * p_Nfp; + const int sk = e * p_Nfp * p_Nfaces + n; + const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID]; + const dlong idM = vmapM[sk]; + s_sum[fld][m] = field[idM + fldId * fieldOffset] * WsJ; + } else { + s_sum[fld][m] = 0.0; + } + + } else { + s_sum[fld][m] = 0.0; + } + } + } + + // compute reduction + @barrier("local"); + +#if p_blockSize > 512 + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 512) { + #pragma unroll p_maxFieldsPerLoop + for(int fld = 0; fld < p_maxFieldsPerLoop; fld++){ + s_sum[fld][t] += s_sum[fld][t + 512]; + } + } + } + @barrier("local"); +#endif + +#if p_blockSize > 256 + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 256) { + #pragma unroll p_maxFieldsPerLoop + for(int fld = 0; fld < p_maxFieldsPerLoop; fld++){ + s_sum[fld][t] += s_sum[fld][t + 256]; + } + } + } + @barrier("local"); +#endif + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 128) { + #pragma unroll p_maxFieldsPerLoop + for(int fld = 0; fld < p_maxFieldsPerLoop; fld++){ + s_sum[fld][t] += s_sum[fld][t + 128]; + } + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 64) { + #pragma unroll p_maxFieldsPerLoop + for(int fld = 0; fld < p_maxFieldsPerLoop; fld++){ + s_sum[fld][t] += s_sum[fld][t + 64]; + } + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 32) { + #pragma unroll p_maxFieldsPerLoop + for(int fld = 0; fld < p_maxFieldsPerLoop; fld++){ + s_sum[fld][t] += s_sum[fld][t + 32]; + } + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 16) { + #pragma unroll p_maxFieldsPerLoop + for(int fld = 0; fld < p_maxFieldsPerLoop; fld++){ + s_sum[fld][t] += s_sum[fld][t + 16]; + } + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 8) { + #pragma unroll p_maxFieldsPerLoop + for(int fld = 0; fld < p_maxFieldsPerLoop; fld++){ + s_sum[fld][t] += s_sum[fld][t + 8]; + } + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 4) { + #pragma unroll p_maxFieldsPerLoop + for(int fld = 0; fld < p_maxFieldsPerLoop; fld++){ + s_sum[fld][t] += s_sum[fld][t + 4]; + } + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 2) { + #pragma unroll p_maxFieldsPerLoop + for(int fld = 0; fld < p_maxFieldsPerLoop; fld++){ + s_sum[fld][t] += s_sum[fld][t + 2]; + } + } + } + @barrier("local"); + + for (int t = 0; t < p_blockSize; ++t; @inner(0)) { + if (t < 1) { + #pragma unroll p_maxFieldsPerLoop + for(int fld = 0; fld < p_maxFieldsPerLoop; fld++){ + const int fldId = fldStart + fld; + if(fldId < Nfields){ + result[f + p_Nfaces * e + fldId * offset] = s_sum[fld][0] + s_sum[fld][1]; + } + } + } + } + + @barrier("local"); + } + } + } +} diff --git a/okl/mesh/constantDifferentiationInterpolationMatrices.h b/okl/mesh/constantDifferentiationInterpolationMatrices.h new file mode 100644 index 000000000..f3edb8eea --- /dev/null +++ b/okl/mesh/constantDifferentiationInterpolationMatrices.h @@ -0,0 +1,1497 @@ +#if p_Nq==2 && p_cubNq==2 +const dfloat c_DI[2][2] = { +{ -5.000000000000001e-01,5.000000000000001e-01}, +{ -5.000000000000001e-01,5.000000000000001e-01} +}; +#endif +#if p_Nq==2 && p_cubNq==3 +const dfloat c_DI[3][2] = { +{ -5.000000000000001e-01,5.000000000000001e-01}, +{ -5.000000000000001e-01,5.000000000000001e-01}, +{ -5.000000000000001e-01,5.000000000000001e-01} +}; +#endif +#if p_Nq==3 && p_cubNq==3 +const dfloat c_DI[3][3] = { +{ -1.274596669241483e+00,1.549193338482966e+00,-2.745966692414831e-01}, +{ -5.000000000000002e-01,9.020562075079396e-17,5.000000000000001e-01}, +{ 2.745966692414832e-01,-1.549193338482967e+00,1.274596669241483e+00} +}; +#endif +#if p_Nq==3 && p_cubNq==4 +const dfloat c_DI[4][3] = { +{ -1.361136311594052e+00,1.722272623188105e+00,-3.611363115940524e-01}, +{ -8.399810435848563e-01,6.799620871697123e-01,1.600189564151440e-01}, +{ -1.600189564151440e-01,-6.799620871697123e-01,8.399810435848563e-01}, +{ 3.611363115940525e-01,-1.722272623188105e+00,1.361136311594052e+00} +}; +#endif +#if p_Nq==3 && p_cubNq==5 +const dfloat c_DI[5][3] = { +{ -1.406179845938664e+00,1.812359691877327e+00,-4.061798459386636e-01}, +{ -1.038469310105683e+00,1.076938620211366e+00,-3.846931010568289e-02}, +{ -5.000000000000002e-01,1.925183573209066e-16,5.000000000000000e-01}, +{ 3.846931010568277e-02,-1.076938620211366e+00,1.038469310105683e+00}, +{ 4.061798459386639e-01,-1.812359691877328e+00,1.406179845938664e+00} +}; +#endif +#if p_Nq==4 && p_cubNq==4 +const dfloat c_DI[4][4] = { +{ -2.341837415390958e+00,2.787944890537087e+00,-6.351041115519562e-01,1.889966364058267e-01}, +{ -5.167021357255349e-01,-4.879524903135274e-01,1.337905099275668e+00,-3.332504732366054e-01}, +{ 3.332504732366055e-01,-1.337905099275668e+00,4.879524903135275e-01,5.167021357255349e-01}, +{ -1.889966364058270e-01,6.351041115519561e-01,-2.787944890537087e+00,2.341837415390958e+00} +}; +#endif +#if p_Nq==4 && p_cubNq==5 +const dfloat c_DI[5][4] = { +{ -2.547403394645993e+00,3.178008306016676e+00,-9.125586911700170e-01,2.819537797993341e-01}, +{ -1.091741383742773e+00,4.911931202885810e-01,8.549801549756264e-01,-2.544318915214344e-01}, +{ 1.249999999999998e-01,-1.397542485937368e+00,1.397542485937369e+00,-1.250000000000000e-01}, +{ 2.544318915214345e-01,-8.549801549756270e-01,-4.911931202885803e-01,1.091741383742773e+00}, +{ -2.819537797993349e-01,9.125586911700178e-01,-3.178008306016677e+00,2.547403394645995e+00} +}; +#endif +#if p_Nq==4 && p_cubNq==6 +const dfloat c_DI[6][4] = { +{ -2.670898258225681e+00,3.413531444501887e+00,-1.082357658994008e+00,3.397244727178013e-01}, +{ -1.521257706991131e+00,1.261976969086191e+00,3.910464970794696e-01,-1.317657591745296e-01}, +{ -2.800348250421349e-01,-8.605440023065498e-01,1.457091967514542e+00,-3.165131401658572e-01}, +{ 3.165131401658570e-01,-1.457091967514542e+00,8.605440023065509e-01,2.800348250421342e-01}, +{ 1.317657591745292e-01,-3.910464970794690e-01,-1.261976969086192e+00,1.521257706991132e+00}, +{ -3.397244727178022e-01,1.082357658994009e+00,-3.413531444501888e+00,2.670898258225681e+00} +}; +#endif +#if p_Nq==5 && p_cubNq==5 +const dfloat c_DI[5][5] = { +{ -3.705336453591452e+00,4.332821168763928e+00,-9.039245362321627e-01,4.206762304276799e-01,-1.442364093679935e-01}, +{ -5.287152679802736e-01,-1.097657967828329e+00,2.132593784692289e+00,-7.497385700132869e-01,2.435180211296003e-01}, +{ 3.750000000000001e-01,-1.336584577695454e+00,1.131266998246126e-15,1.336584577695453e+00,-3.749999999999999e-01}, +{ -2.435180211296004e-01,7.497385700132868e-01,-2.132593784692289e+00,1.097657967828330e+00,5.287152679802730e-01}, +{ 1.442364093679943e-01,-4.206762304276810e-01,9.039245362321652e-01,-4.332821168763933e+00,3.705336453591455e+00} +}; +#endif +#if p_Nq==5 && p_cubNq==6 +const dfloat c_DI[6][5] = { +{ -4.045819650335855e+00,4.963693724096331e+00,-1.350832236853071e+00,6.639059901075752e-01,-2.309478270149803e-01}, +{ -1.288514957558639e+00,7.734729558567645e-02,1.709988885943853e+00,-7.555949933554940e-01,2.567737693846036e-01}, +{ 3.569455308697995e-01,-1.971675836409686e+00,1.463984762476768e+00,2.448696533665309e-01,-9.412411030341221e-02}, +{ 9.412411030341292e-02,-2.448696533665329e-01,-1.463984762476766e+00,1.971675836409686e+00,-3.569455308697999e-01}, +{ -2.567737693846035e-01,7.555949933554937e-01,-1.709988885943851e+00,-7.734729558567888e-02,1.288514957558640e+00}, +{ 2.309478270149816e-01,-6.639059901075772e-01,1.350832236853074e+00,-4.963693724096335e+00,4.045819650335857e+00} +}; +#endif +#if p_Nq==5 && p_cubNq==7 +const dfloat c_DI[7][5] = { +{ -4.270151157742600e+00,5.382090160656650e+00,-1.652258691479321e+00,8.312402426318171e-01,-2.909205540665457e-01}, +{ -1.939362661760671e+00,1.170248243086537e+00,1.137924595950490e+00,-5.662571364020146e-01,1.974469591256584e-01}, +{ 1.305531081332800e-02,-1.787421808953998e+00,2.081730210205858e+00,-4.351480290853635e-01,1.277843170201756e-01}, +{ 3.749999999999997e-01,-1.336584577695452e+00,-2.120100431013261e-15,1.336584577695455e+00,-3.750000000000002e-01}, +{ -1.277843170201755e-01,4.351480290853628e-01,-2.081730210205857e+00,1.787421808953999e+00,-1.305531081332884e-02}, +{ -1.974469591256587e-01,5.662571364020161e-01,-1.137924595950494e+00,-1.170248243086532e+00,1.939362661760668e+00}, +{ 2.909205540665445e-01,-8.312402426318153e-01,1.652258691479318e+00,-5.382090160656646e+00,4.270151157742599e+00} +}; +#endif +#if p_Nq==5 && p_cubNq==8 +const dfloat c_DI[8][5] = { +{ -4.424825435248329e+00,5.671743449100494e+00,-1.863085570895966e+00,9.496707977088237e-01,-3.335032406650218e-01}, +{ -2.463220459555367e+00,2.084542218335788e+00,5.919313360917361e-01,-3.320893855486324e-01,1.188362906764755e-01}, +{ -4.638378446546674e-01,-1.189736402204834e+00,2.148873857637676e+00,-7.314294139539468e-01,2.361298031757721e-01}, +{ 4.026463751120688e-01,-1.900281325531366e+00,1.165289875084430e+00,5.030452928683884e-01,-1.707002175335204e-01}, +{ 1.707002175335200e-01,-5.030452928683872e-01,-1.165289875084431e+00,1.900281325531367e+00,-4.026463751120689e-01}, +{ -2.361298031757720e-01,7.314294139539460e-01,-2.148873857637675e+00,1.189736402204835e+00,4.638378446546664e-01}, +{ -1.188362906764756e-01,3.320893855486330e-01,-5.919313360917366e-01,-2.084542218335788e+00,2.463220459555367e+00}, +{ 3.335032406650222e-01,-9.496707977088252e-01,1.863085570895969e+00,-5.671743449100499e+00,4.424825435248333e+00} +}; +#endif +#if p_Nq==6 && p_cubNq==6 +const dfloat c_DI[6][6] = { +{ -5.366287860296875e+00,6.189142005012727e+00,-1.181123727130086e+00,5.620087029545588e-01,-3.204626575648853e-01,1.167235370245601e-01}, +{ -5.297747896967107e-01,-1.847025589401683e+00,3.053480543583716e+00,-1.022762817558617e+00,5.374079125548058e-01,-1.913252594815107e-01}, +{ 4.119421981810610e-01,-1.377203207575448e+00,-4.938531477244313e-01,2.010690106429841e+00,-8.321398645835391e-01,2.805639152725161e-01}, +{ -2.805639152725166e-01,8.321398645835394e-01,-2.010690106429843e+00,4.938531477244339e-01,1.377203207575446e+00,-4.119421981810600e-01}, +{ 1.913252594815118e-01,-5.374079125548075e-01,1.022762817558619e+00,-3.053480543583717e+00,1.847025589401680e+00,5.297747896967138e-01}, +{ -1.167235370245613e-01,3.204626575648881e-01,-5.620087029545635e-01,1.181123727130096e+00,-6.189142005012736e+00,5.366287860296877e+00} +}; +#endif +#if p_Nq==6 && p_cubNq==7 +const dfloat c_DI[7][6] = { +{ -5.850644156506078e+00,7.074846667856594e+00,-1.800508463197203e+00,9.136186505789293e-01,-5.327341779530471e-01,1.954214792208060e-01}, +{ -1.446282067733919e+00,-5.310840059400220e-01,2.739346652877441e+00,-1.183821081107123e+00,6.615180061229800e-01,-2.396775042193566e-01}, +{ 5.511102402143102e-01,-2.452616529120928e+00,1.337220663851858e+00,8.647177185150524e-01,-4.678877342179857e-01,1.674556407576923e-01}, +{ -6.250000000000089e-02,2.544242700698987e-01,-2.216265054274738e+00,2.216265054274735e+00,-2.544242700698943e-01,6.249999999999940e-02}, +{ -1.674556407576919e-01,4.678877342179859e-01,-8.647177185150535e-01,-1.337220663851858e+00,2.452616529120927e+00,-5.511102402143095e-01}, +{ 2.396775042193569e-01,-6.615180061229805e-01,1.183821081107124e+00,-2.739346652877444e+00,5.310840059400289e-01,1.446282067733915e+00}, +{ -1.954214792208060e-01,5.327341779530468e-01,-9.136186505789274e-01,1.800508463197201e+00,-7.074846667856586e+00,5.850644156506071e+00} +}; +#endif +#if p_Nq==6 && p_cubNq==8 +const dfloat c_DI[8][6] = { +{ -6.190975818800241e+00,7.701585471846824e+00,-2.246698742728907e+00,1.172210651787038e+00,-6.899666876568463e-01,2.538451255521328e-01}, +{ -2.300319914294394e+00,8.307959988065183e-01,2.130598650346839e+00,-1.043184762075071e+00,6.025332694730172e-01,-2.204232422569097e-01}, +{ 3.195876490621026e-01,-2.685022585121719e+00,2.485776408550059e+00,-1.406376376966400e-01,2.453036409123028e-02,-4.234198885033361e-03}, +{ 3.070028719137733e-01,-9.593205265902034e-01,-1.030275309011232e+00,2.233233124639669e+00,-8.208498285256179e-01,2.702096675736112e-01}, +{ -2.702096675736123e-01,8.208498285256186e-01,-2.233233124639667e+00,1.030275309011228e+00,9.593205265902064e-01,-3.070028719137736e-01}, +{ 4.234198885033603e-03,-2.453036409123042e-02,1.406376376966396e-01,-2.485776408550060e+00,2.685022585121719e+00,-3.195876490621023e-01}, +{ 2.204232422569105e-01,-6.025332694730171e-01,1.043184762075070e+00,-2.130598650346837e+00,-8.307959988065203e-01,2.300319914294394e+00}, +{ -2.538451255521362e-01,6.899666876568528e-01,-1.172210651787046e+00,2.246698742728920e+00,-7.701585471846837e+00,6.190975818800245e+00} +}; +#endif +#if p_Nq==6 && p_cubNq==9 +const dfloat c_DI[9][6] = { +{ -6.437821307068186e+00,8.158265319652571e+00,-2.575584010595129e+00,1.365327213767381e+00,-8.079269717959735e-01,2.977397560393366e-01}, +{ -3.038471130367123e+00,2.068632328444774e+00,1.459984660538854e+00,-7.830961961282553e-01,4.639642797481559e-01,-1.710139422364051e-01}, +{ -1.419171415529764e-01,-2.314502501354573e+00,2.982558481460120e+00,-7.756242528248232e-01,3.838046374631031e-01,-1.343192231908498e-01}, +{ 5.294082071875857e-01,-1.990912097934646e+00,4.309135543158122e-01,1.495235853745628e+00,-7.121572780849859e-01,2.475117607706060e-01}, +{ -6.250000000000078e-02,2.544242700698978e-01,-2.216265054274737e+00,2.216265054274735e+00,-2.544242700698952e-01,6.249999999999944e-02}, +{ -2.475117607706058e-01,7.121572780849846e-01,-1.495235853745625e+00,-4.309135543158163e-01,1.990912097934648e+00,-5.294082071875853e-01}, +{ 1.343192231908502e-01,-3.838046374631041e-01,7.756242528248238e-01,-2.982558481460121e+00,2.314502501354573e+00,1.419171415529777e-01}, +{ 1.710139422364052e-01,-4.639642797481566e-01,7.830961961282565e-01,-1.459984660538855e+00,-2.068632328444771e+00,3.038471130367121e+00}, +{ -2.977397560393389e-01,8.079269717959765e-01,-1.365327213767383e+00,2.575584010595134e+00,-8.158265319652571e+00,6.437821307068182e+00} +}; +#endif +#if p_Nq==7 && p_cubNq==7 +const dfloat c_DI[7][7] = { +{ -7.325199412665381e+00,8.358030744498961e+00,-1.473822802750005e+00,6.955550906707026e-01,-4.173910000318668e-01,2.609125674727442e-01,-9.808518719515531e-02}, +{ -5.174500232162018e-01,-2.741239114584553e+00,4.112801561446037e+00,-1.290338304727431e+00,7.026868956729586e-01,-4.240698078195857e-01,1.576087932287761e-01}, +{ 4.456676349189936e-01,-1.427103863171723e+00,-1.051114500750282e+00,2.748760351990831e+00,-1.108196975130437e+00,6.147649484879396e-01,-2.227775963453208e-01}, +{ -3.125000000000009e-01,9.075444712688203e-01,-2.006969240588749e+00,-5.830987757727883e-15,2.006969240588757e+00,-9.075444712688219e-01,3.125000000000003e-01}, +{ 2.227775963453209e-01,-6.147649484879386e-01,1.108196975130435e+00,-2.748760351990831e+00,1.051114500750284e+00,1.427103863171719e+00,-4.456676349189911e-01}, +{ -1.576087932287759e-01,4.240698078195838e-01,-7.026868956729548e-01,1.290338304727429e+00,-4.112801561446041e+00,2.741239114584560e+00,5.174500232161993e-01}, +{ 9.808518719515735e-02,-2.609125674727454e-01,4.173910000318659e-01,-6.955550906707043e-01,1.473822802750008e+00,-8.358030744498945e+00,7.325199412665364e+00} +}; +#endif +#if p_Nq==7 && p_cubNq==8 +const dfloat c_DI[8][7] = { +{ -7.958966822480723e+00,9.507400424306972e+00,-2.268885913846733e+00,1.150494690382494e+00,-7.092700580216608e-01,4.485171293485212e-01,-1.692894496888708e-01}, +{ -1.573449912492710e+00,-1.317864285732593e+00,3.933772929508256e+00,-1.622212949682414e+00,9.458841679080496e-01,-5.856280929422989e-01,2.194981434337105e-01}, +{ 7.184732800515370e-01,-2.867313331279468e+00,1.055813690918919e+00,1.629628957438890e+00,-8.600453540814834e-01,5.138138938629644e-01,-1.903711369113596e-01}, +{ -2.058291939831722e-01,6.800141103019953e-01,-2.846094661705683e+00,2.265227547936947e+00,2.077774139648464e-01,-1.650747543661308e-01,6.397953785119735e-02}, +{ -6.397953785119689e-02,1.650747543661299e-01,-2.077774139648440e-01,-2.265227547936949e+00,2.846094661705684e+00,-6.800141103019942e-01,2.058291939831703e-01}, +{ 1.903711369113602e-01,-5.138138938629646e-01,8.600453540814826e-01,-1.629628957438890e+00,-1.055813690918920e+00,2.867313331279465e+00,-7.184732800515334e-01}, +{ -2.194981434337113e-01,5.856280929422981e-01,-9.458841679080465e-01,1.622212949682412e+00,-3.933772929508257e+00,1.317864285732593e+00,1.573449912492712e+00}, +{ 1.692894496888719e-01,-4.485171293485252e-01,7.092700580216668e-01,-1.150494690382508e+00,2.268885913846761e+00,-9.507400424306992e+00,7.958966822480724e+00} +}; +#endif +#if p_Nq==7 && p_cubNq==9 +const dfloat c_DI[9][7] = { +{ -8.425875562416016e+00,1.036021884222719e+01,-2.869531491905105e+00,1.501360390414511e+00,-9.360738513446282e-01,5.947554557900551e-01,-2.248537827660018e-01}, +{ -2.613075945353255e+00,2.677851652417926e-01,3.339508718633390e+00,-1.572790370236443e+00,9.505787108361353e-01,-5.966618778632353e-01,2.246555987416150e-01}, +{ 6.170719750345540e-01,-3.522551024364185e+00,2.681445872984689e+00,3.946811649544585e-01,-2.864173969709397e-01,1.868599621173422e-01,-7.109055375591866e-02}, +{ 1.735895075104481e-01,-4.641187247727696e-01,-2.095391664517411e+00,2.963250586286815e+00,-8.624942270911838e-01,4.414045643533187e-01,-1.562400417692176e-01}, +{ -3.125000000000012e-01,9.075444712688207e-01,-2.006969240588750e+00,-3.263210127223896e-15,2.006969240588755e+00,-9.075444712688215e-01,3.125000000000004e-01}, +{ 1.562400417692182e-01,-4.414045643533189e-01,8.624942270911837e-01,-2.963250586286814e+00,2.095391664517408e+00,4.641187247727718e-01,-1.735895075104488e-01}, +{ 7.109055375591888e-02,-1.868599621173422e-01,2.864173969709392e-01,-3.946811649544558e-01,-2.681445872984695e+00,3.522551024364185e+00,-6.170719750345501e-01}, +{ -2.246555987416157e-01,5.966618778632348e-01,-9.505787108361338e-01,1.572790370236443e+00,-3.339508718633393e+00,-2.677851652417857e-01,2.613075945353250e+00}, +{ 2.248537827660044e-01,-5.947554557900574e-01,9.360738513446307e-01,-1.501360390414520e+00,2.869531491905122e+00,-1.036021884222718e+01,8.425875562416003e+00} +}; +#endif +#if p_Nq==7 && p_cubNq==10 +const dfloat c_DI[10][7] = { +{ -8.777929035722540e+00,1.100639373563824e+01,-3.330186335401597e+00,1.774145024235445e+00,-1.113295705021248e+00,7.092680034911540e-01,-2.683956872194507e-01}, +{ -3.558594023032438e+00,1.799669457431551e+00,2.590979898326290e+00,-1.332092492819793e+00,8.257396091201441e-01,-5.234411250342014e-01,1.977386760084474e-01}, +{ 2.227052852994668e-01,-3.472842161301807e+00,3.652044051615628e+00,-5.617704153642606e-01,2.473195579237615e-01,-1.368038470332464e-01,4.934752886045835e-02}, +{ 5.306684351290651e-01,-1.773472019568184e+00,-6.130258450885940e-01,2.570210049132547e+00,-1.115126257682473e+00,6.305179467738466e-01,-2.297723086962085e-01}, +{ -2.621002240317294e-01,8.304277883467515e-01,-2.825887077497901e+00,1.920837550686918e+00,5.482596639472863e-01,-3.352288188477644e-01,1.236911173964392e-01}, +{ -1.236911173964383e-01,3.352288188477616e-01,-5.482596639472795e-01,-1.920837550686925e+00,2.825887077497903e+00,-8.304277883467495e-01,2.621002240317275e-01}, +{ 2.297723086962091e-01,-6.305179467738460e-01,1.115126257682471e+00,-2.570210049132543e+00,6.130258450885865e-01,1.773472019568187e+00,-5.306684351290643e-01}, +{ -4.934752886045901e-02,1.368038470332481e-01,-2.473195579237642e-01,5.617704153642660e-01,-3.652044051615636e+00,3.472842161301805e+00,-2.227052852994599e-01}, +{ -1.977386760084492e-01,5.234411250342024e-01,-8.257396091201433e-01,1.332092492819793e+00,-2.590979898326293e+00,-1.799669457431538e+00,3.558594023032428e+00}, +{ 2.683956872194457e-01,-7.092680034911378e-01,1.113295705021225e+00,-1.774145024235419e+00,3.330186335401557e+00,-1.100639373563815e+01,8.777929035722483e+00} +}; +#endif +#if p_Nq==7 && p_cubNq==11 +const dfloat c_DI[11][7] = { +{ -9.049050551979686e+00,1.150576386211247e+01,-3.689256716613666e+00,1.988813388399178e+00,-1.253259367993385e+00,7.998431440735477e-01,-3.028537579984553e-01}, +{ -4.387909613173216e+00,3.191681873262833e+00,1.820550020118004e+00,-1.010557690801444e+00,6.407784722790915e-01,-4.098249062395382e-01,1.552818445542687e-01}, +{ -3.496903159083744e-01,-2.936751296044034e+00,4.073101598027419e+00,-1.178664926038293e+00,6.291331980538357e-01,-3.768331729606741e-01,1.397049148701214e-01}, +{ 7.126432309603175e-01,-2.799188129362384e+00,9.333134041355778e-01,1.710160739796048e+00,-8.909411418447548e-01,5.302395077471702e-01,-1.962276114319750e-01}, +{ 2.544039206979765e-03,7.540584345390436e-02,-2.550590973929565e+00,2.842950602407116e+00,-5.271183714341229e-01,2.377470995653049e-01,-8.093823926961741e-02}, +{ -3.125000000000009e-01,9.075444712688217e-01,-2.006969240588755e+00,3.819765267136848e-15,2.006969240588751e+00,-9.075444712688205e-01,3.124999999999999e-01}, +{ 8.093823926961852e-02,-2.377470995653050e-01,5.271183714341220e-01,-2.842950602407115e+00,2.550590973929563e+00,-7.540584345390354e-02,-2.544039206980431e-03}, +{ 1.962276114319754e-01,-5.302395077471693e-01,8.909411418447520e-01,-1.710160739796043e+00,-9.333134041355862e-01,2.799188129362386e+00,-7.126432309603150e-01}, +{ -1.397049148701228e-01,3.768331729606746e-01,-6.291331980538355e-01,1.178664926038296e+00,-4.073101598027424e+00,2.936751296044030e+00,3.496903159083820e-01}, +{ -1.552818445542692e-01,4.098249062395367e-01,-6.407784722790892e-01,1.010557690801440e+00,-1.820550020118000e+00,-3.191681873262832e+00,4.387909613173213e+00}, +{ 3.028537579984558e-01,-7.998431440735470e-01,1.253259367993385e+00,-1.988813388399186e+00,3.689256716613683e+00,-1.150576386211246e+01,9.049050551979674e+00} +}; +#endif +#if p_Nq==8 && p_cubNq==8 +const dfloat c_DI[8][8] = { +{ -9.582323409585538e+00,1.083983585435099e+01,-1.783564606634931e+00,8.296275346379445e-01,-5.032541521265417e-01,3.359814774081982e-01,-2.209163486401504e-01,8.461365059003234e-02}, +{ -4.906095493346552e-01,-3.782400508702779e+00,5.313358412099381e+00,-1.564918423912751e+00,8.520168073507624e-01,-5.457091003112893e-01,3.523599653529872e-01,-1.340976025416565e-01}, +{ 4.763902830640125e-01,-1.475365110959789e+00,-1.688545056690675e+00,3.570685973417450e+00,-1.368693756068840e+00,7.927075048404114e-01,-4.918596307806720e-01,1.846797931781022e-01}, +{ -3.420360099292031e-01,9.795027568868743e-01,-2.044135465448786e+00,-4.989337898562562e-01,2.663494213499109e+00,-1.189392181608271e+00,6.807279949214027e-01,-2.492275184648693e-01}, +{ 2.492275184648685e-01,-6.807279949214023e-01,1.189392181608269e+00,-2.663494213499104e+00,4.989337898562497e-01,2.044135465448790e+00,-9.795027568868743e-01,3.420360099292035e-01}, +{ -1.846797931781019e-01,4.918596307806715e-01,-7.927075048404092e-01,1.368693756068836e+00,-3.570685973417453e+00,1.688545056690683e+00,1.475365110959785e+00,-4.763902830640117e-01}, +{ 1.340976025416560e-01,-3.523599653529887e-01,5.457091003112906e-01,-8.520168073507633e-01,1.564918423912756e+00,-5.313358412099384e+00,3.782400508702775e+00,4.906095493346584e-01}, +{ -8.461365059003459e-02,2.209163486401542e-01,-3.359814774082029e-01,5.032541521265489e-01,-8.296275346379599e-01,1.783564606634959e+00,-1.083983585435102e+01,9.582323409585559e+00} +}; +#endif +#if p_Nq==8 && p_cubNq==9 +const dfloat c_DI[9][8] = { +{ -1.036916148501039e+01,1.225875899161110e+01,-2.756742913434121e+00,1.386600684977139e+00,-8.672163539736545e-01,5.872583599722545e-01,-3.887745526249832e-01,1.492772684826656e-01}, +{ -1.674802555945869e+00,-2.273582543123139e+00,5.286418576008073e+00,-2.075757683756811e+00,1.215318416620847e+00,-8.014122899686562e-01,5.242704170863951e-01,-2.004523369208397e-01}, +{ 8.661442182075746e-01,-3.232242172113430e+00,6.391244857206666e-01,2.519968180711703e+00,-1.273961385328682e+00,7.978124447673796e-01,-5.105690913003051e-01,1.937233193350919e-01}, +{ -3.358182273928728e-01,1.052152747214880e+00,-3.392639088210416e+00,2.170420557735753e+00,8.150228408907089e-01,-5.096578822344930e-01,3.220829227479370e-01,-1.215638707514973e-01}, +{ 3.906250000000055e-02,-1.248537463656932e-01,3.448188117424304e-01,-3.030359293393399e+00,3.030359293393398e+00,-3.448188117424293e-01,1.248537463656918e-01,-3.906249999999966e-02}, +{ 1.215638707514965e-01,-3.220829227479358e-01,5.096578822344906e-01,-8.150228408907049e-01,-2.170420557735759e+00,3.392639088210418e+00,-1.052152747214877e+00,3.358182273928715e-01}, +{ -1.937233193350918e-01,5.105690913003045e-01,-7.978124447673776e-01,1.273961385328680e+00,-2.519968180711705e+00,-6.391244857206643e-01,3.232242172113429e+00,-8.661442182075747e-01}, +{ 2.004523369208400e-01,-5.242704170863953e-01,8.014122899686541e-01,-1.215318416620846e+00,2.075757683756816e+00,-5.286418576008079e+00,2.273582543123144e+00,1.674802555945865e+00}, +{ -1.492772684826686e-01,3.887745526249819e-01,-5.872583599722512e-01,8.672163539736515e-01,-1.386600684977140e+00,2.756742913434121e+00,-1.225875899161109e+01,1.036916148501040e+01} +}; +#endif +#if p_Nq==8 && p_cubNq==10 +const dfloat c_DI[10][8] = { +{ -1.097025370765744e+01,1.335036450635468e+01,-3.518828846105039e+00,1.831676997583758e+00,-1.160242275573851e+00,7.902469321469575e-01,-5.245919111487529e-01,2.016283043996929e-01}, +{ -2.885013820531691e+00,-5.005243685371882e-01,4.744625207155154e+00,-2.144512929450196e+00,1.303799816248686e+00,-8.730331041180927e-01,5.750819529053727e-01,-2.204227536720453e-01}, +{ 8.980063777289373e-01,-4.294502254585712e+00,2.685468731970841e+00,1.130140542132405e+00,-6.938944178405220e-01,4.618375665965753e-01,-3.030172312996827e-01,1.159606852971588e-01}, +{ 9.028582572458213e-03,7.580295463622524e-02,-3.134632792667707e+00,3.514698476734328e+00,-6.661604216750288e-01,3.161339062584216e-01,-1.809684235023886e-01,6.609771764369153e-02}, +{ -2.862835088752814e-01,8.067226823374559e-01,-1.570810560700869e+00,-1.134968655679097e+00,2.957784258195356e+00,-1.197692673650741e+00,6.678888268748663e-01,-2.426403685016886e-01}, +{ 2.426403685016880e-01,-6.678888268748668e-01,1.197692673650740e+00,-2.957784258195348e+00,1.134968655679086e+00,1.570810560700877e+00,-8.067226823374578e-01,2.862835088752820e-01}, +{ -6.609771764369197e-02,1.809684235023900e-01,-3.161339062584219e-01,6.661604216750298e-01,-3.514698476734333e+00,3.134632792667708e+00,-7.580295463622042e-02,-9.028582572459656e-03}, +{ -1.159606852971579e-01,3.030172312996809e-01,-4.618375665965715e-01,6.938944178405156e-01,-1.130140542132396e+00,-2.685468731970850e+00,4.294502254585714e+00,-8.980063777289354e-01}, +{ 2.204227536720457e-01,-5.750819529053731e-01,8.730331041180923e-01,-1.303799816248687e+00,2.144512929450204e+00,-4.744625207155164e+00,5.005243685372012e-01,2.885013820531682e+00}, +{ -2.016283043996878e-01,5.245919111487367e-01,-7.902469321469314e-01,1.160242275573812e+00,-1.831676997583704e+00,3.518828846104944e+00,-1.335036450635454e+01,1.097025370765737e+01} +}; +#endif +#if p_Nq==8 && p_cubNq==11 +const dfloat c_DI[11][8] = { +{ -1.143758206669569e+01,1.420328710208303e+01,-4.121697826061057e+00,2.188676239034100e+00,-1.396506375785088e+00,9.543022560054044e-01,-6.344823921099559e-01,2.440030635292607e-01}, +{ -4.027779728649322e+00,1.293904974886595e+00,3.959589566234506e+00,-1.957253120640174e+00,1.220869465951525e+00,-8.262498836020166e-01,5.469056737695399e-01,-2.099869479506534e-01}, +{ 6.010341995220678e-01,-4.612590309172756e+00,4.132780185961424e+00,-1.083666577851232e-01,-3.827154810099364e-02,4.761822100320082e-02,-3.725525935051775e-02,1.505116792269785e-02}, +{ 4.422111600506859e-01,-1.350916028343909e+00,-1.826333861369935e+00,3.605180089981810e+00,-1.344187874450870e+00,7.728617957186215e-01,-4.781933234732061e-01,1.793780418868036e-01}, +{ -3.920002869803024e-01,1.175043608500042e+00,-3.031510549364044e+00,1.176905780882888e+00,1.612647151690040e+00,-8.722898853024833e-01,5.277049972044243e-01,-1.965008166305649e-01}, +{ 3.906249999999924e-02,-1.248537463656905e-01,3.448188117424253e-01,-3.030359293393395e+00,3.030359293393401e+00,-3.448188117424336e-01,1.248537463656941e-01,-3.906250000000101e-02}, +{ 1.965008166305647e-01,-5.277049972044235e-01,8.722898853024807e-01,-1.612647151690037e+00,-1.176905780882892e+00,3.031510549364045e+00,-1.175043608500040e+00,3.920002869803018e-01}, +{ -1.793780418868030e-01,4.781933234732065e-01,-7.728617957186205e-01,1.344187874450869e+00,-3.605180089981812e+00,1.826333861369933e+00,1.350916028343914e+00,-4.422111600506871e-01}, +{ -1.505116792269658e-02,3.725525935051600e-02,-4.761822100319782e-02,3.827154810098832e-02,1.083666577851323e-01,-4.132780185961431e+00,4.612590309172753e+00,-6.010341995220638e-01}, +{ 2.099869479506532e-01,-5.469056737695400e-01,8.262498836020160e-01,-1.220869465951526e+00,1.957253120640178e+00,-3.959589566234508e+00,-1.293904974886597e+00,4.027779728649323e+00}, +{ -2.440030635292599e-01,6.344823921099559e-01,-9.543022560054029e-01,1.396506375785085e+00,-2.188676239034101e+00,4.121697826061060e+00,-1.420328710208303e+01,1.143758206669569e+01} +}; +#endif +#if p_Nq==8 && p_cubNq==12 +const dfloat c_DI[12][8] = { +{ -1.180694798708752e+01,1.487989701041257e+01,-4.604277255522874e+00,2.477304313870949e+00,-1.588246647638094e+00,1.087671829215860e+00,-7.238916557877427e-01,2.784903925368531e-01}, +{ -5.064972419885655e+00,2.990876618577595e+00,3.089120031540066e+00,-1.636924509887976e+00,1.042515087260670e+00,-7.117442829654703e-01,4.730020695735014e-01,-1.818725942127304e-01}, +{ 6.679014412624308e-02,-4.334500638122956e+00,4.982285208578148e+00,-1.036231486421303e+00,5.128169491587944e-01,-3.156028758865309e-01,2.000943350952958e-01,-7.565163652769093e-02}, +{ 7.750926653934223e-01,-2.718398432766328e+00,-1.251027586416596e-01,2.940170417450600e+00,-1.386792911653795e+00,8.506170389381061e-01,-5.398266818463608e-01,2.042406631260152e-01}, +{ -2.346376608640497e-01,7.840287656735309e-01,-3.464740606978844e+00,2.837522704337007e+00,1.790364653485209e-01,-1.770612866432487e-01,1.241330540605533e-01,-4.828143493346966e-02}, +{ -2.402969355849996e-01,6.691896163983808e-01,-1.239602116728840e+00,-1.539599421141425e+00,3.104733735032819e+00,-1.158781329857287e+00,6.328629143431379e-01,-2.285064624617858e-01}, +{ 2.285064624617858e-01,-6.328629143431380e-01,1.158781329857285e+00,-3.104733735032815e+00,1.539599421141423e+00,1.239602116728842e+00,-6.691896163983814e-01,2.402969355849994e-01}, +{ 4.828143493346806e-02,-1.241330540605516e-01,1.770612866432465e-01,-1.790364653485167e-01,-2.837522704337014e+00,3.464740606978846e+00,-7.840287656735271e-01,2.346376608640488e-01}, +{ -2.042406631260153e-01,5.398266818463608e-01,-8.506170389381040e-01,1.386792911653792e+00,-2.940170417450599e+00,1.251027586416569e-01,2.718398432766331e+00,-7.750926653934217e-01}, +{ 7.565163652769047e-02,-2.000943350952959e-01,3.156028758865294e-01,-5.128169491587939e-01,1.036231486421307e+00,-4.982285208578149e+00,4.334500638122953e+00,-6.679014412624174e-02}, +{ 1.818725942127308e-01,-4.730020695735025e-01,7.117442829654715e-01,-1.042515087260673e+00,1.636924509887985e+00,-3.089120031540082e+00,-2.990876618577567e+00,5.064972419885637e+00}, +{ -2.784903925368454e-01,7.238916557877316e-01,-1.087671829215842e+00,1.588246647638071e+00,-2.477304313870923e+00,4.604277255522828e+00,-1.487989701041250e+01,1.180694798708748e+01} +}; +#endif +#if p_Nq==9 && p_cubNq==9 +const dfloat c_DI[9][9] = { +{ -1.213779930711334e+01,1.363468993110642e+01,-2.110796837224807e+00,9.664309960964160e-01,-5.863513930495434e-01,3.992888320380585e-01,-2.830283121094550e-01,1.919819946478237e-01,-7.441590439156459e-02}, +{ -4.486575624950035e-01,-4.971534767043011e+00,6.656019132119932e+00,-1.849527566569365e+00,9.982710493135712e-01,-6.494569512084502e-01,4.505371051178682e-01,-3.024222123856702e-01,1.167717731501273e-01}, +{ 5.040855911555342e-01,-1.517257113934003e+00,-2.412823975209051e+00,4.481622778007550e+00,-1.630652512799264e+00,9.482185710335461e-01,-6.273790521014740e-01,4.120266016187904e-01,-1.578408877716277e-01}, +{ -3.699850806999482e-01,1.048599459384466e+00,-2.097069254582873e+00,-1.037038938538828e+00,3.374746241602653e+00,-1.447039179398784e+00,8.687413055969824e-01,-5.479464332524581e-01,2.069918798887884e-01}, +{ 2.734374999999996e-01,-7.417823979162540e-01,1.269413086358149e+00,-2.659310217573916e+00,-3.061132580905937e-15,2.659310217573921e+00,-1.269413086358151e+00,7.417823979162544e-01,-2.734374999999996e-01}, +{ -2.069918798887884e-01,5.479464332524572e-01,-8.687413055969803e-01,1.447039179398783e+00,-3.374746241602649e+00,1.037038938538820e+00,2.097069254582880e+00,-1.048599459384469e+00,3.699850806999467e-01}, +{ 1.578408877716282e-01,-4.120266016187889e-01,6.273790521014716e-01,-9.482185710335450e-01,1.630652512799263e+00,-4.481622778007548e+00,2.412823975209049e+00,1.517257113934003e+00,-5.040855911555320e-01}, +{ -1.167717731501274e-01,3.024222123856689e-01,-4.505371051178664e-01,6.494569512084483e-01,-9.982710493135697e-01,1.849527566569364e+00,-6.656019132119945e+00,4.971534767043021e+00,4.486575624950058e-01}, +{ 7.441590439156764e-02,-1.919819946478237e-01,2.830283121094556e-01,-3.992888320380606e-01,5.863513930495470e-01,-9.664309960964252e-01,2.110796837224838e+00,-1.363468993110642e+01,1.213779930711332e+01} +}; +#endif +#if p_Nq==9 && p_cubNq==10 +const dfloat c_DI[10][9] = { +{ -1.308025054062210e+01,1.532727548130948e+01,-3.263878764273492e+00,1.625075629528097e+00,-1.019733960992839e+00,7.057957874872639e-01,-5.045747259926229e-01,3.437605130851173e-01,-1.334694195289097e-01}, +{ -1.753241968325556e+00,-3.392464776856402e+00,6.792723319758181e+00,-2.544395762058518e+00,1.481871047441724e+00,-9.948166692634817e-01,7.007586189132333e-01,-4.739551533557764e-01,1.835213437465952e-01}, +{ 9.987463298546819e-01,-3.557941869202173e+00,9.896085211250896e-02,3.525406232829373e+00,-1.706043535950489e+00,1.074847960777058e+00,-7.356881052639492e-01,4.908034547486046e-01,-1.890913199056149e-01}, +{ -4.543183120746976e-01,1.384074558608327e+00,-3.877866509908423e+00,1.955116745265806e+00,1.535537129910349e+00,-8.932101385678870e-01,5.889042608780937e-01,-3.859951430391065e-01,1.477574089275390e-01}, +{ 1.373043268778430e-01,-3.955106800735743e-01,8.263518967234251e-01,-3.690979464524028e+00,3.071577656229553e+00,1.447347528632994e-01,-1.664826963218262e-01,1.201555416769210e-01,-4.715133345161293e-02}, +{ 4.715133345161204e-02,-1.201555416769183e-01,1.664826963218213e-01,-1.447347528632895e-01,-3.071577656229561e+00,3.690979464524026e+00,-8.263518967234205e-01,3.955106800735713e-01,-1.373043268778416e-01}, +{ -1.477574089275394e-01,3.859951430391061e-01,-5.889042608780914e-01,8.932101385678853e-01,-1.535537129910344e+00,-1.955116745265813e+00,3.877866509908429e+00,-1.384074558608328e+00,4.543183120746961e-01}, +{ 1.890913199056142e-01,-4.908034547486020e-01,7.356881052639456e-01,-1.074847960777055e+00,1.706043535950484e+00,-3.525406232829361e+00,-9.896085211253086e-02,3.557941869202184e+00,-9.987463298546783e-01}, +{ -1.835213437465947e-01,4.739551533557739e-01,-7.007586189132305e-01,9.948166692634779e-01,-1.481871047441722e+00,2.544395762058518e+00,-6.792723319758196e+00,3.392464776856423e+00,1.753241968325549e+00}, +{ 1.334694195289029e-01,-3.437605130850985e-01,5.045747259925962e-01,-7.057957874872283e-01,1.019733960992790e+00,-1.625075629528027e+00,3.263878764273387e+00,-1.532727548130928e+01,1.308025054062196e+01} +}; +#endif +#if p_Nq==9 && p_cubNq==11 +const dfloat c_DI[11][9] = { +{ -1.382115099177577e+01,1.666702186668847e+01,-4.192502857446798e+00,2.166042316537800e+00,-1.378298365633427e+00,9.602517382082888e-01,-6.888199201301601e-01,4.700965176065838e-01,-1.826403040549830e-01}, +{ -3.121744491333112e+00,-1.460804491882441e+00,6.332024327723754e+00,-2.750045718521860e+00,1.665520623116128e+00,-1.136398755022220e+00,8.068603954444281e-01,-5.478610608710269e-01,2.124491713463483e-01}, +{ 1.160833352175373e+00,-5.004376833277458e+00,2.514810983662448e+00,2.039820513681427e+00,-1.168721861089424e+00,7.768614800790095e-01,-5.444182756085049e-01,3.672591613863753e-01,-1.420685210092458e-01}, +{ -1.682533969377186e-01,6.233761226033752e-01,-4.123133152283866e+00,3.891663264984679e+00,-2.702373385428469e-01,5.904057750586120e-02,-1.547503396543434e-02,3.413655585280074e-03,-3.946989493298914e-04}, +{ -2.168075980253462e-01,5.935468371046155e-01,-1.024808813031292e+00,-2.289728192146326e+00,3.768698935763975e+00,-1.271060885648648e+00,7.152143686143457e-01,-4.397343575066089e-01,1.646797048752831e-01}, +{ 2.734375000000002e-01,-7.417823979162547e-01,1.269413086358150e+00,-2.659310217573922e+00,7.106935203690838e-15,2.659310217573915e+00,-1.269413086358150e+00,7.417823979162541e-01,-2.734374999999994e-01}, +{ -1.646797048752831e-01,4.397343575066086e-01,-7.152143686143446e-01,1.271060885648648e+00,-3.768698935763974e+00,2.289728192146322e+00,1.024808813031295e+00,-5.935468371046169e-01,2.168075980253448e-01}, +{ 3.946989493306963e-04,-3.413655585280409e-03,1.547503396543528e-02,-5.904057750586292e-02,2.702373385428495e-01,-3.891663264984683e+00,4.123133152283867e+00,-6.233761226033744e-01,1.682533969377172e-01}, +{ 1.420685210092463e-01,-3.672591613863738e-01,5.444182756085014e-01,-7.768614800790066e-01,1.168721861089419e+00,-2.039820513681415e+00,-2.514810983662470e+00,5.004376833277466e+00,-1.160833352175366e+00}, +{ -2.124491713463499e-01,5.478610608710256e-01,-8.068603954444269e-01,1.136398755022218e+00,-1.665520623116127e+00,2.750045718521859e+00,-6.332024327723759e+00,1.460804491882444e+00,3.121744491333115e+00}, +{ 1.826403040549834e-01,-4.700965176065837e-01,6.888199201301611e-01,-9.602517382082933e-01,1.378298365633435e+00,-2.166042316537815e+00,4.192502857446839e+00,-1.666702186668848e+01,1.382115099177575e+01} +}; +#endif +#if p_Nq==9 && p_cubNq==12 +const dfloat c_DI[12][9] = { +{ -1.441161959414025e+01,1.774009285266386e+01,-4.945582674701227e+00,2.610870360511975e+00,-1.674687257323615e+00,1.171099814550000e+00,-8.416823344767372e-01,5.749806930233244e-01,-2.234718601073289e-01}, +{ -4.451076558028245e+00,5.665557157988829e-01,5.545794600912525e+00,-2.642944260446835e+00,1.643093654404769e+00,-1.133574731123450e+00,8.092549131301091e-01,-5.509772618996175e-01,2.138739272518625e-01}, +{ 9.776053745268820e-01,-5.709045494695973e+00,4.420707628264059e+00,5.657363111232462e-01,-4.388739877540132e-01,3.184415821790230e-01,-2.317281744692173e-01,1.590948792149605e-01,-6.193811838896784e-02}, +{ 2.940971756036401e-01,-8.013130060941980e-01,-3.116522259850887e+00,4.533318442684531e+00,-1.376920531603962e+00,7.631873304121700e-01,-4.949469897926020e-01,3.220686082471296e-01,-1.229687696058218e-01}, +{ -4.508542489892364e-01,1.309686311380358e+00,-2.918829784207632e+00,1.306846429061366e-01,2.776695782380839e+00,-1.358000403008378e+00,8.472375308722182e-01,-5.427339321107953e-01,2.061141007764913e-01}, +{ 1.876665801662179e-01,-5.304684295789797e-01,1.042146575734223e+00,-3.715006535326365e+00,2.703473084053196e+00,5.381521046050031e-01,-3.798798461428468e-01,2.490222919844873e-01,-9.510582549493529e-02}, +{ 9.510582549493574e-02,-2.490222919844867e-01,3.798798461428447e-01,-5.381521046049987e-01,-2.703473084053199e+00,3.715006535326366e+00,-1.042146575734223e+00,5.304684295789787e-01,-1.876665801662172e-01}, +{ -2.061141007764922e-01,5.427339321107947e-01,-8.472375308722160e-01,1.358000403008377e+00,-2.776695782380834e+00,-1.306846429061460e-01,2.918829784207641e+00,-1.309686311380361e+00,4.508542489892361e-01}, +{ 1.229687696058229e-01,-3.220686082471288e-01,4.949469897925995e-01,-7.631873304121687e-01,1.376920531603961e+00,-4.533318442684529e+00,3.116522259850882e+00,8.013130060942009e-01,-2.940971756036399e-01}, +{ 6.193811838896771e-02,-1.590948792149602e-01,2.317281744692169e-01,-3.184415821790238e-01,4.388739877540123e-01,-5.657363111232447e-01,-4.420707628264068e+00,5.709045494695975e+00,-9.776053745268751e-01}, +{ -2.138739272518632e-01,5.509772618996153e-01,-8.092549131301051e-01,1.133574731123448e+00,-1.643093654404767e+00,2.642944260446835e+00,-5.545794600912537e+00,-5.665557157988458e-01,4.451076558028221e+00}, +{ 2.234718601073278e-01,-5.749806930233153e-01,8.416823344767241e-01,-1.171099814549982e+00,1.674687257323588e+00,-2.610870360511940e+00,4.945582674701187e+00,-1.774009285266375e+01,1.441161959414016e+01} +}; +#endif +#if p_Nq==9 && p_cubNq==13 +const dfloat c_DI[13][9] = { +{ -1.488841707097972e+01,1.860985814673571e+01,-5.561665703708978e+00,2.978509794614875e+00,-1.920602153406352e+00,1.346361542351237e+00,-9.688651628929407e-01,6.622871755172247e-01,-2.574665682310652e-01}, +{ -5.690535929058131e+00,2.546507665388440e+00,4.608269146380853e+00,-2.351748465309316e+00,1.492000881392361e+00,-1.038342043600569e+00,7.444786773824908e-01,-5.079678895387167e-01,1.973379569625876e-01}, +{ 5.138428833307414e-01,-5.747635508081446e+00,5.724819029287803e+00,-6.596073071308688e-01,2.552552798917000e-01,-1.400197717294624e-01,8.843788940135738e-02,-5.649369393857585e-02,2.140119896875067e-02}, +{ 7.379136989772868e-01,-2.391082578184621e+00,-1.429338253157906e+00,4.208646897846431e+00,-1.770158136270990e+00,1.070701203014751e+00,-7.202000741280251e-01,4.765963461850047e-01,-1.830791042819305e-01}, +{ -4.295431315351939e-01,1.328677826218416e+00,-4.020761109237299e+00,2.353617936048425e+00,1.215661165458721e+00,-7.429701777795482e-01,4.977223372571480e-01,-3.284096396676784e-01,1.260047932370083e-01}, +{ -9.363702350610481e-02,2.418906771399954e-01,-3.125465451123147e-01,-2.971140118223268e+00,3.777792897470676e+00,-9.514037102904782e-01,4.944411661113937e-01,-2.944955376416674e-01,1.090981940517675e-01}, +{ 2.734374999999998e-01,-7.417823979162546e-01,1.269413086358150e+00,-2.659310217573921e+00,5.432403858362169e-15,2.659310217573915e+00,-1.269413086358150e+00,7.417823979162539e-01,-2.734374999999992e-01}, +{ -1.090981940517666e-01,2.944955376416631e-01,-4.944411661113867e-01,9.514037102904704e-01,-3.777792897470674e+00,2.971140118223279e+00,3.125465451123013e-01,-2.418906771399890e-01,9.363702350610215e-02}, +{ -1.260047932370085e-01,3.284096396676792e-01,-4.977223372571486e-01,7.429701777795493e-01,-1.215661165458722e+00,-2.353617936048425e+00,4.020761109237301e+00,-1.328677826218418e+00,4.295431315351924e-01}, +{ 1.830791042819311e-01,-4.765963461850029e-01,7.202000741280230e-01,-1.070701203014750e+00,1.770158136270988e+00,-4.208646897846429e+00,1.429338253157901e+00,2.391082578184622e+00,-7.379136989772831e-01}, +{ -2.140119896874904e-02,5.649369393857557e-02,-8.843788940135942e-02,1.400197717294648e-01,-2.552552798917037e-01,6.596073071308757e-01,-5.724819029287820e+00,5.747635508081451e+00,-5.138428833307341e-01}, +{ -1.973379569625862e-01,5.079678895387142e-01,-7.444786773824869e-01,1.038342043600564e+00,-1.492000881392356e+00,2.351748465309307e+00,-4.608269146380828e+00,-2.546507665388479e+00,5.690535929058152e+00}, +{ 2.574665682310715e-01,-6.622871755172335e-01,9.688651628929517e-01,-1.346361542351259e+00,1.920602153406391e+00,-2.978509794614936e+00,5.561665703709091e+00,-1.860985814673581e+01,1.488841707097973e+01} +}; +#endif +#if p_Nq==9 && p_cubNq==14 +const dfloat c_DI[14][9] = { +{ -1.527816602431993e+01,1.932290953136529e+01,-6.070351034299732e+00,3.284426386817098e+00,-2.125843857359714e+00,1.492842091634669e+00,-1.075239766366719e+00,7.353366632038851e-01,-2.859139906748442e-01}, +{ -6.820040808253876e+00,4.406959511431096e+00,3.624260164564197e+00,-1.962074464081275e+00,1.267320860876612e+00,-8.888667533818824e-01,6.397806452063747e-01,-4.373767670958007e-01,1.700376107345543e-01}, +{ -1.479105636218397e-01,-5.269986023280382e+00,6.483237404450914e+00,-1.571802602845343e+00,8.178970036031519e-01,-5.239260557063583e-01,3.606828094260267e-01,-2.411867306188861e-01,9.299475859271678e-02}, +{ 1.042987352647991e+00,-3.802506094605711e+00,4.561015069201947e-01,3.333475025678799e+00,-1.656058552454939e+00,1.051768259409281e+00,-7.224066582649775e-01,4.827311217957109e-01,-1.860919611263482e-01}, +{ -1.886946294728634e-01,6.819773242964802e-01,-4.145210024681427e+00,3.826699764667281e+00,-1.941878930867999e-01,1.425638671821420e-02,1.411480069947494e-02,-1.598847214377169e-02,7.032743003412209e-03}, +{ -3.574958411177473e-01,1.010309817935060e+00,-1.996036056485601e+00,-1.166830785031840e+00,3.429707453906738e+00,-1.444347912140385e+00,8.627624804738676e-01,-5.430839439537221e-01,2.050147864136305e-01}, +{ 2.179463204151513e-01,-6.105228724466649e-01,1.163609800113808e+00,-3.682384899367326e+00,2.397780908789835e+00,8.355542418389329e-01,-5.331993382552491e-01,3.399681573166818e-01,-1.287523184051684e-01}, +{ 1.287523184051680e-01,-3.399681573166821e-01,5.331993382552482e-01,-8.355542418389313e-01,-2.397780908789835e+00,3.682384899367326e+00,-1.163609800113810e+00,6.105228724466646e-01,-2.179463204151493e-01}, +{ -2.050147864136309e-01,5.430839439537215e-01,-8.627624804738659e-01,1.444347912140386e+00,-3.429707453906738e+00,1.166830785031838e+00,1.996036056485603e+00,-1.010309817935061e+00,3.574958411177465e-01}, +{ -7.032743003412167e-03,1.598847214377483e-02,-1.411480069947975e-02,-1.425638671820920e-02,1.941878930867915e-01,-3.826699764667273e+00,4.145210024681430e+00,-6.819773242964862e-01,1.886946294728644e-01}, +{ 1.860919611263482e-01,-4.827311217957080e-01,7.224066582649732e-01,-1.051768259409278e+00,1.656058552454934e+00,-3.333475025678790e+00,-4.561015069202122e-01,3.802506094605719e+00,-1.042987352647986e+00}, +{ -9.299475859271550e-02,2.411867306188838e-01,-3.606828094260245e-01,5.239260557063556e-01,-8.178970036031488e-01,1.571802602845339e+00,-6.483237404450922e+00,5.269986023280393e+00,1.479105636218388e-01}, +{ -1.700376107345546e-01,4.373767670957973e-01,-6.397806452063702e-01,8.888667533818764e-01,-1.267320860876603e+00,1.962074464081263e+00,-3.624260164564173e+00,-4.406959511431123e+00,6.820040808253886e+00}, +{ 2.859139906748447e-01,-7.353366632038842e-01,1.075239766366717e+00,-1.492842091634669e+00,2.125843857359718e+00,-3.284426386817111e+00,6.070351034299777e+00,-1.932290953136529e+01,1.527816602431990e+01} +}; +#endif +#if p_Nq==10 && p_cubNq==10 +const dfloat c_DI[10][10] = { +{ -1.499171043099842e+01,1.674265016958686e+01,-2.455667013255448e+00,1.106748472234893e+00,-6.691277900034381e-01,4.587000700604802e-01,-3.335095515679875e-01,2.454691387402785e-01,-1.699773458193766e-01,6.642428102216624e-02}, +{ -3.912511351593587e-01,-6.309204469348633e+00,8.141093304956803e+00,-2.145119335641087e+00,1.145108022419828e+00,-7.475977022992526e-01,5.308290566604936e-01,-3.858197398198189e-01,2.654303229132979e-01,-1.034683246822724e-01}, +{ 5.286934911105448e-01,-1.550387259990013e+00,-3.227143843208404e+00,5.483355969091409e+00,-1.898562432486503e+00,1.097685854622888e+00,-7.397309175029992e-01,5.237721905860973e-01,-3.556261339076193e-01,1.379430816845990e-01}, +{ -3.966368147876117e-01,1.114847497703397e+00,-2.155118190223682e+00,-1.627968116108324e+00,4.148567262987707e+00,-1.701740853337717e+00,1.028650499052122e+00,-6.939561197100476e-01,4.604599651380158e-01,-1.771051307138592e-01}, +{ 2.963700286793647e-01,-8.000125397990632e-01,1.348379019345462e+00,-2.691647194988813e+00,-5.026866926212062e-01,3.309284257071503e+00,-1.527091794880901e+00,9.391178599469279e-01,-5.988497170623768e-01,2.271367743091037e-01}, +{ -2.271367743091034e-01,5.988497170623763e-01,-9.391178599469289e-01,1.527091794880901e+00,-3.309284257071492e+00,5.026866926211868e-01,2.691647194988827e+00,-1.348379019345465e+00,8.000125397990657e-01,-2.963700286793663e-01}, +{ 1.771051307138585e-01,-4.604599651380162e-01,6.939561197100501e-01,-1.028650499052125e+00,1.701740853337720e+00,-4.148567262987704e+00,1.627968116108313e+00,2.155118190223691e+00,-1.114847497703402e+00,3.966368147876147e-01}, +{ -1.379430816845985e-01,3.556261339076205e-01,-5.237721905861009e-01,7.397309175030032e-01,-1.097685854622892e+00,1.898562432486507e+00,-5.483355969091406e+00,3.227143843208390e+00,1.550387259990027e+00,-5.286934911105506e-01}, +{ 1.034683246822716e-01,-2.654303229132973e-01,3.858197398198178e-01,-5.308290566604896e-01,7.475977022992437e-01,-1.145108022419818e+00,2.145119335641073e+00,-8.141093304956792e+00,6.309204469348652e+00,3.912511351593396e-01}, +{ -6.642428102215803e-02,1.699773458193583e-01,-2.454691387402477e-01,3.335095515679436e-01,-4.587000700604213e-01,6.691277900033601e-01,-1.106748472234775e+00,2.455667013255228e+00,-1.674265016958656e+01,1.499171043099827e+01} +}; +#endif +#if p_Nq==10 && p_cubNq==11 +const dfloat c_DI[11][10] = { +{ -1.609161191443060e+01,1.871187338483086e+01,-3.789974481494352e+00,1.866985337122482e+00,-1.170753131147529e+00,8.171756262925937e-01,-6.000228288048876e-01,4.440930958929367e-01,-3.084371019963555e-01,1.206720137348539e-01}, +{ -1.810622309662448e+00,-4.670797630472306e+00,8.449647757978804e+00,-3.027385615162395e+00,1.748978797969179e+00,-1.180318458534459e+00,8.522291925858090e-01,-6.250772973559421e-01,4.320886233488350e-01,-1.687430606950758e-01}, +{ 1.119361830214302e+00,-3.851394117788030e+00,-5.569865386147644e-01,4.639446253808694e+00,-2.153571580506643e+00,1.350981793603953e+00,-9.431913164658677e-01,6.798393712684451e-01,-4.657643210768608e-01,1.812786255567704e-01}, +{ -5.632778319969206e-01,1.684879398556095e+00,-4.315749326792897e+00,1.633312779547420e+00,2.354985068948468e+00,-1.301569856308630e+00,8.619102342975237e-01,-6.052776314309657e-01,4.093235375472080e-01,-1.585363723673022e-01}, +{ 2.297642068797079e-01,-6.469349432604246e-01,1.255337050725984e+00,-4.276616850887461e+00,2.997140076198822e+00,7.452381637934187e-01,-5.154492858932891e-01,3.601373046403998e-01,-2.421848694899515e-01,9.356914729279342e-02}, +{ -2.734374999999944e-02,7.920519507855044e-02,-1.593770271465146e-01,4.339783761858380e-01,-3.842918861837781e+00,3.842918861837786e+00,-4.339783761858439e-01,1.593770271465183e-01,-7.920519507855341e-02,2.734375000000078e-02}, +{ -9.356914729279237e-02,2.421848694899503e-01,-3.601373046403984e-01,5.154492858932866e-01,-7.452381637934142e-01,-2.997140076198827e+00,4.276616850887460e+00,-1.255337050725980e+00,6.469349432604230e-01,-2.297642068797082e-01}, +{ 1.585363723673014e-01,-4.093235375472074e-01,6.052776314309666e-01,-8.619102342975242e-01,1.301569856308630e+00,-2.354985068948465e+00,-1.633312779547428e+00,4.315749326792900e+00,-1.684879398556097e+00,5.632778319969229e-01}, +{ -1.812786255567702e-01,4.657643210768607e-01,-6.798393712684475e-01,9.431913164658705e-01,-1.350981793603954e+00,2.153571580506644e+00,-4.639446253808688e+00,5.569865386147512e-01,3.851394117788043e+00,-1.119361830214310e+00}, +{ 1.687430606950757e-01,-4.320886233488350e-01,6.250772973559418e-01,-8.522291925858079e-01,1.180318458534457e+00,-1.748978797969176e+00,3.027385615162389e+00,-8.449647757978797e+00,4.670797630472310e+00,1.810622309662442e+00}, +{ -1.206720137348576e-01,3.084371019963612e-01,-4.440930958929403e-01,6.000228288048879e-01,-8.171756262925904e-01,1.170753131147528e+00,-1.866985337122479e+00,3.789974481494337e+00,-1.871187338483087e+01,1.609161191443063e+01} +}; +#endif +#if p_Nq==10 && p_cubNq==12 +const dfloat c_DI[12][10] = { +{ -1.697658323578462e+01,2.030681529371152e+01,-4.888928497290483e+00,2.505154283539983e+00,-1.594830740076356e+00,1.121246529348830e+00,-8.264901939898667e-01,6.130394625446950e-01,-4.262709817529403e-01,1.668480797492368e-01}, +{ -3.327447145424347e+00,-2.603367253395844e+00,8.091909946081977e+00,-3.383410879079420e+00,2.035110276064449e+00,-1.396788239186948e+00,1.017142024311895e+00,-7.494830544668576e-01,5.193401421217305e-01,-2.030058170266357e-01}, +{ 1.406259896945400e+00,-5.658291609836236e+00,2.184292072911678e+00,3.104940395633161e+00,-1.694274527327195e+00,1.118100490372797e+00,-7.986112658503708e-01,5.823991701885287e-01,-4.013921062078225e-01,1.565774831700589e-01}, +{ -3.485283472054942e-01,1.160551746661938e+00,-5.053164697639050e+00,4.105050126327904e+00,2.991727177749410e-01,-2.906649406751098e-01,2.246309461260829e-01,-1.686000200884500e-01,1.176391612790569e-01,-4.608669256181908e-02}, +{ -1.212674422922526e-01,3.156788379103930e-01,-4.274945674634338e-01,-3.421124861976859e+00,4.427098417348022e+00,-1.151422979838149e+00,6.137865434214090e-01,-3.923675778673888e-01,2.538610733694098e-01,-9.674744261114998e-02}, +{ 2.628207749202450e-01,-7.052875848844093e-01,1.166382033009799e+00,-2.186123884538664e+00,-1.205879394373838e+00,3.654911427601637e+00,-1.553138681022428e+00,9.326944590984161e-01,-5.890740364644411e-01,2.226948866536825e-01}, +{ -2.226948866536812e-01,5.890740364644415e-01,-9.326944590984190e-01,1.553138681022431e+00,-3.654911427601636e+00,1.205879394373830e+00,2.186123884538671e+00,-1.166382033009801e+00,7.052875848844113e-01,-2.628207749202472e-01}, +{ 9.674744261115054e-02,-2.538610733694113e-01,3.923675778673928e-01,-6.137865434214149e-01,1.151422979838155e+00,-4.427098417348023e+00,3.421124861976850e+00,4.274945674634436e-01,-3.156788379103979e-01,1.212674422922545e-01}, +{ 4.608669256181835e-02,-1.176391612790560e-01,1.686000200884490e-01,-2.246309461260811e-01,2.906649406751088e-01,-2.991727177749380e-01,-4.105050126327907e+00,5.053164697639048e+00,-1.160551746661937e+00,3.485283472054952e-01}, +{ -1.565774831700586e-01,4.013921062078215e-01,-5.823991701885305e-01,7.986112658503735e-01,-1.118100490372798e+00,1.694274527327197e+00,-3.104940395633165e+00,-2.184292072911668e+00,5.658291609836235e+00,-1.406259896945406e+00}, +{ 2.030058170266349e-01,-5.193401421217290e-01,7.494830544668564e-01,-1.017142024311895e+00,1.396788239186942e+00,-2.035110276064440e+00,3.383410879079408e+00,-8.091909946081978e+00,2.603367253395884e+00,3.327447145424317e+00}, +{ -1.668480797492329e-01,4.262709817529301e-01,-6.130394625446769e-01,8.264901939898428e-01,-1.121246529348799e+00,1.594830740076307e+00,-2.505154283539900e+00,4.888928497290337e+00,-2.030681529371136e+01,1.697658323578455e+01} +}; +#endif +#if p_Nq==10 && p_cubNq==13 +const dfloat c_DI[13][10] = { +{ -1.769641879186821e+01,2.161058190102161e+01,-5.798409629154273e+00,3.040611641692720e+00,-1.952509164674627e+00,1.378339748314406e+00,-1.018220763962788e+00,7.561768283008493e-01,-5.261430743193783e-01,2.059913046496918e-01}, +{ -4.833214037014324e+00,-3.694761061785026e-01,7.334054786168574e+00,-3.377614448607781e+00,2.086673800777821e+00,-1.448561715713917e+00,1.060954024677565e+00,-7.842277199644130e-01,5.443148566014404e-01,-2.129034407464614e-01}, +{ 1.344554718430332e+00,-6.752128450886907e+00,4.521169281438861e+00,1.441638005479699e+00,-9.321569582526712e-01,6.497010878070494e-01,-4.757431272970764e-01,3.514235613538321e-01,-2.437967089703709e-01,9.533859089725205e-02}, +{ 1.082673923687344e-01,-1.784555471810428e-01,-4.426311909277726e+00,5.321208093602262e+00,-1.210171463454275e+00,6.217714421018223e-01,-3.977398633647451e-01,2.742450983807080e-01,-1.837089529892143e-01,7.089570981347648e-02}, +{ -4.494374090385680e-01,1.275960691713574e+00,-2.573599959548352e+00,-1.103642106022778e+00,3.938512400612534e+00,-1.722372879207873e+00,1.060321775280875e+00,-7.209608575712085e-01,4.801412779102546e-01,-1.849229341284589e-01}, +{ 2.998231857200790e-01,-8.286515804807396e-01,1.510298633197113e+00,-4.017427976929390e+00,1.969334528903770e+00,1.649114987586277e+00,-9.605186191907390e-01,6.358316363170976e-01,-4.180694948956237e-01,1.602646997721556e-01}, +{ -2.734374999999989e-02,7.920519507855066e-02,-1.593770271465154e-01,4.339783761858391e-01,-3.842918861837782e+00,3.842918861837785e+00,-4.339783761858422e-01,1.593770271465173e-01,-7.920519507855305e-02,2.734375000000033e-02}, +{ -1.602646997721553e-01,4.180694948956263e-01,-6.358316363171032e-01,9.605186191907477e-01,-1.649114987586294e+00,-1.969334528903751e+00,4.017427976929382e+00,-1.510298633197115e+00,8.286515804807423e-01,-2.998231857200808e-01}, +{ 1.849229341284586e-01,-4.801412779102546e-01,7.209608575712105e-01,-1.060321775280877e+00,1.722372879207875e+00,-3.938512400612536e+00,1.103642106022776e+00,2.573599959548353e+00,-1.275960691713576e+00,4.494374090385697e-01}, +{ -7.089570981347693e-02,1.837089529892143e-01,-2.742450983807087e-01,3.977398633647458e-01,-6.217714421018213e-01,1.210171463454274e+00,-5.321208093602261e+00,4.426311909277725e+00,1.784555471810420e-01,-1.082673923687341e-01}, +{ -9.533859089725150e-02,2.437967089703686e-01,-3.514235613538317e-01,4.757431272970770e-01,-6.497010878070508e-01,9.321569582526732e-01,-1.441638005479700e+00,-4.521169281438860e+00,6.752128450886915e+00,-1.344554718430340e+00}, +{ 2.129034407464614e-01,-5.443148566014405e-01,7.842277199644162e-01,-1.060954024677567e+00,1.448561715713915e+00,-2.086673800777822e+00,3.377614448607781e+00,-7.334054786168558e+00,3.694761061784611e-01,4.833214037014352e+00}, +{ -2.059913046496931e-01,5.261430743193927e-01,-7.561768283008670e-01,1.018220763962804e+00,-1.378339748314424e+00,1.952509164674659e+00,-3.040611641692773e+00,5.798409629154351e+00,-2.161058190102176e+01,1.769641879186831e+01} +}; +#endif +#if p_Nq==10 && p_cubNq==14 +const dfloat c_DI[14][10] = { +{ -1.828820487640615e+01,2.268651937534599e+01,-6.556043513297261e+00,3.491300937373920e+00,-2.254754907379016e+00,1.595996850956067e+00,-1.180704210863857e+00,8.775476674483772e-01,-6.108532237652800e-01,2.391959005872070e-01}, +{ -6.267721191321050e+00,1.870505816720338e+00,6.359923186736050e+00,-3.139707802673315e+00,1.979349650277954e+00,-1.386133270602328e+00,1.019784631865470e+00,-7.556422218706744e-01,5.251505548250877e-01,-2.055093539575326e-01}, +{ 9.729726095161942e-01,-7.142505118435756e+00,6.284557328878271e+00,-5.533964118071009e-02,-1.263591202423237e-01,1.233048132786256e-01,-1.019843130776021e-01,7.978602620692045e-02,-5.692937046537509e-02,2.249678552175616e-02}, +{ 6.239367398838898e-01,-1.883493928256376e+00,-2.887729891707512e+00,5.436090529003609e+00,-1.995442612575774e+00,1.171925426571083e+00,-7.950902648438585e-01,5.648729968166898e-01,-3.841854461639848e-01,1.491164512722341e-01}, +{ -5.645662977778806e-01,1.682877002550523e+00,-4.238454399951017e+00,1.480233738800343e+00,2.461785376638100e+00,-1.344401135303041e+00,8.867478722437142e-01,-6.216041749841502e-01,4.200033270433072e-01,-1.626213092598982e-01}, +{ 7.744442280945563e-02,-2.364260582926138e-01,5.837933367949631e-01,-4.131006744866546e+00,3.966724880539041e+00,-3.258633056122047e-01,8.916867817702714e-02,-3.424372472724999e-02,1.527186355022519e-02,-4.863348372098120e-03}, +{ 2.315519668388925e-01,-6.186073627739924e-01,1.008509564742815e+00,-1.804475680335658e+00,-1.691242275901732e+00,3.852194236709858e+00,-1.525600270691057e+00,8.983976905022900e-01,-5.629978921630018e-01,2.122700230715858e-01}, +{ -2.122700230715854e-01,5.629978921630019e-01,-8.983976905022923e-01,1.525600270691059e+00,-3.852194236709857e+00,1.691242275901728e+00,1.804475680335661e+00,-1.008509564742815e+00,6.186073627739934e-01,-2.315519668388933e-01}, +{ 4.863348372098619e-03,-1.527186355022570e-02,3.424372472725131e-02,-8.916867817702927e-02,3.258633056122076e-01,-3.966724880539042e+00,4.131006744866542e+00,-5.837933367949593e-01,2.364260582926127e-01,-7.744442280945574e-02}, +{ 1.626213092598990e-01,-4.200033270433082e-01,6.216041749841532e-01,-8.867478722437184e-01,1.344401135303047e+00,-2.461785376638109e+00,-1.480233738800331e+00,4.238454399951010e+00,-1.682877002550525e+00,5.645662977778824e-01}, +{ -1.491164512722344e-01,3.841854461639848e-01,-5.648729968166921e-01,7.950902648438607e-01,-1.171925426571085e+00,1.995442612575777e+00,-5.436090529003605e+00,2.887729891707501e+00,1.883493928256387e+00,-6.239367398838936e-01}, +{ -2.249678552175605e-02,5.692937046537467e-02,-7.978602620692150e-02,1.019843130776037e-01,-1.233048132786303e-01,1.263591202423338e-01,5.533964118069451e-02,-6.284557328878253e+00,7.142505118435762e+00,-9.729726095162073e-01}, +{ 2.055093539575321e-01,-5.251505548250872e-01,7.556422218706771e-01,-1.019784631865472e+00,1.386133270602325e+00,-1.979349650277949e+00,3.139707802673308e+00,-6.359923186736035e+00,-1.870505816720371e+00,6.267721191321072e+00}, +{ -2.391959005872089e-01,6.108532237652824e-01,-8.775476674483773e-01,1.180704210863850e+00,-1.595996850956054e+00,2.254754907379002e+00,-3.491300937373905e+00,6.556043513297235e+00,-2.268651937534600e+01,1.828820487640617e+01} +}; +#endif +#if p_Nq==10 && p_cubNq==15 +const dfloat c_DI[15][10] = { +{ -1.877965895893847e+01,2.358272867328635e+01,-7.191764400386866e+00,3.872501026731190e+00,-2.511188079314028e+00,1.780934256898416e+00,-1.318870655728015e+00,9.807997890297823e-01,-6.829344436467765e-01,2.674527920684263e-01}, +{ -7.601622806852959e+00,4.025640347694827e+00,5.289073993647412e+00,-2.762542518743995e+00,1.771466602373814e+00,-1.249822574514467e+00,9.230272210571727e-01,-6.853829148047028e-01,4.768498024721795e-01,-1.866871523292817e-01}, +{ 3.606004818972449e-01,-6.939334783488404e+00,7.472656604468108e+00,-1.267117348664529e+00,5.897799192798040e-01,-3.615478296377359e-01,2.483003405058219e-01,-1.771502875516064e-01,1.206733666268068e-01,-4.686046343551065e-02}, +{ 1.059117513752218e+00,-3.579603160603159e+00,-9.127679543757500e-01,4.799308647526508e+00,-2.170816724985266e+00,1.351417526255616e+00,-9.403157535200357e-01,6.766053975958459e-01,-4.631457097250597e-01,1.802002180790839e-01}, +{ -4.353021847870848e-01,1.392831607788433e+00,-5.006046854294380e+00,3.578741468112143e+00,8.004868216079968e-01,-5.640283843292944e-01,4.036484345516748e-01,-2.934929072962143e-01,2.018012259976354e-01,-7.863922735090989e-02}, +{ -2.361790455968426e-01,6.432274136629397e-01,-1.087346585805252e+00,-2.807492898426080e+00,4.441769384007928e+00,-1.458997232339687e+00,8.299065361487513e-01,-5.455994890164094e-01,3.576851790850474e-01,-1.369732617203968e-01}, +{ 3.210102545420282e-01,-8.786855135043735e-01,1.550433187627100e+00,-3.642840953567366e+00,1.105470108266016e+00,2.303970159931372e+00,-1.235472579866900e+00,7.959783197613488e-01,-5.174279352089296e-01,1.975649520197042e-01}, +{ -2.734374999999989e-02,7.920519507855042e-02,-1.593770271465145e-01,4.339783761858375e-01,-3.842918861837781e+00,3.842918861837786e+00,-4.339783761858443e-01,1.593770271465187e-01,-7.920519507855356e-02,2.734375000000078e-02}, +{ -1.975649520197038e-01,5.174279352089283e-01,-7.959783197613481e-01,1.235472579866898e+00,-2.303970159931364e+00,-1.105470108266029e+00,3.642840953567371e+00,-1.550433187627098e+00,8.786855135043737e-01,-3.210102545420290e-01}, +{ 1.369732617203967e-01,-3.576851790850478e-01,5.455994890164120e-01,-8.299065361487544e-01,1.458997232339689e+00,-4.441769384007928e+00,2.807492898426076e+00,1.087346585805255e+00,-6.432274136629414e-01,2.361790455968433e-01}, +{ 7.863922735090867e-02,-2.018012259976353e-01,2.934929072962152e-01,-4.036484345516760e-01,5.640283843292964e-01,-8.004868216079983e-01,-3.578741468112141e+00,5.006046854294378e+00,-1.392831607788435e+00,4.353021847870869e-01}, +{ -1.802002180790829e-01,4.631457097250585e-01,-6.766053975958469e-01,9.403157535200372e-01,-1.351417526255616e+00,2.170816724985268e+00,-4.799308647526516e+00,9.127679543757716e-01,3.579603160603145e+00,-1.059117513752218e+00}, +{ 4.686046343550920e-02,-1.206733666268054e-01,1.771502875516036e-01,-2.483003405058173e-01,3.615478296377279e-01,-5.897799192797946e-01,1.267117348664514e+00,-7.472656604468091e+00,6.939334783488411e+00,-3.606004818972581e-01}, +{ 1.866871523292803e-01,-4.768498024721781e-01,6.853829148047031e-01,-9.230272210571748e-01,1.249822574514468e+00,-1.771466602373815e+00,2.762542518743992e+00,-5.289073993647414e+00,-4.025640347694826e+00,7.601622806852964e+00}, +{ -2.674527920684220e-01,6.829344436467686e-01,-9.807997890297697e-01,1.318870655727999e+00,-1.780934256898391e+00,2.511188079313996e+00,-3.872501026731141e+00,7.191764400386772e+00,-2.358272867328625e+01,1.877965895893844e+01} +}; +#endif +#if p_Nq==11 && p_cubNq==11 +const dfloat c_DI[11][11] = { +{ -1.814410962294183e+01,2.016374331448132e+01,-2.818224336740170e+00,1.250909863105465e+00,-7.525217968933184e-01,5.167135538464425e-01,-3.796583510468694e-01,2.877656322137002e-01,-2.172563699329766e-01,1.526287247247652e-01,-5.999061081653695e-02}, +{ -3.181791813709665e-01,-7.795745141951947e+00,9.768696369162058e+00,-2.452029713636130e+00,1.293891160263013e+00,-8.439394982138388e-01,6.046332496707025e-01,-4.519757831909085e-01,3.385408141349047e-01,-2.368182233999313e-01,9.292594853304514e-02}, +{ 5.501588337671608e-01,-1.573411583412116e+00,-4.133225597055359e+00,6.576605622505700e+00,-2.173955709553359e+00,1.245970734211871e+00,-8.441300953422816e-01,6.130457721843061e-01,-4.519257143721969e-01,3.134694399369551e-01,-1.226017028706802e-01}, +{ -4.221050465973923e-01,1.178148532910786e+00,-2.212978019161082e+00,-2.278252686752122e+00,4.987865795947985e+00,-1.959430759408841e+00,1.179932999226713e+00,-8.122342166506666e-01,5.823792494858531e-01,-3.982406341361767e-01,1.549147851349445e-01}, +{ 3.184222672343200e-01,-8.562198419074092e-01,1.426000874082716e+00,-2.741734448698768e+00,-1.032049415537572e+00,4.004890854999704e+00,-1.777898390748766e+00,1.103078256783019e+00,-7.539474774571268e-01,5.037263772297855e-01,-1.942690559799019e-01}, +{ -2.460937500000005e-01,6.469399638320998e-01,-1.006505408576777e+00,1.608127903725515e+00,-3.305176853378325e+00,5.818166711462129e-15,3.305176853378315e+00,-1.608127903725510e+00,1.006505408576777e+00,-6.469399638320998e-01,2.460937500000004e-01}, +{ 1.942690559799019e-01,-5.037263772297846e-01,7.539474774571258e-01,-1.103078256783018e+00,1.777898390748760e+00,-4.004890854999699e+00,1.032049415537571e+00,2.741734448698770e+00,-1.426000874082716e+00,8.562198419074093e-01,-3.184222672343200e-01}, +{ -1.549147851349459e-01,3.982406341361780e-01,-5.823792494858558e-01,8.122342166506697e-01,-1.179932999226713e+00,1.959430759408847e+00,-4.987865795947985e+00,2.278252686752112e+00,2.212978019161091e+00,-1.178148532910792e+00,4.221050465973939e-01}, +{ 1.226017028706821e-01,-3.134694399369571e-01,4.519257143721978e-01,-6.130457721843074e-01,8.441300953422837e-01,-1.245970734211877e+00,2.173955709553367e+00,-6.576605622505697e+00,4.133225597055345e+00,1.573411583412128e+00,-5.501588337671636e-01}, +{ -9.292594853304559e-02,2.368182233999300e-01,-3.385408141349039e-01,4.519757831909108e-01,-6.046332496707014e-01,8.439394982138395e-01,-1.293891160263014e+00,2.452029713636122e+00,-9.768696369162059e+00,7.795745141951953e+00,3.181791813709699e-01}, +{ 5.999061081653323e-02,-1.526287247247627e-01,2.172563699329739e-01,-2.877656322137007e-01,3.796583510468668e-01,-5.167135538464422e-01,7.525217968933225e-01,-1.250909863105463e+00,2.818224336740176e+00,-2.016374331448132e+01,1.814410962294182e+01} +}; +#endif +#if p_Nq==11 && p_cubNq==12 +const dfloat c_DI[12][11] = { +{ -1.940283135610619e+01,2.241182496041061e+01,-4.334751134690578e+00,2.112750043542456e+00,-1.321793142354956e+00,9.256050571954455e-01,-6.875853716369634e-01,5.245455803849556e-01,-3.975500981695873e-01,2.798883518145738e-01,-1.101028903897613e-01}, +{ -1.848179939062679e+00,-6.106096103011899e+00,1.025512328448393e+01,-3.524024937535600e+00,2.017900461331905e+00,-1.362613231796467e+00,9.938178486190714e-01,-7.504365240721200e-01,5.654015007751854e-01,-3.967834531518749e-01,1.558910934205433e-01}, +{ 1.230104382640827e+00,-4.117371255094626e+00,-1.323507460409136e+00,5.857748832018763e+00,-2.614571775962574e+00,1.627949432976342e+00,-1.143665892433405e+00,8.464522532103421e-01,-6.305861685346962e-01,4.398539888023255e-01,-1.724063372141628e-01}, +{ -6.643452005101047e-01,1.960994927341221e+00,-4.715725402615766e+00,1.214306624615045e+00,3.264803197941730e+00,-1.728473751090278e+00,1.138880145109543e+00,-8.164026328411711e-01,5.977457094641259e-01,-4.131583259865694e-01,1.613747085722239e-01}, +{ 3.166140104426469e-01,-8.810203241283134e-01,1.643779078578034e+00,-4.805196862294184e+00,2.823045990494705e+00,1.438483762405799e+00,-8.974038459366865e-01,6.214816728764656e-01,-4.463733429267992e-01,3.054182867897663e-01,-1.188284263014336e-01}, +{ -9.972708136281883e-02,2.712617963028618e-01,-4.657829038151899e-01,9.471486355425235e-01,-4.523719068197564e+00,3.879935759334991e+00,7.096181936607707e-02,-1.525897801957618e-01,1.284950193058791e-01,-9.260142112040880e-02,3.661722483941099e-02}, +{ -3.661722483940943e-02,9.260142112040755e-02,-1.284950193058767e-01,1.525897801957585e-01,-7.096181936607357e-02,-3.879935759335001e+00,4.523719068197567e+00,-9.471486355425167e-01,4.657829038151871e-01,-2.712617963028596e-01,9.972708136281772e-02}, +{ 1.188284263014330e-01,-3.054182867897633e-01,4.463733429267945e-01,-6.214816728764612e-01,8.974038459366778e-01,-1.438483762405784e+00,-2.823045990494715e+00,4.805196862294183e+00,-1.643779078578030e+00,8.810203241283111e-01,-3.166140104426465e-01}, +{ -1.613747085722239e-01,4.131583259865700e-01,-5.977457094641261e-01,8.164026328411700e-01,-1.138880145109542e+00,1.728473751090281e+00,-3.264803197941728e+00,-1.214306624615053e+00,4.715725402615771e+00,-1.960994927341224e+00,6.643452005101045e-01}, +{ 1.724063372141631e-01,-4.398539888023266e-01,6.305861685346974e-01,-8.464522532103428e-01,1.143665892433406e+00,-1.627949432976347e+00,2.614571775962575e+00,-5.857748832018761e+00,1.323507460409140e+00,4.117371255094620e+00,-1.230104382640824e+00}, +{ -1.558910934205437e-01,3.967834531518709e-01,-5.654015007751793e-01,7.504365240721156e-01,-9.938178486190635e-01,1.362613231796459e+00,-2.017900461331895e+00,3.524024937535578e+00,-1.025512328448393e+01,6.106096103011935e+00,1.848179939062654e+00}, +{ 1.101028903897543e-01,-2.798883518145582e-01,3.975500981695682e-01,-5.245455803849343e-01,6.875853716369267e-01,-9.256050571954013e-01,1.321793142354902e+00,-2.112750043542367e+00,4.334751134690430e+00,-2.241182496041038e+01,1.940283135610606e+01} +}; +#endif +#if p_Nq==11 && p_cubNq==13 +const dfloat c_DI[13][11] = { +{ -2.043517375126362e+01,2.426739834428509e+01,-5.606910609966538e+00,2.849122195682622e+00,-1.811530853721899e+00,1.278512605151555e+00,-9.538191021407474e-01,7.294743460418749e-01,-5.536849264604731e-01,3.901320662146671e-01,-1.535203138225346e-01}, +{ -3.505236707053031e+00,-3.921114369214971e+00,1.001728641626873e+01,-4.040362600842835e+00,2.411458102280289e+00,-1.656953005318232e+00,1.219303871684599e+00,-9.253332685839813e-01,6.992049571101211e-01,-4.914630748161601e-01,1.932096784854730e-01}, +{ 1.635747941607637e+00,-6.262569573856416e+00,1.706081561983244e+00,4.311543481090731e+00,-2.259468129761775e+00,1.477980777137218e+00,-1.061765444080327e+00,7.951925270968901e-01,-5.963409678314774e-01,4.174474257550814e-01,-1.638495991408063e-01}, +{ -5.267689049342785e-01,1.679140853505360e+00,-5.924850533311858e+00,4.167458671094395e+00,1.021647144097617e+00,-7.107387864946557e-01,5.120376018732519e-01,-3.825867810811676e-01,2.862697033091412e-01,-2.001019228984268e-01,7.849295484062130e-02}, +{ -1.076151934999225e-02,2.613233356467417e-03,1.874721412203274e-01,-4.508702737988262e+00,4.934104859150167e+00,-8.589964401631504e-01,4.020530728542653e-01,-2.458650602917960e-01,1.655613855147827e-01,-1.095711990541335e-01,4.209226475132472e-02}, +{ 2.230770996513129e-01,-5.926902336710086e-01,9.499791335691702e-01,-1.612994004413700e+00,-2.423881349230728e+00,4.527399003533123e+00,-1.660394540485280e+00,9.759634819931857e-01,-6.516899322077953e-01,4.306536028352744e-01,-1.654222615735552e-01}, +{ -2.460937500000007e-01,6.469399638321002e-01,-1.006505408576778e+00,1.608127903725516e+00,-3.305176853378325e+00,4.829552677927509e-15,3.305176853378316e+00,-1.608127903725511e+00,1.006505408576777e+00,-6.469399638321000e-01,2.460937500000008e-01}, +{ 1.654222615735552e-01,-4.306536028352713e-01,6.516899322077913e-01,-9.759634819931805e-01,1.660394540485269e+00,-4.527399003533130e+00,2.423881349230756e+00,1.612994004413678e+00,-9.499791335691596e-01,5.926902336710029e-01,-2.230770996513116e-01}, +{ -4.209226475132521e-02,1.095711990541346e-01,-1.655613855147854e-01,2.458650602917998e-01,-4.020530728542670e-01,8.589964401631530e-01,-4.934104859150166e+00,4.508702737988258e+00,-1.874721412203249e-01,-2.613233356469457e-03,1.076151934999275e-02}, +{ -7.849295484062124e-02,2.001019228984268e-01,-2.862697033091417e-01,3.825867810811678e-01,-5.120376018732518e-01,7.107387864946570e-01,-1.021647144097619e+00,-4.167458671094392e+00,5.924850533311859e+00,-1.679140853505363e+00,5.267689049342790e-01}, +{ 1.638495991408073e-01,-4.174474257550828e-01,5.963409678314778e-01,-7.951925270968896e-01,1.061765444080327e+00,-1.477980777137219e+00,2.259468129761774e+00,-4.311543481090727e+00,-1.706081561983255e+00,6.262569573856424e+00,-1.635747941607637e+00}, +{ -1.932096784854751e-01,4.914630748161601e-01,-6.992049571101211e-01,9.253332685839863e-01,-1.219303871684603e+00,1.656953005318238e+00,-2.411458102280298e+00,4.040362600842839e+00,-1.001728641626872e+01,3.921114369214933e+00,3.505236707053064e+00}, +{ 1.535203138225356e-01,-3.901320662146771e-01,5.536849264604917e-01,-7.294743460419006e-01,9.538191021407684e-01,-1.278512605151584e+00,1.811530853721953e+00,-2.849122195682707e+00,5.606910609966695e+00,-2.426739834428529e+01,2.043517375126372e+01} +}; +#endif +#if p_Nq==11 && p_cubNq==14 +const dfloat c_DI[14][11] = { +{ -2.128937069397671e+01,2.581027296927423e+01,-6.677646179835078e+00,3.477375396850928e+00,-2.231510222705150e+00,1.581891505172346e+00,-1.182991615350928e+00,9.060117328351212e-01,-6.882495551549287e-01,4.851693119837825e-01,-1.909526490936140e-01}, +{ -5.178267629239932e+00,-1.503664755232486e+00,9.312392067629441e+00,-4.152681444093241e+00,2.547116510595701e+00,-1.770636325037660e+00,1.310784403653376e+00,-9.981300308583888e-01,7.556958151558220e-01,-5.317407613326476e-01,2.091321487600132e-01}, +{ 1.698202637824024e+00,-7.739281444418395e+00,4.443147696501661e+00,2.502046169420171e+00,-1.501093613655198e+00,1.026469409653227e+00,-7.527377483427168e-01,5.700038523619056e-01,-4.301314878175453e-01,3.021067300680329e-01,-1.187322015951652e-01}, +{ -1.001575877461921e-01,4.822089544822570e-01,-5.721867407448845e+00,5.954985884837544e+00,-8.508391063368854e-01,3.657159917614352e-01,-2.145221511388369e-01,1.436351134772945e-01,-1.009911390885702e-01,6.825730525851618e-02,-2.642585805771727e-02}, +{ -4.008938292240549e-01,1.115304519961801e+00,-2.066466982653704e+00,-2.443896881565696e+00,5.035120993368737e+00,-1.932802407308620e+00,1.156422611807213e+00,-7.938017578142447e-01,5.683422407890195e-01,-3.883561146847178e-01,1.510276073242660e-01}, +{ 3.700652706384648e-01,-1.009822274731261e+00,1.764482236796036e+00,-4.007772640880918e+00,9.432606850953639e-01,2.867782867567393e+00,-1.511536996234830e+00,9.892680903676999e-01,-6.924135542891343e-01,4.678876516418644e-01,-1.812013359706783e-01}, +{ -1.427304091733097e-01,3.841101851981875e-01,-6.398358237130538e-01,1.211453962968891e+00,-4.584506712767693e+00,3.498702516087294e+00,5.007717058611800e-01,-3.947479883667841e-01,2.865220108839894e-01,-1.958483268039109e-01,7.610887982521009e-02}, +{ -7.610887982521053e-02,1.958483268039103e-01,-2.865220108839881e-01,3.947479883667832e-01,-5.007717058611791e-01,-3.498702516087302e+00,4.584506712767698e+00,-1.211453962968888e+00,6.398358237130536e-01,-3.841101851981870e-01,1.427304091733097e-01}, +{ 1.812013359706794e-01,-4.678876516418631e-01,6.924135542891322e-01,-9.892680903676974e-01,1.511536996234825e+00,-2.867782867567390e+00,-9.432606850953622e-01,4.007772640880916e+00,-1.764482236796035e+00,1.009822274731261e+00,-3.700652706384655e-01}, +{ -1.510276073242658e-01,3.883561146847172e-01,-5.683422407890195e-01,7.938017578142436e-01,-1.156422611807211e+00,1.932802407308621e+00,-5.035120993368740e+00,2.443896881565707e+00,2.066466982653693e+00,-1.115304519961798e+00,4.008938292240530e-01}, +{ 2.642585805771858e-02,-6.825730525851793e-02,1.009911390885730e-01,-1.436351134772975e-01,2.145221511388394e-01,-3.657159917614403e-01,8.508391063368937e-01,-5.954985884837547e+00,5.721867407448838e+00,-4.822089544822501e-01,1.001575877461909e-01}, +{ 1.187322015951664e-01,-3.021067300680353e-01,4.301314878175481e-01,-5.700038523619086e-01,7.527377483427197e-01,-1.026469409653231e+00,1.501093613655206e+00,-2.502046169420190e+00,-4.443147696501641e+00,7.739281444418389e+00,-1.698202637824023e+00}, +{ -2.091321487600122e-01,5.317407613326466e-01,-7.556958151558237e-01,9.981300308583945e-01,-1.310784403653376e+00,1.770636325037658e+00,-2.547116510595702e+00,4.152681444093236e+00,-9.312392067629428e+00,1.503664755232452e+00,5.178267629239954e+00}, +{ 1.909526490936062e-01,-4.851693119837804e-01,6.882495551549310e-01,-9.060117328351274e-01,1.182991615350919e+00,-1.581891505172337e+00,2.231510222705157e+00,-3.477375396850936e+00,6.677646179835101e+00,-2.581027296927424e+01,2.128937069397670e+01} +}; +#endif +#if p_Nq==11 && p_cubNq==15 +const dfloat c_DI[15][11] = { +{ -2.200240231544827e+01,2.710310020293159e+01,-7.583351722586913e+00,4.014338969964031e+00,-2.591888390675746e+00,1.842709781316756e+00,-1.380216633522641e+00,1.058030157489546e+00,-8.041657356029609e-01,5.670519373231702e-01,-2.232062511885606e-01}, +{ -6.800107388986038e+00,9.741467023315032e-01,8.328921007221915e+00,-3.988679116692627e+00,2.496961637588800e+00,-1.751158304691385e+00,1.302311466422792e+00,-9.942563763178424e-01,7.539015807545787e-01,-5.309171924510642e-01,2.088759848193658e-01}, +{ 1.432703382376904e+00,-8.500129843107528e+00,6.657212259605849e+00,7.650929538543126e-01,-6.159786009412046e-01,4.578727027679059e-01,-3.484788674479889e-01,2.690938315564342e-01,-2.052918022260936e-01,1.450335253184504e-01,-5.712954175704149e-02}, +{ 4.533305471856952e-01,-1.249812908856953e+00,-4.432755287478619e+00,6.571180486397483e+00,-2.039855049372789e+00,1.149469368914131e+00,-7.730061133765186e-01,5.592279643832674e-01,-4.113681089683764e-01,2.850108900006773e-01,-1.114217888279988e-01}, +{ -6.377942714672284e-01,1.855144751604009e+00,-4.166783135546920e+00,3.111774032084458e-01,3.815769673526782e+00,-1.902295456736560e+00,1.229262288641646e+00,-8.733259772416685e-01,6.364374569531670e-01,-4.388412582751426e-01,1.712485253334699e-01}, +{ 2.402150605051074e-01,-6.803341740206179e-01,1.346614234576293e+00,-4.959421098944930e+00,3.696558567724391e+00,6.339207851319366e-01,-4.797054690592688e-01,3.522263160428373e-01,-2.596428770223562e-01,1.798709590804990e-01,-7.030230401389131e-02}, +{ 1.338769506698485e-01,-3.508162435712475e-01,5.377275680133985e-01,-7.838230833801608e-01,-3.284259801406931e+00,4.655322354606395e+00,-1.372884449503164e+00,7.600353466756543e-01,-4.945998291214065e-01,3.229079411546095e-01,-1.234867541369966e-01}, +{ -2.460937500000002e-01,6.469399638321001e-01,-1.006505408576777e+00,1.608127903725515e+00,-3.305176853378325e+00,5.854027869258399e-15,3.305176853378315e+00,-1.608127903725510e+00,1.006505408576776e+00,-6.469399638320995e-01,2.460937500000005e-01}, +{ 1.234867541369979e-01,-3.229079411546105e-01,4.945998291214078e-01,-7.600353466756562e-01,1.372884449503163e+00,-4.655322354606398e+00,3.284259801406926e+00,7.838230833801711e-01,-5.377275680134032e-01,3.508162435712512e-01,-1.338769506698503e-01}, +{ 7.030230401389059e-02,-1.798709590804977e-01,2.596428770223542e-01,-3.522263160428344e-01,4.797054690592655e-01,-6.339207851319326e-01,-3.696558567724391e+00,4.959421098944927e+00,-1.346614234576291e+00,6.803341740206160e-01,-2.402150605051070e-01}, +{ -1.712485253334701e-01,4.388412582751434e-01,-6.364374569531691e-01,8.733259772416694e-01,-1.229262288641645e+00,1.902295456736564e+00,-3.815769673526784e+00,-3.111774032084461e-01,4.166783135546921e+00,-1.855144751604012e+00,6.377942714672294e-01}, +{ 1.114217888279989e-01,-2.850108900006763e-01,4.113681089683746e-01,-5.592279643832651e-01,7.730061133765136e-01,-1.149469368914126e+00,2.039855049372781e+00,-6.571180486397480e+00,4.432755287478634e+00,1.249812908856933e+00,-4.533305471856885e-01}, +{ 5.712954175704310e-02,-1.450335253184531e-01,2.052918022260952e-01,-2.690938315564342e-01,3.484788674479925e-01,-4.578727027679110e-01,6.159786009412096e-01,-7.650929538543276e-01,-6.657212259605837e+00,8.500129843107528e+00,-1.432703382376905e+00}, +{ -2.088759848193664e-01,5.309171924510643e-01,-7.539015807545759e-01,9.942563763178387e-01,-1.302311466422790e+00,1.751158304691390e+00,-2.496961637588801e+00,3.988679116692619e+00,-8.328921007221913e+00,-9.741467023314998e-01,6.800107388986035e+00}, +{ 2.232062511885564e-01,-5.670519373231631e-01,8.041657356029523e-01,-1.058030157489533e+00,1.380216633522607e+00,-1.842709781316715e+00,2.591888390675702e+00,-4.014338969963960e+00,7.583351722586814e+00,-2.710310020293144e+01,2.200240231544817e+01} +}; +#endif +#if p_Nq==11 && p_cubNq==16 +const dfloat c_DI[16][11] = { +{ -2.260264820632327e+01,2.819475901030809e+01,-8.353860390541112e+00,4.474880229590067e+00,-2.901946943131290e+00,2.067447549037405e+00,-1.550297481342331e+00,1.189188717319314e+00,-9.042041522594644e-01,6.377295770209330e-01,-2.510479096783449e-01}, +{ -8.333674345069504e+00,3.406269447096243e+00,7.197135045169065e+00,-3.645386177262361e+00,2.320453278054637e+00,-1.639308946873933e+00,1.223787720860944e+00,-9.363366898117513e-01,7.108825656710042e-01,-5.009692294124132e-01,1.971473315780691e-01}, +{ 8.930916476880222e-01,-8.613556360893636e+00,8.292342041654351e+00,-7.338453189786072e-01,2.329639425152392e-01,-1.099415390200245e-01,6.331747971331463e-02,-4.080510817493127e-02,2.772411213318571e-02,-1.830760240503558e-02,7.016705768121870e-03}, +{ 9.881150111383090e-01,-3.141556054528236e+00,-2.492259603277999e+00,6.277610848020458e+00,-2.560770088759057e+00,1.554018807137612e+00,-1.079408968218363e+00,7.941603723223599e-01,-5.896753060825707e-01,4.105913015600547e-01,-1.608263193125674e-01}, +{ -6.362269946551518e-01,1.944144999805059e+00,-5.540021296154150e+00,2.904001426906332e+00,2.069287942187696e+00,-1.232219898782520e+00,8.445916745030575e-01,-6.167187256259350e-01,4.559433093053339e-01,-3.167311582613495e-01,1.239487207716282e-01}, +{ -6.444526409258855e-02,1.519208174927137e-01,-8.698372611491237e-02,-4.314839762442854e+00,5.052290171967262e+00,-1.075057734715505e+00,5.419090982273128e-01,-3.441958539981752e-01,2.366965856272554e-01,-1.584149874517417e-01,6.112065550123247e-02}, +{ 3.393580055535290e-01,-9.156498775512008e-01,1.541945836798216e+00,-3.076184782493554e+00,-5.635831664690545e-01,3.771053933260807e+00,-1.753559835765634e+00,1.103164603802710e+00,-7.586054448730363e-01,5.083018144586327e-01,-1.962410867214150e-01}, +{ -1.712376599791905e-01,4.585290618103295e-01,-7.527983172982792e-01,1.375290034095513e+00,-4.576715873304633e+00,3.166100307742520e+00,8.418852844509902e-01,-5.783764937150920e-01,4.042886642271864e-01,-2.721952302871352e-01,1.052302222577910e-01}, +{ -1.052302222577901e-01,2.721952302871350e-01,-4.042886642271850e-01,5.783764937150907e-01,-8.418852844509881e-01,-3.166100307742531e+00,4.576715873304637e+00,-1.375290034095508e+00,7.527983172982776e-01,-4.585290618103282e-01,1.712376599791900e-01}, +{ 1.962410867214159e-01,-5.083018144586321e-01,7.586054448730352e-01,-1.103164603802709e+00,1.753559835765628e+00,-3.771053933260797e+00,5.635831664690432e-01,3.076184782493564e+00,-1.541945836798219e+00,9.156498775512028e-01,-3.393580055535306e-01}, +{ -6.112065550123336e-02,1.584149874517427e-01,-2.366965856272577e-01,3.441958539981795e-01,-5.419090982273161e-01,1.075057734715513e+00,-5.052290171967265e+00,4.314839762442846e+00,8.698372611492006e-02,-1.519208174927190e-01,6.444526409258977e-02}, +{ -1.239487207716283e-01,3.167311582613503e-01,-4.559433093053356e-01,6.167187256259375e-01,-8.445916745030589e-01,1.232219898782523e+00,-2.069287942187702e+00,-2.904001426906322e+00,5.540021296154148e+00,-1.944144999805063e+00,6.362269946551522e-01}, +{ 1.608263193125683e-01,-4.105913015600559e-01,5.896753060825727e-01,-7.941603723223618e-01,1.079408968218364e+00,-1.554018807137614e+00,2.560770088759057e+00,-6.277610848020457e+00,2.492259603278003e+00,3.141556054528232e+00,-9.881150111383075e-01}, +{ -7.016705768119302e-03,1.830760240503030e-02,-2.772411213317917e-02,4.080510817492423e-02,-6.331747971330112e-02,1.099415390200065e-01,-2.329639425152156e-01,7.338453189785649e-01,-8.292342041654319e+00,8.613556360893647e+00,-8.930916476880383e-01}, +{ -1.971473315780699e-01,5.009692294124155e-01,-7.108825656710069e-01,9.363366898117554e-01,-1.223787720860948e+00,1.639308946873941e+00,-2.320453278054649e+00,3.645386177262378e+00,-7.197135045169114e+00,-3.406269447096151e+00,8.333674345069447e+00}, +{ 2.510479096783307e-01,-6.377295770209057e-01,9.042041522594272e-01,-1.189188717319273e+00,1.550297481342271e+00,-2.067447549037329e+00,2.901946943131186e+00,-4.474880229589899e+00,8.353860390540857e+00,-2.819475901030773e+01,2.260264820632306e+01} +}; +#endif +#if p_Nq==11 && p_cubNq==17 +const dfloat c_DI[17][11] = { +{ -2.311199944139125e+01,2.912341114024907e+01,-9.013280840994826e+00,4.871599457335479e+00,-3.169714014259004e+00,2.261767680668021e+00,-1.697455899459740e+00,1.302714421543278e+00,-9.908132716316749e-01,6.989270728786607e-01,-2.751563049380192e-01}, +{ -9.761270945893921e+00,5.731493270899265e+00,6.003654546276052e+00,-3.192715293339506e+00,2.062735998866664e+00,-1.466839059808040e+00,1.098804806840370e+00,-8.423603664777705e-01,6.402665768785081e-01,-4.514883172177822e-01,1.777187829761601e-01}, +{ 1.440256183441662e-01,-8.193886855032613e+00,9.384186745756425e+00,-1.935581952822421e+00,9.644929398626571e-01,-6.131612110109852e-01,4.333749295797199e-01,-3.214436799628301e-01,2.396725503994334e-01,-1.672372489333993e-01,6.555816381984664e-02}, +{ 1.405353365428385e+00,-4.914577514313052e+00,-2.804972050113058e-01,5.389145274126108e+00,-2.561055115221626e+00,1.624681437964395e+00,-1.150956794609602e+00,8.556123017210415e-01,-6.389823370710551e-01,4.462982531034529e-01,-1.750216661167424e-01}, +{ -4.145479345236464e-01,1.380023083859565e+00,-6.002924376808184e+00,4.873722313915966e+00,3.593332744502012e-01,-3.497366342845845e-01,2.735682045306941e-01,-2.117329543142386e-01,1.612743809180088e-01,-1.137538649246148e-01,4.477450718083245e-02}, +{ -3.865568819803173e-01,1.073019113611450e+00,-1.969545798239077e+00,-2.551532408784703e+00,5.063332208536345e+00,-1.912778135808609e+00,1.139374576333724e+00,-7.805793717332300e-01,5.583211900008314e-01,-3.813157711874907e-01,1.482612792510766e-01}, +{ 3.475977557435008e-01,-9.597604390852165e-01,1.744429305581980e+00,-4.600308704360772e+00,2.210052586955030e+00,1.943789007942890e+00,-1.136208767711071e+00,7.698575828492040e-01,-5.474412763554009e-01,3.727641133887166e-01,-1.447711649488593e-01}, +{ 6.076218605619488e-02,-1.545391545281731e-01,2.133156022100318e-01,-1.860976452469083e-01,-3.815354261232708e+00,4.589430864899021e+00,-1.037610380079278e+00,5.308544786987640e-01,-3.334581627884265e-01,2.140366768423912e-01,-8.134020483090953e-02}, +{ -2.460937500000012e-01,6.469399638321004e-01,-1.006505408576778e+00,1.608127903725515e+00,-3.305176853378322e+00,-1.249762334911327e-15,3.305176853378319e+00,-1.608127903725511e+00,1.006505408576777e+00,-6.469399638320998e-01,2.460937500000011e-01}, +{ 8.134020483090865e-02,-2.140366768423898e-01,3.334581627884246e-01,-5.308544786987613e-01,1.037610380079270e+00,-4.589430864899022e+00,3.815354261232717e+00,1.860976452469035e-01,-2.133156022100287e-01,1.545391545281722e-01,-6.076218605619532e-02}, +{ 1.447711649488590e-01,-3.727641133887163e-01,5.474412763553996e-01,-7.698575828492032e-01,1.136208767711071e+00,-1.943789007942895e+00,-2.210052586955018e+00,4.600308704360766e+00,-1.744429305581980e+00,9.597604390852167e-01,-3.475977557435002e-01}, +{ -1.482612792510773e-01,3.813157711874929e-01,-5.583211900008345e-01,7.805793717332337e-01,-1.139374576333726e+00,1.912778135808615e+00,-5.063332208536341e+00,2.551532408784685e+00,1.969545798239092e+00,-1.073019113611459e+00,3.865568819803197e-01}, +{ -4.477450718083242e-02,1.137538649246136e-01,-1.612743809180064e-01,2.117329543142351e-01,-2.735682045306898e-01,3.497366342845796e-01,-3.593332744501918e-01,-4.873722313915974e+00,6.002924376808183e+00,-1.380023083859561e+00,4.145479345236446e-01}, +{ 1.750216661167439e-01,-4.462982531034551e-01,6.389823370710571e-01,-8.556123017210423e-01,1.150956794609602e+00,-1.624681437964398e+00,2.561055115221627e+00,-5.389145274126114e+00,2.804972050113227e-01,4.914577514313039e+00,-1.405353365428382e+00}, +{ -6.555816381984747e-02,1.672372489334002e-01,-2.396725503994354e-01,3.214436799628371e-01,-4.333749295797279e-01,6.131612110109985e-01,-9.644929398626771e-01,1.935581952822445e+00,-9.384186745756452e+00,8.193886855032600e+00,-1.440256183441411e-01}, +{ -1.777187829761606e-01,4.514883172177835e-01,-6.402665768785114e-01,8.423603664777770e-01,-1.098804806840375e+00,1.466839059808048e+00,-2.062735998866672e+00,3.192715293339511e+00,-6.003654546276075e+00,-5.731493270899206e+00,9.761270945893880e+00}, +{ 2.751563049380006e-01,-6.989270728786310e-01,9.908132716316360e-01,-1.302714421543236e+00,1.697455899459671e+00,-2.261767680667930e+00,3.169714014258893e+00,-4.871599457335312e+00,9.013280840994582e+00,-2.912341114024873e+01,2.311199944139105e+01} +}; +#endif +#if p_Nq==12 && p_cubNq==12 +const dfloat c_DI[12][12] = { +{ -2.159503200751458e+01,2.389798251670890e+01,-3.198482474248066e+00,1.399071111578970e+00,-8.369550615936659e-01,5.743430283405755e-01,-4.239298953051231e-01,3.255878964037814e-01,-2.538962198259964e-01,1.951852292841680e-01,-1.385725670022008e-01,5.469844317323826e-02}, +{ -2.293047628457960e-01,-9.431369682503513e+00,1.153886852455855e+01,-2.770377456559300e+00,1.445215812349903e+00,-9.400009062499806e-01,6.757150601726184e-01,-5.113540714962935e-01,3.952708951834754e-01,-3.022708780025899e-01,2.139684605826388e-01,-8.436099518971753e-02}, +{ 5.684377286548670e-01,-1.585515676366951e+00,-5.132079935564583e+00,7.761687543692362e+00,-2.457424198188892e+00,1.394947028600823e+00,-9.454412160872809e-01,6.937677587512903e-01,-5.268833987330053e-01,3.987568373685790e-01,-2.806582878823978e-01,1.104058157551894e-01}, +{ -4.464366543196110e-01,1.238397246648831e+00,-2.267704864960194e+00,-2.991410181003481e+00,5.893902846122392e+00,-2.222252564443155e+00,1.328536709828026e+00,-9.205055770530407e-01,6.779179776196476e-01,-5.042228134256328e-01,3.515783017324099e-01,-1.378004267461918e-01}, +{ 3.397776157379837e-01,-9.107486933079004e-01,1.501985736967066e+00,-2.800274766376731e+00,-1.599077484387071e+00,4.750326066275143e+00,-2.029212906041598e+00,1.256032853661453e+00,-8.771175439188956e-01,6.342621372257446e-01,-4.358113360218862e-01,1.698583201866922e-01}, +{ -2.642967146299083e-01,6.932373140168915e-01,-1.071933283449124e+00,1.689368444407062e+00,-3.333405439456569e+00,-5.054953393704105e-01,3.951803163902397e+00,-1.857280221933894e+00,1.174559022897939e+00,-8.104659287243989e-01,5.441912628065194e-01,-2.102822804665029e-01}, +{ 2.102822804665037e-01,-5.441912628065204e-01,8.104659287244032e-01,-1.174559022897942e+00,1.857280221933894e+00,-3.951803163902389e+00,5.054953393703954e-01,3.333405439456581e+00,-1.689368444407065e+00,1.071933283449122e+00,-6.932373140168920e-01,2.642967146299102e-01}, +{ -1.698583201866895e-01,4.358113360218874e-01,-6.342621372257464e-01,8.771175439188954e-01,-1.256032853661452e+00,2.029212906041597e+00,-4.750326066275136e+00,1.599077484387049e+00,2.800274766376744e+00,-1.501985736967065e+00,9.107486933079022e-01,-3.397776157379859e-01}, +{ 1.378004267461915e-01,-3.515783017324110e-01,5.042228134256345e-01,-6.779179776196478e-01,9.205055770530401e-01,-1.328536709828025e+00,2.222252564443159e+00,-5.893902846122396e+00,2.991410181003475e+00,2.267704864960204e+00,-1.238397246648837e+00,4.464366543196133e-01}, +{ -1.104058157551907e-01,2.806582878823983e-01,-3.987568373685802e-01,5.268833987330042e-01,-6.937677587512899e-01,9.454412160872819e-01,-1.394947028600824e+00,2.457424198188901e+00,-7.761687543692365e+00,5.132079935564581e+00,1.585515676366951e+00,-5.684377286548672e-01}, +{ 8.436099518971868e-02,-2.139684605826394e-01,3.022708780025893e-01,-3.952708951834735e-01,5.113540714962922e-01,-6.757150601726206e-01,9.400009062499778e-01,-1.445215812349888e+00,2.770377456559259e+00,-1.153886852455850e+01,9.431369682503531e+00,2.293047628457561e-01}, +{ -5.469844317323691e-02,1.385725670021923e-01,-1.951852292841556e-01,2.538962198259815e-01,-3.255878964037556e-01,4.239298953050889e-01,-5.743430283405232e-01,8.369550615935842e-01,-1.399071111578837e+00,3.198482474247802e+00,-2.389798251670858e+01,2.159503200751444e+01} +}; +#endif +#if p_Nq==12 && p_cubNq==13 +const dfloat c_DI[13][12] = { +{ -2.301362256000733e+01,2.642662266520995e+01,-4.897989699553274e+00,2.362551873228266e+00,-1.473538796675570e+00,1.032812664743196e+00,-7.715092359995732e-01,5.968388484596167e-01,-4.675265811859859e-01,3.604211703901552e-01,-2.562889101299577e-01,1.012285615205099e-01}, +{ -1.866769117015190e+00,-7.696640046848491e+00,1.220770685994545e+01,-4.033760098050948e+00,2.289175083283479e+00,-1.543658016266994e+00,1.130723980389729e+00,-8.650373307848348e-01,6.730937900358549e-01,-5.167990447953282e-01,3.666540147647290e-01,-1.446900746574543e-01}, +{ 1.332463127515559e+00,-4.359228260719026e+00,-2.196962492283609e+00,7.177313148430715e+00,-3.087636782599147e+00,1.906358390271739e+00,-1.340774914065847e+00,1.003383179880723e+00,-7.707372446494668e-01,5.872456797963386e-01,-4.148628213836690e-01,1.634389898056876e-01}, +{ -7.588327062962945e-01,2.217060120136561e+00,-5.084414022316689e+00,7.045741289361711e-01,4.259316568882839e+00,-2.170272509952956e+00,1.418808073585013e+00,-1.023791612845124e+00,7.705287349452576e-01,-5.801784865372569e-01,4.072242444667414e-01,-1.600225330042619e-01}, +{ 3.983851800330533e-01,-1.100006538479501e+00,2.000050525883394e+00,-5.288660997170219e+00,2.559952906755030e+00,2.213951690253019e+00,-1.302401247141673e+00,8.932278312600376e-01,-6.541098756259441e-01,4.849497371562013e-01,-3.375530947179473e-01,1.322138817945498e-01}, +{ -1.692214943230159e-01,4.545834351172733e-01,-7.534687974452766e-01,1.410411052665502e+00,-5.136640473108005e+00,3.819825606977898e+00,6.667802453152674e-01,-5.060427172589054e-01,3.730739064754093e-01,-2.759275366737478e-01,1.915752254176872e-01,-7.494845316008655e-02}, +{ 2.050781249999944e-02,-5.650257996325492e-02,9.865545322619218e-02,-1.917528870604694e-01,5.231974657907141e-01,-4.654771388963050e+00,4.654771388963056e+00,-5.231974657907223e-01,1.917528870604736e-01,-9.865545322619465e-02,5.650257996325639e-02,-2.050781250000121e-02}, +{ 7.494845316008780e-02,-1.915752254176915e-01,2.759275366737572e-01,-3.730739064754215e-01,5.060427172589196e-01,-6.667802453152905e-01,-3.819825606977876e+00,5.136640473108006e+00,-1.410411052665513e+00,7.534687974452814e-01,-4.545834351172771e-01,1.692214943230178e-01}, +{ -1.322138817945486e-01,3.375530947179478e-01,-4.849497371562030e-01,6.541098756259455e-01,-8.932278312600369e-01,1.302401247141671e+00,-2.213951690253018e+00,-2.559952906755038e+00,5.288660997170221e+00,-2.000050525883385e+00,1.100006538479499e+00,-3.983851800330545e-01}, +{ 1.600225330042609e-01,-4.072242444667420e-01,5.801784865372587e-01,-7.705287349452566e-01,1.023791612845121e+00,-1.418808073585010e+00,2.170272509952955e+00,-4.259316568882840e+00,-7.045741289361676e-01,5.084414022316682e+00,-2.217060120136562e+00,7.588327062962984e-01}, +{ -1.634389898056878e-01,4.148628213836692e-01,-5.872456797963380e-01,7.707372446494634e-01,-1.003383179880722e+00,1.340774914065849e+00,-1.906358390271743e+00,3.087636782599160e+00,-7.177313148430716e+00,2.196962492283598e+00,4.359228260719041e+00,-1.332463127515573e+00}, +{ 1.446900746574546e-01,-3.666540147647332e-01,5.167990447953341e-01,-6.730937900358589e-01,8.650373307848420e-01,-1.130723980389742e+00,1.543658016267008e+00,-2.289175083283495e+00,4.033760098050965e+00,-1.220770685994544e+01,7.696640046848454e+00,1.866769117015212e+00}, +{ -1.012285615205127e-01,2.562889101299728e-01,-3.604211703901795e-01,4.675265811860208e-01,-5.968388484596591e-01,7.715092359996304e-01,-1.032812664743256e+00,1.473538796675635e+00,-2.362551873228355e+00,4.897989699553407e+00,-2.642662266521023e+01,2.301362256000753e+01} +}; +#endif +#if p_Nq==12 && p_cubNq==14 +const dfloat c_DI[14][12] = { +{ -2.419593916647221e+01,2.854709575505056e+01,-6.345575322751023e+00,3.197889898241711e+00,-2.029112562461811e+00,1.434196147446836e+00,-1.076331823175222e+00,8.349640785909891e-01,-6.551854075230181e-01,5.056247887957210e-01,-3.597564337059446e-01,1.421300479634053e-01}, +{ -3.657454987457374e+00,-5.408778934406012e+00,1.210307478640919e+01,-4.717868846971605e+00,2.793588672780961e+00,-1.917774904012151e+00,1.417780420498617e+00,-1.090405793966400e+00,8.511808315411034e-01,-6.548073756237077e-01,4.650753256309265e-01,-1.836091944235458e-01}, +{ 1.850900154407183e+00,-6.822911798376053e+00,1.089922163921211e+00,5.649152300424293e+00,-2.856509263445854e+00,1.851651306064716e+00,-1.331337549256852e+00,1.008204157152845e+00,-7.798124138197126e-01,5.966032654014448e-01,-4.224333851880591e-01,1.665710627148397e-01}, +{ -7.004877281156379e-01,2.176023498731817e+00,-6.741639870259594e+00,4.090916965464022e+00,1.880941915336903e+00,-1.186004668117839e+00,8.327580903137239e-01,-6.216015850340874e-01,4.765582810838216e-01,-3.626416038918825e-01,2.559998998970544e-01,-1.008231954083003e-01}, +{ 1.077002754702314e-01,-3.275947356315784e-01,8.011067751226837e-01,-5.544254741645162e+00,5.296857746826570e+00,-4.120194070738548e-01,1.033273986843518e-01,-3.365747316514952e-02,1.093738008145410e-02,-2.538570179065693e-03,-2.891899739581326e-04,4.245414834775560e-04}, +{ 1.635232084914993e-01,-4.291956178205460e-01,6.616732875887920e-01,-9.869562168468826e-01,-3.621276611807425e+00,5.266435304926658e+00,-1.601505247018959e+00,9.011918082354090e-01,-6.032488287108813e-01,4.268181166963213e-01,-2.900120750830921e-01,1.125528713491075e-01}, +{ -2.423199416230649e-01,6.338999128924608e-01,-9.727135105800889e-01,1.505321952595841e+00,-2.806390454209420e+00,-1.257033657388715e+00,4.335671389137603e+00,-1.898045970389292e+00,1.175044397723309e+00,-8.032254816671712e-01,5.369255906775100e-01,-2.071342271689710e-01}, +{ 2.071342271689705e-01,-5.369255906775111e-01,8.032254816671753e-01,-1.175044397723312e+00,1.898045970389293e+00,-4.335671389137599e+00,1.257033657388707e+00,2.806390454209426e+00,-1.505321952595841e+00,9.727135105800863e-01,-6.338999128924612e-01,2.423199416230661e-01}, +{ -1.125528713491076e-01,2.900120750830935e-01,-4.268181166963225e-01,6.032488287108809e-01,-9.011918082354080e-01,1.601505247018959e+00,-5.266435304926659e+00,3.621276611807421e+00,9.869562168468857e-01,-6.616732875887917e-01,4.291956178205477e-01,-1.635232084914998e-01}, +{ -4.245414834777583e-04,2.891899739593514e-04,2.538570179062217e-03,-1.093738008144756e-02,3.365747316514323e-02,-1.033273986843458e-01,4.120194070738452e-01,-5.296857746826569e+00,5.544254741645168e+00,-8.011067751226849e-01,3.275947356315800e-01,-1.077002754702336e-01}, +{ 1.008231954083001e-01,-2.559998998970528e-01,3.626416038918812e-01,-4.765582810838190e-01,6.216015850340820e-01,-8.327580903137158e-01,1.186004668117830e+00,-1.880941915336885e+00,-4.090916965464034e+00,6.741639870259586e+00,-2.176023498731811e+00,7.004877281156391e-01}, +{ -1.665710627148402e-01,4.224333851880600e-01,-5.966032654014453e-01,7.798124138197089e-01,-1.008204157152843e+00,1.331337549256853e+00,-1.851651306064720e+00,2.856509263445870e+00,-5.649152300424315e+00,-1.089922163921177e+00,6.822911798376040e+00,-1.850900154407192e+00}, +{ 1.836091944235445e-01,-4.650753256309307e-01,6.548073756237127e-01,-8.511808315411067e-01,1.090405793966404e+00,-1.417780420498625e+00,1.917774904012160e+00,-2.793588672780969e+00,4.717868846971604e+00,-1.210307478640916e+01,5.408778934405973e+00,3.657454987457396e+00}, +{ -1.421300479634041e-01,3.597564337059452e-01,-5.056247887957264e-01,6.551854075230286e-01,-8.349640785910000e-01,1.076331823175237e+00,-1.434196147446840e+00,2.029112562461801e+00,-3.197889898241681e+00,6.345575322750953e+00,-2.854709575505057e+01,2.419593916647225e+01} +}; +#endif +#if p_Nq==12 && p_cubNq==15 +const dfloat c_DI[15][12] = { +{ -2.518855360260213e+01,3.033587365030155e+01,-7.581418532764062e+00,3.920624821355077e+00,-2.512221654979312e+00,1.784066407321071e+00,-1.342383573326817e+00,1.042964084123512e+00,-8.191824699709538e-01,6.325573234999174e-01,-4.502196890981255e-01,1.778932361402812e-01}, +{ -5.489631645328982e+00,-2.827548209241641e+00,1.147156654777562e+01,-4.961732643235849e+00,3.021030934122368e+00,-2.098647269668414e+00,1.561061461764671e+00,-1.204858470128413e+00,9.425434911513495e-01,-7.260377210459225e-01,5.160443664112208e-01,-2.037908425760076e-01}, +{ 2.037142120807289e+00,-8.671654188135710e+00,4.196471871021245e+00,3.731968178538228e+00,-2.132319258761834e+00,1.438447327791397e+00,-1.053632699407742e+00,8.060194467605966e-01,-6.271472187147635e-01,4.815092324037017e-01,-3.416118987335573e-01,1.348070864311474e-01}, +{ -3.210445375484813e-01,1.157919546659124e+00,-6.984204248310085e+00,6.431724510334591e+00,-3.102541922489282e-01,1.083948489400847e-02,3.738008390937644e-02,-4.417521177954289e-02,4.076592084341753e-02,-3.404566388695535e-02,2.519774133250786e-02,-1.010343419903259e-02}, +{ -3.171380202059698e-01,8.624914258218703e-01,-1.451067924471326e+00,-3.833737123161739e+00,6.029682419023865e+00,-1.976752781160961e+00,1.137039299865720e+00,-7.744170111141836e-01,5.651695853833401e-01,-4.182198292532732e-01,2.908109356972665e-01,-1.138609764246083e-01}, +{ 4.007973450843078e-01,-1.083592639546634e+00,1.837280091540857e+00,-3.756791311888821e+00,-2.804649139842622e-01,4.109565978422498e+00,-1.973711243653667e+00,1.265630601206678e+00,-8.980875433419440e-01,6.548520489815600e-01,-4.518926220838229e-01,1.764142092632513e-01}, +{ -2.371235996506388e-01,6.307749070985725e-01,-1.016058811190127e+00,1.771373084748300e+00,-4.968404109137880e+00,2.781644338581938e+00,1.641503340230468e+00,-1.009875647229611e+00,6.978884864334737e-01,-5.014148196372042e-01,3.433177910908803e-01,-1.336249613381729e-01}, +{ 2.050781249999988e-02,-5.650257996325408e-02,9.865545322619115e-02,-1.917528870604681e-01,5.231974657907126e-01,-4.654771388963050e+00,4.654771388963057e+00,-5.231974657907241e-01,1.917528870604751e-01,-9.865545322619507e-02,5.650257996325635e-02,-2.050781250000077e-02}, +{ 1.336249613381708e-01,-3.433177910908782e-01,5.014148196372038e-01,-6.978884864334726e-01,1.009875647229606e+00,-1.641503340230454e+00,-2.781644338581956e+00,4.968404109137887e+00,-1.771373084748295e+00,1.016058811190120e+00,-6.307749070985693e-01,2.371235996506378e-01}, +{ -1.764142092632500e-01,4.518926220838234e-01,-6.548520489815620e-01,8.980875433419441e-01,-1.265630601206676e+00,1.973711243653665e+00,-4.109565978422495e+00,2.804649139842533e-01,3.756791311888823e+00,-1.837280091540850e+00,1.083592639546632e+00,-4.007973450843089e-01}, +{ 1.138609764246081e-01,-2.908109356972673e-01,4.182198292532742e-01,-5.651695853833378e-01,7.744170111141806e-01,-1.137039299865719e+00,1.976752781160963e+00,-6.029682419023871e+00,3.833737123161741e+00,1.451067924471329e+00,-8.624914258218735e-01,3.171380202059722e-01}, +{ 1.010343419903337e-02,-2.519774133250957e-02,3.404566388695958e-02,-4.076592084342437e-02,4.417521177954764e-02,-3.738008390938030e-02,-1.083948489400102e-02,3.102541922489176e-01,-6.431724510334570e+00,6.984204248310075e+00,-1.157919546659134e+00,3.210445375484867e-01}, +{ -1.348070864311489e-01,3.416118987335608e-01,-4.815092324037061e-01,6.271472187147641e-01,-8.060194467605980e-01,1.053632699407745e+00,-1.438447327791403e+00,2.132319258761851e+00,-3.731968178538254e+00,-4.196471871021207e+00,8.671654188135701e+00,-2.037142120807306e+00}, +{ 2.037908425760081e-01,-5.160443664112230e-01,7.260377210459237e-01,-9.425434911513462e-01,1.204858470128412e+00,-1.561061461764676e+00,2.098647269668418e+00,-3.021030934122373e+00,4.961732643235845e+00,-1.147156654777561e+01,2.827548209241638e+00,5.489631645328979e+00}, +{ -1.778932361402720e-01,4.502196890981135e-01,-6.325573234999065e-01,8.191824699709396e-01,-1.042964084123488e+00,1.342383573326793e+00,-1.784066407321031e+00,2.512221654979241e+00,-3.920624821354949e+00,7.581418532763824e+00,-3.033587365030133e+01,2.518855360260206e+01} +}; +#endif +#if p_Nq==12 && p_cubNq==16 +const dfloat c_DI[16][12] = { +{ -2.602804946669998e+01,3.185448215687269e+01,-8.640520195826415e+00,4.546447351274946e+00,-2.932204846323374e+00,2.088796602548201e+00,-1.574350031347373e+00,1.224427872224813e+00,-9.623115494276574e-01,7.433640805001352e-01,-5.292007410310264e-01,2.091187672350376e-01}, +{ -7.291186361524165e+00,-1.325603909954572e-01,1.050277552899150e+01,-4.889094029400553e+00,3.038965815788834e+00,-2.130022121062807e+00,1.591791712610407e+00,-1.231885653435558e+00,9.652624036876611e-01,-7.442774720964558e-01,5.293044220074876e-01,-2.090738545708968e-01}, +{ 1.886109523183745e+00,-9.810614167454375e+00,6.844940908732784e+00,1.788529654369350e+00,-1.198964353672891e+00,8.510589660846518e-01,-6.382685080990881e-01,4.945783674074862e-01,-3.877319827803772e-01,2.990316037968966e-01,-2.126812802972377e-01,8.401126872905397e-02}, +{ 2.423287673218983e-01,-5.315676087774092e-01,-6.016873753933855e+00,7.582863461904260e+00,-1.895055370201439e+00,1.006692860174667e+00,-6.622694882342665e-01,4.782637027330847e-01,-3.598471326156594e-01,2.708366271063756e-01,-1.900395450778389e-01,7.466747960018230e-02}, +{ -6.546783867077575e-01,1.868624152560652e+00,-3.861241442283015e+00,-1.070460633028405e+00,5.199157711506412e+00,-2.363759868938490e+00,1.491308777826263e+00,-1.058590220172884e+00,7.897075823978027e-01,-5.916442697634618e-01,4.141478841687750e-01,-1.625712875658896e-01}, +{ 3.766305672629717e-01,-1.045120763149651e+00,1.932351311874111e+00,-5.456565625878811e+00,3.030213736246175e+00,1.834520848965908e+00,-1.124833231767477e+00,7.824303595324875e-01,-5.768277792936041e-01,4.291799420591609e-01,-2.992896098707442e-01,1.173102440194732e-01}, +{ 9.505372101353327e-03,-1.678353719865552e-02,-1.495289571368137e-02,2.361151636993659e-01,-4.639110479516809e+00,5.014699202718964e+00,-8.318131447179224e-01,3.814101956494088e-01,-2.306982912414800e-01,1.544112834063821e-01,-1.018694621013086e-01,3.908659291438249e-02}, +{ -2.200336559430202e-01,5.743979454657047e-01,-8.761607610050056e-01,1.336916004824440e+00,-2.387848515048661e+00,-1.806106191205421e+00,4.575372378485969e+00,-1.882859827956986e+00,1.144951781612648e+00,-7.765707179448837e-01,5.171931289952326e-01,-1.992515702800156e-01}, +{ 1.992515702800158e-01,-5.171931289952342e-01,7.765707179448880e-01,-1.144951781612651e+00,1.882859827956987e+00,-4.575372378485965e+00,1.806106191205412e+00,2.387848515048669e+00,-1.336916004824442e+00,8.761607610050038e-01,-5.743979454657056e-01,2.200336559430222e-01}, +{ -3.908659291438252e-02,1.018694621013115e-01,-1.544112834063844e-01,2.306982912414815e-01,-3.814101956494128e-01,8.318131447179311e-01,-5.014699202718971e+00,4.639110479516800e+00,-2.361151636993506e-01,1.495289571367249e-02,1.678353719866107e-02,-9.505372101356409e-03}, +{ -1.173102440194727e-01,2.992896098707439e-01,-4.291799420591618e-01,5.768277792936046e-01,-7.824303595324853e-01,1.124833231767472e+00,-1.834520848965900e+00,-3.030213736246192e+00,5.456565625878816e+00,-1.932351311874100e+00,1.045120763149647e+00,-3.766305672629717e-01}, +{ 1.625712875658899e-01,-4.141478841687755e-01,5.916442697634636e-01,-7.897075823978017e-01,1.058590220172879e+00,-1.491308777826258e+00,2.363759868938490e+00,-5.199157711506419e+00,1.070460633028417e+00,3.861241442283005e+00,-1.868624152560651e+00,6.546783867077602e-01}, +{ -7.466747960018191e-02,1.900395450778392e-01,-2.708366271063756e-01,3.598471326156565e-01,-4.782637027330843e-01,6.622694882342695e-01,-1.006692860174670e+00,1.895055370201449e+00,-7.582863461904263e+00,6.016873753933850e+00,5.315676087774096e-01,-2.423287673218989e-01}, +{ -8.401126872905618e-02,2.126812802972398e-01,-2.990316037969004e-01,3.877319827803817e-01,-4.945783674074908e-01,6.382685080990935e-01,-8.510589660846668e-01,1.198964353672924e+00,-1.788529654369410e+00,-6.844940908732706e+00,9.810614167454361e+00,-1.886109523183771e+00}, +{ 2.090738545708959e-01,-5.293044220074901e-01,7.442774720964587e-01,-9.652624036876610e-01,1.231885653435558e+00,-1.591791712610409e+00,2.130022121062809e+00,-3.038965815788842e+00,4.889094029400566e+00,-1.050277552899155e+01,1.325603909955589e-01,7.291186361524101e+00}, +{ -2.091187672350259e-01,5.292007410309938e-01,-7.433640805000956e-01,9.623115494276064e-01,-1.224427872224747e+00,1.574350031347305e+00,-2.088796602548106e+00,2.932204846323224e+00,-4.546447351274705e+00,8.640520195825985e+00,-3.185448215687219e+01,2.602804946669975e+01} +}; +#endif +#if p_Nq==12 && p_cubNq==17 +const dfloat c_DI[17][12] = { +{ -2.674314166922277e+01,3.315203869403979e+01,-9.552312433598368e+00,5.089675007520873e+00,-3.297914603993784e+00,2.354551208761713e+00,-1.776816610645822e+00,1.382893019089030e+00,-1.087338850904019e+00,8.401751639132619e-01,-5.982132894753623e-01,2.364043645154568e-01}, +{ -9.018736431022042e+00,2.558344692553827e+00,9.334324643426173e+00,-4.598807672294201e+00,2.906447267038657e+00,-2.052003268280913e+00,1.539351273601015e+00,-1.193940894028693e+00,9.367910069241207e-01,-7.229169832750713e-01,5.143509719894809e-01,-2.032046066323541e-01}, +{ 1.436214799744993e+00,-1.026843641376934e+01,8.931311932881096e+00,2.242002295620965e-02,-2.412968059767783e-01,2.197022614725492e-01,-1.813770877575941e-01,1.475080375205895e-01,-1.188369668339247e-01,9.311780951615677e-02,-6.680779213109805e-02,2.648020237714083e-02}, +{ 8.469701189924193e-01,-2.537463644992815e+00,-4.214473975174575e+00,7.710081987934941e+00,-2.793555624145703e+00,1.644552669318778e+00,-1.132581930558786e+00,8.381695291775776e-01,-6.396810055955846e-01,4.855307294140758e-01,-3.422812250151394e-01,1.347323706448105e-01}, +{ -7.810875476998160e-01,2.321047398881167e+00,-5.764690107995042e+00,1.869492906045589e+00,3.524889738461664e+00,-1.917373324932325e+00,1.280800440665193e+00,-9.336364335610177e-01,7.065656154220435e-01,-5.336948754187940e-01,3.752376245344278e-01,-1.475514344030895e-01}, +{ 1.243396741701629e-01,-3.729893908335967e-01,8.788393768479567e-01,-5.578374236680411e+00,5.223105469230855e+00,-3.196315648918318e-01,4.744175181658745e-02,4.961505060943196e-03,-1.742495915269203e-02,1.851630940223445e-02,-1.495349482311823e-02,6.169559852910059e-03}, +{ 2.865780995170885e-01,-7.637695810953423e-01,1.236812180947091e+00,-2.173977755685077e+00,-2.362040182380873e+00,5.029925776474911e+00,-1.961085983679002e+00,1.182506658994656e+00,-8.161965138907826e-01,5.866700949283185e-01,-4.018675632842682e-01,1.564447691532819e-01}, +{ -2.680820404477894e-01,7.095438007027840e-01,-1.126229654930184e+00,1.892310923656604e+00,-4.637275799771421e+00,1.853892034434262e+00,2.397121793523736e+00,-1.353979303251474e+00,9.089162217798104e-01,-6.443517854176302e-01,4.383263888431206e-01,-1.701925791218178e-01}, +{ 2.050781250000146e-02,-5.650257996325581e-02,9.865545322619353e-02,-1.917528870604717e-01,5.231974657907180e-01,-4.654771388963053e+00,4.654771388963054e+00,-5.231974657907194e-01,1.917528870604717e-01,-9.865545322619335e-02,5.650257996325510e-02,-2.050781249999968e-02}, +{ 1.701925791218180e-01,-4.383263888431214e-01,6.443517854176347e-01,-9.089162217798153e-01,1.353979303251476e+00,-2.397121793523740e+00,-1.853892034434256e+00,4.637275799771420e+00,-1.892310923656602e+00,1.126229654930179e+00,-7.095438007027830e-01,2.680820404477899e-01}, +{ -1.564447691532810e-01,4.018675632842688e-01,-5.866700949283192e-01,8.161965138907804e-01,-1.182506658994651e+00,1.961085983678998e+00,-5.029925776474913e+00,2.362040182380880e+00,2.173977755685069e+00,-1.236812180947083e+00,7.637695810953399e-01,-2.865780995170885e-01}, +{ -6.169559852908374e-03,1.495349482311577e-02,-1.851630940223177e-02,1.742495915268944e-02,-4.961505060938724e-03,-4.744175181659633e-02,3.196315648918459e-01,-5.223105469230872e+00,5.578374236680406e+00,-8.788393768479378e-01,3.729893908335874e-01,-1.243396741701595e-01}, +{ 1.475514344030888e-01,-3.752376245344267e-01,5.336948754187942e-01,-7.065656154220417e-01,9.336364335610120e-01,-1.280800440665185e+00,1.917373324932317e+00,-3.524889738461650e+00,-1.869492906045605e+00,5.764690107995043e+00,-2.321047398881167e+00,7.810875476998200e-01}, +{ -1.347323706448122e-01,3.422812250151396e-01,-4.855307294140749e-01,6.396810055955806e-01,-8.381695291775747e-01,1.132581930558784e+00,-1.644552669318775e+00,2.793555624145706e+00,-7.710081987934942e+00,4.214473975174588e+00,2.537463644992800e+00,-8.469701189924193e-01}, +{ -2.648020237713822e-02,6.680779213109071e-02,-9.311780951614727e-02,1.188369668339126e-01,-1.475080375205745e-01,1.813770877575746e-01,-2.197022614725294e-01,2.412968059767553e-01,-2.242002295617062e-02,-8.931311932881128e+00,1.026843641376935e+01,-1.436214799744992e+00}, +{ 2.032046066323527e-01,-5.143509719894851e-01,7.229169832750741e-01,-9.367910069241184e-01,1.193940894028692e+00,-1.539351273601018e+00,2.052003268280920e+00,-2.906447267038673e+00,4.598807672294215e+00,-9.334324643426205e+00,-2.558344692553763e+00,9.018736431022008e+00}, +{ -2.364043645154510e-01,5.982132894753391e-01,-8.401751639132283e-01,1.087338850903976e+00,-1.382893019088976e+00,1.776816610645759e+00,-2.354551208761610e+00,3.297914603993620e+00,-5.089675007520620e+00,9.552312433597937e+00,-3.315203869403931e+01,2.674314166922256e+01} +}; +#endif +#if p_Nq==12 && p_cubNq==18 +const dfloat c_DI[18][12] = { +{ -2.735644086785004e+01,3.426770836601935e+01,-1.034113683985214e+01,5.562781123708223e+00,-3.617240383212857e+00,2.586888347737594e+00,-1.953945080437663e+00,1.521583140113111e+00,-1.196791392420759e+00,9.249395099007629e-01,-6.586435124089721e-01,2.602975887033827e-01}, +{ -1.064852181827273e+01,5.171634054616141e+00,8.062518139497369e+00,-4.165564463251330e+00,2.670688746871660e+00,-1.897542264880399e+00,1.428245716461737e+00,-1.109920281379742e+00,8.718986656618960e-01,-6.733258292156390e-01,4.792619741548710e-01,-1.893726402638302e-01}, +{ 7.433249134456678e-01,-1.013356968752992e+01,1.045398731332600e+01,-1.472095511042380e+00,6.351813543351548e-01,-3.759327027477634e-01,2.562198663960842e-01,-1.877447083583406e-01,1.422280570931735e-01,-1.074146136729782e-01,7.549763113367683e-02,-2.968191237837401e-02}, +{ 1.381950880850809e+00,-4.561656622741931e+00,-1.952411968829685e+00,7.086700431624927e+00,-3.095356173837579e+00,1.919516121756482e+00,-1.352671170045419e+00,1.013343000202504e+00,-7.788597106562334e-01,5.936474603728267e-01,-4.194687472209417e-01,1.652664985242398e-01}, +{ -6.726048581797673e-01,2.108237078467295e+00,-6.835816371812299e+00,4.403532556858471e+00,1.621502903114508e+00,-1.055721658470754e+00,7.490598996717384e-01,-5.618808481400239e-01,4.319265381023871e-01,-3.291809925418410e-01,2.325715469598432e-01,-9.162579402955744e-02}, +{ -2.265473634940418e-01,6.035058794668492e-01,-9.231713839907669e-01,-4.325370485898518e+00,6.027171770269193e+00,-1.745594914375054e+00,9.700144069140716e-01,-6.505041289639247e-01,4.708389860028088e-01,-3.467989294529124e-01,2.405444786031329e-01,-9.408831508083698e-02}, +{ 4.211507981537168e-01,-1.145487611937363e+00,1.980792469231123e+00,-4.338735053438220e+00,6.439970101705050e-01,3.564899324301383e+00,-1.831010229968422e+00,1.199581133750137e+00,-8.596983469527146e-01,6.301273418268633e-01,-4.360031721367862e-01,1.703863369997776e-01}, +{ -1.019479449229600e-01,2.781477107144111e-01,-4.817259090771830e-01,9.989087561057179e-01,-5.054896017653371e+00,4.429052801792825e+00,-7.732984422111521e-03,-1.234408644236187e-01,1.173103844410865e-01,-9.521282614413301e-02,6.888594390976745e-02,-2.734905032043179e-02}, +{ -1.996069592542953e-01,5.201658050818679e-01,-7.894995526132471e-01,1.190665551323585e+00,-2.053036050182572e+00,-2.219047937780021e+00,4.729007876106516e+00,-1.841894338159131e+00,1.102751454323570e+00,-7.429308853793515e-01,4.932196493398087e-01,-1.897946128067294e-01}, +{ 1.897946128067289e-01,-4.932196493398105e-01,7.429308853793559e-01,-1.102751454323572e+00,1.841894338159132e+00,-4.729007876106512e+00,2.219047937780010e+00,2.053036050182581e+00,-1.190665551323587e+00,7.894995526132460e-01,-5.201658050818692e-01,1.996069592542966e-01}, +{ 2.734905032043264e-02,-6.888594390976946e-02,9.521282614413823e-02,-1.173103844410941e-01,1.234408644236273e-01,7.732984422099344e-03,-4.429052801792816e+00,5.054896017653373e+00,-9.989087561057245e-01,4.817259090771855e-01,-2.781477107144131e-01,1.019479449229607e-01}, +{ -1.703863369997775e-01,4.360031721367870e-01,-6.301273418268657e-01,8.596983469527163e-01,-1.199581133750137e+00,1.831010229968423e+00,-3.564899324301388e+00,-6.439970101705005e-01,4.338735053438214e+00,-1.980792469231113e+00,1.145487611937361e+00,-4.211507981537191e-01}, +{ 9.408831508083720e-02,-2.405444786031341e-01,3.467989294529129e-01,-4.708389860028065e-01,6.505041289639236e-01,-9.700144069140726e-01,1.745594914375056e+00,-6.027171770269202e+00,4.325370485898518e+00,9.231713839907767e-01,-6.035058794668552e-01,2.265473634940460e-01}, +{ 9.162579402955584e-02,-2.325715469598423e-01,3.291809925418414e-01,-4.319265381023883e-01,5.618808481400226e-01,-7.490598996717340e-01,1.055721658470751e+00,-1.621502903114506e+00,-4.403532556858463e+00,6.835816371812285e+00,-2.108237078467294e+00,6.726048581797699e-01}, +{ -1.652664985242371e-01,4.194687472209417e-01,-5.936474603728271e-01,7.788597106562291e-01,-1.013343000202501e+00,1.352671170045418e+00,-1.919516121756483e+00,3.095356173837595e+00,-7.086700431624961e+00,1.952411968829748e+00,4.561656622741886e+00,-1.381950880850806e+00}, +{ 2.968191237837692e-02,-7.549763113368130e-02,1.074146136729831e-01,-1.422280570931808e-01,1.877447083583498e-01,-2.562198663960985e-01,3.759327027477756e-01,-6.351813543351622e-01,1.472095511042385e+00,-1.045398731332599e+01,1.013356968752992e+01,-7.433249134456729e-01}, +{ 1.893726402638311e-01,-4.792619741548726e-01,6.733258292156379e-01,-8.718986656618928e-01,1.109920281379742e+00,-1.428245716461737e+00,1.897542264880399e+00,-2.670688746871662e+00,4.165564463251320e+00,-8.062518139497335e+00,-5.171634054616217e+00,1.064852181827279e+01}, +{ -2.602975887033916e-01,6.586435124089898e-01,-9.249395099007869e-01,1.196791392420787e+00,-1.521583140113142e+00,1.953945080437706e+00,-2.586888347737627e+00,3.617240383212874e+00,-5.562781123708233e+00,1.034113683985215e+01,-3.426770836601949e+01,2.735644086785016e+01} +}; +#endif +#if p_Nq==13 && p_cubNq==13 +const dfloat c_DI[13][13] = { +{ -2.534450184071924e+01,2.794537440644711e+01,-3.596441748337696e+00,1.551312326158976e+00,-9.226386014588124e-01,6.320644784543838e-01,-4.673651076655465e-01,3.612993334954688e-01,-2.860643662369132e-01,2.276800404002845e-01,-1.773883378193001e-01,1.269373025598126e-01,-5.026788527852433e-02}, +{ -1.245352121658003e-01,-1.121621984438793e+01,1.345161908871992e+01,-3.100197536157081e+00,1.599370971205902e+00,-1.036485135201056e+00,7.456503649569690e-01,-5.675861056202095e-01,4.451995392891187e-01,-3.522559495406852e-01,2.734434541683746e-01,-1.952648937819591e-01,7.726125851443304e-02}, +{ 5.834965048870917e-01,-1.586176736446985e+00,-6.224339921506236e+00,9.038745140907983e+00,-2.749206675778145e+00,1.545456199807135e+00,-1.045652889024839e+00,7.706490380638467e-01,-5.931789334033620e-01,4.639397492967491e-01,-3.575916449052510e-01,2.543313370489796e-01,-1.004711689469642e-01}, +{ -4.696507799108541e-01,1.295504016770785e+00,-2.317531331845055e+00,-3.769503066276454e+00,6.867280180048850e+00,-2.491091232243225e+00,1.476810319022152e+00,-1.024553255389815e+00,7.631181257605328e-01,-5.854181913557935e-01,4.460324792737668e-01,-3.151938214800210e-01,1.241965576251310e-01}, +{ 3.605278234287460e-01,-9.637579250949544e-01,1.576098796193197e+00,-2.862130312965922e+00,-2.209695497372163e+00,5.547489943819294e+00,-2.283812974362322e+00,1.404959740651218e+00,-9.883937262931362e-01,7.348613868715219e-01,-5.499114885015767e-01,3.848116179280248e-01,-1.510473843019288e-01}, +{ -2.819596730479230e-01,7.382326878099192e-01,-1.135846329158552e+00,1.770223009635059e+00,-3.379634026067230e+00,-1.030352714153009e+00,4.637166985019850e+00,-2.104684787506014e+00,1.329332698003957e+00,-9.380739286814045e-01,6.825274485597632e-01,-4.706081249815749e-01,1.836767545671585e-01}, +{ 2.255859374999992e-01,-5.829344906221513e-01,8.648651295090918e-01,-1.244250054865206e+00,1.938479199753327e+00,-3.947923970882851e+00,6.651565871861175e-15,3.947923970882840e+00,-1.938479199753319e+00,1.244250054865200e+00,-8.648651295090897e-01,5.829344906221543e-01,-2.255859375000019e-01}, +{ -1.836767545671562e-01,4.706081249815737e-01,-6.825274485597659e-01,9.380739286814112e-01,-1.329332698003966e+00,2.104684787506023e+00,-4.637166985019870e+00,1.030352714153045e+00,3.379634026067201e+00,-1.770223009635047e+00,1.135846329158548e+00,-7.382326878099217e-01,2.819596730479252e-01}, +{ 1.510473843019283e-01,-3.848116179280248e-01,5.499114885015806e-01,-7.348613868715296e-01,9.883937262931441e-01,-1.404959740651225e+00,2.283812974362329e+00,-5.547489943819295e+00,2.209695497372163e+00,2.862130312965918e+00,-1.576098796193200e+00,9.637579250949618e-01,-3.605278234287500e-01}, +{ -1.241965576251307e-01,3.151938214800187e-01,-4.460324792737677e-01,5.854181913557979e-01,-7.631181257605372e-01,1.024553255389819e+00,-1.476810319022153e+00,2.491091232243224e+00,-6.867280180048849e+00,3.769503066276462e+00,2.317531331845049e+00,-1.295504016770790e+00,4.696507799108575e-01}, +{ 1.004711689469628e-01,-2.543313370489764e-01,3.575916449052529e-01,-4.639397492967526e-01,5.931789334033619e-01,-7.706490380638459e-01,1.045652889024838e+00,-1.545456199807129e+00,2.749206675778129e+00,-9.038745140907963e+00,6.224339921506242e+00,1.586176736446969e+00,-5.834965048870899e-01}, +{ -7.726125851443232e-02,1.952648937819572e-01,-2.734434541683767e-01,3.522559495406873e-01,-4.451995392891190e-01,5.675861056202089e-01,-7.456503649569670e-01,1.036485135201053e+00,-1.599370971205885e+00,3.100197536157050e+00,-1.345161908871992e+01,1.121621984438795e+01,1.245352121657923e-01}, +{ 5.026788527852943e-02,-1.269373025598286e-01,1.773883378193260e-01,-2.276800404003204e-01,2.860643662369511e-01,-3.612993334955089e-01,4.673651076655894e-01,-6.320644784544368e-01,9.226386014588680e-01,-1.551312326159052e+00,3.596441748337857e+00,-2.794537440644748e+01,2.534450184071951e+01} +}; +#endif +#if p_Nq==13 && p_cubNq==14 +const dfloat c_DI[14][13] = { +{ -2.692378142346845e+01,3.075590261907604e+01,-5.479522745185573e+00,2.616474742498425e+00,-1.626333254058497e+00,1.139618708153461e+00,-8.536158420732860e-01,6.651380385197434e-01,-5.293084317064082e-01,4.226686377018066e-01,-3.299968108843792e-01,2.364286092657360e-01,-9.367284783861489e-02}, +{ -1.866998081760022e+00,-9.441203803948698e+00,1.430636735176233e+01,-4.556174334325296e+00,2.563063573956401e+00,-1.724403192440706e+00,1.265213898666334e+00,-9.742424348037796e-01,7.696704345563364e-01,-6.117821109092388e-01,4.762734205863750e-01,-3.406678287937653e-01,1.348831074537265e-01}, +{ 1.427513314656397e+00,-4.579383365368056e+00,-3.174742722946418e+00,8.596008102975469e+00,-3.571763339940489e+00,2.186452147137335e+00,-1.536145610472146e+00,1.155239365283510e+00,-8.998856756239768e-01,7.090420955378023e-01,-5.490053344083303e-01,3.914823788765059e-01,-1.548113557076043e-01}, +{ -8.477666093786642e-01,2.456488712203011e+00,-5.426589653268604e+00,1.087646796550414e-01,5.334570286749547e+00,-2.624595238279054e+00,1.701101721361473e+00,-1.229084287623818e+00,9.358939109779082e-01,-7.273602757108638e-01,5.585105050352234e-01,-3.963953696797688e-01,1.564616179585689e-01}, +{ 4.756765465704563e-01,-1.305985449822327e+00,2.330232615499066e+00,-5.735401327562143e+00,2.215397001158003e+00,3.064712211817711e+00,-1.724996862337999e+00,1.171264686086824e+00,-8.628835401770870e-01,6.578569010444595e-01,-4.994101801615267e-01,3.522113318663045e-01,-1.386739339817392e-01}, +{ -2.356691408716703e-01,6.291299421130120e-01,-1.024129210670730e+00,1.834249602498576e+00,-5.696380601132509e+00,3.674368000994618e+00,1.341243752225184e+00,-8.875706468333427e-01,6.351354462906985e-01,-4.753285948064327e-01,3.567958219110823e-01,-2.500481779096155e-01,9.820380619112930e-02}, +{ 7.658245688787764e-02,-2.022537890408770e-01,3.182575560350564e-01,-5.172926037965658e-01,1.057488208932481e+00,-5.349961277293048e+00,4.689229397869841e+00,-8.261688632211300e-03,-1.319682493121814e-01,1.258794269600986e-01,-1.024519877024507e-01,7.425635816879472e-02,-2.950380907681601e-02}, +{ 2.950380907681564e-02,-7.425635816879253e-02,1.024519877024484e-01,-1.258794269600951e-01,1.319682493121754e-01,8.261688632222796e-03,-4.689229397869850e+00,5.349961277293041e+00,-1.057488208932469e+00,5.172926037965578e-01,-3.182575560350513e-01,2.022537890408750e-01,-7.658245688787817e-02}, +{ -9.820380619112870e-02,2.500481779096146e-01,-3.567958219110826e-01,4.753285948064341e-01,-6.351354462906994e-01,8.875706468333417e-01,-1.341243752225180e+00,-3.674368000994620e+00,5.696380601132505e+00,-1.834249602498569e+00,1.024129210670728e+00,-6.291299421130160e-01,2.356691408716717e-01}, +{ 1.386739339817397e-01,-3.522113318663064e-01,4.994101801615311e-01,-6.578569010444669e-01,8.628835401770959e-01,-1.171264686086834e+00,1.724996862338012e+00,-3.064712211817730e+00,-2.215397001157972e+00,5.735401327562120e+00,-2.330232615499067e+00,1.305985449822338e+00,-4.756765465704615e-01}, +{ -1.564616179585683e-01,3.963953696797672e-01,-5.585105050352264e-01,7.273602757108694e-01,-9.358939109779141e-01,1.229084287623825e+00,-1.701101721361478e+00,2.624595238279055e+00,-5.334570286749545e+00,-1.087646796550407e-01,5.426589653268611e+00,-2.456488712203031e+00,8.477666093786751e-01}, +{ 1.548113557076041e-01,-3.914823788765022e-01,5.490053344083288e-01,-7.090420955378020e-01,8.998856756239751e-01,-1.155239365283509e+00,1.536145610472142e+00,-2.186452147137324e+00,3.571763339940472e+00,-8.596008102975471e+00,3.174742722946471e+00,4.579383365368012e+00,-1.427513314656397e+00}, +{ -1.348831074537289e-01,3.406678287937633e-01,-4.762734205863769e-01,6.117821109092401e-01,-7.696704345563318e-01,9.742424348037756e-01,-1.265213898666328e+00,1.724403192440694e+00,-2.563063573956372e+00,4.556174334325248e+00,-1.430636735176232e+01,9.441203803948714e+00,1.866998081760018e+00}, +{ 9.367284783861507e-02,-2.364286092657334e-01,3.299968108843840e-01,-4.226686377018140e-01,5.293084317064124e-01,-6.651380385197438e-01,8.536158420732710e-01,-1.139618708153435e+00,1.626333254058450e+00,-2.616474742498348e+00,5.479522745185474e+00,-3.075590261907605e+01,2.692378142346852e+01} +}; +#endif +#if p_Nq==13 && p_cubNq==15 +const dfloat c_DI[15][13] = { +{ -2.825815930449430e+01,3.314468265562471e+01,-7.104278110913548e+00,3.551363295450744e+00,-2.247904612371698e+00,1.589300003562635e+00,-1.196394696708249e+00,9.350490390135868e-01,-7.455268801140659e-01,5.960632081624385e-01,-4.657393837464767e-01,3.338335796071176e-01,-1.322887930728933e-01}, +{ -3.785883151783151e+00,-7.062397427958593e+00,1.434552569828875e+01,-5.413739467139923e+00,3.180748240753799e+00,-2.179541325273853e+00,1.614422045691538e+00,-1.250019396109066e+00,9.909191971489880e-01,-7.893586643516731e-01,6.153560896232774e-01,-4.404952044349884e-01,1.744633655448906e-01}, +{ 2.053222185432744e+00,-7.344190036700377e+00,3.435456548892843e-01,7.109831581923962e+00,-3.479774769458346e+00,2.235814276491868e+00,-1.605579637115539e+00,1.221846311059588e+00,-9.584970841138893e-01,7.585322561009070e-01,-5.889137090657732e-01,4.205851350698901e-01,-1.664221645143200e-01}, +{ -8.685719083155635e-01,2.650645435510548e+00,-7.508141313732406e+00,3.886058427890057e+00,2.864151633357137e+00,-1.705440142996971e+00,1.178015090805908e+00,-8.773548040041839e-01,6.794219450055742e-01,-5.333702514671212e-01,4.120443041778878e-01,-2.934379492363999e-01,1.159795330055331e-01}, +{ 2.297392573222213e-01,-6.639783264541862e-01,1.403032375230507e+00,-6.526031712771164e+00,5.524641486280619e+00,1.735365414775440e-01,-2.654925892228192e-01,2.265184476001481e-01,-1.844746033361286e-01,1.483296149997164e-01,-1.160415783015808e-01,8.317844224025901e-02,-3.295735506513540e-02}, +{ 9.085835060698197e-02,-2.325804856426148e-01,3.291565102919529e-01,-3.373718183828319e-01,-4.780792509241109e+00,5.871717939109359e+00,-1.389540496184047e+00,7.286887838521511e-01,-4.764434727654868e-01,3.404857160746064e-01,-2.491059063469032e-01,1.721950748436005e-01,-6.726768621566045e-02}, +{ -2.181040930633208e-01,5.680084731520780e-01,-8.607455349478778e-01,1.294238687857475e+00,-2.216390642290178e+00,-2.521945854716042e+00,5.255959254272478e+00,-2.034866738202655e+00,1.226987635655377e+00,-8.479656130374550e-01,6.103169218665435e-01,-4.184734595796542e-01,1.629809630332318e-01}, +{ 2.255859374999995e-01,-5.829344906221520e-01,8.648651295090922e-01,-1.244250054865207e+00,1.938479199753328e+00,-3.947923970882854e+00,1.029292228200320e-14,3.947923970882837e+00,-1.938479199753317e+00,1.244250054865200e+00,-8.648651295090890e-01,5.829344906221539e-01,-2.255859375000011e-01}, +{ -1.629809630332312e-01,4.184734595796549e-01,-6.103169218665477e-01,8.479656130374629e-01,-1.226987635655387e+00,2.034866738202665e+00,-5.255959254272474e+00,2.521945854716020e+00,2.216390642290196e+00,-1.294238687857481e+00,8.607455349478835e-01,-5.680084731520854e-01,2.181040930633246e-01}, +{ 6.726768621566095e-02,-1.721950748436004e-01,2.491059063469049e-01,-3.404857160746086e-01,4.764434727654886e-01,-7.286887838521555e-01,1.389540496184053e+00,-5.871717939109359e+00,4.780792509241102e+00,3.373718183828348e-01,-3.291565102919555e-01,2.325804856426167e-01,-9.085835060698225e-02}, +{ 3.295735506513425e-02,-8.317844224025980e-02,1.160415783015819e-01,-1.483296149997196e-01,1.844746033361338e-01,-2.265184476001494e-01,2.654925892228195e-01,-1.735365414775482e-01,-5.524641486280609e+00,6.526031712771156e+00,-1.403032375230512e+00,6.639783264541951e-01,-2.297392573222238e-01}, +{ -1.159795330055324e-01,2.934379492364020e-01,-4.120443041778946e-01,5.333702514671319e-01,-6.794219450055855e-01,8.773548040041956e-01,-1.178015090805924e+00,1.705440142996991e+00,-2.864151633357177e+00,-3.886058427889996e+00,7.508141313732389e+00,-2.650645435510577e+00,8.685719083155772e-01}, +{ 1.664221645143180e-01,-4.205851350698867e-01,5.889137090657731e-01,-7.585322561009098e-01,9.584970841138912e-01,-1.221846311059591e+00,1.605579637115543e+00,-2.235814276491868e+00,3.479774769458346e+00,-7.109831581923988e+00,-3.435456548892251e-01,7.344190036700351e+00,-2.053222185432753e+00}, +{ -1.744633655448875e-01,4.404952044349828e-01,-6.153560896232767e-01,7.893586643516732e-01,-9.909191971489851e-01,1.250019396109062e+00,-1.614422045691529e+00,2.179541325273832e+00,-3.180748240753758e+00,5.413739467139859e+00,-1.434552569828874e+01,7.062397427958641e+00,3.785883151783123e+00}, +{ 1.322887930728918e-01,-3.338335796070995e-01,4.657393837464552e-01,-5.960632081624091e-01,7.455268801140272e-01,-9.350490390135437e-01,1.196394696708187e+00,-1.589300003562551e+00,2.247904612371567e+00,-3.551363295450533e+00,7.104278110913230e+00,-3.314468265562441e+01,2.825815930449420e+01} +}; +#endif +#if p_Nq==13 && p_cubNq==16 +const dfloat c_DI[16][13] = { +{ -2.939252352761299e+01,3.518491801193201e+01,-8.508325556733942e+00,4.369902829702007e+00,-2.794809672804246e+00,1.985897278694868e+00,-1.499101554792705e+00,1.173592063918299e+00,-9.367111599052471e-01,7.494303505997308e-01,-5.858274384510647e-01,4.200152558312084e-01,-1.664568803779227e-01}, +{ -5.770094353046919e+00,-4.334348386594739e+00,1.380441360705833e+01,-5.799939835248788e+00,3.505883637501928e+00,-2.431505788139009e+00,1.812401371702486e+00,-1.408445962107326e+00,1.119037185274181e+00,-8.927018212620264e-01,6.965485588930808e-01,-4.988746989419336e-01,1.976264849107360e-01}, +{ 2.361176489946930e+00,-9.552084133860385e+00,3.790555028421583e+00,5.118774461543699e+00,-2.815289970115926e+00,1.878077855270354e+00,-1.372448971646724e+00,1.054499265247675e+00,-8.319869949754141e-01,6.607794043569790e-01,-5.141606224434856e-01,3.676632700969322e-01,-1.455550818422167e-01}, +{ -5.477135239569887e-01,1.834360645495398e+00,-8.203486636961621e+00,6.754283418173769e+00,3.989248363991031e-01,-4.286501263357245e-01,3.449668828379047e-01,-2.750791358293391e-01,2.210259845578010e-01,-1.773161518586452e-01,1.387693903046577e-01,-9.954157324695820e-02,3.945599042064370e-02}, +{ -2.078727064373265e-01,5.446546660182493e-01,-7.664764546198212e-01,-5.235819020245050e+00,6.902393201905729e+00,-1.852962482357362e+00,1.009532242905999e+00,-6.740612379799878e-01,4.917263975928156e-01,-3.726279949924676e-01,2.818309328615289e-01,-1.983453094453798e-01,7.802776479307322e-02}, +{ 3.974463776043454e-01,-1.066298295423392e+00,1.764056621418545e+00,-3.325763847053990e+00,-1.629762105163392e+00,5.317235329311184e+00,-2.313906739027519e+00,1.446765206336284e+00,-1.025348337137533e+00,7.653673461889534e-01,-5.740318215625630e-01,4.021805507923255e-01,-1.579402862832459e-01}, +{ -3.053593782312931e-01,8.070742374515062e-01,-1.276053317832168e+00,2.124779066476741e+00,-5.052192165502832e+00,1.784815311711765e+00,2.893528911734700e+00,-1.609093081138918e+00,1.085293563136929e+00,-7.896697345235710e-01,5.839116843632662e-01,-4.060110272186088e-01,1.589759295724832e-01}, +{ 1.131919849887197e-01,-2.968970716535934e-01,4.587251496830903e-01,-7.186515722518284e-01,1.359554338516599e+00,-5.441233765037716e+00,4.300414088000033e+00,4.478670437775281e-01,-3.948633056905128e-01,3.029291525056683e-01,-2.280850338558896e-01,1.597563023461595e-01,-6.270731132825634e-02}, +{ 6.270731132825370e-02,-1.597563023461558e-01,2.280850338558855e-01,-3.029291525056637e-01,3.948633056905068e-01,-4.478670437775152e-01,-4.300414088000044e+00,5.441233765037711e+00,-1.359554338516586e+00,7.186515722518196e-01,-4.587251496830851e-01,2.968970716535915e-01,-1.131919849887189e-01}, +{ -1.589759295724825e-01,4.060110272186066e-01,-5.839116843632657e-01,7.896697345235734e-01,-1.085293563136930e+00,1.609093081138915e+00,-2.893528911734683e+00,-1.784815311711788e+00,5.052192165502836e+00,-2.124779066476735e+00,1.276053317832166e+00,-8.070742374515107e-01,3.053593782312963e-01}, +{ 1.579402862832456e-01,-4.021805507923248e-01,5.740318215625646e-01,-7.653673461889573e-01,1.025348337137540e+00,-1.446765206336291e+00,2.313906739027527e+00,-5.317235329311181e+00,1.629762105163382e+00,3.325763847053995e+00,-1.764056621418551e+00,1.066298295423402e+00,-3.974463776043505e-01}, +{ -7.802776479307150e-02,1.983453094453768e-01,-2.818309328615292e-01,3.726279949924681e-01,-4.917263975928151e-01,6.740612379799888e-01,-1.009532242905997e+00,1.852962482357354e+00,-6.902393201905720e+00,5.235819020245056e+00,7.664764546198083e-01,-5.446546660182446e-01,2.078727064373257e-01}, +{ -3.945599042064263e-02,9.954157324695964e-02,-1.387693903046610e-01,1.773161518586509e-01,-2.210259845578079e-01,2.750791358293466e-01,-3.449668828379139e-01,4.286501263357378e-01,-3.989248363991341e-01,-6.754283418173729e+00,8.203486636961623e+00,-1.834360645495431e+00,5.477135239570032e-01}, +{ 1.455550818422182e-01,-3.676632700969333e-01,5.141606224434894e-01,-6.607794043569865e-01,8.319869949754229e-01,-1.054499265247688e+00,1.372448971646740e+00,-1.878077855270372e+00,2.815289970115960e+00,-5.118774461543786e+00,-3.790555028421461e+00,9.552084133860342e+00,-2.361176489946947e+00}, +{ -1.976264849107334e-01,4.988746989419256e-01,-6.965485588930780e-01,8.927018212620258e-01,-1.119037185274177e+00,1.408445962107317e+00,-1.812401371702471e+00,2.431505788138983e+00,-3.505883637501880e+00,5.799939835248729e+00,-1.380441360705836e+01,4.334348386594900e+00,5.770094353046821e+00}, +{ 1.664568803778934e-01,-4.200152558311760e-01,5.858274384510318e-01,-7.494303505996862e-01,9.367111599051781e-01,-1.173592063918212e+00,1.499101554792587e+00,-1.985897278694703e+00,2.794809672803991e+00,-4.369902829701616e+00,8.508325556733340e+00,-3.518491801193132e+01,2.939252352761269e+01} +}; +#endif +#if p_Nq==13 && p_cubNq==17 +const dfloat c_DI[17][13] = { +{ -3.036286934755003e+01,3.693674195589447e+01,-9.725206106468537e+00,5.086668763593873e+00,-3.275594465007144e+00,2.335202452790682e+00,-1.765988639898343e+00,1.384038807099356e+00,-1.105443496969136e+00,8.848209281508357e-01,-6.918566414037695e-01,4.961145672065133e-01,-1.966287774387649e-01}, +{ -7.744142894620562e+00,-1.440991466052560e+00,1.287128239779263e+01,-5.833401151570973e+00,3.600822002245444e+00,-2.519988102379621e+00,1.887246518006095e+00,-1.470656475548918e+00,1.170465414789176e+00,-9.347471060895569e-01,7.298551634192191e-01,-5.229348416976645e-01,2.071905417072960e-01}, +{ 2.329170502013960e+00,-1.106957042246438e+01,6.853139115590786e+00,3.001872853122421e+00,-1.862395458669179e+00,1.292675748484103e+00,-9.624232969322581e-01,7.471071761006770e-01,-5.931134678686780e-01,4.728845768554510e-01,-3.688401352943300e-01,2.641085527564930e-01,-1.046157436950666e-01}, +{ 3.405961515317163e-03,2.405117262329895e-01,-7.607664218910850e+00,8.453904968335488e+00,-1.561431847288398e+00,7.513185984512452e-01,-4.720227762772735e-01,3.337597490241598e-01,-2.505223591251818e-01,1.928675846694633e-01,-1.472005868712766e-01,1.041070769246736e-01,-4.103387668035599e-02}, +{ -6.233982211223537e-01,1.750376033875391e+00,-3.373579916319581e+00,-2.596954378281199e+00,6.554764451431781e+00,-2.695555894184386e+00,1.655407804171163e+00,-1.166824128378623e+00,8.765666751367832e-01,-6.758417791790216e-01,5.164772051861873e-01,-3.655862611199392e-01,1.441484087837974e-01}, +{ 4.787285222046238e-01,-1.312744532421402e+00,2.332660067348979e+00,-5.652501004507360e+00,2.041625407065359e+00,3.189856871315987e+00,-1.776276154573368e+00,1.201652839151679e+00,-8.837040695721913e-01,6.730714894668688e-01,-5.106720238023862e-01,3.600428539347966e-01,-1.417402656115858e-01}, +{ -1.197456365380936e-01,3.261836694906159e-01,-5.624712081635753e-01,1.155455158257120e+00,-5.693986657980062e+00,4.940757863224287e+00,3.803130383215526e-02,-1.679951020765162e-01,1.545181733727003e-01,-1.278978415032136e-01,1.009292866349363e-01,-7.254280343964714e-02,2.876379488929268e-02}, +{ -1.518952244556092e-01,3.934441468806197e-01,-5.870723318492195e-01,8.511367043973139e-01,-1.303540994531391e+00,-3.525028417286273e+00,5.487648868942469e+00,-1.783877776795519e+00,1.024371913437712e+00,-6.927057963844157e-01,4.929958549140911e-01,-3.360803012271267e-01,1.306033539573467e-01}, +{ 2.255859374999998e-01,-5.829344906221517e-01,8.648651295090920e-01,-1.244250054865206e+00,1.938479199753326e+00,-3.947923970882846e+00,-3.134375316872081e-16,3.947923970882842e+00,-1.938479199753317e+00,1.244250054865200e+00,-8.648651295090899e-01,5.829344906221543e-01,-2.255859375000024e-01}, +{ -1.306033539573456e-01,3.360803012271254e-01,-4.929958549140923e-01,6.927057963844196e-01,-1.024371913437717e+00,1.783877776795524e+00,-5.487648868942474e+00,3.525028417286276e+00,1.303540994531388e+00,-8.511367043973123e-01,5.870723318492197e-01,-3.934441468806220e-01,1.518952244556099e-01}, +{ -2.876379488929337e-02,7.254280343964875e-02,-1.009292866349388e-01,1.278978415032163e-01,-1.545181733727038e-01,1.679951020765208e-01,-3.803130383216106e-02,-4.940757863224277e+00,5.693986657980058e+00,-1.155455158257123e+00,5.624712081635779e-01,-3.261836694906203e-01,1.197456365380955e-01}, +{ 1.417402656115844e-01,-3.600428539347954e-01,5.106720238023866e-01,-6.730714894668729e-01,8.837040695721966e-01,-1.201652839151681e+00,1.776276154573367e+00,-3.189856871315974e+00,-2.041625407065373e+00,5.652501004507361e+00,-2.332660067348981e+00,1.312744532421412e+00,-4.787285222046297e-01}, +{ -1.441484087837982e-01,3.655862611199380e-01,-5.164772051861913e-01,6.758417791790281e-01,-8.765666751367891e-01,1.166824128378630e+00,-1.655407804171169e+00,2.695555894184389e+00,-6.554764451431775e+00,2.596954378281189e+00,3.373579916319591e+00,-1.750376033875406e+00,6.233982211223618e-01}, +{ 4.103387668035161e-02,-1.041070769246690e-01,1.472005868712713e-01,-1.928675846694566e-01,2.505223591251733e-01,-3.337597490241510e-01,4.720227762772618e-01,-7.513185984512246e-01,1.561431847288359e+00,-8.453904968335449e+00,7.607664218910870e+00,-2.405117262330367e-01,-3.405961515301622e-03}, +{ 1.046157436950663e-01,-2.641085527564916e-01,3.688401352943292e-01,-4.728845768554495e-01,5.931134678686745e-01,-7.471071761006715e-01,9.624232969322495e-01,-1.292675748484088e+00,1.862395458669164e+00,-3.001872853122401e+00,-6.853139115590828e+00,1.106957042246443e+01,-2.329170502013982e+00}, +{ -2.071905417072912e-01,5.229348416976586e-01,-7.298551634192200e-01,9.347471060895587e-01,-1.170465414789174e+00,1.470656475548914e+00,-1.887246518006086e+00,2.519988102379605e+00,-3.600822002245415e+00,5.833401151570932e+00,-1.287128239779265e+01,1.440991466052680e+00,7.744142894620493e+00}, +{ 1.966287774387432e-01,-4.961145672064655e-01,6.918566414037173e-01,-8.848209281507738e-01,1.105443496969060e+00,-1.384038807099269e+00,1.765988639898226e+00,-2.335202452790520e+00,3.275594465006889e+00,-5.086668763593477e+00,9.725206106467938e+00,-3.693674195589382e+01,3.036286934754975e+01} +}; +#endif +#if p_Nq==13 && p_cubNq==18 +const dfloat c_DI[18][13] = { +{ -3.119800239532919e+01,3.844911672636646e+01,-1.078375006387131e+01,5.715349860210662e+00,-3.698635237941168e+00,2.643024894692090e+00,-2.001379283148238e+00,1.569743984426363e+00,-1.254386208574297e+00,1.004357022762671e+00,-7.854818653406931e-01,5.633163533807468e-01,-2.232737876340928e-01}, +{ -9.659614944342273e+00,1.490874671334226e+00,1.168843982498288e+01,-5.613042988095098e+00,3.523135257758106e+00,-2.483633562278944e+00,1.867162296241453e+00,-1.458273595999826e+00,1.162230948053656e+00,-9.289980404069274e-01,7.257726038156339e-01,-5.201769921783643e-01,2.061245211154829e-01}, +{ 1.980920009304639e+00,-1.188882830970825e+01,9.385991957567018e+00,9.927793986497876e-01,-8.212025917156048e-01,6.171763397411847e-01,-4.761229072228760e-01,3.767563180591347e-01,-3.025220151919979e-01,2.429065402215244e-01,-1.902896477847487e-01,1.365950297951987e-01,-5.416012171501558e-02}, +{ 6.509958083772864e-01,-1.808979660644045e+00,-6.026451096134407e+00,9.055883172485057e+00,-2.850773548673638e+00,1.617627061572366e+00,-1.099042858022026e+00,8.118097551760929e-01,-6.256886955856362e-01,4.897679181655986e-01,-3.776903710202289e-01,2.687034488716331e-01,-1.061609345680529e-01}, +{ -8.683845213152308e-01,2.528890359689118e+00,-5.715135250143510e+00,5.457289875149944e-01,5.089897949026905e+00,-2.561513794459313e+00,1.672266206572051e+00,-1.212354175185877e+00,9.248816140313622e-01,-7.195998360653326e-01,5.529204935056311e-01,-3.925744416891634e-01,1.549764085183650e-01}, +{ 3.065900238393021e-01,-8.699266813382975e-01,1.733008789270357e+00,-6.548027584704949e+00,4.966798109003236e+00,7.503277682312025e-01,-5.915711914800130e-01,4.468977560425237e-01,-3.462030741299763e-01,2.713013794728784e-01,-2.092340147167815e-01,1.488368591530353e-01,-5.879813864251741e-02}, +{ 1.839573951030595e-01,-4.825971601850280e-01,7.430486754815776e-01,-1.104717690911856e+00,-4.106357511560907e+00,5.956063043439467e+00,-1.807747426576976e+00,1.020784456772069e+00,-6.909045011265600e-01,5.031626832590823e-01,-3.721352988200415e-01,2.587622801689560e-01,-1.013189450428438e-01}, +{ -3.072830922382854e-01,8.075470906705610e-01,-1.255904033192486e+00,2.008347615859086e+00,-4.161205034100046e+00,1.442534559815058e-01,3.990031965794509e+00,-1.983849607596376e+00,1.289164038806358e+00,-9.217126829929041e-01,6.752432399018827e-01,-4.672459021668181e-01,1.826129452730125e-01}, +{ 1.391146663921827e-01,-3.637451873095827e-01,5.572510158386107e-01,-8.577158285845565e-01,1.559835348913514e+00,-5.457475800467678e+00,3.949415689338607e+00,8.229008156715925e-01,-6.023316487900751e-01,4.403996865670909e-01,-3.248589148923984e-01,2.253536094067208e-01,-8.814345208402652e-02}, +{ 8.814345208402563e-02,-2.253536094067175e-01,3.248589148923952e-01,-4.403996865670877e-01,6.023316487900701e-01,-8.229008156715798e-01,-3.949415689338620e+00,5.457475800467676e+00,-1.559835348913503e+00,8.577158285845469e-01,-5.572510158386047e-01,3.637451873095813e-01,-1.391146663921814e-01}, +{ -1.826129452730108e-01,4.672459021668174e-01,-6.752432399018857e-01,9.217126829929109e-01,-1.289164038806366e+00,1.983849607596384e+00,-3.990031965794522e+00,-1.442534559814858e-01,4.161205034100029e+00,-2.008347615859077e+00,1.255904033192484e+00,-8.075470906705655e-01,3.072830922382872e-01}, +{ 1.013189450428432e-01,-2.587622801689549e-01,3.721352988200424e-01,-5.031626832590856e-01,6.909045011265627e-01,-1.020784456772071e+00,1.807747426576976e+00,-5.956063043439467e+00,4.106357511560915e+00,1.104717690911845e+00,-7.430486754815745e-01,4.825971601850287e-01,-1.839573951030608e-01}, +{ 5.879813864251615e-02,-1.488368591530358e-01,2.092340147167814e-01,-2.713013794728786e-01,3.462030741299784e-01,-4.468977560425250e-01,5.915711914800154e-01,-7.503277682312025e-01,-4.966798109003234e+00,6.548027584704943e+00,-1.733008789270358e+00,8.699266813383055e-01,-3.065900238393056e-01}, +{ -1.549764085183635e-01,3.925744416891623e-01,-5.529204935056333e-01,7.195998360653388e-01,-9.248816140313691e-01,1.212354175185884e+00,-1.672266206572058e+00,2.561513794459317e+00,-5.089897949026918e+00,-5.457289875149646e-01,5.715135250143500e+00,-2.528890359689135e+00,8.683845213152407e-01}, +{ 1.061609345680502e-01,-2.687034488716262e-01,3.776903710202233e-01,-4.897679181655926e-01,6.256886955856259e-01,-8.118097551760812e-01,1.099042858022009e+00,-1.617627061572338e+00,2.850773548673593e+00,-9.055883172485036e+00,6.026451096134474e+00,1.808979660643963e+00,-6.509958083772659e-01}, +{ 5.416012171501561e-02,-1.365950297951980e-01,1.902896477847485e-01,-2.429065402215268e-01,3.025220151920013e-01,-3.767563180591373e-01,4.761229072228761e-01,-6.171763397411828e-01,8.212025917156135e-01,-9.927793986498107e-01,-9.385991957567013e+00,1.188882830970828e+01,-1.980920009304667e+00}, +{ -2.061245211154841e-01,5.201769921783599e-01,-7.257726038156340e-01,9.289980404069271e-01,-1.162230948053650e+00,1.458273595999822e+00,-1.867162296241452e+00,2.483633562278931e+00,-3.523135257758076e+00,5.613042988095056e+00,-1.168843982498286e+01,-1.490874671334264e+00,9.659614944342319e+00}, +{ 2.232737876340742e-01,-5.633163533807430e-01,7.854818653407002e-01,-1.004357022762676e+00,1.254386208574293e+00,-1.569743984426357e+00,2.001379283148228e+00,-2.643024894692080e+00,3.698635237941138e+00,-5.715349860210623e+00,1.078375006387133e+01,-3.844911672636669e+01,3.119800239532940e+01} +}; +#endif +#if p_Nq==13 && p_cubNq==19 +const dfloat c_DI[19][13] = { +{ -3.192101461569459e+01,3.976180529390244e+01,-1.170828741293016e+01,6.268168275143989e+00,-4.071601552919226e+00,2.914753785959585e+00,-2.209314333643195e+00,1.733857638163548e+00,-1.386046601843068e+00,1.110041019540778e+00,-8.682665582973720e-01,6.227407925345159e-01,-2.468357299172571e-01}, +{ -1.148749803200814e+01,4.377379240557479e+00,1.035970314712100e+01,-5.216679994468991e+00,3.320977890678199e+00,-2.355755749564545e+00,1.776873943443882e+00,-1.390445678031310e+00,1.109509713554885e+00,-8.875401960929726e-01,6.937201352662237e-01,-4.973427602993193e-01,1.970983398436026e-01}, +{ 1.362453553019809e+00,-1.207135618831085e+01,1.135192495117330e+01,-7.830535170357373e-01,1.826008268469476e-01,-5.621666892010498e-02,1.682234836092176e-02,-2.239232925625622e-03,-3.413654061756417e-03,5.317075397638491e-03,-5.406062598147012e-03,4.385244403044296e-03,-1.818675349438985e-03}, +{ 1.278900289684723e+00,-4.012869303123250e+00,-3.812435213225866e+00,8.779631349731376e+00,-3.496240157019972e+00,2.114875077965114e+00,-1.478014532193119e+00,1.108368385192830e+00,-8.619242511157499e-01,6.784247561330056e-01,-5.249614020374439e-01,3.742008121220791e-01,-1.479558121137287e-01}, +{ -8.833186242827016e-01,2.679148018937134e+00,-7.359134899683322e+00,3.536901080587642e+00,3.125815961421282e+00,-1.824601717019474e+00,1.251802233956951e+00,-9.292721468222348e-01,7.183166865302284e-01,-5.632897089754899e-01,4.348715201052593e-01,-3.095800546656353e-01,1.223416499103600e-01}, +{ -2.913346136098231e-02,4.439109300626154e-02,1.756090117442043e-01,-5.946314773913931e+00,6.597167392225304e+00,-1.203616872195812e+00,5.762976719517436e-01,-3.608366938169498e-01,2.537224825946345e-01,-1.880167501758191e-01,1.402744496311235e-01,-9.796353669987745e-02,3.841998701010022e-02}, +{ 4.197480154496575e-01,-1.128876060371564e+00,1.882228560415934e+00,-3.640823327185700e+00,-1.214562477818707e+00,5.131160111605896e+00,-2.312128240624191e+00,1.460807865457822e+00,-1.040260871141017e+00,7.785021625462399e-01,-5.847398392264725e-01,4.100084699889801e-01,-1.610643690968795e-01}, +{ -2.427663477353126e-01,6.475048888448667e-01,-1.051331151007440e+00,1.870888891903980e+00,-5.673217889389456e+00,3.557493716536846e+00,1.448136229377451e+00,-9.424165253971934e-01,6.706993371540229e-01,-5.006521002383374e-01,3.752905051082545e-01,-2.628217799952535e-01,1.031922248375709e-01}, +{ -9.306768803522866e-02,2.392192613221958e-01,-3.490816382946252e-01,4.791643427095644e-01,-6.054282308609217e-01,-4.196483197906905e+00,5.502691527882382e+00,-1.461981466777850e+00,7.924462196041003e-01,-5.220347358143903e-01,3.664901252571678e-01,-2.480803415560402e-01,9.614582247054967e-02}, +{ 2.255859374999994e-01,-5.829344906221517e-01,8.648651295090913e-01,-1.244250054865205e+00,1.938479199753324e+00,-3.947923970882841e+00,-8.824855536099263e-15,3.947923970882848e+00,-1.938479199753319e+00,1.244250054865200e+00,-8.648651295090890e-01,5.829344906221543e-01,-2.255859375000014e-01}, +{ -9.614582247054951e-02,2.480803415560408e-01,-3.664901252571708e-01,5.220347358143962e-01,-7.924462196041082e-01,1.461981466777860e+00,-5.502691527882385e+00,4.196483197906895e+00,6.054282308609309e-01,-4.791643427095683e-01,3.490816382946290e-01,-2.392192613221998e-01,9.306768803523027e-02}, +{ -1.031922248375716e-01,2.628217799952544e-01,-3.752905051082575e-01,5.006521002383421e-01,-6.706993371540280e-01,9.424165253971993e-01,-1.448136229377459e+00,-3.557493716536834e+00,5.673217889389446e+00,-1.870888891903976e+00,1.051331151007441e+00,-6.475048888448725e-01,2.427663477353163e-01}, +{ 1.610643690968790e-01,-4.100084699889795e-01,5.847398392264750e-01,-7.785021625462477e-01,1.040260871141026e+00,-1.460807865457832e+00,2.312128240624202e+00,-5.131160111605900e+00,1.214562477818714e+00,3.640823327185689e+00,-1.882228560415935e+00,1.128876060371573e+00,-4.197480154496641e-01}, +{ -3.841998701010005e-02,9.796353669987309e-02,-1.402744496311195e-01,1.880167501758157e-01,-2.537224825946288e-01,3.608366938169429e-01,-5.762976719517324e-01,1.203616872195794e+00,-6.597167392225288e+00,5.946314773913938e+00,-1.756090117442268e-01,-4.439109300624741e-02,2.913346136097869e-02}, +{ -1.223416499103592e-01,3.095800546656348e-01,-4.348715201052611e-01,5.632897089754941e-01,-7.183166865302327e-01,9.292721468222391e-01,-1.251802233956955e+00,1.824601717019477e+00,-3.125815961421289e+00,-3.536901080587627e+00,7.359134899683323e+00,-2.679148018937158e+00,8.833186242827141e-01}, +{ 1.479558121137261e-01,-3.742008121220746e-01,5.249614020374436e-01,-6.784247561330073e-01,8.619242511157511e-01,-1.108368385192828e+00,1.478014532193108e+00,-2.114875077965099e+00,3.496240157019951e+00,-8.779631349731384e+00,3.812435213225943e+00,4.012869303123181e+00,-1.278900289684712e+00}, +{ 1.818675349438914e-03,-4.385244403042238e-03,5.406062598142613e-03,-5.317075397634810e-03,3.413654061752018e-03,2.239232925629551e-03,-1.682234836092496e-02,5.621666892011316e-02,-1.826008268469524e-01,7.830535170357374e-01,-1.135192495117332e+01,1.207135618831087e+01,-1.362453553019816e+00}, +{ -1.970983398435990e-01,4.973427602993156e-01,-6.937201352662246e-01,8.875401960929774e-01,-1.109509713554888e+00,1.390445678031311e+00,-1.776873943443882e+00,2.355755749564539e+00,-3.320977890678189e+00,5.216679994468986e+00,-1.035970314712110e+01,-4.377379240557279e+00,1.148749803200803e+01}, +{ 2.468357299172652e-01,-6.227407925345290e-01,8.682665582973955e-01,-1.110041019540796e+00,1.386046601843069e+00,-1.733857638163549e+00,2.209314333643192e+00,-2.914753785959578e+00,4.071601552919192e+00,-6.268168275143935e+00,1.170828741293014e+01,-3.976180529390260e+01,3.192101461569473e+01} +}; +#endif +#if p_Nq==13 && p_cubNq==20 +const dfloat c_DI[20][13] = { +{ -3.255048842294853e+01,4.090712873764020e+01,-1.251916765847401e+01,6.755753836493844e+00,-4.401276358248587e+00,3.155195980093287e+00,-2.393415438062906e+00,1.879211356829612e+00,-1.502682754231628e+00,1.203678468201827e+00,-9.416215755164717e-01,6.753991869850904e-01,-2.677153587617395e-01}, +{ -1.321215415532219e+01,7.165557238197081e+00,8.958862490208226e+00,-4.703431222041189e+00,3.032541344298619e+00,-2.163325909969156e+00,1.636627125264659e+00,-1.282958427583998e+00,1.024866121444126e+00,-8.204069615289147e-01,6.415311800088027e-01,-4.600443487812169e-01,1.823355258051539e-01}, +{ 5.270280505902218e-01,-1.170975664502319e+01,1.277603946396969e+01,-2.272304347374643e+00,1.078847859041125e+00,-6.723865711601782e-01,4.734614803561986e-01,-3.557929515687864e-01,2.768172235840215e-01,-2.178829169879340e-01,1.685752063300601e-01,-1.201492036643957e-01,4.750335190780824e-02}, +{ 1.803201316068473e+00,-6.136713977924257e+00,-1.280402847930359e+00,7.879796499182341e+00,-3.612752361280025e+00,2.275555660285872e+00,-1.619361117563946e+00,1.226269283472956e+00,-9.591423456205929e-01,7.576567766970675e-01,-5.875694325443453e-01,4.193554776573768e-01,-1.658929305005614e-01}, +{ -6.784902518626529e-01,2.185942416500000e+00,-8.135994579665836e+00,5.982928582469567e+00,1.127799229308845e+00,-8.318254999617805e-01,6.147425820243337e-01,-4.726199353349026e-01,3.724757684969977e-01,-2.954751040954547e-01,2.297026390637734e-01,-1.641595816168231e-01,6.497373467393200e-02}, +{ -4.029852060774377e-01,1.101903331819615e+00,-1.898716255314336e+00,-4.196631601272260e+00,6.930459835738472e+00,-2.361834802548750e+00,1.376219469801984e+00,-9.474057875387619e-01,7.027149611218362e-01,-5.377605577267883e-01,4.091223713445399e-01,-2.888751385548904e-01,1.137893792067776e-01}, +{ 4.804808798926462e-01,-1.316418366570972e+00,2.332494201915280e+00,-5.591462775629219e+00,1.917284166734099e+00,3.278082016444890e+00,-1.811622252501302e+00,1.222392425993562e+00,-8.978367929232693e-01,6.833655324248787e-01,-5.182767618884698e-01,3.653254008949283e-01,-1.438076747870515e-01}, +{ -1.477225331763106e-02,4.882930242353933e-02,-1.254898735114892e-01,4.498179974945654e-01,-5.347986362851987e+00,5.550598823160893e+00,-7.716704572412102e-01,3.251578598033357e-01,-1.880501679001469e-01,1.247141312293092e-01,-8.715638850723550e-02,5.870281989890724e-02,-2.269543068085027e-02}, +{ -2.774118117088713e-01,7.259887240482715e-01,-1.115526650446593e+00,1.733087223606795e+00,-3.276660367787936e+00,-1.173342135594394e+00,4.707257011745867e+00,-2.109891463720218e+00,1.327586057985504e+00,-9.352288431837937e-01,6.798460225496630e-01,-4.685414382646729e-01,1.828376707703787e-01}, +{ 1.579424753560368e-01,-4.121903012961491e-01,6.282140494815259e-01,-9.565000396768035e-01,1.696870566174595e+00,-5.435756622032507e+00,3.639633190045440e+00,1.133083746015058e+00,-7.678960968464896e-01,5.485710865670608e-01,-4.004843049130222e-01,2.764389098629190e-01,-1.079266587376660e-01}, +{ 1.079266587376639e-01,-2.764389098629179e-01,4.004843049130236e-01,-5.485710865670634e-01,7.678960968464923e-01,-1.133083746015059e+00,-3.639633190045438e+00,5.435756622032502e+00,-1.696870566174590e+00,9.565000396767998e-01,-6.282140494815255e-01,4.121903012961518e-01,-1.579424753560388e-01}, +{ -1.828376707703778e-01,4.685414382646723e-01,-6.798460225496656e-01,9.352288431838007e-01,-1.327586057985513e+00,2.109891463720225e+00,-4.707257011745862e+00,1.173342135594383e+00,3.276660367787939e+00,-1.733087223606792e+00,1.115526650446595e+00,-7.259887240482774e-01,2.774118117088748e-01}, +{ 2.269543068084885e-02,-5.870281989890399e-02,8.715638850723222e-02,-1.247141312293047e-01,1.880501679001412e-01,-3.251578598033298e-01,7.716704572412019e-01,-5.550598823160883e+00,5.347986362851990e+00,-4.498179974945745e-01,1.254898735114958e-01,-4.882930242354519e-02,1.477225331763137e-02}, +{ 1.438076747870514e-01,-3.653254008949297e-01,5.182767618884748e-01,-6.833655324248875e-01,8.978367929232820e-01,-1.222392425993575e+00,1.811622252501317e+00,-3.278082016444917e+00,-1.917284166734060e+00,5.591462775629195e+00,-2.332494201915283e+00,1.316418366570985e+00,-4.804808798926530e-01}, +{ -1.137893792067775e-01,2.888751385548893e-01,-4.091223713445406e-01,5.377605577267925e-01,-7.027149611218414e-01,9.474057875387676e-01,-1.376219469801989e+00,2.361834802548751e+00,-6.930459835738471e+00,4.196631601272262e+00,1.898716255314336e+00,-1.101903331819621e+00,4.029852060774422e-01}, +{ -6.497373467393297e-02,1.641595816168233e-01,-2.297026390637732e-01,2.954751040954578e-01,-3.724757684970030e-01,4.726199353349063e-01,-6.147425820243371e-01,8.318254999617835e-01,-1.127799229308859e+00,-5.982928582469540e+00,8.135994579665836e+00,-2.185942416500026e+00,6.784902518626650e-01}, +{ 1.658929305005590e-01,-4.193554776573724e-01,5.875694325443449e-01,-7.576567766970709e-01,9.591423456205980e-01,-1.226269283472959e+00,1.619361117563946e+00,-2.275555660285869e+00,3.612752361280019e+00,-7.879796499182357e+00,1.280402847930414e+00,6.136713977924225e+00,-1.803201316068477e+00}, +{ -4.750335190780486e-02,1.201492036643894e-01,-1.685752063300559e-01,2.178829169879277e-01,-2.768172235840125e-01,3.557929515687769e-01,-4.734614803561874e-01,6.723865711601617e-01,-1.078847859041089e+00,2.272304347374579e+00,-1.277603946396966e+01,1.170975664502323e+01,-5.270280505902593e-01}, +{ -1.823355258051489e-01,4.600443487812115e-01,-6.415311800088010e-01,8.204069615289136e-01,-1.024866121444125e+00,1.282958427583992e+00,-1.636627125264648e+00,2.163325909969138e+00,-3.032541344298588e+00,4.703431222041136e+00,-8.958862490208190e+00,-7.165557238197137e+00,1.321215415532225e+01}, +{ 2.677153587617624e-01,-6.753991869851411e-01,9.416215755165515e-01,-1.203678468201923e+00,1.502682754231737e+00,-1.879211356829754e+00,2.393415438063072e+00,-3.155195980093483e+00,4.401276358248829e+00,-6.755753836494192e+00,1.251916765847467e+01,-4.090712873764134e+01,3.255048842294921e+01} +}; +#endif +#if p_Nq==14 && p_cubNq==14 +const dfloat c_DI[14][14] = { +{ -2.939253641064719e+01,3.230592231946941e+01,-4.012097645098290e+00,1.707677330780151e+00,-1.009686753805695e+00,6.901249998338667e-01,-5.104701716136937e-01,3.959384446376005e-01,-3.160102907815688e-01,2.557811808710455e-01,-2.067073415777446e-01,1.626983185000961e-01,-1.171377855666336e-01,4.650380499864967e-02}, +{ -3.805606663095248e-03,-1.315039372912953e+01,1.550694458051991e+01,-3.441491349502366e+00,1.756507947464320e+00,-1.133752862738018e+00,8.151980104274708e-01,-6.222513737366141e-01,4.917834178630712e-01,-3.955491400579547e-01,3.183448085961646e-01,-2.499086279590211e-01,1.796517723216731e-01,-7.127784740601130e-02}, +{ 5.953094848018973e-01,-1.575043556797312e+00,-7.410422505176678e+00,1.040784244908798e+01,-3.049395313621607e+00,1.697916630775934e+00,-1.145705305922694e+00,8.457010717831644e-01,-6.552991544283963e-01,5.205727474453409e-01,-4.156386969441448e-01,3.246450826773256e-01,-2.326995621201775e-01,9.221662843937081e-02}, +{ -4.917544560318113e-01,1.349397154638722e+00,-2.361331112069333e+00,-4.613820733468926e+00,7.908302480838153e+00,-2.766337226587043e+00,1.625831419183244e+00,-1.126781767749047e+00,8.434943740158359e-01,-6.563420661767571e-01,5.172921854737184e-01,-4.008004769381168e-01,2.859675278923408e-01,-1.131173030209795e-01}, +{ 3.807215775001134e-01,-1.015323846517881e+00,1.648161153502815e+00,-2.924227390032502e+00,-2.867378878637319e+00,6.397333527138642e+00,-2.542900107343539e+00,1.552664889665606e+00,-1.094406983040802e+00,8.234936973565520e-01,-6.361218424863573e-01,4.869110069949832e-01,-3.450403929021293e-01,1.361135888018179e-01}, +{ -2.991967413848454e-01,7.821820277806696e-01,-1.198450916944693e+00,1.850266584118980e+00,-3.435956560963535e+00,-1.583505573213424e+00,5.363895343602235e+00,-2.353775621127276e+00,1.478808653103465e+00,-1.051997939635221e+00,7.875728707712822e-01,-5.919633690160624e-01,4.153152333278771e-01,-1.631939904194518e-01}, +{ 2.404068826180497e-01,-6.205038004227549e-01,9.178045561850223e-01,-1.312641871097070e+00,2.020530818743557e+00,-3.972844185220432e+00,-5.076572802618726e-01,4.592534448840404e+00,-2.183687039270851e+00,1.401062353798898e+00,-9.967071767138922e-01,7.286080320898741e-01,-5.037104318129453e-01,1.968046925240128e-01}, +{ -1.968046925240133e-01,5.037104318129449e-01,-7.286080320898742e-01,9.967071767138953e-01,-1.401062353798897e+00,2.183687039270843e+00,-4.592534448840396e+00,5.076572802618611e-01,3.972844185220442e+00,-2.020530818743560e+00,1.312641871097068e+00,-9.178045561850212e-01,6.205038004227550e-01,-2.404068826180491e-01}, +{ 1.631939904194554e-01,-4.153152333278781e-01,5.919633690160641e-01,-7.875728707712858e-01,1.051997939635223e+00,-1.478808653103463e+00,2.353775621127274e+00,-5.363895343602231e+00,1.583505573213413e+00,3.435956560963539e+00,-1.850266584118975e+00,1.198450916944692e+00,-7.821820277806690e-01,2.991967413848436e-01}, +{ -1.361135888018188e-01,3.450403929021289e-01,-4.869110069949844e-01,6.361218424863603e-01,-8.234936973565511e-01,1.094406983040799e+00,-1.552664889665603e+00,2.542900107343539e+00,-6.397333527138652e+00,2.867378878637338e+00,2.924227390032486e+00,-1.648161153502810e+00,1.015323846517878e+00,-3.807215775001124e-01}, +{ 1.131173030209814e-01,-2.859675278923434e-01,4.008004769381214e-01,-5.172921854737241e-01,6.563420661767607e-01,-8.434943740158404e-01,1.126781767749053e+00,-1.625831419183256e+00,2.766337226587062e+00,-7.908302480838147e+00,4.613820733468893e+00,2.361331112069360e+00,-1.349397154638735e+00,4.917544560318154e-01}, +{ -9.221662843937117e-02,2.326995621201768e-01,-3.246450826773230e-01,4.156386969441410e-01,-5.205727474453390e-01,6.552991544283916e-01,-8.457010717831582e-01,1.145705305922692e+00,-1.697916630775931e+00,3.049395313621590e+00,-1.040784244908796e+01,7.410422505176695e+00,1.575043556797280e+00,-5.953094848018838e-01}, +{ 7.127784740601283e-02,-1.796517723216722e-01,2.499086279590190e-01,-3.183448085961662e-01,3.955491400579667e-01,-4.917834178630847e-01,6.222513737366208e-01,-8.151980104274802e-01,1.133752862738036e+00,-1.756507947464336e+00,3.441491349502388e+00,-1.550694458051996e+01,1.315039372912953e+01,3.805606663129142e-03}, +{ -4.650380499864326e-02,1.171377855666300e-01,-1.626983185000963e-01,2.067073415777498e-01,-2.557811808710588e-01,3.160102907815844e-01,-3.959384446376074e-01,5.104701716136928e-01,-6.901249998338770e-01,1.009686753805707e+00,-1.707677330780172e+00,4.012097645098336e+00,-3.230592231946940e+01,2.939253641064716e+01} +}; +#endif +#if p_Nq==14 && p_cubNq==15 +const dfloat c_DI[15][14] = { +{ -3.113315858247676e+01,3.539939755355282e+01,-6.079222585574962e+00,2.874558761120601e+00,-1.780362622808984e+00,1.246453081376440e+00,-9.348001934816521e-01,7.312910566726664e-01,-5.869167842032105e-01,4.768196094692556e-01,-3.862959599932690e-01,3.045427984826698e-01,-2.194691140338811e-01,8.716298189826978e-02}, +{ -1.849311148440213e+00,-1.133889198731030e+01,1.655035144736470e+01,-5.090956926425943e+00,2.839706805425552e+00,-1.905363793671459e+00,1.398426245947453e+00,-1.080460169449757e+00,8.604793867609094e-01,-6.955753683431130e-01,5.616712195549198e-01,-4.418697335378143e-01,3.180433943278427e-01,-1.262493722027804e-01}, +{ 1.516048608194358e+00,-4.779618677921331e+00,-4.254935229543495e+00,1.011228566962531e+01,-4.066230830711277e+00,2.468344807236331e+00,-1.730621343384537e+00,1.304139123178658e+00,-1.023108982547813e+00,8.191632812022744e-01,-6.573725918017412e-01,5.151184822820041e-01,-3.699179392813718e-01,1.467056234726316e-01}, +{ -9.319480052833687e-01,2.681838127829021e+00,-5.745775954367827e+00,-5.697048924104383e-01,6.487719975658067e+00,-3.089819598487593e+00,1.985385651772664e+00,-1.433067790152548e+00,1.096959366914936e+00,-8.650962599511877e-01,6.875798334483056e-01,-5.355380571640124e-01,3.832468607340438e-01,-1.517792585400629e-01}, +{ 5.490498413959488e-01,-1.500765203399826e+00,2.638862382589226e+00,-6.151526053979044e+00,1.794891606177101e+00,3.985904722848185e+00,-2.161760174139069e+00,1.453548666822116e+00,-1.071865714847795e+00,8.269450940315918e-01,-6.483898776185691e-01,5.007959799388060e-01,-3.566797512343967e-01,1.409884814157246e-01}, +{ -2.991786878646314e-01,7.954238991485976e-01,-1.279663356867976e+00,2.226156899504494e+00,-6.213157856648331e+00,3.451953799083871e+00,2.086332497068443e+00,-1.290013579865368e+00,9.074666769099530e-01,-6.813123371168949e-01,5.254500092120746e-01,-4.017775537043593e-01,2.845380809926421e-01,-1.122184898525140e-01}, +{ 1.310631316269951e-01,-3.434365216566025e-01,5.292845765639410e-01,-8.250873792102920e-01,1.544904824944846e+00,-5.983325587746759e+00,4.639810124373638e+00,5.839696666543868e-01,-4.888264506312727e-01,3.750622629780588e-01,-2.900556348613220e-01,2.216243251398919e-01,-1.567924865574143e-01,6.180514838190436e-02}, +{ -1.611328125000069e-02,4.312512707585636e-02,-6.969293589826984e-02,1.157432968296397e-01,-2.237658131830447e-01,6.125800096199093e-01,-5.466238122747793e+00,5.466238122747806e+00,-6.125800096199284e-01,2.237658131830548e-01,-1.157432968296459e-01,6.969293589827495e-02,-4.312512707585991e-02,1.611328124999980e-02}, +{ -6.180514838190318e-02,1.567924865574108e-01,-2.216243251398869e-01,2.900556348613157e-01,-3.750622629780482e-01,4.888264506312585e-01,-5.839696666543681e-01,-4.639810124373659e+00,5.983325587746762e+00,-1.544904824944836e+00,8.250873792102817e-01,-5.292845765639339e-01,3.434365216565993e-01,-1.310631316269932e-01}, +{ 1.122184898525130e-01,-2.845380809926418e-01,4.017775537043598e-01,-5.254500092120760e-01,6.813123371168929e-01,-9.074666769099480e-01,1.290013579865364e+00,-2.086332497068435e+00,-3.451953799083885e+00,6.213157856648331e+00,-2.226156899504482e+00,1.279663356867972e+00,-7.954238991485945e-01,2.991786878646295e-01}, +{ -1.409884814157265e-01,3.566797512343964e-01,-5.007959799388071e-01,6.483898776185731e-01,-8.269450940315930e-01,1.071865714847793e+00,-1.453548666822114e+00,2.161760174139069e+00,-3.985904722848181e+00,-1.794891606177101e+00,6.151526053979040e+00,-2.638862382589228e+00,1.500765203399828e+00,-5.490498413959481e-01}, +{ 1.517792585400645e-01,-3.832468607340446e-01,5.355380571640150e-01,-6.875798334483106e-01,8.650962599511901e-01,-1.096959366914937e+00,1.433067790152552e+00,-1.985385651772675e+00,3.089819598487610e+00,-6.487719975658089e+00,5.697048924104735e-01,5.745775954367804e+00,-2.681838127829018e+00,9.319480052833659e-01}, +{ -1.467056234726323e-01,3.699179392813724e-01,-5.151184822820026e-01,6.573725918017398e-01,-8.191632812022758e-01,1.023108982547811e+00,-1.304139123178651e+00,1.730621343384537e+00,-2.468344807236332e+00,4.066230830711265e+00,-1.011228566962530e+01,4.254935229543508e+00,4.779618677921304e+00,-1.516048608194347e+00}, +{ 1.262493722027809e-01,-3.180433943278408e-01,4.418697335378101e-01,-5.616712195549165e-01,6.955753683431178e-01,-8.604793867609146e-01,1.080460169449757e+00,-1.398426245947454e+00,1.905363793671463e+00,-2.839706805425544e+00,5.090956926425922e+00,-1.655035144736471e+01,1.133889198731031e+01,1.849311148440221e+00}, +{ -8.716298189827398e-02,2.194691140338588e-01,-3.045427984826466e-01,3.862959599932462e-01,-4.768196094692326e-01,5.869167842031805e-01,-7.312910566726227e-01,9.348001934815962e-01,-1.246453081376379e+00,1.780362622808889e+00,-2.874558761120459e+00,6.079222585574744e+00,-3.539939755355243e+01,3.113315858247653e+01} +}; +#endif +#if p_Nq==14 && p_cubNq==16 +const dfloat c_DI[16][14] = { +{ -3.262129521008634e+01,3.805924444152497e+01,-7.882537292994348e+00,3.909449939710833e+00,-2.468067683249664e+00,1.744339938023058e+00,-1.315166935199983e+00,1.032191041601237e+00,-8.301424481343153e-01,6.753539901215641e-01,-5.476445314870646e-01,4.320032101163550e-01,-3.114327767713547e-01,1.237043168250496e-01}, +{ -3.891892975566706e+00,-8.878944589479989e+00,1.674182657076423e+01,-6.126371269395097e+00,3.572378160085515e+00,-2.442335031375684e+00,1.810114820123592e+00,-1.406546630451441e+00,1.124192119042638e+00,-9.108744749636930e-01,7.366579148523869e-01,-5.801076935823682e-01,4.177839848341387e-01,-1.658809048875231e-01}, +{ 2.244044236317652e+00,-7.830471834499365e+00,-5.269154302616440e-01,8.687514609174199e+00,-4.125136692082596e+00,2.628162461609280e+00,-1.883268906362024e+00,1.436067859106085e+00,-1.134649655714881e+00,9.125825370269020e-01,-7.344895082486278e-01,5.766212040386599e-01,-4.145313931843448e-01,1.644705130807057e-01}, +{ -1.030648976959372e+00,3.103704699523309e+00,-8.229114744439714e+00,3.561951571982519e+00,3.961033160648553e+00,-2.260821120353147e+00,1.541812879667721e+00,-1.144884936998775e+00,8.903891012487848e-01,-7.090113491340981e-01,5.669762163976744e-01,-4.432940638718373e-01,3.179298035112530e-01,-1.260222412228709e-01}, +{ 3.526467436886286e-01,-9.999494410832201e-01,1.987906073485856e+00,-7.455553050442052e+00,5.627000694471334e+00,8.840543983679010e-01,-6.914762357927253e-01,5.230910789851285e-01,-4.080469140175417e-01,3.248683394363239e-01,-2.595810722228896e-01,2.028035713835388e-01,-1.453762194219170e-01,5.761203316163419e-02}, +{ 9.784057851972519e-03,-1.514472911579804e-02,-2.918406726798823e-02,3.176262966085392e-01,-5.894265472920352e+00,6.347873815278785e+00,-1.037906170940596e+00,4.745789662189988e-01,-2.897098276615790e-01,2.006623902260917e-01,-1.473322239297906e-01,1.093086899486520e-01,-7.610076351072220e-02,2.980903921378657e-02}, +{ -1.789020750841004e-01,4.636732206961167e-01,-6.931482653526698e-01,1.009918150502737e+00,-1.573654359426194e+00,-3.768876346754815e+00,6.057646109397047e+00,-2.024808078451079e+00,1.176886453450567e+00,-8.081203767065126e-01,5.935790358866589e-01,-4.413362617791217e-01,3.078191350832180e-01,-1.206763414618532e-01}, +{ 2.251069042228130e-01,-5.802087832932058e-01,8.550008736214104e-01,-1.213422792086943e+00,1.836517221504718e+00,-3.430390208147021e+00,-1.295654913369928e+00,5.005891547299476e+00,-2.236534460177615e+00,1.407801022101244e+00,-9.927580132438117e-01,7.223910592323746e-01,-4.982221484381926e-01,1.944826907746825e-01}, +{ -1.944826907746834e-01,4.982221484381936e-01,-7.223910592323770e-01,9.927580132438163e-01,-1.407801022101244e+00,2.236534460177608e+00,-5.005891547299465e+00,1.295654913369912e+00,3.430390208147034e+00,-1.836517221504721e+00,1.213422792086941e+00,-8.550008736214093e-01,5.802087832932066e-01,-2.251069042228126e-01}, +{ 1.206763414618542e-01,-3.078191350832206e-01,4.413362617791255e-01,-5.935790358866649e-01,8.081203767065180e-01,-1.176886453450572e+00,2.024808078451085e+00,-6.057646109397045e+00,3.768876346754793e+00,1.573654359426214e+00,-1.009918150502745e+00,6.931482653526772e-01,-4.636732206961200e-01,1.789020750841008e-01}, +{ -2.980903921378700e-02,7.610076351072391e-02,-1.093086899486556e-01,1.473322239297944e-01,-2.006623902260952e-01,2.897098276615855e-01,-4.745789662190065e-01,1.037906170940610e+00,-6.347873815278798e+00,5.894265472920343e+00,-3.176262966085198e-01,2.918406726797833e-02,1.514472911580333e-02,-9.784057851975366e-03}, +{ -5.761203316163477e-02,1.453762194219177e-01,-2.028035713835412e-01,2.595810722228923e-01,-3.248683394363246e-01,4.080469140175426e-01,-5.230910789851295e-01,6.914762357927248e-01,-8.840543983678999e-01,-5.627000694471325e+00,7.455553050442042e+00,-1.987906073485858e+00,9.999494410832223e-01,-3.526467436886291e-01}, +{ 1.260222412228733e-01,-3.179298035112541e-01,4.432940638718397e-01,-5.669762163976790e-01,7.090113491340990e-01,-8.903891012487839e-01,1.144884936998778e+00,-1.541812879667729e+00,2.260821120353157e+00,-3.961033160648560e+00,-3.561951571982510e+00,8.229114744439718e+00,-3.103704699523319e+00,1.030648976959372e+00}, +{ -1.644705130807109e-01,4.145313931843488e-01,-5.766212040386629e-01,7.344895082486317e-01,-9.125825370269043e-01,1.134649655714881e+00,-1.436067859106087e+00,1.883268906362030e+00,-2.628162461609289e+00,4.125136692082601e+00,-8.687514609174233e+00,5.269154302617268e-01,7.830471834499293e+00,-2.244044236317627e+00}, +{ 1.658809048875227e-01,-4.177839848341337e-01,5.801076935823600e-01,-7.366579148523780e-01,9.108744749636893e-01,-1.124192119042634e+00,1.406546630451431e+00,-1.810114820123586e+00,2.442335031375678e+00,-3.572378160085488e+00,6.126371269395053e+00,-1.674182657076426e+01,8.878944589480113e+00,3.891892975566629e+00}, +{ -1.237043168250267e-01,3.114327767713143e-01,-4.320032101163034e-01,5.476445314870110e-01,-6.753539901215152e-01,8.301424481342515e-01,-1.032191041601145e+00,1.315166935199867e+00,-1.744339938022919e+00,2.468067683249448e+00,-3.909449939710502e+00,7.882537292993798e+00,-3.805924444152407e+01,3.262129521008580e+01} +}; +#endif +#if p_Nq==14 && p_cubNq==17 +const dfloat c_DI[17][14] = { +{ -3.390017606173286e+01,4.035552673310009e+01,-9.457305912829897e+00,4.824841615971128e+00,-3.079301458508099e+00,2.187884714017563e+00,-1.654449966530131e+00,1.300796957641300e+00,-1.047370687413147e+00,8.527254041052066e-01,-6.918251406170203e-01,5.459173788001354e-01,-3.936294396700468e-01,1.563658636657622e-01}, +{ -6.021936600385533e+00,-6.018626240318198e+00,1.630533659894254e+01,-6.663635332395261e+00,3.999782528240824e+00,-2.768315823924072e+00,2.064884349697530e+00,-1.610533815713767e+00,1.290257718206847e+00,-1.047032526478146e+00,8.476310902729635e-01,-6.679321904019879e-01,4.812163159159804e-01,-1.910960716597227e-01}, +{ 2.670729925089802e+00,-1.038404183271905e+01,3.233892770760486e+00,6.651981850382406e+00,-3.541673851591694e+00,2.339676556811340e+00,-1.704946964251328e+00,1.312172755335839e+00,-1.042583094370825e+00,8.415377890985891e-01,-6.788863849503355e-01,5.337599510878567e-01,-3.840483980667724e-01,1.524289273836832e-01}, +{ -7.758190755690897e-01,2.502739338619949e+00,-9.375400860554253e+00,6.928326944795195e+00,1.264297267867647e+00,-9.403794958576208e-01,6.979574963724211e-01,-5.394936364004704e-01,4.290974515798540e-01,-3.463919599882240e-01,2.794028990682351e-01,-2.196368419187688e-01,1.580117724971331e-01,-6.271130051200827e-02}, +{ -8.060923821271357e-02,1.822042047463280e-01,-4.049080920498711e-02,-6.626124682259994e+00,7.644209584923759e+00,-1.566258785849043e+00,7.837567504819755e-01,-5.033878055061179e-01,3.610845923251853e-01,-2.737412801480053e-01,2.122183893508023e-01,-1.627374038425575e-01,1.154255667920837e-01,-4.554908359671474e-02}, +{ 3.660888557176722e-01,-9.751416210491777e-01,1.576961618518222e+00,-2.764151342949583e+00,-3.051695121609412e+00,6.453675696222886e+00,-2.515389904100172e+00,1.527279861533832e+00,-1.073769303312354e+00,8.068393159363048e-01,-6.227425428367398e-01,4.764333692896605e-01,-3.375217999700495e-01,1.331329186089110e-01}, +{ -3.471083160494637e-01,9.131681052441681e-01,-1.424670755597814e+00,2.296824743888200e+00,-4.894461259795984e+00,5.868939404065876e-01,4.185850332038703e+00,-2.142464897277909e+00,1.412602120244190e+00,-1.027852513271445e+00,7.790451233969913e-01,-5.897132507359469e-01,4.153352826455468e-01,-1.634486551358230e-01}, +{ 1.928743267964813e-01,-5.024144232113723e-01,7.619230820482964e-01,-1.148367943174458e+00,1.991822489303372e+00,-5.893888047234540e+00,3.602602859663322e+00,1.608008357481822e+00,-1.035834366566459e+00,7.360251240706588e-01,-5.495893381932857e-01,4.122720118664072e-01,-2.888972383972677e-01,1.134631055470238e-01}, +{ -1.611328125000038e-02,4.312512707585794e-02,-6.969293589827169e-02,1.157432968296424e-01,-2.237658131830493e-01,6.125800096199163e-01,-5.466238122747798e+00,5.466238122747801e+00,-6.125800096199207e-01,2.237658131830501e-01,-1.157432968296416e-01,6.969293589827091e-02,-4.312512707585725e-02,1.611328124999949e-02}, +{ -1.134631055470231e-01,2.888972383972683e-01,-4.122720118664080e-01,5.495893381932873e-01,-7.360251240706585e-01,1.035834366566458e+00,-1.608008357481828e+00,-3.602602859663318e+00,5.893888047234543e+00,-1.991822489303372e+00,1.148367943174455e+00,-7.619230820482957e-01,5.024144232113730e-01,-1.928743267964809e-01}, +{ 1.634486551358255e-01,-4.153352826455466e-01,5.897132507359474e-01,-7.790451233969951e-01,1.027852513271447e+00,-1.412602120244189e+00,2.142464897277912e+00,-4.185850332038712e+00,-5.868939404065787e-01,4.894461259795976e+00,-2.296824743888191e+00,1.424670755597811e+00,-9.131681052441660e-01,3.471083160494616e-01}, +{ -1.331329186089127e-01,3.375217999700511e-01,-4.764333692896640e-01,6.227425428367445e-01,-8.068393159363074e-01,1.073769303312357e+00,-1.527279861533836e+00,2.515389904100180e+00,-6.453675696222883e+00,3.051695121609386e+00,2.764151342949608e+00,-1.576961618518236e+00,9.751416210491857e-01,-3.660888557176740e-01}, +{ 4.554908359671477e-02,-1.154255667920861e-01,1.627374038425596e-01,-2.122183893508068e-01,2.737412801480121e-01,-3.610845923251924e-01,5.033878055061266e-01,-7.837567504819916e-01,1.566258785849070e+00,-7.644209584923765e+00,6.626124682259969e+00,4.049080920500959e-02,-1.822042047463398e-01,8.060923821271888e-02}, +{ 6.271130051200702e-02,-1.580117724971359e-01,2.196368419187754e-01,-2.794028990682427e-01,3.463919599882272e-01,-4.290974515798572e-01,5.394936364004788e-01,-6.979574963724318e-01,9.403794958576345e-01,-1.264297267867672e+00,-6.928326944795162e+00,9.375400860554251e+00,-2.502739338619966e+00,7.758190755690944e-01}, +{ -1.524289273836827e-01,3.840483980667717e-01,-5.337599510878568e-01,6.788863849503365e-01,-8.415377890985865e-01,1.042583094370822e+00,-1.312172755335836e+00,1.704946964251326e+00,-2.339676556811332e+00,3.541673851591668e+00,-6.651981850382334e+00,-3.233892770760606e+00,1.038404183271911e+01,-2.670729925089799e+00}, +{ 1.910960716597252e-01,-4.812163159159779e-01,6.679321904019826e-01,-8.476310902729582e-01,1.047032526478148e+00,-1.290257718206848e+00,1.610533815713764e+00,-2.064884349697527e+00,2.768315823924069e+00,-3.999782528240811e+00,6.663635332395232e+00,-1.630533659894256e+01,6.018626240318286e+00,6.021936600385475e+00}, +{ -1.563658636657692e-01,3.936294396700156e-01,-5.459173788000945e-01,6.918251406169672e-01,-8.527254041051417e-01,1.047370687413059e+00,-1.300796957641185e+00,1.654449966529997e+00,-2.187884714017404e+00,3.079301458507865e+00,-4.824841615970778e+00,9.457305912829332e+00,-4.035552673309920e+01,3.390017606173232e+01} +}; +#endif +#if p_Nq==14 && p_cubNq==18 +const dfloat c_DI[18][14] = { +{ -3.500508493783860e+01,4.234682952652953e+01,-1.083560204205651e+01,5.634243511997662e+00,-3.621860487575974e+00,2.582328573279177e+00,-1.956484487511529e+00,1.540062423800637e+00,-1.240947574900007e+00,1.010826551639488e+00,-8.203639389592129e-01,6.474847676047945e-01,-4.669219307431476e-01,1.854900447336955e-01}, +{ -8.161787903004004e+00,-2.943823269856004e+00,1.542611916845340e+01,-6.815626272824795e+00,4.179011953193515e+00,-2.918897512258094e+00,2.187646795981969e+00,-1.711081681085908e+00,1.373231376330967e+00,-1.115650150005624e+00,9.038687433153446e-01,-7.125965580803423e-01,5.135418023189823e-01,-2.039564924794099e-01}, +{ 2.759702838527704e+00,-1.227587985317724e+01,6.688599092914443e+00,4.392930638661652e+00,-2.595131588483938e+00,1.774408995441297e+00,-1.314184662886787e+00,1.020588409556298e+00,-8.153649037682725e-01,6.604493135845501e-01,-5.340189758559951e-01,4.204746773440227e-01,-3.027942989014729e-01,1.202203170437379e-01}, +{ -2.540964337231706e-01,1.044168475662799e+00,-9.183634472478744e+00,9.176147782776768e+00,-1.044318321191027e+00,3.930187781799227e-01,-2.115581702980318e-01,1.351979752007467e-01,-9.512365665879739e-02,7.074051902541850e-02,-5.399311064450412e-02,4.093979021092861e-02,-2.883420922158375e-02,1.134505315927401e-02}, +{ -5.525165856396566e-01,1.526327595057671e+00,-2.748045018444876e+00,-4.215662011204978e+00,7.842745340795160e+00,-2.878652887122901e+00,1.714582021331960e+00,-1.195461211718638e+00,8.978486460772825e-01,-7.000135177124691e-01,5.523921165346106e-01,-4.283244193859209e-01,3.057394633373165e-01,-1.209595319045623e-01}, +{ 5.447097687802984e-01,-1.480897231266432e+00,2.557962474933455e+00,-5.589083970345737e+00,8.020316016566288e-01,4.627739343704898e+00,-2.381367777747684e+00,1.573266102713081e+00,-1.150366415563324e+00,8.833045813564534e-01,-6.906054871309800e-01,5.324784209269435e-01,-3.788751856832504e-01,1.497037736656488e-01}, +{ -2.395417567589584e-01,6.411927021499693e-01,-1.052096191903297e+00,1.922665833252864e+00,-6.436323623446624e+00,4.477115121318545e+00,1.164041432617025e+00,-8.194853139299758e-01,6.015650808146915e-01,-4.609473141926551e-01,3.595255624945163e-01,-2.767022250344600e-01,1.966594526581849e-01,-7.766876003982591e-02}, +{ -5.953488251625460e-02,1.511301180831144e-01,-2.124821485426914e-01,2.639491678699653e-01,-1.971108907516179e-01,-5.035870766691354e+00,5.993211146207161e+00,-1.323133892272746e+00,6.789651433736075e-01,-4.396087379899646e-01,3.126207480579285e-01,-2.281380601048676e-01,1.575029389367884e-01,-6.149988365906845e-02}, +{ 2.086020088187208e-01,-5.370686859066468e-01,7.890582313345926e-01,-1.112953111540918e+00,1.662067979993927e+00,-2.983793320398936e+00,-1.895976097212885e+00,5.281275585531956e+00,-2.233201118035799e+00,1.382904982574769e+00,-9.679937060766426e-01,7.016495873970937e-01,-4.829487312587331e-01,1.883763947795007e-01}, +{ -1.883763947794994e-01,4.829487312587326e-01,-7.016495873970938e-01,9.679937060766454e-01,-1.382904982574768e+00,2.233201118035794e+00,-5.281275585531948e+00,1.895976097212871e+00,2.983793320398950e+00,-1.662067979993931e+00,1.112953111540916e+00,-7.890582313345926e-01,5.370686859066474e-01,-2.086020088187202e-01}, +{ 6.149988365906987e-02,-1.575029389367870e-01,2.281380601048662e-01,-3.126207480579277e-01,4.396087379899623e-01,-6.789651433736006e-01,1.323133892272732e+00,-5.993211146207159e+00,5.035870766691369e+00,1.971108907516019e-01,-2.639491678699560e-01,2.124821485426864e-01,-1.511301180831097e-01,5.953488251625207e-02}, +{ 7.766876003982687e-02,-1.966594526581857e-01,2.767022250344605e-01,-3.595255624945189e-01,4.609473141926569e-01,-6.015650808146910e-01,8.194853139299773e-01,-1.164041432617029e+00,-4.477115121318546e+00,6.436323623446624e+00,-1.922665833252859e+00,1.052096191903295e+00,-6.411927021499687e-01,2.395417567589577e-01}, +{ -1.497037736656496e-01,3.788751856832500e-01,-5.324784209269443e-01,6.906054871309826e-01,-8.833045813564535e-01,1.150366415563322e+00,-1.573266102713079e+00,2.381367777747682e+00,-4.627739343704890e+00,-8.020316016566400e-01,5.589083970345740e+00,-2.557962474933457e+00,1.480897231266433e+00,-5.447097687802968e-01}, +{ 1.209595319045630e-01,-3.057394633373177e-01,4.283244193859229e-01,-5.523921165346133e-01,7.000135177124698e-01,-8.978486460772819e-01,1.195461211718637e+00,-1.714582021331967e+00,2.878652887122913e+00,-7.842745340795157e+00,4.215662011204971e+00,2.748045018444879e+00,-1.526327595057674e+00,5.525165856396564e-01}, +{ -1.134505315927173e-02,2.883420922157924e-02,-4.093979021091917e-02,5.399311064449056e-02,-7.074051902540823e-02,9.512365665878582e-02,-1.351979752007284e-01,2.115581702980089e-01,-3.930187781798885e-01,1.044318321190965e+00,-9.176147782776710e+00,9.183634472478763e+00,-1.044168475662853e+00,2.540964337231858e-01}, +{ -1.202203170437399e-01,3.027942989014755e-01,-4.204746773440244e-01,5.340189758559946e-01,-6.604493135845432e-01,8.153649037682630e-01,-1.020588409556289e+00,1.314184662886778e+00,-1.774408995441281e+00,2.595131588483909e+00,-4.392930638661602e+00,-6.688599092914517e+00,1.227587985317727e+01,-2.759702838527695e+00}, +{ 2.039564924794135e-01,-5.135418023189822e-01,7.125965580803392e-01,-9.038687433153439e-01,1.115650150005628e+00,-1.373231376330970e+00,1.711081681085908e+00,-2.187646795981971e+00,2.918897512258101e+00,-4.179011953193518e+00,6.815626272824776e+00,-1.542611916845337e+01,2.943823269855930e+00,8.161787903004058e+00}, +{ -1.854900447336973e-01,4.669219307431522e-01,-6.474847676048078e-01,8.203639389592309e-01,-1.010826551639514e+00,1.240947574900039e+00,-1.540062423800675e+00,1.956484487511579e+00,-2.582328573279248e+00,3.621860487576043e+00,-5.634243511997754e+00,1.083560204205672e+01,-4.234682952652977e+01,3.500508493783870e+01} +}; +#endif +#if p_Nq==14 && p_cubNq==19 +const dfloat c_DI[19][14] = { +{ -3.596473655366272e+01,4.408167607653118e+01,-1.204552470896764e+01,6.350676980813566e+00,-4.103625971431576e+00,2.933110605696839e+00,-2.225313436428262e+00,1.753132109868828e+00,-1.413387332200802e+00,1.151694749541025e+00,-9.349085783893863e-01,7.380028463399591e-01,-5.322446441698432e-01,2.114478564588423e-01}, +{ -1.025908929790819e+01,2.120558482849935e-01,1.424906808008273e+01,-6.680048414005961e+00,4.165431503446109e+00,-2.930772822411575e+00,2.205031520166183e+00,-1.728597142294402e+00,1.389271885164788e+00,-1.129737004546896e+00,9.158469936577281e-01,-7.223273260947091e-01,5.206752029177606e-01,-2.068090264585530e-01}, +{ 2.521184110679350e+00,-1.346572680723147e+01,9.657303742218385e+00,2.166943043298838e+00,-1.495321316350280e+00,1.073297501258705e+00,-8.130342716624822e-01,6.392922597998029e-01,-5.146025121110082e-01,4.188402305046163e-01,-3.397232548419677e-01,2.680251190666368e-01,-1.932350744548810e-01,7.675722982574962e-02}, +{ 4.131450569601924e-01,-9.899007379159607e-01,-7.888158849795491e+00,1.028765743603673e+01,-2.723833954969578e+00,1.474043555424574e+00,-9.818946065323003e-01,7.196994250880011e-01,-5.553034016707129e-01,4.399451202913784e-01,-3.506474155374070e-01,2.735759494877588e-01,-1.959671990325076e-01,7.763962216532035e-02}, +{ -9.014734273185148e-01,2.583173309590787e+00,-5.432261217746321e+00,-1.001934994265698e+00,6.700386391759360e+00,-3.121313833762532e+00,1.991964881424058e+00,-1.433251252339352e+00,1.095169192770068e+00,-8.627644055759732e-01,6.852667638562553e-01,-5.335131356665134e-01,3.817063639291476e-01,-1.511546366547710e-01}, +{ 4.674598269994930e-01,-1.301593543422032e+00,2.435110925333559e+00,-7.213462196786020e+00,4.323592096775355e+00,2.067172271267869e+00,-1.313119564225765e+00,9.322610418413858e-01,-7.058807676359224e-01,5.527727237184731e-01,-4.373370792601366e-01,3.396404129070017e-01,-2.426481548990069e-01,9.603200738574706e-02}, +{ 5.257491152254651e-02,-1.290282214430017e-01,1.542587841970147e-01,2.577452586054698e-03,-5.676100916598496e+00,6.490840201685688e+00,-1.294909286927094e+00,6.418740883982983e-01,-4.097664423484754e-01,2.916908371700189e-01,-2.179220357730384e-01,1.634555519326537e-01,-1.145140081198905e-01,4.496908371772095e-02}, +{ -2.928588998115982e-01,7.652298941676680e-01,-1.170802436034113e+00,1.801494633038130e+00,-3.311193080263925e+00,-1.747461514461591e+00,5.436353503623951e+00,-2.351632165028309e+00,1.471162113045951e+00,-1.044503655132969e+00,7.811307201555441e-01,-5.867644844722586e-01,4.115318027353325e-01,-1.616864315618118e-01}, +{ 2.268506826137280e-01,-5.891560857895388e-01,8.862246487745520e-01,-1.312925624395461e+00,2.188470480808900e+00,-5.612638668267095e+00,2.635089701727232e+00,2.440828193572023e+00,-1.436035425330162e+00,9.891717372578596e-01,-7.277106508444219e-01,5.415568271847736e-01,-3.779095676310851e-01,1.481837503186960e-01}, +{ -1.611328125000012e-02,4.312512707585920e-02,-6.969293589827406e-02,1.157432968296452e-01,-2.237658131830526e-01,6.125800096199225e-01,-5.466238122747803e+00,5.466238122747799e+00,-6.125800096199162e-01,2.237658131830478e-01,-1.157432968296409e-01,6.969293589827089e-02,-4.312512707585725e-02,1.611328124999923e-02}, +{ -1.481837503186959e-01,3.779095676310837e-01,-5.415568271847717e-01,7.277106508444222e-01,-9.891717372578563e-01,1.436035425330153e+00,-2.440828193572013e+00,-2.635089701727243e+00,5.612638668267103e+00,-2.188470480808900e+00,1.312925624395456e+00,-8.862246487745481e-01,5.891560857895377e-01,-2.268506826137263e-01}, +{ 1.616864315618124e-01,-4.115318027353321e-01,5.867644844722582e-01,-7.811307201555464e-01,1.044503655132970e+00,-1.471162113045948e+00,2.351632165028306e+00,-5.436353503623956e+00,1.747461514461600e+00,3.311193080263914e+00,-1.801494633038118e+00,1.170802436034109e+00,-7.652298941676650e-01,2.928588998115963e-01}, +{ -4.496908371772097e-02,1.145140081198907e-01,-1.634555519326545e-01,2.179220357730395e-01,-2.916908371700193e-01,4.097664423484756e-01,-6.418740883982982e-01,1.294909286927098e+00,-6.490840201685696e+00,5.676100916598496e+00,-2.577452586050190e-03,-1.542587841970177e-01,1.290282214430039e-01,-5.257491152254594e-02}, +{ -9.603200738574763e-02,2.426481548990083e-01,-3.396404129070043e-01,4.373370792601433e-01,-5.527727237184776e-01,7.058807676359241e-01,-9.322610418413906e-01,1.313119564225773e+00,-2.067172271267883e+00,-4.323592096775330e+00,7.213462196786003e+00,-2.435110925333563e+00,1.301593543422037e+00,-4.674598269994939e-01}, +{ 1.511546366547717e-01,-3.817063639291489e-01,5.335131356665170e-01,-6.852667638562598e-01,8.627644055759742e-01,-1.095169192770068e+00,1.433251252339354e+00,-1.991964881424063e+00,3.121313833762542e+00,-6.700386391759353e+00,1.001934994265674e+00,5.432261217746341e+00,-2.583173309590797e+00,9.014734273185153e-01}, +{ -7.763962216532089e-02,1.959671990325043e-01,-2.735759494877538e-01,3.506474155374020e-01,-4.399451202913733e-01,5.553034016707046e-01,-7.196994250879886e-01,9.818946065322864e-01,-1.474043555424556e+00,2.723833954969545e+00,-1.028765743603670e+01,7.888158849795524e+00,9.899007379159037e-01,-4.131450569601736e-01}, +{ -7.675722982574973e-02,1.932350744548782e-01,-2.680251190666321e-01,3.397232548419624e-01,-4.188402305046067e-01,5.146025121109944e-01,-6.392922597997894e-01,8.130342716624666e-01,-1.073297501258678e+00,1.495321316350239e+00,-2.166943043298758e+00,-9.657303742218495e+00,1.346572680723149e+01,-2.521184110679322e+00}, +{ 2.068090264585560e-01,-5.206752029177609e-01,7.223273260947090e-01,-9.158469936577283e-01,1.129737004546898e+00,-1.389271885164787e+00,1.728597142294396e+00,-2.205031520166185e+00,2.930772822411578e+00,-4.165431503446094e+00,6.680048414005938e+00,-1.424906808008278e+01,-2.120558482848067e-01,1.025908929790807e+01}, +{ -2.114478564588325e-01,5.322446441698477e-01,-7.380028463399667e-01,9.349085783894052e-01,-1.151694749541058e+00,1.413387332200833e+00,-1.753132109868847e+00,2.225313436428292e+00,-2.933110605696901e+00,4.103625971431651e+00,-6.350676980813684e+00,1.204552470896787e+01,-4.408167607653138e+01,3.596473655366278e+01} +}; +#endif +#if p_Nq==14 && p_cubNq==20 +const dfloat c_DI[20][14] = { +{ -3.680251402901809e+01,4.560010952336886e+01,-1.311121078135471e+01,6.986028007594845e+00,-4.531994506505180e+00,3.245410533758734e+00,-2.464819740292742e+00,1.943042203227685e+00,-1.567125810220583e+00,1.277308355510712e+00,-1.037061499967746e+00,8.187347504851306e-01,-5.905077838813924e-01,2.346007772944748e-01}, +{ -1.228035737767783e+01,3.356632476988764e+00,1.288363273066138e+01,-6.336485070760904e+00,4.007252631134454e+00,-2.836978832292072e+00,2.141470878066121e+00,-1.682020600733414e+00,1.353488759952012e+00,-1.101518217819711e+00,8.934427552084767e-01,-7.048971083399759e-01,5.082119292829095e-01,-2.018749536702150e-01}, +{ 1.991067505671131e+00,-1.398859518398500e+01,1.207006846342743e+01,1.263182582077498e-01,-3.840658502440912e-01,3.378717820356841e-01,-2.774024050266823e-01,2.273540765033710e-01,-1.874955755789948e-01,1.549302666236790e-01,-1.268905807996062e-01,1.007266677481163e-01,-7.287713915578564e-02,2.898971457299984e-02}, +{ 1.109801247553489e+00,-3.307285033448483e+00,-5.806793814099367e+00,1.042227209353198e+01,-3.739159577946645e+00,2.199814643512875e+00,-1.521098135250988e+00,1.137712098503267e+00,-8.885628979025915e-01,7.094251222325150e-01,-5.682653856363710e-01,4.447752070306673e-01,-3.191876173050061e-01,1.265520492246606e-01}, +{ -1.039891412242975e+00,3.083904874826149e+00,-7.589349872863716e+00,2.328773488528019e+00,4.802401660680818e+00,-2.600024785311379e+00,1.741593268067172e+00,-1.282094703947364e+00,9.922496201925980e-01,-7.877712178388649e-01,6.287708419803018e-01,-4.910278355830339e-01,3.519257578779005e-01,-1.394596843656263e-01}, +{ 1.808311811461634e-01,-5.374297658011157e-01,1.233931088460596e+00,-7.333652055731310e+00,6.748742078028966e+00,-3.124939727840458e-01,-2.633659707843271e-03,5.273317456210692e-02,-5.909696367614811e-02,5.486769074185210e-02,-4.754690825829299e-02,3.889069483277277e-02,-2.857825135155201e-02,1.143566953785115e-02}, +{ 3.554263173399678e-01,-9.459183255438388e-01,1.525482148191663e+00,-2.650293062815349e+00,-3.180712069136890e+00,6.490552672347523e+00,-2.493327221448517e+00,1.507659128559281e+00,-1.057990570093701e+00,7.941706689227531e-01,-6.125935450182999e-01,4.684982271111182e-01,-3.318326640640901e-01,1.308782956483799e-01}, +{ -3.392767444893551e-01,8.969924768890239e-01,-1.419663871871311e+00,2.370911055257157e+00,-5.704106161846627e+00,2.127183995468847e+00,3.134238316117199e+00,-1.762084664513376e+00,1.199290801671387e+00,-8.860713436406974e-01,6.772866823612109e-01,-5.151985108496067e-01,3.638267601618774e-01,-1.433287907157287e-01}, +{ 3.720161266653647e-02,-1.005592500799950e-01,1.678257371331885e-01,-3.035386198291836e-01,7.480479787288664e-01,-5.686123163257604e+00,5.534533744105720e+00,-5.069900779711679e-01,1.540746056350927e-01,-6.625085199284868e-02,3.369155478294579e-02,-1.885108129558487e-02,1.083069128607461e-02,-3.892879912040388e-03}, +{ 1.927243748593304e-01,-4.957230071149427e-01,7.264720992143093e-01,-1.019372788445156e+00,1.505355134920508e+00,-2.615263134815190e+00,-2.363101392003221e+00,5.468508292599075e+00,-2.200598084790272e+00,1.343273633987286e+00,-9.341882470338964e-01,6.748711314295985e-01,-4.637104212401031e-01,1.807524084326733e-01}, +{ -1.807524084326739e-01,4.637104212401029e-01,-6.748711314295998e-01,9.341882470338997e-01,-1.343273633987285e+00,2.200598084790266e+00,-5.468508292599075e+00,2.363101392003228e+00,2.615263134815186e+00,-1.505355134920505e+00,1.019372788445150e+00,-7.264720992143064e-01,4.957230071149417e-01,-1.927243748593308e-01}, +{ 3.892879912039265e-03,-1.083069128607622e-02,1.885108129558770e-02,-3.369155478294982e-02,6.625085199285351e-02,-1.540746056350985e-01,5.069900779711755e-01,-5.534533744105727e+00,5.686123163257602e+00,-7.480479787288580e-01,3.035386198291767e-01,-1.678257371331837e-01,1.005592500799928e-01,-3.720161266653568e-02}, +{ 1.433287907157306e-01,-3.638267601618781e-01,5.151985108496088e-01,-6.772866823612161e-01,8.860713436407011e-01,-1.199290801671388e+00,1.762084664513380e+00,-3.134238316117213e+00,-2.127183995468831e+00,5.704106161846616e+00,-2.370911055257148e+00,1.419663871871308e+00,-8.969924768890234e-01,3.392767444893556e-01}, +{ -1.308782956483800e-01,3.318326640640892e-01,-4.684982271111174e-01,6.125935450182994e-01,-7.941706689227516e-01,1.057990570093699e+00,-1.507659128559278e+00,2.493327221448518e+00,-6.490552672347538e+00,3.180712069136920e+00,2.650293062815326e+00,-1.525482148191656e+00,9.459183255438357e-01,-3.554263173399664e-01}, +{ -1.143566953785033e-02,2.857825135155043e-02,-3.889069483277149e-02,4.754690825829214e-02,-5.486769074184930e-02,5.909696367614287e-02,-5.273317456210130e-02,2.633659707833684e-03,3.124939727840643e-01,-6.748742078028973e+00,7.333652055731299e+00,-1.233931088460587e+00,5.374297658011111e-01,-1.808311811461612e-01}, +{ 1.394596843656259e-01,-3.519257578779008e-01,4.910278355830369e-01,-6.287708419803059e-01,7.877712178388642e-01,-9.922496201925947e-01,1.282094703947362e+00,-1.741593268067174e+00,2.600024785311382e+00,-4.802401660680809e+00,-2.328773488528032e+00,7.589349872863729e+00,-3.083904874826159e+00,1.039891412242975e+00}, +{ -1.265520492246592e-01,3.191876173050062e-01,-4.447752070306642e-01,5.682653856363664e-01,-7.094251222325151e-01,8.885628979025911e-01,-1.137712098503266e+00,1.521098135250988e+00,-2.199814643512871e+00,3.739159577946628e+00,-1.042227209353196e+01,5.806793814099381e+00,3.307285033448457e+00,-1.109801247553479e+00}, +{ -2.898971457299870e-02,7.287713915578686e-02,-1.007266677481185e-01,1.268905807996062e-01,-1.549302666236715e-01,1.874955755789880e-01,-2.273540765033732e-01,2.774024050266877e-01,-3.378717820356857e-01,3.840658502440938e-01,-1.263182582077515e-01,-1.207006846342745e+01,1.398859518398501e+01,-1.991067505671120e+00}, +{ 2.018749536702167e-01,-5.082119292829087e-01,7.048971083399732e-01,-8.934427552084737e-01,1.101518217819710e+00,-1.353488759952008e+00,1.682020600733401e+00,-2.141470878066108e+00,2.836978832292068e+00,-4.007252631134444e+00,6.336485070760855e+00,-1.288363273066129e+01,-3.356632476988865e+00,1.228035737767788e+01}, +{ -2.346007772944816e-01,5.905077838814404e-01,-8.187347504852029e-01,1.037061499967842e+00,-1.277308355510836e+00,1.567125810220740e+00,-1.943042203227887e+00,2.464819740293004e+00,-3.245410533759085e+00,4.531994506505625e+00,-6.986028007595487e+00,1.311121078135582e+01,-4.560010952337034e+01,3.680251402901885e+01} +}; +#endif +#if p_Nq==14 && p_cubNq==21 +const dfloat c_DI[21][14] = { +{ -3.753750962047845e+01,4.693516660019624e+01,-1.405318040445914e+01,7.550836208811377e+00,-4.913646250492942e+00,3.523950553750492e+00,-2.678562447700529e+00,2.112584895973656e+00,-1.704407935991361e+00,1.389493279553555e+00,-1.128303085088403e+00,8.908481290214302e-01,-6.425530547632263e-01,2.552831316673056e-01}, +{ -1.420566909252268e+01,6.428234055336849e+00,1.141046522862088e+01,-5.847457265324379e+00,3.744123966263201e+00,-2.665294772793785e+00,2.017762874168394e+00,-1.587595344196144e+00,1.278901733458305e+00,-1.041561262887422e+00,8.452115883756574e-01,-6.670478123543901e-01,4.810092484149138e-01,-1.910831445593998e-01}, +{ 1.216402045326183e+00,-1.391889317035777e+01,1.392590056455286e+01,-1.651022574486760e+00,6.514117623808792e-01,-3.656310540274880e-01,2.417720898933795e-01,-1.749534427355004e-01,1.334625886448933e-01,-1.048048275874480e-01,8.299253556308119e-02,-6.446400086789295e-02,4.605269894754116e-02,-1.822521524594815e-02}, +{ 1.743580652112341e+00,-5.659995688485677e+00,-3.250959929444257e+00,9.805930840428779e+00,-4.165159264266035e+00,2.567838738980110e+00,-1.812862436395613e+00,1.371227177389940e+00,-1.078151788748345e+00,8.644605725625357e-01,-6.943618824752679e-01,5.444217378720965e-01,-3.910936032067722e-01,1.551248736761629e-01}, +{ -9.489701977308296e-01,2.938043530128711e+00,-8.952167834082990e+00,5.299736966779276e+00,2.649041016110865e+00,-1.655676661395648e+00,1.164545488009263e+00,-8.777367097896704e-01,6.883948455846178e-01,-5.509946078470683e-01,4.420543848149218e-01,-3.463304718171334e-01,2.486791239873893e-01,-9.861887275170386e-02}, +{ -2.088512472966688e-01,5.413569444498466e-01,-7.183869752507184e-01,-6.113732496401554e+00,7.858149421174331e+00,-2.026242472617543e+00,1.092168623204419e+00,-7.275094656353503e-01,5.328416121303420e-01,-4.091864583066062e-01,3.198374144279486e-01,-2.465345947654966e-01,1.753812785029215e-01,-6.929158361587003e-02}, +{ 5.307641237828336e-01,-1.437723175487459e+00,2.453873839026159e+00,-5.140664672898484e+00,8.537152816569395e-02,5.052429106313030e+00,-2.498901487520277e+00,1.629751660953627e+00,-1.184374809067416e+00,9.063085443069470e-01,-7.071375362151543e-01,5.445473431493820e-01,-3.871918013710851e-01,1.529473368622009e-01}, +{ -1.835680019926375e-01,4.951048227091908e-01,-8.303368945361355e-01,1.600211929617613e+00,-6.457749120414012e+00,5.119395634634247e+00,5.170740534107146e-01,-4.635242691101317e-01,3.630505011900462e-01,-2.864267750871158e-01,2.269367410553857e-01,-1.762245345823231e-01,1.258556316876208e-01,-4.979971858246184e-02}, +{ -2.028562567384483e-01,5.267199648237189e-01,-7.915199896129486e-01,1.167555547775312e+00,-1.889807443482466e+00,-3.435389523705382e+00,5.998374857148514e+00,-2.126390119281808e+00,1.256460928262732e+00,-8.692223637939849e-01,6.410271488482205e-01,-4.777051048235134e-01,3.335985582025205e-01,-1.308462036224670e-01}, +{ 2.434915710432144e-01,-6.311255815268076e-01,9.442619454925507e-01,-1.383128251198947e+00,2.246247637572174e+00,-5.259939113723728e+00,1.779460675939188e+00,3.104824864055234e+00,-1.720157406294079e+00,1.160520358487723e+00,-8.453127271535934e-01,6.257294755893739e-01,-4.354268179311984e-01,1.705533696488957e-01}, +{ -1.611328124999942e-02,4.312512707586034e-02,-6.969293589827560e-02,1.157432968296475e-01,-2.237658131830561e-01,6.125800096199288e-01,-5.466238122747807e+00,5.466238122747794e+00,-6.125800096199084e-01,2.237658131830424e-01,-1.157432968296361e-01,6.969293589826703e-02,-4.312512707585459e-02,1.611328124999942e-02}, +{ -1.705533696488974e-01,4.354268179311991e-01,-6.257294755893751e-01,8.453127271535976e-01,-1.160520358487726e+00,1.720157406294078e+00,-3.104824864055240e+00,-1.779460675939179e+00,5.259939113723726e+00,-2.246247637572171e+00,1.383128251198943e+00,-9.442619454925489e-01,6.311255815268078e-01,-2.434915710432140e-01}, +{ 1.308462036224700e-01,-3.335985582025209e-01,4.777051048235144e-01,-6.410271488482232e-01,8.692223637939850e-01,-1.256460928262728e+00,2.126390119281802e+00,-5.998374857148515e+00,3.435389523705386e+00,1.889807443482462e+00,-1.167555547775307e+00,7.915199896129466e-01,-5.267199648237177e-01,2.028562567384475e-01}, +{ 4.979971858246279e-02,-1.258556316876208e-01,1.762245345823231e-01,-2.269367410553868e-01,2.864267750871157e-01,-3.630505011900437e-01,4.635242691101308e-01,-5.170740534107118e-01,-5.119395634634254e+00,6.457749120414009e+00,-1.600211929617604e+00,8.303368945361320e-01,-4.951048227091893e-01,1.835680019926361e-01}, +{ -1.529473368622026e-01,3.871918013710847e-01,-5.445473431493829e-01,7.071375362151571e-01,-9.063085443069472e-01,1.184374809067414e+00,-1.629751660953625e+00,2.498901487520274e+00,-5.052429106313023e+00,-8.537152816570293e-02,5.140664672898486e+00,-2.453873839026163e+00,1.437723175487461e+00,-5.307641237828323e-01}, +{ 6.929158361586987e-02,-1.753812785029242e-01,2.465345947654991e-01,-3.198374144279523e-01,4.091864583066112e-01,-5.328416121303470e-01,7.275094656353568e-01,-1.092168623204433e+00,2.026242472617566e+00,-7.858149421174334e+00,6.113732496401534e+00,7.183869752507357e-01,-5.413569444498568e-01,2.088512472966738e-01}, +{ 9.861887275170289e-02,-2.486791239873868e-01,3.463304718171318e-01,-4.420543848149210e-01,5.509946078470599e-01,-6.883948455846065e-01,8.777367097896625e-01,-1.164545488009254e+00,1.655676661395630e+00,-2.649041016110820e+00,-5.299736966779331e+00,8.952167834083015e+00,-2.938043530128708e+00,9.489701977308259e-01}, +{ -1.551248736761635e-01,3.910936032067734e-01,-5.444217378720972e-01,6.943618824752692e-01,-8.644605725625377e-01,1.078151788748344e+00,-1.371227177389939e+00,1.812862436395617e+00,-2.567838738980112e+00,4.165159264266023e+00,-9.805930840428768e+00,3.250959929444269e+00,5.659995688485650e+00,-1.743580652112329e+00}, +{ 1.822521524594500e-02,-4.605269894753564e-02,6.446400086788774e-02,-8.299253556307838e-02,1.048048275874525e-01,-1.334625886449015e-01,1.749534427355057e-01,-2.417720898933823e-01,3.656310540274919e-01,-6.514117623808801e-01,1.651022574486755e+00,-1.392590056455287e+01,1.391889317035778e+01,-1.216402045326173e+00}, +{ 1.910831445594017e-01,-4.810092484149184e-01,6.670478123543949e-01,-8.452115883756633e-01,1.041561262887430e+00,-1.278901733458312e+00,1.587595344196147e+00,-2.017762874168396e+00,2.665294772793793e+00,-3.744123966263215e+00,5.847457265324389e+00,-1.141046522862091e+01,-6.428234055336768e+00,1.420566909252263e+01}, +{ -2.552831316672979e-01,6.425530547631969e-01,-8.908481290213923e-01,1.128303085088363e+00,-1.389493279553520e+00,1.704407935991314e+00,-2.112584895973575e+00,2.678562447700423e+00,-3.523950553750378e+00,4.913646250492774e+00,-7.550836208811130e+00,1.405318040445879e+01,-4.693516660019568e+01,3.753750962047810e+01} +}; +#endif +#if p_Nq==15 && p_cubNq==15 +const dfloat c_DI[15][15] = { +{ -3.373914837117184e+01,3.697962786571060e+01,-4.445444223187027e+00,1.868191302577720e+00,-1.098165391537377e+00,7.486627709921336e-01,-5.535155954334526e-01,4.300251522084111e-01,-3.447521893129591e-01,2.816167174369299e-01,-2.317506814668523e-01,1.894978849215767e-01,-1.503447954810074e-01,1.087655181769140e-01,-4.326596443377759e-02}, +{ 1.329308777000537e-01,-1.523396141843973e+01,1.770483652466537e+01,-3.794247153163127e+00,1.916710543081449e+00,-1.232003525823835e+00,8.847603559057404e-01,-6.761345248729995e-01,5.365743071226871e-01,-4.354179891061508e-01,3.567325157400258e-01,-2.908239230893252e-01,2.302867409757067e-01,-1.664081917608205e-01,6.616486106495523e-02}, +{ 6.038569846966917e-01,-1.551871365291377e+00,-8.690613596370042e+00,1.186900514133666e+01,-3.358017801626527e+00,1.852552000977332e+00,-1.246089347985637e+00,9.199091424195167e-01,-7.152284506045996e-01,5.728926221076306e-01,-4.653573447249626e-01,3.772215000565064e-01,-2.975986109586993e-01,2.145840338122873e-01,-8.524490784477717e-02}, +{ -5.127493000838265e-01,1.400020316883520e+00,-2.398351967085392e+00,-5.525210742084300e+00,9.017129010451196e+00,-3.048164926403111e+00,1.776146468072241e+00,-1.228344913657053e+00,9.214334494972631e-01,-7.221966498609335e-01,5.785276790576420e-01,-4.647142449045281e-01,3.644978162648061e-01,-2.619352871447840e-01,1.039132909972585e-01}, +{ 4.003858733755731e-01,-1.065483600354056e+00,1.718037593498430e+00,-2.984612937336691e+00,-3.574300609126896e+00,7.300365916717425e+00,-2.807036774221549e+00,1.700457357751480e+00,-1.197948394217519e+00,9.064042960160161e-01,-7.106072208881616e-01,5.630585696003780e-01,-4.378590676017498e-01,3.131062814662697e-01,-1.239672846789491e-01}, +{ -3.160725107177700e-01,8.252270366011828e-01,-1.259846665632606e+00,1.929192986966071e+00,-3.497651246883281e+00,-2.170197910275026e+00,6.133342826746655e+00,-2.606089282093927e+00,1.626243006076415e+00,-1.159678528161523e+00,8.791186449121173e-01,-6.825232856858309e-01,5.241925953506122e-01,-3.722143423652446e-01,1.469566751621554e-01}, +{ 2.548707797039755e-01,-6.571991140297524e-01,9.696325721714262e-01,-1.379956614137462e+00,2.102773815924315e+00,-4.015323019032570e+00,-1.030017757264877e+00,5.270676125990836e+00,-2.429107234817705e+00,1.551346619855165e+00,-1.113131881614637e+00,8.377605481893058e-01,-6.318051466309470e-01,4.441419407073531e-01,-1.746616350144272e-01}, +{ -2.094726562499999e-01,5.356855072902387e-01,-7.732388425498719e-01,1.053799509049800e+00,-1.471725451746940e+00,2.264813641904639e+00,-4.588914964624974e+00,1.383309711983865e-14,4.588914964624958e+00,-2.264813641904632e+00,1.471725451746936e+00,-1.053799509049796e+00,7.732388425498701e-01,-5.356855072902391e-01,2.094726562499976e-01}, +{ 1.746616350144268e-01,-4.441419407073525e-01,6.318051466309483e-01,-8.377605481893088e-01,1.113131881614641e+00,-1.551346619855167e+00,2.429107234817700e+00,-5.270676125990815e+00,1.030017757264841e+00,4.015323019032591e+00,-2.102773815924319e+00,1.379956614137462e+00,-9.696325721714264e-01,6.571991140297543e-01,-2.548707797039769e-01}, +{ -1.469566751621560e-01,3.722143423652460e-01,-5.241925953506157e-01,6.825232856858359e-01,-8.791186449121240e-01,1.159678528161528e+00,-1.626243006076419e+00,2.606089282093932e+00,-6.133342826746652e+00,2.170197910275016e+00,3.497651246883289e+00,-1.929192986966074e+00,1.259846665632609e+00,-8.252270366011861e-01,3.160725107177720e-01}, +{ 1.239672846789507e-01,-3.131062814662711e-01,4.378590676017520e-01,-5.630585696003804e-01,7.106072208881633e-01,-9.064042960160181e-01,1.197948394217522e+00,-1.700457357751485e+00,2.807036774221556e+00,-7.300365916717423e+00,3.574300609126884e+00,2.984612937336694e+00,-1.718037593498432e+00,1.065483600354061e+00,-4.003858733755735e-01}, +{ -1.039132909972600e-01,2.619352871447841e-01,-3.644978162648047e-01,4.647142449045272e-01,-5.785276790576414e-01,7.221966498609285e-01,-9.214334494972578e-01,1.228344913657051e+00,-1.776146468072237e+00,3.048164926403102e+00,-9.017129010451207e+00,5.525210742084340e+00,2.398351967085363e+00,-1.400020316883510e+00,5.127493000838212e-01}, +{ 8.524490784477534e-02,-2.145840338122874e-01,2.975986109586977e-01,-3.772215000565053e-01,4.653573447249630e-01,-5.728926221076269e-01,7.152284506045941e-01,-9.199091424195144e-01,1.246089347985632e+00,-1.852552000977322e+00,3.358017801626515e+00,-1.186900514133663e+01,8.690613596370044e+00,1.551871365291346e+00,-6.038569846966806e-01}, +{ -6.616486106495192e-02,1.664081917608273e-01,-2.302867409757177e-01,2.908239230893283e-01,-3.567325157400221e-01,4.354179891061474e-01,-5.365743071226845e-01,6.761345248729973e-01,-8.847603559057434e-01,1.232003525823833e+00,-1.916710543081441e+00,3.794247153163106e+00,-1.770483652466539e+01,1.523396141843975e+01,-1.329308777000382e-01}, +{ 4.326596443375763e-02,-1.087655181769094e-01,1.503447954809953e-01,-1.894978849215635e-01,2.317506814668304e-01,-2.816167174368989e-01,3.447521893129197e-01,-4.300251522083570e-01,5.535155954333933e-01,-7.486627709920561e-01,1.098165391537262e+00,-1.868191302577537e+00,4.445444223186738e+00,-3.697962786571011e+01,3.373914837117152e+01} +}; +#endif +#if p_Nq==15 && p_cubNq==16 +const dfloat c_DI[16][15] = { +{ -3.564164224962857e+01,4.035690695849897e+01,-6.696991206600313e+00,3.136823372779190e+00,-1.935734308942559e+00,1.353558400697338e+00,-1.015545226727812e+00,7.962314167051158e-01,-6.421860755860836e-01,5.267216677851119e-01,-4.346730741117832e-01,3.561084012140942e-01,-2.828907920004301e-01,2.048091348937113e-01,-8.149641897599746e-02}, +{ -1.814040148634128e+00,-1.338903613847404e+01,1.893909765778627e+01,-5.637875456284835e+00,3.119189036537658e+00,-2.086843772438767e+00,1.530992873816918e+00,-1.184927519144617e+00,9.479810683957076e-01,-7.734140474809239e-01,6.359710955660532e-01,-5.197627696244408e-01,4.122433876998666e-01,-2.981801759429712e-01,1.186049082222503e-01}, +{ 1.598666421064509e+00,-4.961273629342763e+00,-5.436109158150760e+00,1.172499879945844e+01,-4.570516088405785e+00,2.752102216394407e+00,-1.924691813728515e+00,1.451205457742756e+00,-1.142784720049171e+00,9.228937394806452e-01,-7.537576271850546e-01,6.132341481371265e-01,-4.849437207672459e-01,3.501569986821071e-01,-1.391810233306945e-01}, +{ -1.012005260040474e+00,2.895056014267569e+00,-6.044624976717674e+00,-1.328274021555487e+00,7.716671531391072e+00,-3.564800986783162e+00,2.271420521807109e+00,-1.636186523431570e+00,1.255242660169896e+00,-9.973744109875985e-01,8.060143902020959e-01,-6.511823766881709e-01,5.126395356361694e-01,-3.691833500529054e-01,1.465872527831310e-01}, +{ 6.189978617968895e-01,-1.685861001458806e+00,2.929393944047868e+00,-6.541596671693960e+00,1.302565810905172e+00,4.973993951600765e+00,-2.610369853030312e+00,1.738906979437566e+00,-1.280725893527873e+00,9.933874637692126e-01,-7.906786364307283e-01,6.325395206504461e-01,-4.948578731417041e-01,3.550893912114879e-01,-1.407849941360230e-01}, +{ -3.599640181074721e-01,9.541818022342574e-01,-1.521880633598565e+00,2.591704452321129e+00,-6.694399576535974e+00,3.158801420666897e+00,2.896389642294777e+00,-1.708908661682804e+00,1.186520703488809e+00,-8.904573465208021e-01,6.946651025719617e-01,-5.487210444417838e-01,4.258847323033709e-01,-3.042068544476708e-01,1.203902794538680e-01}, +{ 1.837515728480397e-01,-4.796645446692009e-01,7.316763554578738e-01,-1.116674693328844e+00,1.994494123818027e+00,-6.567041304583046e+00,4.515582792710643e+00,1.244571702304160e+00,-8.701101280790505e-01,6.387436797471878e-01,-4.900912175798087e-01,3.828311297803191e-01,-2.950115653893912e-01,2.098524498380298e-01,-8.291035287493798e-02}, +{ -6.116337992401100e-02,1.587806448792821e-01,-2.383141313922184e-01,3.498624225183599e-01,-5.616438351864400e-01,1.162183003236685e+00,-6.172249780437830e+00,5.499011637600598e+00,-9.061721797680958e-02,-1.077290730848238e-01,1.183691618503223e-01,-1.037491237214349e-01,8.412635661003555e-02,-6.129915789464577e-02,2.443247292293194e-02}, +{ -2.443247292293194e-02,6.129915789464269e-02,-8.412635661003234e-02,1.037491237214307e-01,-1.183691618503146e-01,1.077290730848146e-01,9.061721797682408e-02,-5.499011637600612e+00,6.172249780437829e+00,-1.162183003236673e+00,5.616438351864311e-01,-3.498624225183535e-01,2.383141313922142e-01,-1.587806448792800e-01,6.116337992401100e-02}, +{ 8.291035287493587e-02,-2.098524498380256e-01,2.950115653893856e-01,-3.828311297803131e-01,4.900912175798013e-01,-6.387436797471771e-01,8.701101280790321e-01,-1.244571702304128e+00,-4.515582792710674e+00,6.567041304583046e+00,-1.994494123818008e+00,1.116674693328831e+00,-7.316763554578657e-01,4.796645446691972e-01,-1.837515728480367e-01}, +{ -1.203902794538684e-01,3.042068544476688e-01,-4.258847323033697e-01,5.487210444417843e-01,-6.946651025719626e-01,8.904573465208028e-01,-1.186520703488805e+00,1.708908661682798e+00,-2.896389642294760e+00,-3.158801420666915e+00,6.694399576535980e+00,-2.591704452321125e+00,1.521880633598563e+00,-9.541818022342571e-01,3.599640181074665e-01}, +{ 1.407849941360232e-01,-3.550893912114885e-01,4.948578731417057e-01,-6.325395206504488e-01,7.906786364307318e-01,-9.933874637692162e-01,1.280725893527874e+00,-1.738906979437568e+00,2.610369853030320e+00,-4.973993951600773e+00,-1.302565810905167e+00,6.541596671693951e+00,-2.929393944047867e+00,1.685861001458809e+00,-6.189978617968871e-01}, +{ -1.465872527831305e-01,3.691833500529059e-01,-5.126395356361702e-01,6.511823766881744e-01,-8.060143902020995e-01,9.973744109876010e-01,-1.255242660169897e+00,1.636186523431573e+00,-2.271420521807114e+00,3.564800986783165e+00,-7.716671531391076e+00,1.328274021555503e+00,6.044624976717673e+00,-2.895056014267579e+00,1.012005260040472e+00}, +{ 1.391810233306940e-01,-3.501569986821064e-01,4.849437207672436e-01,-6.132341481371228e-01,7.537576271850523e-01,-9.228937394806387e-01,1.142784720049164e+00,-1.451205457742755e+00,1.924691813728509e+00,-2.752102216394394e+00,4.570516088405776e+00,-1.172499879945845e+01,5.436109158150840e+00,4.961273629342670e+00,-1.598666421064479e+00}, +{ -1.186049082222460e-01,2.981801759429694e-01,-4.122433876998706e-01,5.197627696244376e-01,-6.359710955660453e-01,7.734140474809132e-01,-9.479810683956968e-01,1.184927519144606e+00,-1.530992873816903e+00,2.086843772438736e+00,-3.119189036537614e+00,5.637875456284756e+00,-1.893909765778626e+01,1.338903613847416e+01,1.814040148634050e+00}, +{ 8.149641897599480e-02,-2.048091348936793e-01,2.828907920003854e-01,-3.561084012140427e-01,4.346730741117140e-01,-5.267216677850257e-01,6.421860755859931e-01,-7.962314167050094e-01,1.015545226727675e+00,-1.353558400697150e+00,1.935734308942298e+00,-3.136823372778783e+00,6.696991206599613e+00,-4.035690695849785e+01,3.564164224962787e+01} +}; +#endif +#if p_Nq==15 && p_cubNq==17 +const dfloat c_DI[17][15] = { +{ -3.728493585280302e+01,4.329008535078473e+01,-8.679987716017127e+00,4.272070067027006e+00,-2.689683603880884e+00,1.899600891710261e+00,-1.433271330465201e+00,1.127634802535934e+00,-9.115164716928202e-01,7.487578432483609e-01,-6.185479637045597e-01,5.071085079776136e-01,-4.030330258255630e-01,2.918713442796896e-01,-1.161528431744183e-01}, +{ -3.976554135828163e+00,-1.085607962899225e+01,1.928983549823197e+01,-6.854574732548753e+00,3.968070750510990e+00,-2.706169126970223e+00,2.005328644253289e+00,-1.561198559169523e+00,1.253663556970871e+00,-1.025327353062922e+00,8.445254350912244e-01,-6.909907609190912e-01,5.484579057499942e-01,-3.968795601334070e-01,1.578920668159891e-01}, +{ 2.424508939838450e+00,-8.285121426370814e+00,-1.516579220498556e+00,1.037751719256671e+01,-4.789522220177540e+00,3.027053593112726e+00,-2.163543275617909e+00,1.650722345025955e+00,-1.309234615435320e+00,1.062196073537065e+00,-8.701927481986209e-01,7.094168053043225e-01,-5.617541462482688e-01,4.059365626271697e-01,-1.614038594653729e-01}, +{ -1.186738594937237e+00,3.536461225230264e+00,-8.909030103159596e+00,3.126197637644953e+00,5.163444171205481e+00,-2.845916517310545e+00,1.919882768423065e+00,-1.420929471340855e+00,1.106823876295710e+00,-8.877305664250951e-01,7.217728466102439e-01,-5.854543581932222e-01,4.620786748931007e-01,-3.332688596583225e-01,1.324072707220560e-01}, +{ 4.747706299295213e-01,-1.331629111471655e+00,2.553316202035000e+00,-8.335864344187684e+00,5.612885971999837e+00,1.708117727328039e+00,-1.164533482488182e+00,8.478466761434057e-01,-6.520586814923494e-01,5.183865772837769e-01,-4.189302611541531e-01,3.384115822522619e-01,-2.663761219469053e-01,1.918150314556887e-01,-7.615839568660193e-02}, +{ -7.643983824128581e-02,2.146832036961772e-01,-4.011846416346906e-01,9.670296940588339e-01,-6.958821578315725e+00,6.701691249864720e+00,-5.591901972468961e-01,1.520705569356898e-01,-5.578060572984419e-02,2.223862997765904e-02,-8.315851544482902e-03,2.137717626709947e-03,4.721195124779835e-04,-1.204964175256166e-03,6.145052159116232e-04}, +{ -1.289241763713440e-01,3.318999216466225e-01,-4.866849819090296e-01,6.770183413592367e-01,-9.036832686261292e-01,-4.983006490928615e+00,6.739695380327180e+00,-1.877146299011340e+00,1.038142953798771e+00,-7.002838890972127e-01,5.148537873569836e-01,-3.924806924824025e-01,2.981209521432521e-01,-2.103772063372748e-01,8.285566813130263e-02}, +{ 2.097158599945450e-01,-5.392817204976319e-01,7.898033150936130e-01,-1.107123588091743e+00,1.632564210690700e+00,-2.829093068764450e+00,-2.596672776607602e+00,5.964135185880953e+00,-2.398295421518193e+00,1.470693270346239e+00,-1.035675657607630e+00,7.715748980383026e-01,-5.785089467194648e-01,4.053876407472012e-01,-1.592232009848402e-01}, +{ -2.094726562499989e-01,5.356855072902387e-01,-7.732388425498720e-01,1.053799509049799e+00,-1.471725451746940e+00,2.264813641904638e+00,-4.588914964624965e+00,-2.294548110753321e-15,4.588914964624967e+00,-2.264813641904634e+00,1.471725451746936e+00,-1.053799509049796e+00,7.732388425498702e-01,-5.356855072902391e-01,2.094726562499986e-01}, +{ 1.592232009848402e-01,-4.053876407472007e-01,5.785089467194654e-01,-7.715748980383050e-01,1.035675657607633e+00,-1.470693270346241e+00,2.398295421518191e+00,-5.964135185880955e+00,2.596672776607607e+00,2.829093068764446e+00,-1.632564210690697e+00,1.107123588091739e+00,-7.898033150936098e-01,5.392817204976315e-01,-2.097158599945450e-01}, +{ -8.285566813130252e-02,2.103772063372746e-01,-2.981209521432512e-01,3.924806924824019e-01,-5.148537873569831e-01,7.002838890972125e-01,-1.038142953798772e+00,1.877146299011343e+00,-6.739695380327183e+00,4.983006490928619e+00,9.036832686261268e-01,-6.770183413592357e-01,4.866849819090288e-01,-3.318999216466223e-01,1.289241763713435e-01}, +{ -6.145052159097358e-04,1.204964175252343e-03,-4.721195124739452e-04,-2.137717626714953e-03,8.315851544489852e-03,-2.223862997766524e-02,5.578060572985461e-02,-1.520705569357072e-01,5.591901972469249e-01,-6.701691249864736e+00,6.958821578315709e+00,-9.670296940588116e-01,4.011846416346753e-01,-2.146832036961684e-01,7.643983824128299e-02}, +{ 7.615839568660243e-02,-1.918150314556867e-01,2.663761219469047e-01,-3.384115822522602e-01,4.189302611541498e-01,-5.183865772837752e-01,6.520586814923467e-01,-8.478466761434009e-01,1.164533482488171e+00,-1.708117727328015e+00,-5.612885971999869e+00,8.335864344187685e+00,-2.553316202034987e+00,1.331629111471651e+00,-4.747706299295168e-01}, +{ -1.324072707220556e-01,3.332688596583225e-01,-4.620786748931049e-01,5.854543581932268e-01,-7.217728466102468e-01,8.877305664251001e-01,-1.106823876295714e+00,1.420929471340858e+00,-1.919882768423073e+00,2.845916517310555e+00,-5.163444171205504e+00,-3.126197637644901e+00,8.909030103159573e+00,-3.536461225230275e+00,1.186738594937239e+00}, +{ 1.614038594653744e-01,-4.059365626271700e-01,5.617541462482694e-01,-7.094168053043236e-01,8.701927481986235e-01,-1.062196073537067e+00,1.309234615435323e+00,-1.650722345025960e+00,2.163543275617914e+00,-3.027053593112726e+00,4.789522220177535e+00,-1.037751719256665e+01,1.516579220498419e+00,8.285121426370907e+00,-2.424508939838463e+00}, +{ -1.578920668159873e-01,3.968795601334093e-01,-5.484579057499950e-01,6.909907609190884e-01,-8.445254350912217e-01,1.025327353062915e+00,-1.253663556970865e+00,1.561198559169520e+00,-2.005328644253281e+00,2.706169126970200e+00,-3.968070750510964e+00,6.854574732548703e+00,-1.928983549823197e+01,1.085607962899235e+01,3.976554135828107e+00}, +{ 1.161528431744117e-01,-2.918713442796502e-01,4.030330258255148e-01,-5.071085079775636e-01,6.185479637044902e-01,-7.487578432482742e-01,9.115164716927160e-01,-1.127634802535796e+00,1.433271330465034e+00,-1.899600891710041e+00,2.689683603880577e+00,-4.272070067026541e+00,8.679987716016379e+00,-4.329008535078358e+01,3.728493585280233e+01} +}; +#endif +#if p_Nq==15 && p_cubNq==18 +const dfloat c_DI[18][15] = { +{ -3.871065309521984e+01,4.584624493937984e+01,-1.042754491039567e+01,5.285150915932665e+00,-3.365672286692658e+00,2.390285127394282e+00,-1.809115892027909e+00,1.426036749691153e+00,-1.154144834527098e+00,9.488461471836009e-01,-7.842841393529596e-01,6.432336249053705e-01,-5.113509308016438e-01,3.703695972765247e-01,-1.474010127456583e-01}, +{ -6.247025517688204e+00,-7.875997127432966e+00,1.896992680418569e+01,-7.550006702548382e+00,4.501309394635943e+00,-3.108374808010749e+00,2.318421170934374e+00,-1.811865631485467e+00,1.458474918581170e+00,-1.194746005556966e+00,9.851377786079307e-01,-8.066331927272992e-01,6.405555179948149e-01,-4.636560483797333e-01,1.844794488898470e-01}, +{ 2.966520949469854e+00,-1.117109330321112e+01,2.533926313727932e+00,8.322946982587521e+00,-4.304872464376853e+00,2.818912660605220e+00,-2.048015759163398e+00,1.576734894359124e+00,-1.257441457495756e+00,1.023804906761968e+00,-8.407313490418219e-01,6.864898024776628e-01,-5.441628008451568e-01,3.934641085304230e-01,-1.564834843856008e-01}, +{ -1.002577028688897e+00,3.157840799784300e+00,-1.049889529090126e+01,6.960643389869237e+00,2.274448836746599e+00,-1.513797519708529e+00,1.088103653621437e+00,-8.303522407751367e-01,6.581241905469142e-01,-5.335554110581143e-01,4.368479456263851e-01,-3.559760594807765e-01,2.817925513135967e-01,-2.035908589515354e-01,8.094304205577807e-02}, +{ 5.899255461770017e-02,-2.098156175271605e-01,7.074194921876681e-01,-7.989736870608509e+00,8.252676786674954e+00,-1.124465599191974e+00,4.702096012355722e-01,-2.722216864503199e-01,1.832064543943200e-01,-1.337928934495128e-01,1.022012073545494e-01,-7.950113156588491e-02,6.106008728331564e-02,-4.333889242871594e-02,1.710650747399822e-02}, +{ 3.124055780952020e-01,-8.257454285259548e-01,1.303030952945388e+00,-2.110381060840026e+00,-4.508437872085052e+00,7.496008562815852e+00,-2.571759556096868e+00,1.508740466814456e+00,-1.047437327992878e+00,7.861478726217265e-01,-6.133199247364940e-01,4.844742808299434e-01,-3.760216502923453e-01,2.685897895727629e-01,-1.062946831257137e-01}, +{ -3.644225192237138e-01,9.551068309314030e-01,-1.474143720536182e+00,2.316475152509685e+00,-4.547795472829652e+00,-7.492393945271869e-01,5.467249047459468e+00,-2.578668199873596e+00,1.660999532745442e+00,-1.202132714966238e+00,9.188109560677293e-01,-7.168505897477800e-01,5.521966665477450e-01,-3.927555215445747e-01,1.551699469874486e-01}, +{ 2.553972245743277e-01,-6.628035797062397e-01,9.950819248413179e-01,-1.468586662387951e+00,2.428104227926057e+00,-6.060406874611754e+00,2.637848475242321e+00,2.880292601502831e+00,-1.670592205143994e+00,1.151417672118717e+00,-8.568886758769917e-01,6.580696522719055e-01,-5.021278224195340e-01,3.552506491607645e-01,-1.400566074917748e-01}, +{ -9.257005739454316e-02,2.391674344252274e-01,-3.545849850905195e-01,5.087655515710753e-01,-7.837157295777987e-01,1.494534051442322e+00,-6.288998062200506e+00,5.105716765072181e+00,3.853156642568361e-01,-3.862926184586313e-01,3.091893089139274e-01,-2.436590756901324e-01,1.880495199141229e-01,-1.337480992993862e-01,5.283033211582522e-02}, +{ -5.283033211582300e-02,1.337480992993840e-01,-1.880495199141218e-01,2.436590756901293e-01,-3.091893089139219e-01,3.862926184586248e-01,-3.853156642568234e-01,-5.105716765072194e+00,6.288998062200505e+00,-1.494534051442310e+00,7.837157295777905e-01,-5.087655515710691e-01,3.545849850905153e-01,-2.391674344252259e-01,9.257005739454183e-02}, +{ 1.400566074917757e-01,-3.552506491607643e-01,5.021278224195348e-01,-6.580696522719088e-01,8.568886758769966e-01,-1.151417672118723e+00,1.670592205143997e+00,-2.880292601502842e+00,-2.637848475242302e+00,6.060406874611741e+00,-2.428104227926050e+00,1.468586662387947e+00,-9.950819248413169e-01,6.628035797062408e-01,-2.553972245743259e-01}, +{ -1.551699469874501e-01,3.927555215445748e-01,-5.521966665477467e-01,7.168505897477835e-01,-9.188109560677333e-01,1.202132714966243e+00,-1.660999532745445e+00,2.578668199873601e+00,-5.467249047459477e+00,7.492393945271980e-01,4.547795472829646e+00,-2.316475152509682e+00,1.474143720536179e+00,-9.551068309314036e-01,3.644225192237123e-01}, +{ 1.062946831257160e-01,-2.685897895727658e-01,3.760216502923484e-01,-4.844742808299464e-01,6.133199247364979e-01,-7.861478726217310e-01,1.047437327992885e+00,-1.508740466814463e+00,2.571759556096876e+00,-7.496008562815850e+00,4.508437872085035e+00,2.110381060840036e+00,-1.303030952945395e+00,8.257454285259608e-01,-3.124055780952034e-01}, +{ -1.710650747399774e-02,4.333889242871722e-02,-6.106008728331647e-02,7.950113156588395e-02,-1.022012073545501e-01,1.337928934495114e-01,-1.832064543943196e-01,2.722216864503215e-01,-4.702096012355723e-01,1.124465599191971e+00,-8.252676786674954e+00,7.989736870608508e+00,-7.074194921876644e-01,2.098156175271595e-01,-5.899255461769774e-02}, +{ -8.094304205578606e-02,2.035908589515392e-01,-2.817925513136004e-01,3.559760594807846e-01,-4.368479456263963e-01,5.335554110581284e-01,-6.581241905469305e-01,8.303522407751557e-01,-1.088103653621461e+00,1.513797519708569e+00,-2.274448836746672e+00,-6.960643389869137e+00,1.049889529090123e+01,-3.157840799784339e+00,1.002577028688908e+00}, +{ 1.564834843856024e-01,-3.934641085304247e-01,5.441628008451596e-01,-6.864898024776637e-01,8.407313490418236e-01,-1.023804906761971e+00,1.257441457495756e+00,-1.576734894359123e+00,2.048015759163393e+00,-2.818912660605208e+00,4.304872464376835e+00,-8.322946982587473e+00,-2.533926313728027e+00,1.117109330321117e+01,-2.966520949469854e+00}, +{ -1.844794488898440e-01,4.636560483797395e-01,-6.405555179948230e-01,8.066331927273019e-01,-9.851377786079405e-01,1.194746005556969e+00,-1.458474918581171e+00,1.811865631485474e+00,-2.318421170934378e+00,3.108374808010748e+00,-4.501309394635948e+00,7.550006702548372e+00,-1.896992680418567e+01,7.875997127432907e+00,6.247025517688264e+00}, +{ 1.474010127456680e-01,-3.703695972765306e-01,5.113509308016564e-01,-6.432336249053855e-01,7.842841393529675e-01,-9.488461471836160e-01,1.154144834527134e+00,-1.426036749691199e+00,1.809115892027971e+00,-2.390285127394356e+00,3.365672286692747e+00,-5.285150915932780e+00,1.042754491039594e+01,-4.584624493938017e+01,3.871065309521996e+01} +}; +#endif +#if p_Nq==15 && p_cubNq==19 +const dfloat c_DI[19][15] = { +{ -3.995329113044558e+01,4.808234256181181e+01,-1.197029780253863e+01,6.188572803390829e+00,-3.970798602700764e+00,2.830339431305127e+00,-2.146522947616947e+00,1.694086648879747e+00,-1.372180981498985e+00,1.128702010744970e+00,-9.332886843033594e-01,7.656316977310429e-01,-6.087539005583110e-01,4.409611966474694e-01,-1.755023008484446e-01}, +{ -8.546560893555473e+00,-4.634877428049360e+00,1.816048890509198e+01,-7.831023249928674e+00,4.770800085850951e+00,-3.325061103460199e+00,2.492084943251546e+00,-1.953148467757932e+00,1.575043272877725e+00,-1.291780397760224e+00,1.066013030878920e+00,-8.733348325561866e-01,6.937745597196511e-01,-5.022849221752050e-01,1.998664975724797e-01}, +{ 3.176674107933239e+00,-1.343030705322432e+01,6.358482245492322e+00,5.950755574371498e+00,-3.387708707049848e+00,2.289467323463041e+00,-1.688273793983381e+00,1.310578487756932e+00,-1.050495505669806e+00,8.581284662738881e-01,-7.062331340141041e-01,5.775205511194557e-01,-4.582274580984476e-01,3.315153905374247e-01,-1.318764949078921e-01}, +{ -5.232929148198074e-01,1.863525162583240e+00,-1.073098735216496e+01,9.747210781517513e+00,-3.515867965697060e-01,-5.802188583259649e-02,1.102553177864989e-01,-1.093300190885930e-01,9.798748866127291e-02,-8.513767895903451e-02,7.273782374638094e-02,-6.090149709159830e-02,4.904044778183391e-02,-3.578106153863062e-02,1.428218398818318e-02}, +{ -4.499169599077159e-01,1.219243672850259e+00,-2.020738815422384e+00,-5.886902787057143e+00,9.036514189028951e+00,-2.903877384243533e+00,1.668459043873254e+00,-1.146541652402616e+00,8.570592851572322e-01,-6.703098773932992e-01,5.362274439052511e-01,-4.303494709749946e-01,3.373499216545388e-01,-2.423452530003804e-01,9.612864393258158e-02}, +{ 5.761950644573666e-01,-1.555673821990629e+00,2.627296475107663e+00,-5.310090501994392e+00,-6.259527212720248e-01,6.090990681853463e+00,-2.902553733283471e+00,1.873958734008045e+00,-1.359714338072836e+00,1.045802179139292e+00,-8.281052622564512e-01,6.603071705999617e-01,-5.155152463232382e-01,3.694730895714965e-01,-1.464177695442437e-01}, +{ -3.415402412035953e-01,9.072570495928964e-01,-1.455978435372383e+00,2.517964610889250e+00,-6.881534496373303e+00,3.680824645815840e+00,2.474615522475359e+00,-1.512799949710373e+00,1.063445862378359e+00,-8.029644450592530e-01,6.285832055449504e-01,-4.975684454749842e-01,3.866803139969783e-01,-2.764043738846950e-01,1.094191763849517e-01}, +{ 4.240991329766453e-02,-1.144942732644864e-01,1.904969841836614e-01,-3.427522971833087e-01,8.378595959910885e-01,-6.277765798100279e+00,6.093186786383659e+00,-5.442214829642372e-01,1.603814089466568e-01,-6.608702537156243e-02,3.171560134580459e-02,-1.653615123747027e-02,9.057315747317737e-03,-5.004075863483583e-03,1.753498088974736e-03}, +{ 1.592816763528355e-01,-4.085090347653062e-01,5.940459327895329e-01,-8.206988049576260e-01,1.172883052013300e+00,-1.854222743803197e+00,-3.715135439467285e+00,6.285152382281792e+00,-2.183385957489923e+00,1.284401418563645e+00,-8.875467090016205e-01,6.545055601437660e-01,-4.878820961875062e-01,3.408003737104332e-01,-1.336896101828402e-01}, +{ -2.094726562499994e-01,5.356855072902382e-01,-7.732388425498712e-01,1.053799509049798e+00,-1.471725451746937e+00,2.264813641904634e+00,-4.588914964624957e+00,-1.247918856377756e-14,4.588914964624974e+00,-2.264813641904635e+00,1.471725451746936e+00,-1.053799509049796e+00,7.732388425498707e-01,-5.356855072902397e-01,2.094726562499985e-01}, +{ 1.336896101828415e-01,-3.408003737104339e-01,4.878820961875074e-01,-6.545055601437687e-01,8.875467090016237e-01,-1.284401418563649e+00,2.183385957489925e+00,-6.285152382281789e+00,3.715135439467272e+00,1.854222743803208e+00,-1.172883052013305e+00,8.206988049576278e-01,-5.940459327895340e-01,4.085090347653081e-01,-1.592816763528360e-01}, +{ -1.753498088975042e-03,5.004075863483948e-03,-9.057315747317496e-03,1.653615123747023e-02,-3.171560134580505e-02,6.608702537156219e-02,-1.603814089466589e-01,5.442214829642394e-01,-6.093186786383659e+00,6.277765798100278e+00,-8.378595959910879e-01,3.427522971833087e-01,-1.904969841836628e-01,1.144942732644873e-01,-4.240991329766355e-02}, +{ -1.094191763849484e-01,2.764043738846944e-01,-3.866803139969794e-01,4.975684454749866e-01,-6.285832055449547e-01,8.029644450592566e-01,-1.063445862378361e+00,1.512799949710375e+00,-2.474615522475358e+00,-3.680824645815838e+00,6.881534496373298e+00,-2.517964610889248e+00,1.455978435372381e+00,-9.072570495928973e-01,3.415402412035951e-01}, +{ 1.464177695442448e-01,-3.694730895714968e-01,5.155152463232398e-01,-6.603071705999639e-01,8.281052622564536e-01,-1.045802179139295e+00,1.359714338072838e+00,-1.873958734008048e+00,2.902553733283479e+00,-6.090990681853478e+00,6.259527212720494e-01,5.310090501994368e+00,-2.627296475107656e+00,1.555673821990629e+00,-5.761950644573650e-01}, +{ -9.612864393258480e-02,2.423452530003829e-01,-3.373499216545413e-01,4.303494709749978e-01,-5.362274439052540e-01,6.703098773933023e-01,-8.570592851572373e-01,1.146541652402623e+00,-1.668459043873263e+00,2.903877384243541e+00,-9.036514189028949e+00,5.886902787057128e+00,2.020738815422408e+00,-1.219243672850273e+00,4.499169599077195e-01}, +{ -1.428218398818659e-02,3.578106153863345e-02,-4.904044778183803e-02,6.090149709160607e-02,-7.273782374638982e-02,8.513767895904711e-02,-9.798748866128744e-02,1.093300190886084e-01,-1.102553177865214e-01,5.802188583263220e-02,3.515867965696524e-01,-9.747210781517447e+00,1.073098735216496e+01,-1.863525162583292e+00,5.232929148198207e-01}, +{ 1.318764949078881e-01,-3.315153905374187e-01,4.582274580984445e-01,-5.775205511194497e-01,7.062331340140970e-01,-8.581284662738824e-01,1.050495505669795e+00,-1.310578487756920e+00,1.688273793983366e+00,-2.289467323463017e+00,3.387708707049806e+00,-5.950755574371406e+00,-6.358482245492472e+00,1.343030705322439e+01,-3.176674107933224e+00}, +{ -1.998664975724829e-01,5.022849221752058e-01,-6.937745597196496e-01,8.733348325561847e-01,-1.066013030878918e+00,1.291780397760217e+00,-1.575043272877714e+00,1.953148467757921e+00,-2.492084943251540e+00,3.325061103460183e+00,-4.770800085850919e+00,7.831023249928624e+00,-1.816048890509202e+01,4.634877428049566e+00,8.546560893555343e+00}, +{ 1.755023008484642e-01,-4.409611966474862e-01,6.087539005583285e-01,-7.656316977310679e-01,9.332886843033740e-01,-1.128702010744988e+00,1.372180981499022e+00,-1.694086648879795e+00,2.146522947617009e+00,-2.830339431305199e+00,3.970798602700860e+00,-6.188572803390966e+00,1.197029780253889e+01,-4.808234256181209e+01,3.995329113044565e+01} +}; +#endif +#if p_Nq==15 && p_cubNq==20 +const dfloat c_DI[20][15] = { +{ -4.104134004619191e+01,5.004626085741084e+01,-1.333553561474224e+01,6.994678090499452e+00,-4.512448124630358e+00,3.224832434106709e+00,-2.449251686875689e+00,1.934709986664932e+00,-1.567972671888850e+00,1.290244482606950e+00,-1.067141512734268e+00,8.755949558391553e-01,-6.962673579003344e-01,5.043880540141785e-01,-2.007518461785751e-01}, +{ -1.081977152696324e+01,-1.270821053634490e+00,1.700738121478504e+01,-7.793206613780099e+00,4.829237757261741e+00,-3.390673482509182e+00,2.551145341620084e+00,-2.004024699937318e+00,1.618421480600060e+00,-1.328637709524678e+00,1.097146575832953e+00,-8.992406006187176e-01,7.145623854912039e-01,-5.174241599719335e-01,2.059050913485765e-01}, +{ 3.053091494168042e+00,-1.499422685106292e+01,9.748768081932461e+00,3.534359137585247e+00,-2.253066692616773e+00,1.579953796143656e+00,-1.185512945377470e+00,9.292505312415591e-01,-7.492752331217284e-01,6.144294603633992e-01,-5.069753852283462e-01,4.152968248220869e-01,-3.298852232758141e-01,2.388217752893083e-01,-9.502877086270434e-02}, +{ 1.439892251685125e-01,-1.069200098604141e-01,-9.770023767199806e+00,1.138802818049220e+01,-2.410750005427467e+00,1.218366711120006e+00,-7.864394311834819e-01,5.673670875008550e-01,-4.349300232100669e-01,3.451842287993652e-01,-2.786669140410997e-01,2.249530770368360e-01,-1.769903308102635e-01,1.274159061829805e-01,-5.058393456816040e-02}, +{ -8.860133099588479e-01,2.503447012326835e+00,-4.956461313040610e+00,-2.716859343420847e+00,8.305952519062627e+00,-3.566128343433927e+00,2.223022348136165e+00,-1.585205492608022e+00,1.209328882109959e+00,-9.576046836332239e-01,7.721689723243602e-01,-6.229366741674359e-01,4.899485268045319e-01,-3.526517114926855e-01,1.399926109911207e-01}, +{ 5.987786040540380e-01,-1.648543935682975e+00,2.969650102509177e+00,-7.588017816349351e+00,3.345829012934849e+00,3.563749293782924e+00,-2.061787503650279e+00,1.420091288463029e+00,-1.063068112186967e+00,8.322399908900555e-01,-6.662139695175562e-01,5.349167038980707e-01,-4.194476450031822e-01,3.013769709264644e-01,-1.195529850682978e-01}, +{ -9.003239952863533e-02,2.505883708109511e-01,-4.577255754478785e-01,1.058244739433048e+00,-7.002225761034249e+00,6.621980060468287e+00,-4.549796687551781e-01,8.819224320773381e-02,-1.092149810453501e-02,-1.158979231407160e-02,1.813915658736998e-02,-1.878793450931810e-02,1.672620666490015e-02,-1.282015264868649e-02,5.212005170262005e-03}, +{ -2.356848665282727e-01,6.122715963668545e-01,-9.215106199992495e-01,1.364745673280024e+00,-2.238171442035135e+00,-3.651221669473865e+00,6.613424714232655e+00,-2.398680012579054e+00,1.430203202509387e+00,-9.988019938163895e-01,7.485202447213871e-01,-5.771702299572055e-01,4.414519610025757e-01,-3.127365498785443e-01,1.233599921548314e-01}, +{ 2.742083888048408e-01,-7.092973726101790e-01,1.055461084994557e+00,-1.528908025580366e+00,2.422835889335919e+00,-5.233912810439119e+00,9.133800057940566e-01,4.125582630551746e+00,-2.155236777133507e+00,1.433019880744775e+00,-1.048025876410767e+00,7.970796435814718e-01,-6.047748325235260e-01,4.265494547100941e-01,-1.679612838199953e-01}, +{ -1.158985749189188e-01,2.987964631274104e-01,-4.405210535954613e-01,6.253719953655738e-01,-9.443257733334970e-01,1.726239639802869e+00,-6.327593865312577e+00,4.741852895880772e+00,7.872090777333162e-01,-6.129244311057458e-01,4.621162503856311e-01,-3.549261170744115e-01,2.703452440218423e-01,-1.909889437648613e-01,7.524719278805803e-02}, +{ -7.524719278805980e-02,1.909889437648604e-01,-2.703452440218423e-01,3.549261170744130e-01,-4.621162503856324e-01,6.129244311057471e-01,-7.872090777333173e-01,-4.741852895880771e+00,6.327593865312579e+00,-1.726239639802867e+00,9.443257733334951e-01,-6.253719953655723e-01,4.405210535954607e-01,-2.987964631274113e-01,1.158985749189179e-01}, +{ 1.679612838199964e-01,-4.265494547100923e-01,6.047748325235260e-01,-7.970796435814727e-01,1.048025876410766e+00,-1.433019880744775e+00,2.155236777133500e+00,-4.125582630551729e+00,-9.133800057940769e-01,5.233912810439126e+00,-2.422835889335915e+00,1.528908025580362e+00,-1.055461084994554e+00,7.092973726101784e-01,-2.742083888048401e-01}, +{ -1.233599921548312e-01,3.127365498785446e-01,-4.414519610025761e-01,5.771702299572070e-01,-7.485202447213888e-01,9.988019938163903e-01,-1.430203202509387e+00,2.398680012579055e+00,-6.613424714232665e+00,3.651221669473886e+00,2.238171442035118e+00,-1.364745673280016e+00,9.215106199992443e-01,-6.122715963668521e-01,2.356848665282725e-01}, +{ -5.212005170261547e-03,1.282015264868774e-02,-1.672620666490221e-02,1.878793450932233e-02,-1.813915658737631e-02,1.158979231408036e-02,1.092149810452550e-02,-8.819224320772290e-02,4.549796687551620e-01,-6.621980060468270e+00,7.002225761034253e+00,-1.058244739433066e+00,4.577255754478879e-01,-2.505883708109576e-01,9.003239952863851e-02}, +{ 1.195529850682993e-01,-3.013769709264642e-01,4.194476450031837e-01,-5.349167038980717e-01,6.662139695175580e-01,-8.322399908900586e-01,1.063068112186969e+00,-1.420091288463031e+00,2.061787503650277e+00,-3.563749293782911e+00,-3.345829012934871e+00,7.588017816349357e+00,-2.969650102509174e+00,1.648543935682977e+00,-5.987786040540388e-01}, +{ -1.399926109911213e-01,3.526517114926869e-01,-4.899485268045343e-01,6.229366741674400e-01,-7.721689723243652e-01,9.576046836332273e-01,-1.209328882109963e+00,1.585205492608029e+00,-2.223022348136172e+00,3.566128343433928e+00,-8.305952519062620e+00,2.716859343420835e+00,4.956461313040632e+00,-2.503447012326852e+00,8.860133099588506e-01}, +{ 5.058393456815952e-02,-1.274159061829809e-01,1.769903308102627e-01,-2.249530770368351e-01,2.786669140410976e-01,-3.451842287993597e-01,4.349300232100624e-01,-5.673670875008520e-01,7.864394311834753e-01,-1.218366711119993e+00,2.410750005427450e+00,-1.138802818049217e+01,9.770023767199804e+00,1.069200098603848e-01,-1.439892251685040e-01}, +{ 9.502877086270356e-02,-2.388217752893084e-01,3.298852232758183e-01,-4.152968248220913e-01,5.069753852283509e-01,-6.144294603634055e-01,7.492752331217337e-01,-9.292505312415662e-01,1.185512945377475e+00,-1.579953796143657e+00,2.253066692616773e+00,-3.534359137585250e+00,-9.748768081932486e+00,1.499422685106294e+01,-3.053091494168031e+00}, +{ -2.059050913485745e-01,5.174241599719347e-01,-7.145623854911989e-01,8.992406006187150e-01,-1.097146575832963e+00,1.328637709524678e+00,-1.618421480600059e+00,2.004024699937322e+00,-2.551145341620078e+00,3.390673482509170e+00,-4.829237757261732e+00,7.793206613780049e+00,-1.700738121478496e+01,1.270821053634386e+00,1.081977152696331e+01}, +{ 2.007518461786049e-01,-5.043880540142456e-01,6.962673579004246e-01,-8.755949558392644e-01,1.067141512734388e+00,-1.290244482607093e+00,1.567972671889035e+00,-1.934709986665173e+00,2.449251686876012e+00,-3.224832434107115e+00,4.512448124630896e+00,-6.994678090500237e+00,1.333553561474364e+01,-5.004626085741278e+01,4.104134004619291e+01} +}; +#endif +#if p_Nq==15 && p_cubNq==21 +const dfloat c_DI[21][15] = { +{ -4.199832348050113e+01,5.177807624522266e+01,-1.454706963314327e+01,7.714960912607263e+00,-4.997711831514768e+00,3.578710348485139e+00,-2.721005638209220e+00,2.150806235194948e+00,-1.743855986470890e+00,1.435388197824256e+00,-1.187421829970171e+00,9.744166948189622e-01,-7.749184366675215e-01,5.613936905769797e-01,-2.234454882532232e-01}, +{ -1.302942258553499e+01,2.116914490321361e+00,1.562409590532590e+01,-7.516824790955707e+00,4.724216170116528e+00,-3.337499382006583e+00,2.519371578086025e+00,-1.982904886585266e+00,1.603338512636327e+00,-1.317332031056587e+00,1.088415077870343e+00,-8.924207410192961e-01,7.093187983272363e-01,-5.137023438499342e-01,2.044362283246473e-01}, +{ 2.621724608275601e+00,-1.587277709231714e+01,1.260549681621729e+01,1.248571608145472e+00,-1.055156066026642e+00,7.983901612028479e-01,-6.197113403992189e-01,4.947692168293694e-01,-4.034005076508608e-01,3.331742487566756e-01,-2.762178669080263e-01,2.269901856703506e-01,-1.806805001702906e-01,1.309636377813561e-01,-5.213710940678948e-02}, +{ 8.870577687263083e-01,-2.478825893761744e+00,-7.891961156579828e+00,1.198089895219799e+01,-3.809016145769728e+00,2.169857738299780e+00,-1.480649875657606e+00,1.101655717382967e+00,-8.606003057160638e-01,6.914387017465680e-01,-5.627924761718385e-01,4.568237498365108e-01,-3.607176052798906e-01,2.602312535008235e-01,-1.034004227542528e-01}, +{ -1.141219794523371e+00,3.326705740028992e+00,-7.554319090184505e+00,8.361933346324470e-01,6.588199951048060e+00,-3.335280142951497e+00,2.187308538923140e+00,-1.597075886402853e+00,1.234550833187104e+00,-9.855058425863019e-01,7.988186432510118e-01,-6.466433335142415e-01,5.097114650596732e-01,-3.673455871889520e-01,1.459011712212922e-01}, +{ 3.849125868286560e-01,-1.093872128934267e+00,2.190668163753835e+00,-8.446165583366712e+00,6.490691730381801e+00,8.826182706343146e-01,-7.184143188913484e-01,5.504303081720334e-01,-4.334498559063251e-01,3.491568851693521e-01,-2.844396074625659e-01,2.309392390442642e-01,-1.823603060377466e-01,1.315562260166989e-01,-5.227160940198947e-02}, +{ 2.446550196967069e-01,-6.424632477286178e-01,9.926882444640744e-01,-1.496582417840872e+00,-5.109967445298774e+00,7.543504870722511e+00,-2.335098642203945e+00,1.330858408271689e+00,-9.117818270083194e-01,6.793349165586836e-01,-5.276365881320501e-01,4.156219464706613e-01,-3.220153323207900e-01,2.297814485748824e-01,-9.089935422584150e-02}, +{ -3.837497410676671e-01,1.008536496293302e+00,-1.568987014118516e+00,2.512870752008735e+00,-5.244239783543047e+00,3.158133591369177e-01,4.867007891381736e+00,-2.447756646825850e+00,1.609387520431787e+00,-1.176198855434168e+00,9.039066413566994e-01,-7.075396864860281e-01,5.461118233380745e-01,-3.888639592499036e-01,1.537012027779290e-01}, +{ 1.618235522618139e-01,-4.232428671191407e-01,6.490104945689485e-01,-1.001446986678796e+00,1.833751093378876e+00,-6.589403744430171e+00,4.864608255114197e+00,8.962834847985829e-01,-6.813454469251105e-01,5.128557260821408e-01,-3.981193971329863e-01,3.129792112781888e-01,-2.420706495158418e-01,1.725392191304078e-01,-6.822194481110960e-02}, +{ 1.116669020584471e-01,-2.854876978462738e-01,4.116273655285805e-01,-5.587599479097916e-01,7.682184478805281e-01,-1.075377363289548e+00,-4.508536971278323e+00,6.377942983368500e+00,-1.881894361885927e+00,1.056990411296049e+00,-7.151011155331082e-01,5.213235551991263e-01,-3.860622182299781e-01,2.687140488626807e-01,-1.052640382209624e-01}, +{ -2.094726562499997e-01,5.356855072902382e-01,-7.732388425498712e-01,1.053799509049798e+00,-1.471725451746936e+00,2.264813641904632e+00,-4.588914964624949e+00,-2.694149925743180e-14,4.588914964624982e+00,-2.264813641904638e+00,1.471725451746936e+00,-1.053799509049797e+00,7.732388425498707e-01,-5.356855072902397e-01,2.094726562500000e-01}, +{ 1.052640382209638e-01,-2.687140488626802e-01,3.860622182299780e-01,-5.213235551991271e-01,7.151011155331095e-01,-1.056990411296050e+00,1.881894361885924e+00,-6.377942983368501e+00,4.508536971278329e+00,1.075377363289540e+00,-7.682184478805230e-01,5.587599479097880e-01,-4.116273655285789e-01,2.854876978462726e-01,-1.116669020584458e-01}, +{ 6.822194481110916e-02,-1.725392191304063e-01,2.420706495158411e-01,-3.129792112781893e-01,3.981193971329879e-01,-5.128557260821411e-01,6.813454469251076e-01,-8.962834847985807e-01,-4.864608255114201e+00,6.589403744430170e+00,-1.833751093378871e+00,1.001446986678793e+00,-6.490104945689471e-01,4.232428671191402e-01,-1.618235522618124e-01}, +{ -1.537012027779290e-01,3.888639592499041e-01,-5.461118233380766e-01,7.075396864860314e-01,-9.039066413567031e-01,1.176198855434174e+00,-1.609387520431792e+00,2.447756646825856e+00,-4.867007891381739e+00,-3.158133591369158e-01,5.244239783543046e+00,-2.512870752008734e+00,1.568987014118515e+00,-1.008536496293304e+00,3.837497410676685e-01}, +{ 9.089935422584206e-02,-2.297814485748843e-01,3.220153323207935e-01,-4.156219464706648e-01,5.276365881320543e-01,-6.793349165586859e-01,9.117818270083256e-01,-1.330858408271701e+00,2.335098642203955e+00,-7.543504870722506e+00,5.109967445298756e+00,1.496582417840884e+00,-9.926882444640819e-01,6.424632477286240e-01,-2.446550196967108e-01}, +{ 5.227160940198622e-02,-1.315562260166971e-01,1.823603060377472e-01,-2.309392390442645e-01,2.844396074625656e-01,-3.491568851693516e-01,4.334498559063210e-01,-5.504303081720255e-01,7.184143188913393e-01,-8.826182706342985e-01,-6.490691730381823e+00,8.446165583366710e+00,-2.190668163753822e+00,1.093872128934263e+00,-3.849125868286522e-01}, +{ -1.459011712212932e-01,3.673455871889516e-01,-5.097114650596746e-01,6.466433335142447e-01,-7.988186432510141e-01,9.855058425863032e-01,-1.234550833187103e+00,1.597075886402851e+00,-2.187308538923137e+00,3.335280142951483e+00,-6.588199951048010e+00,-8.361933346325195e-01,7.554319090184562e+00,-3.326705740029018e+00,1.141219794523373e+00}, +{ 1.034004227542554e-01,-2.602312535008248e-01,3.607176052798886e-01,-4.568237498365116e-01,5.627924761718401e-01,-6.914387017465633e-01,8.606003057160596e-01,-1.101655717382968e+00,1.480649875657604e+00,-2.169857738299770e+00,3.809016145769714e+00,-1.198089895219797e+01,7.891961156579828e+00,2.478825893761714e+00,-8.870577687263003e-01}, +{ 5.213710940679173e-02,-1.309636377813551e-01,1.806805001702916e-01,-2.269901856703536e-01,2.762178669080298e-01,-3.331742487566789e-01,4.034005076508593e-01,-4.947692168293650e-01,6.197113403992145e-01,-7.983901612028464e-01,1.055156066026643e+00,-1.248571608145483e+00,-1.260549681621731e+01,1.587277709231716e+01,-2.621724608275593e+00}, +{ -2.044362283246528e-01,5.137023438499378e-01,-7.093187983272333e-01,8.924207410192989e-01,-1.088415077870356e+00,1.317332031056590e+00,-1.603338512636328e+00,1.982904886585280e+00,-2.519371578086035e+00,3.337499382006585e+00,-4.724216170116541e+00,7.516824790955716e+00,-1.562409590532595e+01,-2.116914490321249e+00,1.302942258553494e+01}, +{ 2.234454882532191e-01,-5.613936905769574e-01,7.749184366674893e-01,-9.744166948189161e-01,1.187421829970106e+00,-1.435388197824185e+00,1.743855986470810e+00,-2.150806235194841e+00,2.721005638209093e+00,-3.578710348484961e+00,4.997711831514509e+00,-7.714960912606888e+00,1.454706963314277e+01,-5.177807624522192e+01,4.199832348050068e+01} +}; +#endif +#if p_Nq==15 && p_cubNq==22 +const dfloat c_DI[22][15] = { +{ -4.284369832249769e+01,5.331129295771051e+01,-1.562543987607084e+01,8.359794225833763e+00,-5.433118322918024e+00,3.896574508441368e+00,-2.965250731336012e+00,2.345098942146515e+00,-1.902030605431895e+00,1.565938969285716e+00,-1.295620773530653e+00,1.063319036307747e+00,-8.456782428965285e-01,6.126812841684689e-01,-2.438630492124194e-01}, +{ -1.515178727293864e+01,5.459131723009000e+00,1.409684463751984e+01,-7.066801393067953e+00,4.496162029242370e+00,-3.193630940497275e+00,2.417723657296038e+00,-1.906157042045513e+00,1.542958732379125e+00,-1.268639492340234e+00,1.048699088495416e+00,-8.601436524712464e-01,6.838140495229366e-01,-4.952954487217588e-01,1.971213246178977e-01}, +{ 1.922750447034980e+00,-1.612143049233333e+01,1.490244009021622e+01,-8.062998130717153e-01,1.041136702569947e-01,2.016797036512052e-02,-4.820912750203767e-02,5.228229193585684e-02,-4.933689251666060e-02,4.428606650878905e-02,-3.865735058319931e-02,3.283335163028749e-02,-2.668530647293477e-02,1.957641515308924e-02,-7.831320621459983e-03}, +{ 1.609338144077002e+00,-5.000538206761555e+00,-5.393937227700462e+00,1.171538759295577e+01,-4.578093088386963e+00,2.758553500215735e+00,-1.929790920959895e+00,1.455289929450737e+00,-1.146114971392097e+00,9.256422555957239e-01,-7.560344922421465e-01,6.151039958297069e-01,-4.864313672055915e-01,3.512349684535915e-01,-1.396101119295547e-01}, +{ -1.171188704116825e+00,3.534360401695175e+00,-9.470061590339174e+00,4.240622781107686e+00,4.390696779243249e+00,-2.525167366223459e+00,1.728438322004653e+00,-1.288273170564430e+00,1.007510578333110e+00,-8.100798680029735e-01,6.596989463444957e-01,-5.356718608532218e-01,4.230757603312809e-01,-3.052599697973756e-01,1.212989608378116e-01}, +{ 1.448332115568779e-02,-8.710747709165492e-02,4.889150914236495e-01,-7.863875202238657e+00,8.395947422659601e+00,-1.330696504711160e+00,6.008662281760149e-01,-3.651568008246070e-01,2.538711477023785e-01,-1.896039529626073e-01,1.471204927896633e-01,-1.156916824457706e-01,8.949961188326913e-02,-6.379825088728204e-02,2.522655537147328e-02}, +{ 5.124259185863980e-01,-1.375048071099098e+00,2.276871820982473e+00,-4.309357841133974e+00,-1.984492689168539e+00,6.736775174255562e+00,-2.955452742681667e+00,1.860349936099016e+00,-1.333918896327083e+00,1.019189588188647e+00,-8.037796358513769e-01,6.392733816183785e-01,-4.982935853064692e-01,3.568013676133925e-01,-1.413437257756602e-01}, +{ -3.252088154870331e-01,8.653502671032349e-01,-1.395663540484716e+00,2.443963062164106e+00,-6.994486243591807e+00,4.050117463456167e+00,2.163341377754282e+00,-1.361715862473684e+00,9.669462858369738e-01,-7.337215090116098e-01,5.759948497300128e-01,-4.567200158595440e-01,3.553058908816258e-01,-2.541273453051926e-01,1.006241352871829e-01}, +{ -8.230397217239582e-02,2.100481751415025e-01,-3.001963097291910e-01,3.906765435209598e-01,-3.899860368636613e-01,-5.425877131762115e+00,6.665165515958457e+00,-1.577364110622159e+00,8.298178227179912e-01,-5.464725834327244e-01,3.963562115884010e-01,-2.996750871472183e-01,2.264882416419250e-01,-1.593751673959210e-01,6.269788855614900e-02}, +{ 2.634104433078819e-01,-6.797466736854365e-01,1.005000828346485e+00,-1.436484308763020e+00,2.209738009994250e+00,-4.343414991230200e+00,-5.480069887449505e-01,5.013304444325773e+00,-2.390058401157455e+00,1.542615844360853e+00,-1.112355981648649e+00,8.394456924367518e-01,-6.340624199402248e-01,4.461076224776918e-01,-1.754931200797496e-01}, +{ -1.335502801445472e-01,3.438610171286685e-01,-5.052603563865578e-01,7.126606407930793e-01,-1.063007529421728e+00,1.891825057123599e+00,-6.322662391161827e+00,4.413142745453172e+00,1.127498450355369e+00,-7.986237496601041e-01,5.857821516455771e-01,-4.443002144827181e-01,3.362025391960777e-01,-2.367048538990878e-01,9.313677346102711e-02}, +{ -9.313677346102667e-02,2.367048538990910e-01,-3.362025391960848e-01,4.443002144827279e-01,-5.857821516455902e-01,7.986237496601252e-01,-1.127498450355408e+00,-4.413142745453138e+00,6.322662391161828e+00,-1.891825057123614e+00,1.063007529421738e+00,-7.126606407930874e-01,5.052603563865652e-01,-3.438610171286747e-01,1.335502801445494e-01}, +{ 1.754931200797532e-01,-4.461076224776908e-01,6.340624199402252e-01,-8.394456924367542e-01,1.112355981648651e+00,-1.542615844360856e+00,2.390058401157456e+00,-5.013304444325786e+00,5.480069887449743e-01,4.343414991230181e+00,-2.209738009994241e+00,1.436484308763013e+00,-1.005000828346481e+00,6.797466736854357e-01,-2.634104433078801e-01}, +{ -6.269788855614855e-02,1.593751673959218e-01,-2.264882416419259e-01,2.996750871472200e-01,-3.963562115884032e-01,5.464725834327258e-01,-8.298178227179931e-01,1.577364110622163e+00,-6.665165515958461e+00,5.425877131762114e+00,3.899860368636638e-01,-3.906765435209609e-01,3.001963097291924e-01,-2.100481751415036e-01,8.230397217239560e-02}, +{ -1.006241352871811e-01,2.541273453051895e-01,-3.553058908816233e-01,4.567200158595408e-01,-5.759948497300079e-01,7.337215090116058e-01,-9.669462858369656e-01,1.361715862473666e+00,-2.163341377754246e+00,-4.050117463456206e+00,6.994486243591819e+00,-2.443963062164095e+00,1.395663540484706e+00,-8.653502671032306e-01,3.252088154870283e-01}, +{ 1.413437257756617e-01,-3.568013676133948e-01,4.982935853064729e-01,-6.392733816183834e-01,8.037796358513829e-01,-1.019189588188654e+00,1.333918896327090e+00,-1.860349936099024e+00,2.955452742681676e+00,-6.736775174255538e+00,1.984492689168472e+00,4.309357841134021e+00,-2.276871820982491e+00,1.375048071099111e+00,-5.124259185864030e-01}, +{ -2.522655537147309e-02,6.379825088728713e-02,-8.949961188327471e-02,1.156916824457767e-01,-1.471204927896712e-01,1.896039529626155e-01,-2.538711477023903e-01,3.651568008246230e-01,-6.008662281760406e-01,1.330696504711196e+00,-8.395947422659626e+00,7.863875202238633e+00,-4.889150914236048e-01,8.710747709163055e-02,-1.448332115567993e-02}, +{ -1.212989608378099e-01,3.052599697973721e-01,-4.230757603312778e-01,5.356718608532192e-01,-6.596989463444904e-01,8.100798680029668e-01,-1.007510578333100e+00,1.288273170564416e+00,-1.728438322004634e+00,2.525167366223428e+00,-4.390696779243173e+00,-4.240622781107773e+00,9.470061590339217e+00,-3.534360401695183e+00,1.171188704116825e+00}, +{ 1.396101119295570e-01,-3.512349684535934e-01,4.864313672055918e-01,-6.151039958297067e-01,7.560344922421489e-01,-9.256422555957258e-01,1.146114971392100e+00,-1.455289929450744e+00,1.929790920959898e+00,-2.758553500215732e+00,4.578093088386963e+00,-1.171538759295576e+01,5.393937227700442e+00,5.000538206761554e+00,-1.609338144076997e+00}, +{ 7.831320621461968e-03,-1.957641515309860e-02,2.668530647294887e-02,-3.283335163030439e-02,3.865735058322148e-02,-4.428606650881495e-02,4.933689251668782e-02,-5.228229193589153e-02,4.820912750207690e-02,-2.016797036517462e-02,-1.041136702569115e-01,8.062998130715607e-01,-1.490244009021609e+01,1.612143049233336e+01,-1.922750447035031e+00}, +{ -1.971213246178944e-01,4.952954487217600e-01,-6.838140495229332e-01,8.601436524712447e-01,-1.048699088495423e+00,1.268639492340234e+00,-1.542958732379124e+00,1.906157042045520e+00,-2.417723657296039e+00,3.193630940497266e+00,-4.496162029242360e+00,7.066801393067924e+00,-1.409684463751982e+01,-5.459131723008984e+00,1.515178727293863e+01}, +{ 2.438630492124076e-01,-6.126812841684363e-01,8.456782428964775e-01,-1.063319036307691e+00,1.295620773530585e+00,-1.565938969285636e+00,1.902030605431809e+00,-2.345098942146396e+00,2.965250731335857e+00,-3.896574508441157e+00,5.433118322917734e+00,-8.359794225833324e+00,1.562543987607019e+01,-5.331129295770954e+01,4.284369832249712e+01} +}; +#endif +#if p_Nq==15 && p_cubNq==23 +const dfloat c_DI[23][15] = { +{ -4.359360515230317e+01,5.467394482320615e+01,-1.658825693991238e+01,8.938371271938026e+00,-5.824536469683872e+00,4.182591272950695e+00,-3.185138231429301e+00,2.520070713817985e+00,-2.044504951394911e+00,1.683547453141229e+00,-1.393102415169065e+00,1.143420551907662e+00,-9.094359116186415e-01,6.588947907077527e-01,-2.622608061581495e-01}, +{ -1.717292336831618e+01,8.709233869779782e+00,1.248989553450876e+01,-6.494374431791202e+00,4.178164894099708e+00,-2.982468664196428e+00,2.263830971907972e+00,-1.787629220047573e+00,1.448461910830287e+00,-1.191734854064215e+00,9.855725646521852e-01,-8.086156500414948e-01,6.429790479381677e-01,-4.657736502834820e-01,1.853810450237212e-01}, +{ 1.001497318057691e+00,-1.581810072036399e+01,1.665848773342728e+01,-2.582170591012574e+00,1.163085155665366e+00,-7.063454951090229e-01,4.911977061750911e-01,-3.680003934170697e-01,2.882897229170854e-01,-2.319063010583071e-01,1.888608502062856e-01,-1.533360809951569e-01,1.210890661910898e-01,-8.735997560850228e-02,3.471200492473969e-02}, +{ 2.237907419451537e+00,-7.469436715198327e+00,-2.545099107512275e+00,1.080585659071581e+01,-4.812213239562420e+00,3.009342656848223e+00,-2.140560307140652e+00,1.628895573229730e+00,-1.289863935956628e+00,1.045404870877023e+00,-8.558502581606342e-01,6.974038895228230e-01,-5.520766693465553e-01,3.988733812120387e-01,-1.585841489796869e-01}, +{ -9.819164045614763e-01,3.105867199340450e+00,-1.054872351634662e+01,7.153130044044757e+00,2.110767031012542e+00,-1.428948505919398e+00,1.032638526265162e+00,-7.900282645270305e-01,6.270577504925072e-01,-5.088161210555053e-01,4.168296973983018e-01,-3.397908148485987e-01,2.690449166252123e-01,-1.944081293871598e-01,7.729659146685436e-02}, +{ -4.078769278313311e-01,1.098891580198834e+00,-1.774048952570675e+00,-6.115360905975249e+00,9.035669559404139e+00,-2.798809465750995e+00,1.591686606623551e+00,-1.088691499647196e+00,8.117232632307312e-01,-6.338571389253337e-01,5.065551707813404e-01,-4.062675150264553e-01,3.183373253116611e-01,-2.286306728807987e-01,9.067957305777974e-02}, +{ 6.201351877221903e-01,-1.690539512989463e+00,2.946609181135717e+00,-6.654119496398796e+00,1.498527519658901e+00,4.848669414338596e+00,-2.568060245165724e+00,1.715996913442534e+00,-1.265734560851989e+00,9.825887500510900e-01,-7.824900541809714e-01,6.261959776703309e-01,-4.899972285356310e-01,3.516437738877159e-01,-1.394256197844990e-01}, +{ -1.000682356671278e-01,2.770777610128569e-01,-4.993411335445175e-01,1.124944130449499e+00,-7.032197890373909e+00,6.560154727504317e+00,-3.760154978041636e-01,4.010929466854860e-02,2.275282900593457e-02,-3.694718750980090e-02,3.795269536464176e-02,-3.445199545517912e-02,2.888936231246239e-02,-2.151034167447508e-02,8.651481710914313e-03}, +{ -3.026610034217785e-01,7.895261492187545e-01,-1.202369138249901e+00,1.830524953430035e+00,-3.261255019376546e+00,-2.465310756850532e+00,6.249047953341583e+00,-2.586863007307503e+00,1.601911506272696e+00,-1.138290302524914e+00,8.612224449515085e-01,-6.678512047146312e-01,5.125631977119679e-01,-3.638127711014279e-01,1.436169986206881e-01}, +{ 2.321357289959917e-01,-6.036558429808454e-01,9.112894655505971e-01,-1.360656013161824e+00,2.310774361005052e+00,-6.345537607801884e+00,3.456832897718438e+00,2.208613371241197e+00,-1.360451174911123e+00,9.575585829998892e-01,-7.199702953320662e-01,5.561028414093474e-01,-4.257448036469341e-01,3.017642985881205e-01,-1.190558096739577e-01}, +{ 6.948855309056245e-02,-1.767672512154967e-01,2.514127256536164e-01,-3.315323244077200e-01,4.259148704968016e-01,-4.575032098050096e-01,-5.071386776310941e+00,6.334201926406354e+00,-1.548032348011456e+00,8.216895487895860e-01,-5.412641889435537e-01,3.888237405930102e-01,-2.854978811000740e-01,1.977924595263485e-01,-7.733984476202771e-02}, +{ -2.094726562499995e-01,5.356855072902382e-01,-7.732388425498705e-01,1.053799509049797e+00,-1.471725451746937e+00,2.264813641904633e+00,-4.588914964624954e+00,-1.890894030801509e-14,4.588914964624976e+00,-2.264813641904637e+00,1.471725451746938e+00,-1.053799509049797e+00,7.732388425498719e-01,-5.356855072902406e-01,2.094726562499989e-01}, +{ 7.733984476202860e-02,-1.977924595263483e-01,2.854978811000734e-01,-3.888237405930107e-01,5.412641889435551e-01,-8.216895487895872e-01,1.548032348011456e+00,-6.334201926406357e+00,5.071386776310944e+00,4.575032098050092e-01,-4.259148704968007e-01,3.315323244077186e-01,-2.514127256536143e-01,1.767672512154961e-01,-6.948855309056157e-02}, +{ 1.190558096739589e-01,-3.017642985881219e-01,4.257448036469392e-01,-5.561028414093541e-01,7.199702953320735e-01,-9.575585829999009e-01,1.360451174911138e+00,-2.208613371241229e+00,-3.456832897718399e+00,6.345537607801867e+00,-2.310774361005053e+00,1.360656013161826e+00,-9.112894655506004e-01,6.036558429808492e-01,-2.321357289959929e-01}, +{ -1.436169986206861e-01,3.638127711014283e-01,-5.125631977119693e-01,6.678512047146328e-01,-8.612224449515103e-01,1.138290302524917e+00,-1.601911506272697e+00,2.586863007307503e+00,-6.249047953341591e+00,2.465310756850547e+00,3.261255019376535e+00,-1.830524953430029e+00,1.202369138249896e+00,-7.895261492187543e-01,3.026610034217787e-01}, +{ -8.651481710914316e-03,2.151034167447274e-02,-2.888936231245953e-02,3.445199545517799e-02,-3.795269536464201e-02,3.694718750979897e-02,-2.275282900592960e-02,-4.010929466855401e-02,3.760154978041771e-01,-6.560154727504320e+00,7.032197890373896e+00,-1.124944130449488e+00,4.993411335445106e-01,-2.770777610128524e-01,1.000682356671264e-01}, +{ 1.394256197844999e-01,-3.516437738877165e-01,4.899972285356351e-01,-6.261959776703340e-01,7.824900541809728e-01,-9.825887500510935e-01,1.265734560851994e+00,-1.715996913442541e+00,2.568060245165734e+00,-4.848669414338610e+00,-1.498527519658885e+00,6.654119496398781e+00,-2.946609181135713e+00,1.690539512989467e+00,-6.201351877221914e-01}, +{ -9.067957305778340e-02,2.286306728808012e-01,-3.183373253116630e-01,4.062675150264584e-01,-5.065551707813438e-01,6.338571389253367e-01,-8.117232632307361e-01,1.088691499647203e+00,-1.591686606623560e+00,2.798809465751003e+00,-9.035669559404136e+00,6.115360905975227e+00,1.774048952570706e+00,-1.098891580198852e+00,4.078769278313381e-01}, +{ -7.729659146685608e-02,1.944081293871588e-01,-2.690449166252122e-01,3.397908148485991e-01,-4.168296973983003e-01,5.088161210555049e-01,-6.270577504925055e-01,7.900282645270255e-01,-1.032638526265157e+00,1.428948505919390e+00,-2.110767031012521e+00,-7.153130044044764e+00,1.054872351634662e+01,-3.105867199340457e+00,9.819164045614734e-01}, +{ 1.585841489796898e-01,-3.988733812120409e-01,5.520766693465577e-01,-6.974038895228263e-01,8.558502581606383e-01,-1.045404870877026e+00,1.289863935956630e+00,-1.628895573229733e+00,2.140560307140654e+00,-3.009342656848221e+00,4.812213239562421e+00,-1.080585659071579e+01,2.545099107512252e+00,7.469436715198332e+00,-2.237907419451530e+00}, +{ -3.471200492474513e-02,8.735997560850364e-02,-1.210890661910895e-01,1.533360809951572e-01,-1.888608502062837e-01,2.319063010583022e-01,-2.882897229170829e-01,3.680003934170669e-01,-4.911977061750910e-01,7.063454951090183e-01,-1.163085155665360e+00,2.582170591012556e+00,-1.665848773342730e+01,1.581810072036402e+01,-1.001497318057673e+00}, +{ -1.853810450237212e-01,4.657736502834869e-01,-6.429790479381730e-01,8.086156500414940e-01,-9.855725646521867e-01,1.191734854064217e+00,-1.448461910830287e+00,1.787629220047579e+00,-2.263830971907985e+00,2.982468664196438e+00,-4.178164894099713e+00,6.494374431791210e+00,-1.248989553450888e+01,-8.709233869779473e+00,1.717292336831599e+01}, +{ 2.622608061581531e-01,-6.588947907077796e-01,9.094359116186848e-01,-1.143420551907707e+00,1.393102415169100e+00,-1.683547453141276e+00,2.044504951394983e+00,-2.520070713818074e+00,3.185138231429422e+00,-4.182591272950832e+00,5.824536469684052e+00,-8.938371271938305e+00,1.658825693991295e+01,-5.467394482320688e+01,4.359360515230351e+01} +}; +#endif +#if p_Nq==16 && p_cubNq==16 +const dfloat c_DI[16][16] = { +{ -3.838434719682315e+01,4.196649173042269e+01,-4.896475486831696e+00,2.032869278066199e+00,-1.188114372192413e+00,8.077596573068975e-01,-5.966564321605711e-01,4.638441400907967e-01,-3.728125408032548e-01,3.061741503537852e-01,-2.545254953471973e-01,2.121600391533486e-01,-1.750894255366666e-01,1.397970925858685e-01,-1.015260990366554e-01,4.045096075202187e-02}, +{ 2.857088540621128e-01,-1.746697432603434e+01,2.004528492044021e+01,-4.158448768617835e+00,2.080027185279240e+00,-1.331353983299444e+00,9.545665067043440e-01,-7.296625780768884e-01,5.803634243210581e-01,-4.733686167200410e-01,3.916848680671437e-01,-3.254380011103640e-01,2.679786884903514e-01,-2.136482043372029e-01,1.550239802711247e-01,-6.174394943946682e-02}, +{ 6.091237726230616e-01,-1.516484298965557e+00,-1.006511584570120e+01,1.342223988654793e+01,-3.675072888980839e+00,2.009488481332614e+00,-1.347080677886165e+00,9.938034860525196e-01,-7.739670339557507e-01,6.228285848127262e-01,-5.107315510023454e-01,4.217439217528933e-01,-3.458228349755090e-01,2.749463481973335e-01,-1.991746267350131e-01,7.927527688329897e-02}, +{ -5.326345326330855e-01,1.447329290021881e+00,-2.428072083828943e+00,-6.504253166873754e+00,1.019384352601326e+01,-3.336647598571094e+00,1.928053076099571e+00,-1.329853121832748e+00,9.981185007770174e-01,-7.853089079815266e-01,6.346197333652330e-01,-5.189350254909981e-01,4.227166997119268e-01,-3.346334741419180e-01,2.417973261129612e-01,-9.614024074778076e-02}, +{ 4.195361684367000e-01,-1.114254867431882e+00,1.785625320668119e+00,-3.041985717651542e+00,-4.331888934135993e+00,8.256872185330122e+00,-3.076496256654556e+00,1.849011630193302e+00,-1.300372059337599e+00,9.863220971631365e-01,-7.792124803793598e-01,6.278617990898930e-01,-5.065019091572653e-01,3.984565404144567e-01,-2.868629145553184e-01,1.138893980077849e-01}, +{ -3.326253010162913e-01,8.674498939231451e-01,-1.320082167791508e+00,2.006778495775615e+00,-3.561722014872640e+00,-2.793709983330915e+00,6.946258969775523e+00,-2.862375169581415e+00,1.773175210022847e+00,-1.264277055323648e+00,9.640993520320222e-01,-7.599997311415302e-01,6.045543889843943e-01,-4.713916737227689e-01,3.376382779264623e-01,-1.337714916592934e-01}, +{ 2.690513380322282e-01,-6.931945537396100e-01,1.020544675344804e+00,-1.446297867290839e+00,2.184745306080398e+00,-4.068653942730110e+00,-1.574447403788371e+00,5.984157867143651e+00,-2.676628565467615e+00,1.698808251172499e+00,-1.222367836670710e+00,9.319404302263086e-01,-7.263217341014864e-01,5.592632138109314e-01,-3.977300146346888e-01,1.571308366126106e-01}, +{ -2.218119151949122e-01,5.668539353598441e-01,-8.168269506542112e-01,1.109765160167089e+00,-1.541539847623808e+00,2.347127056893906e+00,-4.611169300106076e+00,-5.093663983973962e-01,5.232182975194748e+00,-2.507895485063130e+00,1.623066835525642e+00,-1.172676354588958e+00,8.863036028053918e-01,-6.701940763053144e-01,4.718601616671998e-01,-1.856793996800135e-01}, +{ 1.856793996800176e-01,-4.718601616671990e-01,6.701940763053040e-01,-8.863036028053818e-01,1.172676354588952e+00,-1.623066835525639e+00,2.507895485063123e+00,-5.232182975194736e+00,5.093663983973848e-01,4.611169300106081e+00,-2.347127056893905e+00,1.541539847623812e+00,-1.109765160167099e+00,8.168269506542224e-01,-5.668539353598441e-01,2.218119151949062e-01}, +{ -1.571308366126130e-01,3.977300146346884e-01,-5.592632138109230e-01,7.263217341014786e-01,-9.319404302263067e-01,1.222367836670714e+00,-1.698808251172497e+00,2.676628565467609e+00,-5.984157867143643e+00,1.574447403788350e+00,4.068653942730130e+00,-2.184745306080412e+00,1.446297867290855e+00,-1.020544675344821e+00,6.931945537396119e-01,-2.690513380322219e-01}, +{ 1.337714916592963e-01,-3.376382779264626e-01,4.713916737227631e-01,-6.045543889843913e-01,7.599997311415325e-01,-9.640993520320269e-01,1.264277055323650e+00,-1.773175210022846e+00,2.862375169581420e+00,-6.946258969775523e+00,2.793709983330906e+00,3.561722014872650e+00,-2.006778495775635e+00,1.320082167791529e+00,-8.674498939231465e-01,3.326253010162842e-01}, +{ -1.138893980077842e-01,2.868629145553162e-01,-3.984565404144469e-01,5.065019091572583e-01,-6.278617990898886e-01,7.792124803793538e-01,-9.863220971631304e-01,1.300372059337590e+00,-1.849011630193295e+00,3.076496256654544e+00,-8.256872185330117e+00,4.331888934136014e+00,3.041985717651528e+00,-1.785625320668127e+00,1.114254867431872e+00,-4.195361684366871e-01}, +{ 9.614024074777927e-02,-2.417973261129581e-01,3.346334741419100e-01,-4.227166997119202e-01,5.189350254909929e-01,-6.346197333652277e-01,7.853089079815182e-01,-9.981185007770108e-01,1.329853121832734e+00,-1.928053076099547e+00,3.336647598571064e+00,-1.019384352601324e+01,6.504253166873768e+00,2.428072083828930e+00,-1.447329290021857e+00,5.326345326330654e-01}, +{ -7.927527688329783e-02,1.991746267350073e-01,-2.749463481973216e-01,3.458228349754954e-01,-4.217439217528819e-01,5.107315510023327e-01,-6.228285848127105e-01,7.739670339557317e-01,-9.938034860524894e-01,1.347080677886123e+00,-2.009488481332558e+00,3.675072888980771e+00,-1.342223988654795e+01,1.006511584570129e+01,1.516484298965475e+00,-6.091237726230202e-01}, +{ 6.174394943947031e-02,-1.550239802711323e-01,2.136482043372099e-01,-2.679786884903590e-01,3.254380011103743e-01,-3.916848680671642e-01,4.733686167200553e-01,-5.803634243210635e-01,7.296625780768988e-01,-9.545665067043574e-01,1.331353983299455e+00,-2.080027185279273e+00,4.158448768617931e+00,-2.004528492044040e+01,1.746697432603436e+01,-2.857088540620038e-01}, +{ -4.045096075202537e-02,1.015260990366493e-01,-1.397970925858560e-01,1.750894255366410e-01,-2.121600391533130e-01,2.545254953471487e-01,-3.061741503537249e-01,3.728125408031823e-01,-4.638441400907069e-01,5.966564321604726e-01,-8.077596573067731e-01,1.188114372192242e+00,-2.032869278065958e+00,4.896475486831321e+00,-4.196649173042126e+01,3.838434719682197e+01} +}; +#endif +#if p_Nq==16 && p_cubNq==17 +const dfloat c_DI[17][16] = { +{ -4.044914711760586e+01,4.562827757981907e+01,-7.332752623296914e+00,3.403277882318508e+00,-2.092513522943058e+00,1.461079757400523e+00,-1.096130323549025e+00,8.604746037409293e-01,-6.960647499618420e-01,5.741716053687845e-01,-4.787912163830443e-01,3.999691800796255e-01,-3.305867661645850e-01,2.642211090579997e-01,-1.920050399571851e-01,7.651964207607075e-02}, +{ -1.761437730339596e+00,-1.559112763970063e+01,2.147217986402735e+01,-6.196754624227057e+00,3.401565630458617e+00,-2.269035351106631e+00,1.663291848584080e+00,-1.288341605226461e+00,1.033462761710234e+00,-8.477569441457157e-01,7.042422501203235e-01,-5.867499295524039e-01,4.840808967033168e-01,-3.864313486522297e-01,2.806101450548906e-01,-1.117982237080924e-01}, +{ 1.675824364022875e+00,-5.125372730895220e+00,-6.717175101996506e+00,1.343328236118274e+01,-5.084234822104125e+00,3.037771670673593e+00,-2.118668043655218e+00,1.597096260883825e+00,-1.260186929625164e+00,1.022745675522119e+00,-8.434948911691411e-01,6.992850194718397e-01,-5.749592330493680e-01,4.579424543021272e-01,-3.320930800830048e-01,1.322370265186241e-01}, +{ -1.088435064874041e+00,3.097648110502748e+00,-6.325167365840144e+00,-2.164990851673678e+00,9.019853460432918e+00,-4.048714381827579e+00,2.559057286505024e+00,-1.838725013332029e+00,1.411597492740516e+00,-1.126139895391884e+00,9.183134208807943e-01,-7.554921965649043e-01,6.179443410626192e-01,-4.904969558315451e-01,3.549821429112891e-01,-1.412345297001035e-01}, +{ 6.859406692845009e-01,-1.862527110857558e+00,3.204500946541397e+00,-6.909089535173385e+00,7.415691920995013e-01,6.026344215397093e+00,-3.069189248147429e+00,2.026615345801443e+00,-1.489325640248438e+00,1.157833948109470e+00,-9.287340997833216e-01,7.557704133378080e-01,-6.136747067612500e-01,4.848004395027515e-01,-3.498824708535426e-01,1.390476417509577e-01}, +{ -4.182696578362965e-01,1.106149515378797e+00,-1.752398568204833e+00,2.935148633235673e+00,-7.145673216405756e+00,2.799636717370042e+00,3.767258736333182e+00,-2.141236913221871e+00,1.470266364708496e+00,-1.101203002934459e+00,8.634827261050037e-01,-6.924772657886860e-01,5.569151999538785e-01,-4.372596297977913e-01,3.144418110171712e-01,-1.247814499125499e-01}, +{ 2.346250078105140e-01,-6.109649470970655e-01,9.258200015770044e-01,-1.393685259355100e+00,2.412893686203294e+00,-7.109888026776982e+00,4.323322978946373e+00,1.967224192997947e+00,-1.270415643521044e+00,9.119291188575926e-01,-6.965866496238874e-01,5.493748597884915e-01,-4.370537147031423e-01,3.407834215790131e-01,-2.440823867754690e-01,9.670336009246215e-02}, +{ -1.052682007357544e-01,2.718142242687552e-01,-4.023874750430927e-01,5.758025602763291e-01,-8.828130820647652e-01,1.667902463341631e+00,-6.821598785800934e+00,5.457898937163869e+00,4.987367081105313e-01,-4.671077828411240e-01,3.712102364967337e-01,-2.954905184857148e-01,2.355782908862024e-01,-1.837208927629294e-01,1.315563357785193e-01,-5.211301858825609e-02}, +{ 1.309204101562367e-02,-3.440237394195049e-02,5.303203820657958e-02,-8.059277838610701e-02,1.322462961906361e-01,-2.557458139019142e-01,7.021017873809685e-01,-6.277470788189989e+00,6.277470788190000e+00,-7.021017873809812e-01,2.557458139019190e-01,-1.322462961906410e-01,8.059277838611209e-02,-5.303203820658283e-02,3.440237394195244e-02,-1.309204101562545e-02}, +{ 5.211301858825874e-02,-1.315563357785206e-01,1.837208927629292e-01,-2.355782908862030e-01,2.954905184857164e-01,-3.712102364967367e-01,4.671077828411296e-01,-4.987367081105444e-01,-5.457898937163861e+00,6.821598785800941e+00,-1.667902463341635e+00,8.828130820647699e-01,-5.758025602763372e-01,4.023874750431011e-01,-2.718142242687577e-01,1.052682007357510e-01}, +{ -9.670336009246644e-02,2.440823867754695e-01,-3.407834215790068e-01,4.370537147031368e-01,-5.493748597884920e-01,6.965866496238933e-01,-9.119291188575965e-01,1.270415643521051e+00,-1.967224192997967e+00,-4.323322978946345e+00,7.109888026776972e+00,-2.412893686203305e+00,1.393685259355117e+00,-9.258200015770212e-01,6.109649470970696e-01,-2.346250078105106e-01}, +{ 1.247814499125512e-01,-3.144418110171704e-01,4.372596297977852e-01,-5.569151999538732e-01,6.924772657886852e-01,-8.634827261050048e-01,1.101203002934456e+00,-1.470266364708489e+00,2.141236913221865e+00,-3.767258736333166e+00,-2.799636717370065e+00,7.145673216405776e+00,-2.935148633235698e+00,1.752398568204859e+00,-1.106149515378799e+00,4.182696578362875e-01}, +{ -1.390476417509593e-01,3.498824708535417e-01,-4.848004395027426e-01,6.136747067612418e-01,-7.557704133378045e-01,9.287340997833198e-01,-1.157833948109466e+00,1.489325640248427e+00,-2.026615345801436e+00,3.069189248147421e+00,-6.026344215397074e+00,-7.415691920995290e-01,6.909089535173426e+00,-3.204500946541438e+00,1.862527110857557e+00,-6.859406692844844e-01}, +{ 1.412345297001074e-01,-3.549821429112858e-01,4.904969558315326e-01,-6.179443410626062e-01,7.554921965648931e-01,-9.183134208807825e-01,1.126139895391871e+00,-1.411597492740502e+00,1.838725013332012e+00,-2.559057286504998e+00,4.048714381827545e+00,-9.019853460432913e+00,2.164990851673699e+00,6.325167365840139e+00,-3.097648110502709e+00,1.088435064873999e+00}, +{ -1.322370265186270e-01,3.320930800830048e-01,-4.579424543021191e-01,5.749592330493613e-01,-6.992850194718380e-01,8.434948911691459e-01,-1.022745675522123e+00,1.260186929625164e+00,-1.597096260883822e+00,2.118668043655206e+00,-3.037771670673576e+00,5.084234822104121e+00,-1.343328236118273e+01,6.717175101996363e+00,5.125372730895324e+00,-1.675824364022857e+00}, +{ 1.117982237081001e-01,-2.806101450548935e-01,3.864313486522277e-01,-4.840808967033196e-01,5.867499295524099e-01,-7.042422501203361e-01,8.477569441457268e-01,-1.033462761710244e+00,1.288341605226476e+00,-1.663291848584096e+00,2.269035351106643e+00,-3.401565630458664e+00,6.196754624227197e+00,-2.147217986402756e+01,1.559112763970064e+01,1.761437730339701e+00}, +{ -7.651964207607831e-02,1.920050399571644e-01,-2.642211090579671e-01,3.305867661645410e-01,-3.999691800795780e-01,4.787912163829933e-01,-5.741716053687189e-01,6.960647499617516e-01,-8.604746037408143e-01,1.096130323548899e+00,-1.461079757400374e+00,2.092513522942863e+00,-3.403277882318243e+00,7.332752623296534e+00,-4.562827757981761e+01,4.044914711760462e+01} +}; +#endif +#if p_Nq==16 && p_cubNq==18 +const dfloat c_DI[18][16] = { +{ -4.224876250188248e+01,4.883666727904352e+01,-9.496348654588004e+00,4.639158115322181e+00,-2.912795201609437e+00,2.055249431142114e+00,-1.551066312990391e+00,1.222067834389424e+00,-9.909361491891305e-01,8.187373131415124e-01,-6.835056510488899e-01,5.714387354752386e-01,-4.725749520677017e-01,3.778458284998681e-01,-2.746353954803711e-01,1.094602818425510e-01}, +{ -4.040710726521473e+00,-1.299196799123203e+01,2.198789832653113e+01,-7.597455638790386e+00,4.367530181902255e+00,-2.971036398555071e+00,2.200335011390663e+00,-1.714655063990154e+00,1.380733585917105e+00,-1.135538501223984e+00,9.449766986933603e-01,-7.882947019276869e-01,6.509166006664089e-01,-5.199090612274837e-01,3.776643802125351e-01,-1.504867018451887e-01}, +{ 2.595585126211459e+00,-8.710914493230383e+00,-2.621536855073769e+00,1.217618629825942e+01,-5.470615843586469e+00,3.431304533500314e+00,-2.445792931840753e+00,1.865668016115597e+00,-1.482727779865736e+00,1.208982524871461e+00,-1.000237724337364e+00,8.310313801114390e-01,-6.843011348973348e-01,5.455684514782617e-01,-3.958697830681150e-01,1.576702153519717e-01}, +{ -1.337059724209455e+00,3.950376056466652e+00,-9.551901199497550e+00,2.585119282072771e+00,6.464889249775003e+00,-3.455952468990556e+00,2.309113165031818e+00,-1.703235340106657e+00,1.327024759627821e+00,-1.068399032719084e+00,8.764714632905491e-01,-7.239961262187552e-01,5.938097627309747e-01,-4.721888636479924e-01,3.420953962493964e-01,-1.361663798549354e-01}, +{ 5.951277123043526e-01,-1.656818489906992e+00,3.098540626823080e+00,-9.170597355694127e+00,5.490322982704377e+00,2.636242404095233e+00,-1.676717765411530e+00,1.194719221906255e+00,-9.112980905111976e-01,7.237762054309128e-01,-5.883851907748304e-01,4.830222685626133e-01,-3.944956880625096e-01,3.128252529898456e-01,-2.262652776970211e-01,9.000118324153639e-02}, +{ -1.655689839653304e-01,4.511627298813021e-01,-7.788337981628457e-01,1.604349380832294e+00,-7.974584799421807e+00,6.940632235031829e+00,3.531431759344079e-02,-2.279639707782717e-01,2.158493081837856e-01,-1.852986933541535e-01,1.564006200574011e-01,-1.311026611476478e-01,1.084142825623591e-01,-8.661586805130940e-02,6.291192287876375e-02,-2.506602213981099e-02}, +{ -7.133980021008514e-02,1.809114421555882e-01,-2.536284330159191e-01,3.128418770740703e-01,-2.232726959350972e-01,-6.156477733711030e+00,7.305049965859168e+00,-1.602078149092642e+00,8.227975939850406e-01,-5.367923229532070e-01,3.892882922026532e-01,-2.975208740694025e-01,2.320346279796963e-01,-1.786816186111884e-01,1.270647704114837e-01,-5.019694206912882e-02}, +{ 1.828195479715873e-01,-4.690034330011094e-01,6.825589949789499e-01,-9.447315085318529e-01,1.356276694973340e+00,-2.176038421942141e+00,-3.882028847069026e+00,6.815730607441430e+00,-2.428107972492101e+00,1.442368781158928e+00,-1.006594826122047e+00,7.545760514261372e-01,-5.821766006532694e-01,4.455351636360697e-01,-3.157602997828847e-01,1.245760680079890e-01}, +{ -2.106624556443427e-01,5.379345035185844e-01,-7.735594100052575e-01,1.046856491275024e+00,-1.443478705383481e+00,2.163879864814771e+00,-4.057073650289367e+00,-1.325835316549271e+00,5.668933078017695e+00,-2.570728443839116e+00,1.635245385269745e+00,-1.171944394001065e+00,8.818257017282682e-01,-6.651040111773964e-01,4.676221116121034e-01,-1.839107493468951e-01}, +{ 1.839107493468990e-01,-4.676221116121020e-01,6.651040111773848e-01,-8.818257017282569e-01,1.171944394001057e+00,-1.635245385269740e+00,2.570728443839107e+00,-5.668933078017684e+00,1.325835316549260e+00,4.057073650289373e+00,-2.163879864814772e+00,1.443478705383486e+00,-1.046856491275034e+00,7.735594100052701e-01,-5.379345035185854e-01,2.106624556443369e-01}, +{ -1.245760680079918e-01,3.157602997828819e-01,-4.455351636360593e-01,5.821766006532582e-01,-7.545760514261296e-01,1.006594826122043e+00,-1.442368781158919e+00,2.428107972492089e+00,-6.815730607441441e+00,3.882028847069062e+00,2.176038421942114e+00,-1.356276694973330e+00,9.447315085318524e-01,-6.825589949789529e-01,4.690034330011051e-01,-1.828195479715828e-01}, +{ 5.019694206912818e-02,-1.270647704114822e-01,1.786816186111837e-01,-2.320346279796915e-01,2.975208740693998e-01,-3.892882922026514e-01,5.367923229532031e-01,-8.227975939850319e-01,1.602078149092634e+00,-7.305049965859163e+00,6.156477733711041e+00,2.232726959350788e-01,-3.128418770740615e-01,2.536284330159130e-01,-1.809114421555824e-01,7.133980021008100e-02}, +{ 2.506602213981185e-02,-6.291192287876321e-02,8.661586805130549e-02,-1.084142825623532e-01,1.311026611476453e-01,-1.564006200574038e-01,1.852986933541542e-01,-2.158493081837863e-01,2.279639707782755e-01,-3.531431759344964e-02,-6.940632235031815e+00,7.974584799421811e+00,-1.604349380832313e+00,7.788337981628617e-01,-4.511627298813051e-01,1.655689839653234e-01}, +{ -9.000118324153805e-02,2.262652776970212e-01,-3.128252529898409e-01,3.944956880625051e-01,-4.830222685626128e-01,5.883851907748315e-01,-7.237762054309124e-01,9.112980905111958e-01,-1.194719221906260e+00,1.676717765411542e+00,-2.636242404095257e+00,-5.490322982704358e+00,9.170597355694152e+00,-3.098540626823125e+00,1.656818489906995e+00,-5.951277123043396e-01}, +{ 1.361663798549380e-01,-3.420953962493953e-01,4.721888636479839e-01,-5.938097627309711e-01,7.239961262187550e-01,-8.764714632905442e-01,1.068399032719079e+00,-1.327024759627822e+00,1.703235340106658e+00,-2.309113165031815e+00,3.455952468990556e+00,-6.464889249775059e+00,-2.585119282072692e+00,9.551901199497532e+00,-3.950376056466614e+00,1.337059724209410e+00}, +{ -1.576702153519700e-01,3.958697830681113e-01,-5.455684514782470e-01,6.843011348973193e-01,-8.310313801114280e-01,1.000237724337355e+00,-1.208982524871452e+00,1.482727779865727e+00,-1.865668016115580e+00,2.445792931840726e+00,-3.431304533500276e+00,5.470615843586421e+00,-1.217618629825936e+01,2.621536855073612e+00,8.710914493230433e+00,-2.595585126211393e+00}, +{ 1.504867018451963e-01,-3.776643802125440e-01,5.199090612274887e-01,-6.509166006664177e-01,7.882947019276992e-01,-9.449766986933827e-01,1.135538501224010e+00,-1.380733585917133e+00,1.714655063990185e+00,-2.200335011390695e+00,2.971036398555109e+00,-4.367530181902344e+00,7.597455638790581e+00,-2.198789832653133e+01,1.299196799123190e+01,4.040710726521669e+00}, +{ -1.094602818425678e-01,2.746353954803938e-01,-3.778458284998940e-01,4.725749520677380e-01,-5.714387354752808e-01,6.835056510489386e-01,-8.187373131415732e-01,9.909361491892156e-01,-1.222067834389544e+00,1.551066312990551e+00,-2.055249431142319e+00,2.912795201609725e+00,-4.639158115322650e+00,9.496348654588907e+00,-4.883666727904388e+01,4.224876250188224e+01} +}; +#endif +#if p_Nq==16 && p_cubNq==19 +const dfloat c_DI[19][16] = { +{ -4.382327841025823e+01,5.165593031833733e+01,-1.141840911840834e+01,5.750602807827074e+00,-3.653880770005843e+00,2.593234937903541e+00,-1.963485603764013e+00,1.550110544125184e+00,-1.258576157020653e+00,1.040789395022297e+00,-8.694168916734472e-01,7.271833174994510e-01,-6.015560207614620e-01,4.810693161459231e-01,-3.497050474845898e-01,1.393873825157861e-01}, +{ -6.446894673350577e+00,-9.902905148258101e+00,2.179468076985039e+01,-8.456876814775407e+00,5.009395582550733e+00,-3.451143009928478e+00,2.572881553350043e+00,-2.012802081242866e+00,1.624837802494197e+00,-1.338506445319818e+00,1.115153176214666e+00,-9.309951124122723e-01,7.691719684813446e-01,-6.145890756752279e-01,4.465380388419123e-01,-1.779465308205349e-01}, +{ 3.249382892134614e+00,-1.191663881785299e+01,1.697072040793749e+00,1.012455750510954e+01,-5.099642785391109e+00,3.312458863097329e+00,-2.399330161291701e+00,1.846505297378514e+00,-1.475460272022234e+00,1.207304864796987e+00,-1.001237408236471e+00,8.332335632267615e-01,-6.868919926072602e-01,5.480449782392580e-01,-3.978440096884704e-01,1.584854423134796e-01}, +{ -1.226242397259504e+00,3.796753707809544e+00,-1.157478736569027e+01,6.858226951432901e+00,3.419219831774214e+00,-2.140023185594694e+00,1.508761767537717e+00,-1.142254337718811e+00,9.032727005826642e-01,-7.340133992337914e-01,6.058502204068263e-01,-5.025321570243270e-01,4.133301123334145e-01,-3.292820531096401e-01,2.388217079160770e-01,-9.510210416232187e-02}, +{ 2.067343676302173e-01,-6.204509922189032e-01,1.463790190507434e+00,-9.317848090416813e+00,8.729228895553742e+00,-5.365948710553564e-01,7.905286856666137e-02,1.065223950701114e-02,-3.383962115548178e-02,3.880297135939630e-02,-3.767878269642050e-02,3.425396992826438e-02,-2.978589107826760e-02,2.455104916245406e-02,-1.815291523708955e-02,7.284611643152149e-03}, +{ 2.413009941702846e-01,-6.313586250645792e-01,9.640881580087886e-01,-1.393640107782619e+00,-5.973548837442655e+00,8.431147304368249e+00,-2.482955457496513e+00,1.396414773373564e+00,-9.525218256130333e-01,7.102863677672878e-01,-5.552938485344991e-01,4.444103588850743e-01,-3.569101536443676e-01,2.799676573734540e-01,-2.012199957901780e-01,7.983323742174298e-02}, +{ -3.603265835039368e-01,9.412256465983603e-01,-1.439023252012805e+00,2.211593318511996e+00,-4.056530476275535e+00,-2.175035225908119e+00,6.702021748009707e+00,-2.899362803105494e+00,1.822089589916089e+00,-1.307860573052142e+00,1.001059931607769e+00,-7.909353760999470e-01,6.300732283605538e-01,-4.917364963186684e-01,3.523946658847493e-01,-1.396473426125762e-01}, +{ 2.996183662860892e-01,-7.755031346104154e-01,1.155977222293971e+00,-1.680908678708500e+00,2.687925748119834e+00,-5.989689404287777e+00,1.472485522726964e+00,4.204473017567445e+00,-2.256536699935817e+00,1.518859242696799e+00,-1.124026357065453e+00,8.706040796909668e-01,-6.850205472973260e-01,5.305370093649502e-01,-3.785412854348260e-01,1.497458985930953e-01}, +{ -1.605124735997713e-01,4.128309211805956e-01,-6.049043547889448e-01,8.487408886225564e-01,-1.253782734058661e+00,2.185610514559647e+00,-6.800051131971129e+00,4.427122056809797e+00,1.558374716030402e+00,-1.046763094964206e+00,7.596675928735964e-01,-5.801993813239658e-01,4.522989496407128e-01,-3.482286814223741e-01,2.476135956364555e-01,-9.781738322471005e-02}, +{ 1.309204101562679e-02,-3.440237394195175e-02,5.303203820657990e-02,-8.059277838610959e-02,1.322462961906406e-01,-2.557458139019184e-01,7.021017873809757e-01,-6.277470788189998e+00,6.277470788189997e+00,-7.021017873809744e-01,2.557458139019159e-01,-1.322462961906379e-01,8.059277838610981e-02,-5.303203820658126e-02,3.440237394195142e-02,-1.309204101562501e-02}, +{ 9.781738322471115e-02,-2.476135956364545e-01,3.482286814223672e-01,-4.522989496407060e-01,5.801993813239612e-01,-7.596675928735918e-01,1.046763094964201e+00,-1.558374716030396e+00,-4.427122056809804e+00,6.800051131971131e+00,-2.185610514559643e+00,1.253782734058664e+00,-8.487408886225638e-01,6.049043547889537e-01,-4.128309211805967e-01,1.605124735997663e-01}, +{ -1.497458985930984e-01,3.785412854348253e-01,-5.305370093649421e-01,6.850205472973185e-01,-8.706040796909643e-01,1.124026357065456e+00,-1.518859242696800e+00,2.256536699935821e+00,-4.204473017567465e+00,-1.472485522726932e+00,5.989689404287761e+00,-2.687925748119840e+00,1.680908678708516e+00,-1.155977222293987e+00,7.755031346104162e-01,-2.996183662860838e-01}, +{ 1.396473426125787e-01,-3.523946658847491e-01,4.917364963186603e-01,-6.300732283605462e-01,7.909353760999441e-01,-1.001059931607768e+00,1.307860573052140e+00,-1.822089589916086e+00,2.899362803105497e+00,-6.702021748009716e+00,2.175035225908132e+00,4.056530476275533e+00,-2.211593318512015e+00,1.439023252012823e+00,-9.412256465983593e-01,3.603265835039305e-01}, +{ -7.983323742174586e-02,2.012199957901748e-01,-2.799676573734437e-01,3.569101536443588e-01,-4.444103588850673e-01,5.552938485344906e-01,-7.102863677672765e-01,9.525218256130148e-01,-1.396414773373540e+00,2.482955457496484e+00,-8.431147304368235e+00,5.973548837442690e+00,1.393640107782578e+00,-9.640881580087749e-01,6.313586250645631e-01,-2.413009941702728e-01}, +{ -7.284611643153249e-03,1.815291523709041e-02,-2.455104916245371e-02,2.978589107826529e-02,-3.425396992826181e-02,3.767878269642010e-02,-3.880297135939748e-02,3.383962115548146e-02,-1.065223950701437e-02,-7.905286856665210e-02,5.365948710553451e-01,-8.729228895553744e+00,9.317848090416838e+00,-1.463790190507457e+00,6.204509922189073e-01,-2.067343676302135e-01}, +{ 9.510210416232301e-02,-2.388217079160780e-01,3.292820531096370e-01,-4.133301123334153e-01,5.025321570243304e-01,-6.058502204068278e-01,7.340133992337934e-01,-9.032727005826688e-01,1.142254337718820e+00,-1.508761767537730e+00,2.140023185594717e+00,-3.419219831774270e+00,-6.858226951432869e+00,1.157478736569030e+01,-3.796753707809532e+00,1.226242397259471e+00}, +{ -1.584854423134805e-01,3.978440096884617e-01,-5.480449782392367e-01,6.868919926072394e-01,-8.332335632267455e-01,1.001237408236455e+00,-1.207304864796966e+00,1.475460272022205e+00,-1.846505297378481e+00,2.399330161291652e+00,-3.312458863097263e+00,5.099642785391024e+00,-1.012455750510938e+01,-1.697072040794039e+00,1.191663881785307e+01,-3.249382892134511e+00}, +{ 1.779465308205377e-01,-4.465380388419149e-01,6.145890756752181e-01,-7.691719684813350e-01,9.309951124122693e-01,-1.115153176214671e+00,1.338506445319819e+00,-1.624837802494194e+00,2.012802081242865e+00,-2.572881553350041e+00,3.451143009928474e+00,-5.009395582550757e+00,8.456876814775512e+00,-2.179468076985057e+01,9.902905148258231e+00,6.446894673350552e+00}, +{ -1.393873825158205e-01,3.497050474846450e-01,-4.810693161459848e-01,6.015560207615249e-01,-7.271833174995159e-01,8.694168916735159e-01,-1.040789395022380e+00,1.258576157020757e+00,-1.550110544125306e+00,1.963485603764166e+00,-2.593234937903740e+00,3.653880770006146e+00,-5.750602807827601e+00,1.141840911840930e+01,-5.165593031833765e+01,4.382327841025793e+01} +}; +#endif +#if p_Nq==16 && p_cubNq==20 +const dfloat c_DI[20][16] = { +{ -4.520636358047584e+01,5.414136622090894e+01,-1.312818090804431e+01,6.749184280087572e+00,-4.322223781943642e+00,3.079292031606760e+00,-2.336470570957869e+00,1.846967952422478e+00,-1.500868920126367e+00,1.241865560692249e+00,-1.037797564517144e+00,8.682603512462266e-01,-7.184006778754917e-01,5.745857218903668e-01,-4.177176574934690e-01,1.665015425795453e-01}, +{ -8.900561273894978e+00,-6.508951873699165e+00,2.106882217473103e+01,-8.875801956606187e+00,5.374050183639135e+00,-3.737159882197897e+00,2.799807581572206e+00,-2.196674708461241e+00,1.776534943447110e+00,-1.465271976081663e+00,1.221800447611476e+00,-1.020634104230499e+00,8.435759460873212e-01,-6.742239303798149e-01,4.899462998994379e-01,-1.952578714362712e-01}, +{ 3.579764160189273e+00,-1.453464925303028e+01,5.869770050489117e+00,7.665717428236776e+00,-4.232156712378114e+00,2.832307650192188e+00,-2.080509926158299e+00,1.613698096410354e+00,-1.295640753305606e+00,1.063500706273297e+00,-8.838610764189232e-01,7.366375552255783e-01,-6.078776263139880e-01,4.853286841048698e-01,-3.524570178696023e-01,1.404280343533608e-01}, +{ -7.991632664986402e-01,2.687400167850383e+00,-1.224125274867730e+01,1.016826320929689e+01,5.078330776131008e-01,-5.919092870255781e-01,4.852950214694051e-01,-3.928613132282080e-01,3.222789560534586e-01,-2.678267583095465e-01,2.243089381665775e-01,-1.878869823153506e-01,1.555604337000526e-01,-1.244649719786875e-01,9.050222180395062e-02,-3.607669792050285e-02}, +{ -3.224124432561909e-01,8.482029214581754e-01,-1.220308131509688e+00,-7.581429716883570e+00,1.011922850497305e+01,-2.768640103900302e+00,1.520642689726694e+00,-1.024640104995223e+00,7.591142803760970e-01,-5.925003667933020e-01,4.763093427075678e-01,-3.881122631129453e-01,3.153974329056964e-01,-2.492870165559753e-01,1.799622824882476e-01,-7.152730762833126e-02}, +{ 5.764803460180474e-01,-1.547050724270625e+00,2.562495552701823e+00,-4.856352629604105e+00,-2.190072202892675e+00,7.536103909246777e+00,-3.314497161510026e+00,2.090170081522494e+00,-1.502932384734641e+00,1.154134203418283e+00,-9.187411233785269e-01,7.439413392496088e-01,-6.020936893479091e-01,4.746470545169348e-01,-3.421318431483136e-01,1.358992722128527e-01}, +{ -4.216645160528906e-01,1.114512155609446e+00,-1.762813286852084e+00,2.940855263076932e+00,-7.058963464252408e+00,2.610999224495895e+00,3.905818715395657e+00,-2.198769495066469e+00,1.504722406106588e+00,-1.125149527204153e+00,8.814218140634412e-01,-7.064460458622646e-01,5.679338947861163e-01,-4.458038177004999e-01,3.205416262313978e-01,-1.271949467747027e-01}, +{ 1.432275128814758e-01,-3.758699246225917e-01,5.816481568128447e-01,-9.146740727542291e-01,1.746935615365359e+00,-7.230245215156819e+00,5.817145729292193e+00,4.960291881139398e-01,-4.762402710077081e-01,3.814178653172930e-01,-3.064350487711841e-01,2.485240833745568e-01,-2.010515901074954e-01,1.583671644222192e-01,-1.140798428306538e-01,4.530064967079890e-02}, +{ 8.913378343154957e-02,-2.271221555677127e-01,3.245494862414390e-01,-4.324314574525344e-01,5.699985392677170e-01,-6.854653351177286e-01,-5.352201421773926e+00,6.915477201637624e+00,-1.796027668843581e+00,9.736269885488852e-01,-6.510796350179050e-01,4.766125298320087e-01,-3.625072564952294e-01,2.750265377543111e-01,-1.939658854353693e-01,7.637574899045305e-02}, +{ -1.980529279082642e-01,5.054066242275673e-01,-7.255563824875139e-01,9.787389546244649e-01,-1.341472536982065e+00,1.985841608705748e+00,-3.588503843554559e+00,-1.968132812817185e+00,5.974515084254429e+00,-2.578280832449072e+00,1.615660839191556e+00,-1.149884110608510e+00,8.619506306281256e-01,-6.486981331479473e-01,4.555454269519013e-01,-1.790775886286754e-01}, +{ 1.790775886286807e-01,-4.555454269518997e-01,6.486981331479350e-01,-8.619506306281148e-01,1.149884110608504e+00,-1.615660839191553e+00,2.578280832449064e+00,-5.974515084254430e+00,1.968132812817205e+00,3.588503843554543e+00,-1.985841608705740e+00,1.341472536982063e+00,-9.787389546244710e-01,7.255563824875231e-01,-5.054066242275663e-01,1.980529279082589e-01}, +{ -7.637574899045399e-02,1.939658854353697e-01,-2.750265377543085e-01,3.625072564952270e-01,-4.766125298320090e-01,6.510796350179078e-01,-9.736269885488872e-01,1.796027668843581e+00,-6.915477201637629e+00,5.352201421773926e+00,6.854653351177357e-01,-5.699985392677219e-01,4.324314574525399e-01,-3.245494862414438e-01,2.271221555677127e-01,-8.913378343154552e-02}, +{ -4.530064967080166e-02,1.140798428306555e-01,-1.583671644222210e-01,2.010515901074983e-01,-2.485240833745601e-01,3.064350487711890e-01,-3.814178653173012e-01,4.762402710077237e-01,-4.960291881139679e-01,-5.817145729292164e+00,7.230245215156820e+00,-1.746935615365382e+00,9.146740727542505e-01,-5.816481568128637e-01,3.758699246225987e-01,-1.432275128814739e-01}, +{ 1.271949467747059e-01,-3.205416262313996e-01,4.458038177004955e-01,-5.679338947861133e-01,7.064460458622679e-01,-8.814218140634482e-01,1.125149527204160e+00,-1.504722406106596e+00,2.198769495066490e+00,-3.905818715395703e+00,-2.610999224495834e+00,7.058963464252391e+00,-2.940855263076962e+00,1.762813286852115e+00,-1.114512155609451e+00,4.216645160528830e-01}, +{ -1.358992722128539e-01,3.421318431483136e-01,-4.746470545169265e-01,6.020936893479027e-01,-7.439413392496063e-01,9.187411233785251e-01,-1.154134203418280e+00,1.502932384734635e+00,-2.090170081522490e+00,3.314497161510022e+00,-7.536103909246773e+00,2.190072202892673e+00,4.856352629604123e+00,-2.562495552701851e+00,1.547050724270621e+00,-5.764803460180344e-01}, +{ 7.152730762833420e-02,-1.799622824882462e-01,2.492870165559684e-01,-3.153974329056916e-01,3.881122631129427e-01,-4.763093427075638e-01,5.925003667932974e-01,-7.591142803760925e-01,1.024640104995216e+00,-1.520642689726678e+00,2.768640103900275e+00,-1.011922850497303e+01,7.581429716883570e+00,1.220308131509688e+00,-8.482029214581662e-01,3.224124432561774e-01}, +{ 3.607669792049939e-02,-9.050222180395015e-02,1.244649719786868e-01,-1.555604337000524e-01,1.878869823153504e-01,-2.243089381665741e-01,2.678267583095438e-01,-3.222789560534567e-01,3.928613132282114e-01,-4.852950214694133e-01,5.919092870255885e-01,-5.078330776131191e-01,-1.016826320929691e+01,1.224125274867736e+01,-2.687400167850372e+00,7.991632664986120e-01}, +{ -1.404280343533606e-01,3.524570178695901e-01,-4.853286841048443e-01,6.078776263139600e-01,-7.366375552255564e-01,8.838610764189039e-01,-1.063500706273277e+00,1.295640753305587e+00,-1.613698096410330e+00,2.080509926158258e+00,-2.832307650192131e+00,4.232156712378044e+00,-7.665717428236683e+00,-5.869770050489283e+00,1.453464925303025e+01,-3.579764160189123e+00}, +{ 1.952578714362758e-01,-4.899462998994432e-01,6.742239303798130e-01,-8.435759460873155e-01,1.020634104230500e+00,-1.221800447611493e+00,1.465271976081679e+00,-1.776534943447121e+00,2.196674708461250e+00,-2.799807581572211e+00,3.737159882197907e+00,-5.374050183639185e+00,8.875801956606304e+00,-2.106882217473107e+01,6.508951873698983e+00,8.900561273895132e+00}, +{ -1.665015425795815e-01,4.177176574935694e-01,-5.745857218904978e-01,7.184006778756606e-01,-8.682603512464336e-01,1.037797564517387e+00,-1.241865560692514e+00,1.500868920126654e+00,-1.846967952422832e+00,2.336470570958342e+00,-3.079292031607390e+00,4.322223781944525e+00,-6.749184280088938e+00,1.312818090804671e+01,-5.414136622091133e+01,4.520636358047666e+01} +}; +#endif +#if p_Nq==16 && p_cubNq==21 +const dfloat c_DI[21][16] = { +{ -4.642618613805242e+01,5.634007666814657e+01,-1.465210102189389e+01,7.646572199799120e+00,-4.924725169016542e+00,3.518127878878399e+00,-2.673502075098439e+00,2.115347238295155e+00,-1.719990678693738e+00,1.423753168896029e+00,-1.190133546525298e+00,9.959083933627391e-01,-8.241310629795453e-01,6.592111505261230e-01,-4.792659099881999e-01,1.910389043439384e-01}, +{ -1.134404690793299e+01,-2.951352259973338e+00,1.995591300756931e+01,-8.947069514075265e+00,5.511246706157426e+00,-3.861131646583412e+00,2.904022988136685e+00,-2.283718161778174e+00,1.849654935689299e+00,-1.527086104676570e+00,1.274209697400282e+00,-1.064920071908670e+00,8.804691857406141e-01,-7.038650789572932e-01,5.115528970736258e-01,-2.038796718815380e-01}, +{ 3.574183247907449e+00,-1.647212366819188e+01,9.665293433772863e+00,5.084843331636711e+00,-3.084901345283671e+00,2.130083715613314e+00,-1.588007790635395e+00,1.241937308784038e+00,-1.002253105514225e+00,8.254328004274391e-01,-6.875660907318691e-01,5.739402795628041e-01,-4.741316315068507e-01,3.788177538890500e-01,-2.752236590819384e-01,1.096754193521590e-01}, +{ -1.480308307864407e-01,8.192468318454925e-01,-1.165061939040400e+01,1.234679553183425e+01,-1.913791821548238e+00,8.572600406376409e-01,-5.190464633617028e-01,3.606901068656495e-01,-2.704082496167909e-01,2.121269355903420e-01,-1.709147260606806e-01,1.394099219864349e-01,-1.133423450719936e-01,8.960168873881147e-02,-6.468876130638078e-02,2.571153065761044e-02}, +{ -8.284557492844405e-01,2.309876171901657e+00,-4.324564287241797e+00,-4.551919056821451e+00,9.868221069896382e+00,-3.875541645427693e+00,2.356174293023952e+00,-1.663033537512836e+00,1.264160777739656e+00,-1.002411166083859e+00,8.141862375146350e-01,-6.680440596474497e-01,5.454340204816810e-01,-4.324312306759768e-01,3.127412978210680e-01,-1.243931356835249e-01}, +{ 6.970357818258972e-01,-1.903441466412654e+00,3.336823165591994e+00,-7.697575223931058e+00,2.088486162426622e+00,5.179141689802550e+00,-2.791381035000643e+00,1.878427157177964e+00,-1.393152208559033e+00,1.088757815365579e+00,-8.761858653699881e-01,7.145337551523482e-01,-5.810134413843158e-01,4.594185698897517e-01,-3.317418964514026e-01,1.318670398763886e-01}, +{ -2.307098517203107e-01,6.220467455683296e-01,-1.042352959783967e+00,2.005418564286236e+00,-8.053303187385735e+00,6.366617252975121e+00,6.638354779994128e-01,-5.913018015435362e-01,4.654105307688876e-01,-3.719439529716319e-01,3.025481099644814e-01,-2.481770852043952e-01,2.024934745272110e-01,-1.604407822281123e-01,1.159830311358444e-01,-4.612356638783330e-02}, +{ -1.485634933904952e-01,3.826612233241987e-01,-5.620306221497181e-01,7.852225641663683e-01,-1.065867531237634e+00,-5.445321539506979e+00,7.451551177721112e+00,-2.109631201586854e+00,1.173995549769082e+00,-7.965637895087934e-01,5.908744486088131e-01,-4.579881495338793e-01,3.604300948910104e-01,-2.791520327010372e-01,1.991715104271916e-01,-7.878820929238614e-02}, +{ 2.812162010788254e-01,-7.251335966914254e-01,1.069936089659600e+00,-1.523226389944399e+00,2.323927856049697e+00,-4.459134042062350e+00,-1.028348868150819e+00,5.713560007929256e+00,-2.653874831294873e+00,1.704048022637129e+00,-1.232853661823211e+00,9.428004758223701e-01,-7.361312200168861e-01,5.674470658963316e-01,-4.038030705580689e-01,1.595699614688230e-01}, +{ -1.945105094232935e-01,4.993050989608390e-01,-7.279167584243778e-01,1.011391583736148e+00,-1.466220239071208e+00,2.451076198496964e+00,-6.569718706848488e+00,3.435320994161399e+00,2.451126720767181e+00,-1.492548726216035e+00,1.047601506026020e+00,-7.872148387029113e-01,6.081323883804798e-01,-4.657359309685574e-01,3.302070333094512e-01,-1.302958141836088e-01}, +{ 1.309204101562648e-02,-3.440237394195318e-02,5.303203820658269e-02,-8.059277838611247e-02,1.322462961906438e-01,-2.557458139019242e-01,7.021017873809843e-01,-6.277470788190000e+00,6.277470788189990e+00,-7.021017873809653e-01,2.557458139019098e-01,-1.322462961906341e-01,8.059277838610622e-02,-5.303203820657881e-02,3.440237394194969e-02,-1.309204101562382e-02}, +{ 1.302958141836131e-01,-3.302070333094524e-01,4.657359309685537e-01,-6.081323883804777e-01,7.872148387029125e-01,-1.047601506026024e+00,1.492548726216040e+00,-2.451126720767197e+00,-3.435320994161384e+00,6.569718706848485e+00,-2.451076198496964e+00,1.466220239071211e+00,-1.011391583736157e+00,7.279167584243892e-01,-4.993050989608400e-01,1.945105094232906e-01}, +{ -1.595699614688280e-01,4.038030705580686e-01,-5.674470658963224e-01,7.361312200168774e-01,-9.428004758223675e-01,1.232853661823214e+00,-1.704048022637127e+00,2.653874831294869e+00,-5.713560007929269e+00,1.028348868150843e+00,4.459134042062338e+00,-2.323927856049700e+00,1.523226389944410e+00,-1.069936089659614e+00,7.251335966914255e-01,-2.812162010788181e-01}, +{ 7.878820929238767e-02,-1.991715104271922e-01,2.791520327010333e-01,-3.604300948910065e-01,4.579881495338778e-01,-5.908744486088124e-01,7.965637895087925e-01,-1.173995549769079e+00,2.109631201586854e+00,-7.451551177721113e+00,5.445321539506986e+00,1.065867531237623e+00,-7.852225641663676e-01,5.620306221497208e-01,-3.826612233241951e-01,1.485634933904901e-01}, +{ 4.612356638783660e-02,-1.159830311358444e-01,1.604407822281095e-01,-2.024934745272067e-01,2.481770852043930e-01,-3.025481099644842e-01,3.719439529716327e-01,-4.654105307688874e-01,5.913018015435394e-01,-6.638354779994188e-01,-6.366617252975113e+00,8.053303187385740e+00,-2.005418564286258e+00,1.042352959783986e+00,-6.220467455683326e-01,2.307098517203092e-01}, +{ -1.318670398763920e-01,3.317418964514013e-01,-4.594185698897433e-01,5.810134413843098e-01,-7.145337551523459e-01,8.761858653699871e-01,-1.088757815365576e+00,1.393152208559026e+00,-1.878427157177963e+00,2.791381035000643e+00,-5.179141689802544e+00,-2.088486162426637e+00,7.697575223931095e+00,-3.336823165592036e+00,1.903441466412652e+00,-6.970357818258787e-01}, +{ 1.243931356835285e-01,-3.127412978210666e-01,4.324312306759687e-01,-5.454340204816763e-01,6.680440596474476e-01,-8.141862375146344e-01,1.002411166083858e+00,-1.264160777739655e+00,1.663033537512831e+00,-2.356174293023939e+00,3.875541645427671e+00,-9.868221069896341e+00,4.551919056821366e+00,4.324564287241880e+00,-2.309876171901664e+00,8.284557492844266e-01}, +{ -2.571153065760892e-02,6.468876130638049e-02,-8.960168873880994e-02,1.133423450719909e-01,-1.394099219864334e-01,1.709147260606809e-01,-2.121269355903413e-01,2.704082496167866e-01,-3.606901068656395e-01,5.190464633616876e-01,-8.572600406376224e-01,1.913791821548216e+00,-1.234679553183427e+01,1.165061939040405e+01,-8.192468318454961e-01,1.480308307864339e-01}, +{ -1.096754193521554e-01,2.752236590819292e-01,-3.788177538890331e-01,4.741316315068325e-01,-5.739402795627871e-01,6.875660907318495e-01,-8.254328004274198e-01,1.002253105514206e+00,-1.241937308784010e+00,1.588007790635353e+00,-2.130083715613259e+00,3.084901345283590e+00,-5.084843331636584e+00,-9.665293433773069e+00,1.647212366819184e+01,-3.574183247907286e+00}, +{ 2.038796718815412e-01,-5.115528970736296e-01,7.038650789572884e-01,-8.804691857406101e-01,1.064920071908673e+00,-1.274209697400298e+00,1.527086104676582e+00,-1.849654935689305e+00,2.283718161778186e+00,-2.904022988136695e+00,3.861131646583412e+00,-5.511246706157465e+00,8.947069514075402e+00,-1.995591300756946e+01,2.951352259973410e+00,1.134404690793297e+01}, +{ -1.910389043439509e-01,4.792659099882051e-01,-6.592111505261153e-01,8.241310629795315e-01,-9.959083933627181e-01,1.190133546525269e+00,-1.423753168895996e+00,1.719990678693704e+00,-2.115347238295113e+00,2.673502075098380e+00,-3.518127878878305e+00,4.924725169016435e+00,-7.646572199799055e+00,1.465210102189393e+01,-5.634007666814555e+01,4.642618613805135e+01} +}; +#endif +#if p_Nq==16 && p_cubNq==22 +const dfloat c_DI[22][16] = { +{ -4.750629152128783e+01,5.829198097443671e+01,-1.601356047819493e+01,8.453834849188191e+00,-5.468152361930896e+00,3.914442655434369e+00,-2.978093134393621e+00,2.357999126544919e+00,-1.918162640865318e+00,1.588282163634828e+00,-1.327949429598216e+00,1.111400230455193e+00,-9.197986976012951e-01,7.357857733879800e-01,-5.349602315956045e-01,2.132427223855058e-01}, +{ -1.373700906311712e+01,6.651752968930520e-01,1.857216836783839e+01,-8.750781641132262e+00,5.467459579508163e+00,-3.854255762170527e+00,2.908373931424052e+00,-2.291592617870262e+00,1.858337842925235e+00,-1.535530715547092e+00,1.281991106753611e+00,-1.071853006620959e+00,8.864477337771137e-01,-7.087757504290479e-01,5.151787718285394e-01,-2.053340740608902e-01}, +{ 3.249141432628909e+00,-1.771580993937812e+01,1.295856777526414e+01,2.575195431706872e+00,-1.821154244265210e+00,1.317866709347612e+00,-1.004025319726076e+00,7.946986392238765e-01,-6.460597232770486e-01,5.346387830858448e-01,-4.467928381936314e-01,3.737957275091918e-01,-3.092700256012107e-01,2.473511866093829e-01,-1.798182323483209e-01,7.167463741378735e-02}, +{ 6.214820096877107e-01,-1.556015061348971e+00,-1.003349437628485e+01,1.343140167265005e+01,-3.697748457669817e+00,2.025019671511611e+00,-1.358449480364471e+00,1.002579539452435e+00,-7.809867889835773e-01,6.285745736688014e-01,-5.154972910448312e-01,4.257100317442682e-01,-3.490923049178198e-01,2.775548648487384e-01,-2.010681967747653e-01,8.002959382548294e-02}, +{ -1.189627017869405e+00,3.419074529060922e+00,-7.286516080990590e+00,-8.868258083400694e-01,8.429193040711514e+00,-3.993509005919083e+00,2.566163176691943e+00,-1.858060540872867e+00,1.432581747789807e+00,-1.145914766376252e+00,9.360609086276009e-01,-7.709937446367694e-01,6.311225651571510e-01,-5.012169146666879e-01,3.628513100359466e-01,-1.443833984031603e-01}, +{ 5.705360607674613e-01,-1.593138182228609e+00,3.009729840472025e+00,-9.270600643288615e+00,5.852036091221350e+00,2.328066433109590e+00,-1.521391411687802e+00,1.093939835353020e+00,-8.381281677329999e-01,6.673538394182266e-01,-5.433787900963086e-01,4.465388607671231e-01,-3.649503188197789e-01,2.895255917281732e-01,-2.094673907795152e-01,8.332835179665889e-02}, +{ 1.040470595921389e-01,-2.634069652001493e-01,3.580275172107537e-01,-2.848198876808032e-01,-6.899162112506930e+00,8.235025072632801e+00,-1.833053035835521e+00,9.458596972238263e-01,-6.189114324750629e-01,4.506515794321379e-01,-3.470997790430030e-01,2.750879012754777e-01,-2.194964565387404e-01,1.714557014311035e-01,-1.229268680991682e-01,4.872200858113881e-02}, +{ -3.781397592611881e-01,9.889014329713944e-01,-1.516920460607276e+00,2.349562445729001e+00,-4.413314453067410e+00,-1.707684941755162e+00,6.496038107543424e+00,-2.903650702695703e+00,1.842855802928232e+00,-1.328877606475732e+00,1.019775023421894e+00,-8.069958668211826e-01,6.435130669888397e-01,-5.025428722535645e-01,3.602697903204538e-01,-1.427890069660212e-01}, +{ 2.615912666886136e-01,-6.798599155444723e-01,1.024771744238258e+00,-1.525478526419184e+00,2.573779576123065e+00,-6.910802625037944e+00,3.607927250418993e+00,2.586197146955519e+00,-1.573900756855273e+00,1.106840381392632e+00,-8.369823666081769e-01,6.563096080807689e-01,-5.202959716868061e-01,4.048179268012973e-01,-2.895932611443444e-01,1.146785225970545e-01}, +{ 6.727827398978998e-03,-1.517563913109850e-02,1.401183613459313e-02,3.057937906878101e-03,-7.148078999724798e-02,4.101131129666088e-01,-6.212329743946015e+00,6.587145582837013e+00,-1.007425080400067e+00,4.460984566459101e-01,-2.671309909598892e-01,1.827894364759131e-01,-1.331732737389689e-01,9.832807472077069e-02,-6.826874397358666e-02,2.671199706020770e-02}, +{ -1.854501354706611e-01,4.729827954510096e-01,-6.780298526598018e-01,9.121217909191860e-01,-1.243799170945015e+00,1.821785315357091e+00,-3.192427161315713e+00,-2.481574730009446e+00,6.191805291968078e+00,-2.554393609313824e+00,1.579546804546835e+00,-1.117315818557913e+00,8.347537611032912e-01,-6.270317885731830e-01,4.398724667372624e-01,-1.728459592371971e-01}, +{ 1.728459592372004e-01,-4.398724667372589e-01,6.270317885731687e-01,-8.347537611032770e-01,1.117315818557903e+00,-1.579546804546828e+00,2.554393609313812e+00,-6.191805291968100e+00,2.481574730009516e+00,3.192427161315655e+00,-1.821785315357065e+00,1.243799170945002e+00,-9.121217909191823e-01,6.780298526598033e-01,-4.729827954510039e-01,1.854501354706546e-01}, +{ -2.671199706020687e-02,6.826874397358172e-02,-9.832807472076166e-02,1.331732737389593e-01,-1.827894364759047e-01,2.671309909598808e-01,-4.460984566458949e-01,1.007425080400040e+00,-6.587145582837003e+00,6.212329743946036e+00,-4.101131129666308e-01,7.148078999726153e-02,-3.057937906888166e-03,-1.401183613458531e-02,1.517563913109294e-02,-6.727827398977382e-03}, +{ -1.146785225970603e-01,2.895932611443447e-01,-4.048179268012919e-01,5.202959716868001e-01,-6.563096080807668e-01,8.369823666081782e-01,-1.106840381392632e+00,1.573900756855275e+00,-2.586197146955530e+00,-3.607927250418972e+00,6.910802625037936e+00,-2.573779576123074e+00,1.525478526419200e+00,-1.024771744238275e+00,6.798599155444746e-01,-2.615912666886074e-01}, +{ 1.427890069660252e-01,-3.602697903204546e-01,5.025428722535591e-01,-6.435130669888350e-01,8.069958668211824e-01,-1.019775023421898e+00,1.328877606475733e+00,-1.842855802928230e+00,2.903650702695708e+00,-6.496038107543412e+00,1.707684941755125e+00,4.413314453067446e+00,-2.349562445729033e+00,1.516920460607304e+00,-9.889014329713994e-01,3.781397592611799e-01}, +{ -4.872200858113827e-02,1.229268680991704e-01,-1.714557014311049e-01,2.194964565387447e-01,-2.750879012754830e-01,3.470997790430085e-01,-4.506515794321457e-01,6.189114324750747e-01,-9.458596972238407e-01,1.833053035835540e+00,-8.235025072632803e+00,6.899162112506903e+00,2.848198876808323e-01,-3.580275172107718e-01,2.634069652001582e-01,-1.040470595921419e-01}, +{ -8.332835179666020e-02,2.094673907795123e-01,-2.895255917281625e-01,3.649503188197681e-01,-4.465388607671167e-01,5.433787900963022e-01,-6.673538394182180e-01,8.381281677329881e-01,-1.093939835353007e+00,1.521391411687785e+00,-2.328066433109559e+00,-5.852036091221398e+00,9.270600643288661e+00,-3.009729840472054e+00,1.593138182228602e+00,-5.705360607674445e-01}, +{ 1.443833984031608e-01,-3.628513100359426e-01,5.012169146666751e-01,-6.311225651571412e-01,7.709937446367626e-01,-9.360609086275911e-01,1.145914766376242e+00,-1.432581747789797e+00,1.858060540872853e+00,-2.566163176691914e+00,3.993509005919036e+00,-8.429193040711429e+00,8.868258083399216e-01,7.286516080990709e+00,-3.419074529060919e+00,1.189627017869373e+00}, +{ -8.002959382548602e-02,2.010681967747640e-01,-2.775548648487315e-01,3.490923049178141e-01,-4.257100317442660e-01,5.154972910448326e-01,-6.285745736687999e-01,7.809867889835750e-01,-1.002579539452430e+00,1.358449480364455e+00,-2.025019671511589e+00,3.697748457669806e+00,-1.343140167265009e+01,1.003349437628486e+01,1.556015061348984e+00,-6.214820096877024e-01}, +{ -7.167463741378954e-02,1.798182323483208e-01,-2.473511866093778e-01,3.092700256012058e-01,-3.737957275091885e-01,4.467928381936258e-01,-5.346387830858416e-01,6.460597232770531e-01,-7.946986392238836e-01,1.004025319726075e+00,-1.317866709347614e+00,1.821154244265219e+00,-2.575195431706905e+00,-1.295856777526415e+01,1.771580993937802e+01,-3.249141432628768e+00}, +{ 2.053340740608993e-01,-5.151787718285390e-01,7.087757504290366e-01,-8.864477337771075e-01,1.071853006620957e+00,-1.281991106753617e+00,1.535530715547097e+00,-1.858337842925235e+00,2.291592617870260e+00,-2.908373931424046e+00,3.854255762170514e+00,-5.467459579508170e+00,8.750781641132333e+00,-1.857216836783841e+01,-6.651752968930910e-01,1.373700906311712e+01}, +{ -2.132427223855027e-01,5.349602315956010e-01,-7.357857733879682e-01,9.197986976012833e-01,-1.111400230455179e+00,1.327949429598187e+00,-1.588282163634779e+00,1.918162640865261e+00,-2.357999126544855e+00,2.978093134393531e+00,-3.914442655434231e+00,5.468152361930740e+00,-8.453834849188054e+00,1.601356047819480e+01,-5.829198097443540e+01,4.750629152128657e+01} +}; +#endif +#if p_Nq==16 && p_cubNq==23 +const dfloat c_DI[23][16] = { +{ -4.846638105279437e+01,6.003084850287078e+01,-1.723300151530248e+01,9.181122612244094e+00,-5.958847942527493e+00,4.272691824029951e+00,-3.253595912619826e+00,2.577559249360942e+00,-2.097518882257087e+00,1.737213986577915e+00,-1.452714652428643e+00,1.215963737719848e+00,-1.006418653668518e+00,8.051209886649434e-01,-5.853902961162201e-01,2.333480062461813e-01}, +{ -1.605238864431841e+01,4.265021586575806e+00,1.700819771405970e+01,-8.353414403134371e+00,5.283336048323570e+00,-3.744539159387530e+00,2.833673691884459e+00,-2.236532718685777e+00,1.815659108542296e+00,-1.501358624616695e+00,1.254091848460521e+00,-1.048895612631146e+00,8.676730632524450e-01,-6.938769103014697e-01,5.043983459876565e-01,-2.010453340110590e-01}, +{ 2.637677128763398e+00,-1.830232855235474e+01,1.569965013885234e+01,2.569646664550430e-01,-5.551088948644902e-01,4.782945596626492e-01,-3.907955186900807e-01,3.207733368780888e-01,-2.664518965053069e-01,2.235545383526881e-01,-1.885508053114019e-01,1.587425763142520e-01,-1.319070043723037e-01,1.057982502445932e-01,-7.704239720126463e-02,3.072987377653422e-02}, +{ 1.411835808308166e+00,-4.190803066914072e+00,-7.664906215765576e+00,1.357082480871975e+01,-4.832029488479124e+00,2.839402450472626e+00,-1.965745057171975e+00,1.475900643661716e+00,-1.161739683592539e+00,9.413693440905303e-01,-7.755593455279692e-01,6.424958687820408e-01,-5.280025317415707e-01,4.204033868580723e-01,-3.048104511581527e-01,1.213635294580774e-01}, +{ -1.340387963904575e+00,3.969462821747573e+00,-9.705160240172937e+00,2.852496147582916e+00,6.294241597010319e+00,-3.393971376247439e+00,2.274302324719677e+00,-1.679910421176429e+00,1.309894475890991e+00,-1.055130948106028e+00,8.658697694812925e-01,-7.153969795706866e-01,5.868449459698001e-01,-4.666965182887728e-01,3.381359330531412e-01,-1.345935679888418e-01}, +{ 2.473044703272215e-01,-7.307601261513355e-01,1.650421459497935e+00,-9.384708029913075e+00,8.527607551551766e+00,-2.997518180275371e-01,-6.336323057041220e-02,1.099115206780505e-01,-1.086639179057750e-01,9.780745666625504e-02,-8.542449663728539e-02,7.332856664450303e-02,-6.163248033278741e-02,4.977016373568287e-02,-3.637914307660668e-02,1.453205351340103e-02}, +{ 4.362375614930979e-01,-1.159881436912833e+00,1.865228407046546e+00,-3.213417872352757e+00,-4.144608355263758e+00,8.211566100183514e+00,-3.116659055859129e+00,1.882973990287772e+00,-1.327434774209282e+00,1.008190436037901e+00,-7.971402169549536e-01,6.426471774867911e-01,-5.186100178648804e-01,4.080732597369349e-01,-2.938247618287591e-01,1.166595589737954e-01}, +{ -4.238902918347052e-01,1.119941347636505e+00,-1.769314795137905e+00,2.943103299375164e+00,-6.991906474962347e+00,2.469102188618483e+00,4.008646501441675e+00,-2.240626418838461e+00,1.529570396523685e+00,-1.142333291676789e+00,8.942551086059152e-01,-7.164189314718553e-01,5.757900719835702e-01,-4.518904297130701e-01,3.248847391415909e-01,-1.289130196914561e-01}, +{ 5.616025976462307e-02,-1.506193149406351e-01,2.465602637090753e-01,-4.313867518014255e-01,1.008624928270969e+00,-6.964050941975573e+00,6.647607048298042e+00,-5.050355909265393e-01,1.179504995352432e-01,-3.109481151958141e-02,3.130721096601450e-03,6.949199433804179e-03,-1.013747047095063e-02,1.012051953335724e-02,-8.151053772380424e-03,3.372495765369469e-03}, +{ 2.235676073081188e-01,-5.746509342303783e-01,8.406747406880977e-01,-1.176022829852414e+00,1.727398230940265e+00,-2.963062359598839e+00,-2.984547336522756e+00,6.562332895615552e+00,-2.602268069790626e+00,1.593806655878556e+00,-1.128007998060533e+00,8.521611049237372e-01,-6.605126373223046e-01,5.069021151230472e-01,-3.598183604437531e-01,1.420471753442320e-01}, +{ -2.141278351544141e-01,5.489641993848257e-01,-7.976558353286339e-01,1.101187225161659e+00,-1.576806196045498e+00,2.564570255054207e+00,-6.244783729989273e+00,2.531450397814697e+00,3.187651562502723e+00,-1.826201535569508e+00,1.254684204299819e+00,-9.329977272749261e-01,7.165281747050649e-01,-5.468715950493448e-01,3.870009376772300e-01,-1.525925021886282e-01}, +{ 1.309204101562595e-02,-3.440237394195206e-02,5.303203820658111e-02,-8.059277838611044e-02,1.322462961906410e-01,-2.557458139019202e-01,7.021017873809787e-01,-6.277470788189999e+00,6.277470788189994e+00,-7.021017873809720e-01,2.557458139019140e-01,-1.322462961906368e-01,8.059277838610875e-02,-5.303203820658049e-02,3.440237394195084e-02,-1.309204101562417e-02}, +{ 1.525925021886312e-01,-3.870009376772300e-01,5.468715950493372e-01,-7.165281747050580e-01,9.329977272749230e-01,-1.254684204299820e+00,1.826201535569509e+00,-3.187651562502733e+00,-2.531450397814685e+00,6.244783729989269e+00,-2.564570255054204e+00,1.576806196045501e+00,-1.101187225161670e+00,7.976558353286467e-01,-5.489641993848264e-01,2.141278351544080e-01}, +{ -1.420471753442365e-01,3.598183604437499e-01,-5.069021151230343e-01,6.605126373222905e-01,-8.521611049237273e-01,1.128007998060528e+00,-1.593806655878546e+00,2.602268069790613e+00,-6.562332895615577e+00,2.984547336522824e+00,2.963062359598787e+00,-1.727398230940247e+00,1.176022829852411e+00,-8.406747406880997e-01,5.746509342303717e-01,-2.235676073081090e-01}, +{ -3.372495765368792e-03,8.151053772382134e-03,-1.012051953336005e-02,1.013747047095344e-02,-6.949199433806995e-03,-3.130721096595335e-03,3.109481151957324e-02,-1.179504995352311e-01,5.050355909265185e-01,-6.647607048298022e+00,6.964050941975575e+00,-1.008624928270989e+00,4.313867518014408e-01,-2.465602637090869e-01,1.506193149406416e-01,-5.616025976462381e-02}, +{ 1.289130196914605e-01,-3.248847391415907e-01,4.518904297130636e-01,-5.757900719835634e-01,7.164189314718520e-01,-8.942551086059153e-01,1.142333291676788e+00,-1.529570396523683e+00,2.240626418838465e+00,-4.008646501441679e+00,-2.469102188618481e+00,6.991906474962356e+00,-2.943103299375190e+00,1.769314795137933e+00,-1.119941347636507e+00,4.238902918346932e-01}, +{ -1.166595589737973e-01,2.938247618287577e-01,-4.080732597369285e-01,5.186100178648754e-01,-6.426471774867856e-01,7.971402169549445e-01,-1.008190436037893e+00,1.327434774209277e+00,-1.882973990287766e+00,3.116659055859114e+00,-8.211566100183509e+00,4.144608355263789e+00,3.213417872352735e+00,-1.865228407046551e+00,1.159881436912821e+00,-4.362375614930829e-01}, +{ -1.453205351340078e-02,3.637914307660627e-02,-4.977016373568135e-02,6.163248033278466e-02,-7.332856664450021e-02,8.542449663728344e-02,-9.780745666625480e-02,1.086639179057728e-01,-1.099115206780517e-01,6.336323057041689e-02,2.997518180275325e-01,-8.527607551551769e+00,9.384708029913092e+00,-1.650421459497953e+00,7.307601261513343e-01,-2.473044703272118e-01}, +{ 1.345935679888446e-01,-3.381359330531370e-01,4.666965182887619e-01,-5.868449459697891e-01,7.153969795706757e-01,-8.658697694812804e-01,1.055130948106014e+00,-1.309894475890979e+00,1.679910421176418e+00,-2.274302324719654e+00,3.393971376247405e+00,-6.294241597010279e+00,-2.852496147582989e+00,9.705160240173006e+00,-3.969462821747544e+00,1.340387963904527e+00}, +{ -1.213635294580773e-01,3.048104511581510e-01,-4.204033868580613e-01,5.280025317415593e-01,-6.424958687820346e-01,7.755593455279666e-01,-9.413693440905250e-01,1.161739683592530e+00,-1.475900643661702e+00,1.965745057171953e+00,-2.839402450472599e+00,4.832029488479104e+00,-1.357082480871976e+01,7.664906215765554e+00,4.190803066914078e+00,-1.411835808308134e+00}, +{ -3.072987377652936e-02,7.704239720125661e-02,-1.057982502445818e-01,1.319070043722918e-01,-1.587425763142387e-01,1.885508053113832e-01,-2.235545383526691e-01,2.664518965052886e-01,-3.207733368780654e-01,3.907955186900391e-01,-4.782945596625890e-01,5.551088948643997e-01,-2.569646664548789e-01,-1.569965013885259e+01,1.830232855235470e+01,-2.637677128763221e+00}, +{ 2.010453340110565e-01,-5.043983459876549e-01,6.938769103014576e-01,-8.676730632524332e-01,1.048895612631142e+00,-1.254091848460531e+00,1.501358624616701e+00,-1.815659108542293e+00,2.236532718685771e+00,-2.833673691884447e+00,3.744539159387519e+00,-5.283336048323582e+00,8.353414403134440e+00,-1.700819771405981e+01,-4.265021586575518e+00,1.605238864431818e+01}, +{ -2.333480062462086e-01,5.853902961162785e-01,-8.051209886650148e-01,1.006418653668606e+00,-1.215963737719945e+00,1.452714652428742e+00,-1.737213986578014e+00,2.097518882257197e+00,-2.577559249361080e+00,3.253595912620022e+00,-4.272691824030225e+00,5.958847942527919e+00,-9.181122612244858e+00,1.723300151530390e+01,-6.003084850287162e+01,4.846638105279431e+01} +}; +#endif +#if p_Nq==16 && p_cubNq==24 +const dfloat c_DI[24][16] = { +{ -4.932297577036705e+01,6.158524936700893e+01,-1.832816480480584e+01,9.837561643005804e+00,-6.402602792037315e+00,4.596975464697133e+00,-3.503109176475153e+00,2.776470861643689e+00,-2.260041365326263e+00,1.872186589676025e+00,-1.565796839849710e+00,1.310742424178767e+00,-1.084936668551877e+00,8.679729913967740e-01,-6.311057545267954e-01,2.515738303328761e-01}, +{ -1.827305060035444e+01,7.795102336103540e+00,1.533339114675282e+01,-7.808753942800936e+00,4.993053304294982e+00,-3.555974492312054e+00,2.697938178871181e+00,-2.132680131779717e+00,1.733055228331521e+00,-1.434000940807768e+00,1.198374469080050e+00,-1.002614806096130e+00,8.295721629906185e-01,-6.635056438087629e-01,4.823631147027772e-01,-1.922693831676894e-01}, +{ 1.780495761169578e+00,-1.829607953186667e+01,1.788808946914782e+01,-1.803516928309694e+00,6.389020565633587e-01,-3.320107743386733e-01,2.079823387570166e-01,-1.451010982699245e-01,1.082840581301265e-01,-8.439669038702867e-02,6.758040191473189e-02,-5.483703068931301e-02,4.440295780465528e-02,-3.500038054902298e-02,2.522302557788156e-02,-1.001763465484868e-02}, +{ 2.143944463968757e+00,-6.872561924049285e+00,-4.811482150046132e+00,1.295832768103859e+01,-5.381393935879230e+00,3.299465699964755e+00,-2.327837505490673e+00,1.765781800049552e+00,-1.398578855339715e+00,1.137853789084123e+00,-9.399861150728001e-01,7.801695626493553e-01,-6.419661412047828e-01,5.115771202242134e-01,-3.711023297255724e-01,1.477888398288408e-01}, +{ -1.265359799180592e+00,3.885265820016934e+00,-1.135416091024808e+01,6.262079471864984e+00,3.886335437229004e+00,-2.365723237843491e+00,1.652303402237509e+00,-1.245252723456285e+00,9.821728078073436e-01,-7.968376106740140e-01,6.570033400745767e-01,-5.445680014545059e-01,4.476845435571939e-01,-3.565357562653759e-01,2.585390154711265e-01,-1.029457991363261e-01}, +{ -1.820478689001739e-01,4.533435640813080e-01,-4.624096780839216e-01,-8.179812768546746e+00,9.932444599962539e+00,-2.300483225082301e+00,1.200920935148047e+00,-7.897875366310894e-01,5.770966873327171e-01,-4.465654904387855e-01,3.569576674893041e-01,-2.897422369793091e-01,2.348423279203395e-01,-1.852983309271037e-01,1.336325479231736e-01,-5.309119426799808e-02}, +{ 6.503869378215399e-01,-1.756631145507497e+00,2.970430125117112e+00,-6.031096932067341e+00,-5.934204420914686e-01,6.775005182111941e+00,-3.247277311505012e+00,2.102672414700723e+00,-1.530841934146733e+00,1.183817876511260e+00,-9.464547577185318e-01,7.685392317663386e-01,-6.231570246544514e-01,4.918401524675852e-01,-3.547727951895343e-01,1.409604223840704e-01}, +{ -2.747620600834438e-01,7.371028735162616e-01,-1.217419754453074e+00,2.261847596008557e+00,-8.045456444220074e+00,5.880975247523402e+00,1.155492159293659e+00,-8.649442142766717e-01,6.504734806251371e-01,-5.092259323370860e-01,4.095212452330433e-01,-3.336055769859494e-01,2.710045080307155e-01,-2.141330544625458e-01,1.545524305130249e-01,-6.142250392495795e-02}, +{ -2.050858082789221e-01,5.308433204725009e-01,-7.907553223354092e-01,1.143200481321978e+00,-1.741435786153253e+00,-4.803853156164612e+00,7.440870926719458e+00,-2.413518794205159e+00,1.396613068688985e+00,-9.650182714339079e-01,7.231653094368045e-01,-5.640389237416672e-01,4.456576038275908e-01,-3.460248609040721e-01,2.472401399874931e-01,-9.785992723780605e-02}, +{ 3.002264087320662e-01,-7.769106188446542e-01,1.157403311287179e+00,-1.680941466630850e+00,2.680629457084987e+00,-5.919284486800972e+00,1.339281439016475e+00,4.295264337690024e+00,-2.288796147991624e+00,1.536815775201067e+00,-1.135961512142186e+00,8.792537358969774e-01,-6.915423343769856e-01,5.354534961297431e-01,-3.819949350113669e-01,1.511035407601203e-01}, +{ -5.966950439063021e-02,1.551217918210526e-01,-2.336680012538624e-01,3.453452897647301e-01,-5.609459628378541e-01,1.186492242931020e+00,-6.658048742078154e+00,6.032519997170471e+00,-1.901927574960752e-01,-6.274786580967931e-02,9.238859290515331e-02,-8.808192504659532e-02,7.633765504591838e-02,-6.222671038493466e-02,4.560083746917658e-02,-1.822493780973736e-02}, +{ -1.734830127406635e-01,4.422446633860398e-01,-6.331616876978945e-01,8.497106533000411e-01,-1.153512329773492e+00,1.673901896225890e+00,-2.855978114202656e+00,-2.898412953663957e+00,6.348388723174404e+00,-2.512959766293864e+00,1.535397498465571e+00,-1.080132852119699e+00,8.045730668308084e-01,-6.033309483381089e-01,4.228528596174491e-01,-1.660976961698706e-01}, +{ 1.660976961698723e-01,-4.228528596174456e-01,6.033309483380952e-01,-8.045730668307945e-01,1.080132852119689e+00,-1.535397498465563e+00,2.512959766293852e+00,-6.348388723174415e+00,2.898412953664004e+00,2.855978114202617e+00,-1.673901896225872e+00,1.153512329773484e+00,-8.497106533000409e-01,6.331616876978983e-01,-4.422446633860360e-01,1.734830127406569e-01}, +{ 1.822493780973586e-02,-4.560083746917794e-02,6.222671038493648e-02,-7.633765504592127e-02,8.808192504659852e-02,-9.238859290515784e-02,6.274786580968926e-02,1.901927574960562e-01,-6.032519997170461e+00,6.658048742078167e+00,-1.186492242931035e+00,5.609459628378659e-01,-3.453452897647409e-01,2.336680012538724e-01,-1.551217918210581e-01,5.966950439063062e-02}, +{ -1.511035407601226e-01,3.819949350113657e-01,-5.354534961297334e-01,6.915423343769757e-01,-8.792537358969722e-01,1.135961512142184e+00,-1.536815775201062e+00,2.288796147991618e+00,-4.295264337690016e+00,-1.339281439016481e+00,5.919284486800974e+00,-2.680629457084994e+00,1.680941466630866e+00,-1.157403311287195e+00,7.769106188446543e-01,-3.002264087320604e-01}, +{ 9.785992723780568e-02,-2.472401399874930e-01,3.460248609040679e-01,-4.456576038275888e-01,5.640389237416689e-01,-7.231653094368072e-01,9.650182714339092e-01,-1.396613068688985e+00,2.413518794205165e+00,-7.440870926719459e+00,4.803853156164608e+00,1.741435786153255e+00,-1.143200481321987e+00,7.907553223354212e-01,-5.308433204725010e-01,2.050858082789201e-01}, +{ 6.142250392496179e-02,-1.545524305130252e-01,2.141330544625430e-01,-2.710045080307123e-01,3.336055769859498e-01,-4.095212452330500e-01,5.092259323370872e-01,-6.504734806251411e-01,8.649442142766862e-01,-1.155492159293675e+00,-5.880975247523385e+00,8.045456444220077e+00,-2.261847596008586e+00,1.217419754453099e+00,-7.371028735162665e-01,2.747620600834365e-01}, +{ -1.409604223840732e-01,3.547727951895334e-01,-4.918401524675765e-01,6.231570246544433e-01,-7.685392317663349e-01,9.464547577185320e-01,-1.183817876511257e+00,1.530841934146724e+00,-2.102672414700720e+00,3.247277311505008e+00,-6.775005182111941e+00,5.934204420914784e-01,6.031096932067356e+00,-2.970430125117144e+00,1.756631145507492e+00,-6.503869378215210e-01}, +{ 5.309119426800024e-02,-1.336325479231707e-01,1.852983309270961e-01,-2.348423279203316e-01,2.897422369793012e-01,-3.569576674892940e-01,4.465654904387762e-01,-5.770966873327108e-01,7.897875366310748e-01,-1.200920935148020e+00,2.300483225082258e+00,-9.932444599962514e+00,8.179812768546771e+00,4.624096780838876e-01,-4.533435640812848e-01,1.820478689001598e-01}, +{ 1.029457991363265e-01,-2.585390154711235e-01,3.565357562653660e-01,-4.476845435571852e-01,5.445680014544992e-01,-6.570033400745670e-01,7.968376106740021e-01,-9.821728078073332e-01,1.245252723456275e+00,-1.652303402237492e+00,2.365723237843464e+00,-3.886335437228963e+00,-6.262079471865070e+00,1.135416091024816e+01,-3.885265820016901e+00,1.265359799180546e+00}, +{ -1.477888398288449e-01,3.711023297255686e-01,-5.115771202241993e-01,6.419661412047697e-01,-7.801695626493478e-01,9.399861150727951e-01,-1.137853789084115e+00,1.398578855339707e+00,-1.765781800049536e+00,2.327837505490645e+00,-3.299465699964722e+00,5.381393935879195e+00,-1.295832768103856e+01,4.811482150046022e+00,6.872561924049323e+00,-2.143944463968702e+00}, +{ 1.001763465485839e-02,-2.522302557790214e-02,3.500038054904954e-02,-4.440295780468932e-02,5.483703068935165e-02,-6.758040191477685e-02,8.439669038707678e-02,-1.082840581301750e-01,1.451010982699805e-01,-2.079823387570974e-01,3.320107743387811e-01,-6.389020565635253e-01,1.803516928310008e+00,-1.788808946914821e+01,1.829607953186660e+01,-1.780495761169328e+00}, +{ 1.922693831676874e-01,-4.823631147027658e-01,6.635056438087390e-01,-8.295721629905947e-01,1.002614806096108e+00,-1.198374469080025e+00,1.434000940807740e+00,-1.733055228331486e+00,2.132680131779669e+00,-2.697938178871118e+00,3.555974492311966e+00,-4.993053304294873e+00,7.808753942800798e+00,-1.533339114675229e+01,-7.795102336104524e+00,1.827305060035497e+01}, +{ -2.515738303329087e-01,6.311057545268572e-01,-8.679729913968376e-01,1.084936668551943e+00,-1.310742424178839e+00,1.565796839849791e+00,-1.872186589676114e+00,2.260041365326366e+00,-2.776470861643825e+00,3.503109176475339e+00,-4.596975464697374e+00,6.402602792037674e+00,-9.837561643006426e+00,1.832816480480697e+01,-6.158524936700932e+01,4.932297577036670e+01} +}; +#endif diff --git a/okl/mesh/constantDifferentiationMatrices.h b/okl/mesh/constantDifferentiationMatrices.h new file mode 100644 index 000000000..ab0a8b4e1 --- /dev/null +++ b/okl/mesh/constantDifferentiationMatrices.h @@ -0,0 +1,261 @@ +#if p_cubNq==2 +const dfloat c_D[2][2] = { +{ -8.660254037844388e-01,8.660254037844388e-01}, +{ -8.660254037844388e-01,8.660254037844388e-01} +}; +#endif +#if p_cubNq==3 +const dfloat c_D[3][3] = { +{ -1.936491673103709e+00,2.581988897471611e+00,-6.454972243679026e-01}, +{ -6.454972243679031e-01,4.525071358092163e-16,6.454972243679027e-01}, +{ 6.454972243679027e-01,-2.581988897471611e+00,1.936491673103708e+00} +}; +#endif +#if p_cubNq==4 +const dfloat c_D[4][4] = { +{ -3.332000236352281e+00,4.860154415685196e+00,-2.108782348495180e+00,5.806281691622645e-01}, +{ -7.575576147992333e-01,-3.844143922232097e-01,1.470670231280717e+00,-3.286982242582740e-01}, +{ 3.286982242582739e-01,-1.470670231280717e+00,3.844143922232095e-01,7.575576147992334e-01}, +{ -5.806281691622641e-01,2.108782348495180e+00,-4.860154415685196e+00,3.332000236352281e+00} +}; +#endif +#if p_cubNq==5 +const dfloat c_D[5][5] = { +{ -5.067040595654539e+00,7.701952085172241e+00,-4.043543754387663e+00,1.960399115833278e+00,-5.517668509633162e-01}, +{ -9.602560236319582e-01,-7.583532171678755e-01,2.402750652164300e+00,-9.285580266438336e-01,2.444166152793676e-01}, +{ 3.011681597278315e-01,-1.435388242334742e+00,1.262167974425964e-15,1.435388242334740e+00,-3.011681597278306e-01}, +{ -2.444166152793679e-01,9.285580266438352e-01,-2.402750652164301e+00,7.583532171678785e-01,9.602560236319555e-01}, +{ 5.517668509633165e-01,-1.960399115833280e+00,4.043543754387667e+00,-7.701952085172239e+00,5.067040595654537e+00} +}; +#endif +#if p_cubNq==6 +const dfloat c_D[6][6] = { +{ -7.145327131771608e+00,1.110928369489405e+01,-6.402816680808855e+00,3.793561028968053e+00,-1.890911470859322e+00,5.362105595776783e-01}, +{ -1.223325472251408e+00,-1.174852281034098e+00,3.488553796847059e+00,-1.638343894637413e+00,7.561901119888436e-01,-2.082222609129840e-01}, +{ 3.244123181406702e-01,-1.605150401681950e+00,-2.530262787093281e-01,2.095388925791030e+00,-7.538333973657512e-01,1.922088338253287e-01}, +{ -1.922088338253292e-01,7.538333973657541e-01,-2.095388925791034e+00,2.530262787093338e-01,1.605150401681945e+00,-3.244123181406695e-01}, +{ 2.082222609129849e-01,-7.561901119888468e-01,1.638343894637418e+00,-3.488553796847059e+00,1.174852281034094e+00,1.223325472251409e+00}, +{ -5.362105595776780e-01,1.890911470859330e+00,-3.793561028968061e+00,6.402816680808853e+00,-1.110928369489405e+01,7.145327131771603e+00} +}; +#endif +#if p_cubNq==7 +const dfloat c_D[7][7] = { +{ -9.568182337465757e+00,1.508303411865755e+01,-9.172567853813947e+00,6.010333806311428e+00,-3.677702697481981e+00,1.851895449194525e+00,-5.268104854018216e-01}, +{ -1.538700050032437e+00,-1.647365679381668e+00,4.741330435251490e+00,-2.457069232590230e+00,1.387163356479284e+00,-6.742804749281588e-01,1.889216452017202e-01}, +{ 3.693934530448182e-01,-1.871687025373811e+00,-4.858735179073487e-01,2.820674389670841e+00,-1.231996977918924e+00,5.475964377198165e-01,-1.481067592353911e-01}, +{ -1.847014135806377e-01,7.401568549048141e-01,-2.152416541461877e+00,-8.384053974269786e-15,2.152416541461887e+00,-7.401568549048158e-01,1.847014135806379e-01}, +{ 1.481067592353899e-01,-5.475964377198138e-01,1.231996977918917e+00,-2.820674389670840e+00,4.858735179073544e-01,1.871687025373809e+00,-3.693934530448163e-01}, +{ -1.889216452017199e-01,6.742804749281572e-01,-1.387163356479280e+00,2.457069232590231e+00,-4.741330435251498e+00,1.647365679381676e+00,1.538700050032433e+00}, +{ 5.268104854018230e-01,-1.851895449194527e+00,3.677702697481978e+00,-6.010333806311440e+00,9.172567853813973e+00,-1.508303411865755e+01,9.568182337465746e+00} +}; +#endif +#if p_cubNq==8 +const dfloat c_D[8][8] = { +{ -1.233617700739016e+01,1.962363182777440e+01,-1.234714284064271e+01,8.584781486527866e+00,-5.831065317519066e+00,3.612822620384686e+00,-1.827526892720913e+00,5.206761235858988e-01}, +{ -1.903399269213866e+00,-2.180720939958538e+00,6.166016293603690e+00,-3.387036628665435e+00,2.119208563829787e+00,-1.264421785526866e+00,6.276152118553389e-01,-1.772614459241109e-01}, +{ 4.284887942362614e-01,-2.206109949908643e+00,-7.260583322364265e-01,3.631663393943801e+00,-1.752386002797822e+00,9.514161078659371e-01,-4.523915197606116e-01,1.253775086575040e-01}, +{ -1.930147883186554e-01,7.851103846579588e-01,-2.352848325643207e+00,-1.898218179097410e-01,2.725766481169760e+00,-1.135319556167891e+00,4.912296007186727e-01,-1.311019785068977e-01}, +{ 1.311019785068983e-01,-4.912296007186729e-01,1.135319556167888e+00,-2.725766481169755e+00,1.898218179097348e-01,2.352848325643209e+00,-7.851103846579564e-01,1.930147883186546e-01}, +{ -1.253775086575044e-01,4.523915197606125e-01,-9.514161078659362e-01,1.752386002797821e+00,-3.631663393943806e+00,7.260583322364398e-01,2.206109949908634e+00,-4.284887942362604e-01}, +{ 1.772614459241089e-01,-6.276152118553397e-01,1.264421785526868e+00,-2.119208563829790e+00,3.387036628665443e+00,-6.166016293603695e+00,2.180720939958542e+00,1.903399269213861e+00}, +{ -5.206761235859005e-01,1.827526892720920e+00,-3.612822620384695e+00,5.831065317519078e+00,-8.584781486527886e+00,1.234714284064272e+01,-1.962363182777438e+01,1.233617700739015e+01} +}; +#endif +#if p_cubNq==9 +const dfloat c_D[9][9] = { +{ -1.544959135849172e+01,2.473130086626485e+01,-1.592386114644575e+01,1.150485639046921e+01,-8.317162612305609e+00,5.731953832359281e+00,-3.572238100056592e+00,1.811185563493741e+00,-5.164434352874068e-01}, +{ -2.316092360147001e+00,-2.777032345944471e+00,7.764816168647416e+00,-4.429730819535023e+00,2.947504352855346e+00,-1.953863314712999e+00,1.192844219528976e+00,-5.980638706122354e-01,1.696179699199932e-01}, +{ 4.988980406533281e-01,-2.597680184472936e+00,-9.833208372446401e-01,4.535349788062876e+00,-2.323702892869143e+00,1.398481806078869e+00,-8.151667543409540e-01,3.990600324506156e-01,-1.119189983180160e-01}, +{ -2.096393253110600e-01,8.619061400751797e-01,-2.637782135089943e+00,-3.623511228575266e-01,3.352228710952995e+00,-1.542003765916531e+00,8.133640174860155e-01,-3.801691020122863e-01,1.044465826731569e-01}, +{ 1.282715436905586e-01,-4.854010043923289e-01,1.143858518955989e+00,-2.837247478170765e+00,-2.047805216953375e-15,2.837247478170770e+00,-1.143858518955992e+00,4.854010043923296e-01,-1.282715436905588e-01}, +{ -1.044465826731571e-01,3.801691020122847e-01,-8.133640174860112e-01,1.542003765916526e+00,-3.352228710952990e+00,3.623511228575206e-01,2.637782135089948e+00,-8.619061400751800e-01,2.096393253110599e-01}, +{ 1.119189983180164e-01,-3.990600324506146e-01,8.151667543409495e-01,-1.398481806078864e+00,2.323702892869138e+00,-4.535349788062876e+00,9.833208372446424e-01,2.597680184472935e+00,-4.988980406533260e-01}, +{ -1.696179699199920e-01,5.980638706122334e-01,-1.192844219528972e+00,1.953863314712994e+00,-2.947504352855345e+00,4.429730819535028e+00,-7.764816168647428e+00,2.777032345944483e+00,2.316092360146998e+00}, +{ 5.164434352874068e-01,-1.811185563493739e+00,3.572238100056591e+00,-5.731953832359276e+00,8.317162612305609e+00,-1.150485639046923e+01,1.592386114644578e+01,-2.473130086626485e+01,1.544959135849171e+01} +}; +#endif +#if p_cubNq==10 +const dfloat c_D[10][10] = { +{ -1.890857637430895e+01,3.040616677207318e+01,-1.990130782739118e+01,1.476423396898959e+01,-1.111885145792880e+01,8.170258889333290e+00,-5.670590455771260e+00,3.544920823561030e+00,-1.799650630048322e+00,5.133962914914280e-01}, +{ -2.776105894393461e+00,-3.437355569204852e+00,9.538844565880771e+00,-5.586025495567151e+00,3.870261734047303e+00,-2.733736967052654e+00,1.857054203877053e+00,-1.146619462076206e+00,5.779923405076633e-01,-1.643094560184647e-01}, +{ 5.793709957730089e-01,-3.041561131776954e+00,-1.261898660428509e+00,5.534686166475386e+00,-2.950218351752307e+00,1.889683906719852e+00,-1.223584808215754e+00,7.359331150601934e-01,-3.656117011555446e-01,1.032004693006279e-01}, +{ -2.318354216710728e-01,9.607213262742580e-01,-2.985291471697434e+00,-5.336274802956024e-01,4.040181237739737e+00,-1.974199521942891e+00,1.153680926869499e+00,-6.599755041199018e-01,3.193883699828727e-01,-8.904246113946412e-02}, +{ 1.321290416388208e-01,-5.037375639782228e-01,1.204252022008973e+00,-3.057527214433838e+00,-1.522487138263230e-01,3.358537162416500e+00,-1.494034155863658e+00,7.713516066610998e-01,-3.558121116786205e-01,9.708992705526942e-02}, +{ -9.708992705526820e-02,3.558121116786168e-01,-7.713516066610949e-01,1.494034155863650e+00,-3.358537162416491e+00,1.522487138263124e-01,3.057527214433847e+00,-1.204252022008978e+00,5.037375639782282e-01,-1.321290416388233e-01}, +{ 8.904246113946146e-02,-3.193883699828680e-01,6.599755041198964e-01,-1.153680926869491e+00,1.974199521942882e+00,-4.040181237739734e+00,5.336274802956029e-01,2.985291471697440e+00,-9.607213262742659e-01,2.318354216710753e-01}, +{ -1.032004693006258e-01,3.656117011555395e-01,-7.359331150601847e-01,1.223584808215743e+00,-1.889683906719837e+00,2.950218351752293e+00,-5.534686166475371e+00,1.261898660428483e+00,3.041561131776975e+00,-5.793709957730148e-01}, +{ 1.643094560184619e-01,-5.779923405076511e-01,1.146619462076185e+00,-1.857054203877026e+00,2.733736967052623e+00,-3.870261734047272e+00,5.586025495567124e+00,-9.538844565880762e+00,3.437355569204844e+00,2.776105894393474e+00}, +{ -5.133962914914125e-01,1.799650630048273e+00,-3.544920823560950e+00,5.670590455771153e+00,-8.170258889333164e+00,1.111885145792866e+01,-1.476423396898944e+01,1.990130782739106e+01,-3.040616677207317e+01,1.890857637430899e+01} +}; +#endif +#if p_cubNq==11 +const dfloat c_D[11][11] = { +{ -2.271321937232154e+01,3.664830407865720e+01,-2.427867530885342e+01,1.835930784567718e+01,-1.422649901393411e+01,1.090675310599355e+01,-8.080094039817057e+00,5.629610564092911e+00,-3.525544761427831e+00,1.791184843070727e+00,-5.111279411376088e-01}, +{ -3.283067117730984e+00,-4.162269296497124e+00,1.148871755894810e+01,-6.856445993512434e+00,4.886679872884685e+00,-3.599933966364751e+00,2.609030624553408e+00,-1.794208705489494e+00,1.114695287036893e+00,-5.636580779425445e-01,1.604598141142441e-01}, +{ 6.692728794156246e-01,-3.535280788539979e+00,-1.563903052455009e+00,6.631107038816361e+00,-3.634196689904084e+00,2.426116778209983e+00,-1.674453594513598e+00,1.120301138258529e+00,-6.847889154353525e-01,3.430113773027946e-01,-9.718617115527062e-02}, +{ -2.583851617230533e-01,1.077170487754078e+00,-3.385470739019516e+00,-7.105657342764693e-01,4.792856991452879e+00,-2.438336874049113e+00,1.516627100765264e+00,-9.632127304898380e-01,5.719628261560241e-01,-2.818761597850288e-01,7.922999321477420e-02}, +{ 1.399570774894561e-01,-5.366422195960848e-01,1.296961295914984e+00,-3.350271725289774e+00,-2.906606750636482e-01,3.926051286017769e+00,-1.854990523626573e+00,1.060142813057692e+00,-5.975740140655907e-01,2.865168215997740e-01,-7.949013643800584e-02}, +{ -9.581284903864011e-02,3.530180628848685e-01,-7.731463924817517e-01,1.521986193215410e+00,-3.505802234422296e+00,1.338969607473769e-14,3.505802234422281e+00,-1.521986193215408e+00,7.731463924817527e-01,-3.530180628848698e-01,9.581284903864049e-02}, +{ 7.949013643800606e-02,-2.865168215997737e-01,5.975740140655913e-01,-1.060142813057695e+00,1.854990523626580e+00,-3.926051286017769e+00,2.906606750636385e-01,3.350271725289778e+00,-1.296961295914987e+00,5.366422195960872e-01,-1.399570774894570e-01}, +{ -7.922999321477464e-02,2.818761597850285e-01,-5.719628261560243e-01,9.632127304898394e-01,-1.516627100765269e+00,2.438336874049113e+00,-4.792856991452872e+00,7.105657342764642e-01,3.385470739019520e+00,-1.077170487754080e+00,2.583851617230544e-01}, +{ 9.718617115527017e-02,-3.430113773027925e-01,6.847889154353489e-01,-1.120301138258529e+00,1.674453594513602e+00,-2.426116778209979e+00,3.634196689904074e+00,-6.631107038816352e+00,1.563903052454996e+00,3.535280788539986e+00,-6.692728794156237e-01}, +{ -1.604598141142430e-01,5.636580779425393e-01,-1.114695287036888e+00,1.794208705489492e+00,-2.609030624553407e+00,3.599933966364737e+00,-4.886679872884665e+00,6.856445993512422e+00,-1.148871755894810e+01,4.162269296497135e+00,3.283067117730981e+00}, +{ 5.111279411376035e-01,-1.791184843070705e+00,3.525544761427810e+00,-5.629610564092897e+00,8.080094039817061e+00,-1.090675310599353e+01,1.422649901393407e+01,-1.835930784567717e+01,2.427867530885344e+01,-3.664830407865723e+01,2.271321937232155e+01} +}; +#endif +#if p_cubNq==12 +const dfloat c_D[12][12] = { +{ -2.686357374861171e+01,4.345775921932029e+01,-2.905547418840264e+01,2.228789114987257e+01,-1.763430354279154e+01,1.392904517763852e+01,-1.077691074699266e+01,8.020415709703508e+00,-5.600712510073300e+00,3.511248203393345e+00,-1.784777604713352e+00,5.093928816569913e-01}, +{ -3.836755246488341e+00,-4.952113807536514e+00,1.361479959615465e+01,-8.241320111399194e+00,5.996352743992794e+00,-4.550265989507965e+00,3.443072223688135e+00,-2.528213938816728e+00,1.750558415736110e+00,-1.091566836121061e+00,5.530256130794632e-01,-1.575726627813467e-01}, +{ 7.682504445358964e-01,-4.077456527741155e+00,-1.890492119508359e+00,7.825383191336750e+00,-4.376929384625486e+00,3.008579967414336e+00,-2.166753274517042e+00,1.546791142335749e+00,-1.052736281911466e+00,6.494327357977396e-01,-3.269101605186064e-01,9.284026740164220e-02}, +{ -2.886713911641252e-01,1.209023616484368e+00,-3.833235730011737e+00,-8.965897591928506e-01,5.611703438163877e+00,-2.937697262790662e+00,1.905076007392582e+00,-1.289528975320760e+00,8.513276264597166e-01,-5.156790704601157e-01,2.568115833448785e-01,-7.254008290517097e-02}, +{ 1.505525813505521e-01,-5.798561941536764e-01,1.413268181926972e+00,-3.699046909199136e+00,-4.253862657062266e-01,4.542957436889303e+00,-2.235228633321833e+00,1.359318061019224e+00,-8.500143001219838e-01,4.994439054073628e-01,-2.444820335223281e-01,6.847416943177083e-02}, +{ -9.790370989170739e-02,3.622578623856177e-01,-7.997688939110750e-01,1.594226681936099e+00,-3.740128949425786e+00,-1.272287893469674e-01,3.992544848399699e+00,-1.840220481087494e+00,1.033844787402107e+00,-5.759866410424886e-01,2.741114446207213e-01,-7.574816003872524e-02}, +{ 7.574816003872605e-02,-2.741114446207196e-01,5.759866410424845e-01,-1.033844787402100e+00,1.840220481087489e+00,-3.992544848399696e+00,1.272287893469638e-01,3.740128949425791e+00,-1.594226681936106e+00,7.997688939110783e-01,-3.622578623856194e-01,9.790370989170875e-02}, +{ -6.847416943176862e-02,2.444820335223251e-01,-4.994439054073584e-01,8.500143001219767e-01,-1.359318061019216e+00,2.235228633321825e+00,-4.542957436889293e+00,4.253862657062124e-01,3.699046909199148e+00,-1.413268181926977e+00,5.798561941536792e-01,-1.505525813505525e-01}, +{ 7.254008290517028e-02,-2.568115833448757e-01,5.156790704601096e-01,-8.513276264597078e-01,1.289528975320751e+00,-1.905076007392573e+00,2.937697262790651e+00,-5.611703438163867e+00,8.965897591928397e-01,3.833235730011745e+00,-1.209023616484370e+00,2.886713911641259e-01}, +{ -9.284026740163942e-02,3.269101605186036e-01,-6.494327357977322e-01,1.052736281911457e+00,-1.546791142335739e+00,2.166753274517029e+00,-3.008579967414323e+00,4.376929384625475e+00,-7.825383191336749e+00,1.890492119508355e+00,4.077456527741164e+00,-7.682504445359014e-01}, +{ 1.575726627813423e-01,-5.530256130794525e-01,1.091566836121044e+00,-1.750558415736089e+00,2.528213938816700e+00,-3.443072223688104e+00,4.550265989507933e+00,-5.996352743992768e+00,8.241320111399180e+00,-1.361479959615463e+01,4.952113807536501e+00,3.836755246488345e+00}, +{ -5.093928816569822e-01,1.784777604713341e+00,-3.511248203393312e+00,5.600712510073241e+00,-8.020415709703435e+00,1.077691074699258e+01,-1.392904517763845e+01,1.763430354279148e+01,-2.228789114987253e+01,2.905547418840262e+01,-4.345775921932032e+01,2.686357374861176e+01} +}; +#endif +#if p_cubNq==13 +const dfloat c_D[13][13] = { +{ -3.135967366643347e+01,5.083456231181062e+01,-3.423139336823537e+01,2.654859221977765e+01,-2.133858526578766e+01,1.722935369822144e+01,-1.374642703137280e+01,1.069137807158008e+01,-7.978687973412647e+00,5.579479355273741e+00,-3.500368663016606e+00,1.779805882419766e+00,-5.080355708247405e-01}, +{ -4.437032371497986e+00,-5.807100473056290e+00,1.591731807891887e+01,-9.740860017426495e+00,7.199055145868580e+00,-5.583457666591261e+00,4.355919261957253e+00,-3.341836265770851e+00,2.472102560573588e+00,-1.718751529181573e+00,1.074195793528608e+00,-5.449006890415193e-01,1.553481717190731e-01}, +{ 8.760931085164583e-01,-4.667260842241157e+00,-2.242348561636867e+00,9.117961336672600e+00,-5.179198612769088e+00,3.637620614358351e+00,-2.700117847856348e+00,2.013026917012579e+00,-1.462876375036595e+00,1.005481001078663e+00,-6.237695438289491e-01,3.149746671630744e-01,-8.958586143272340e-02}, +{ -3.223507705052295e-01,1.355036548825231e+00,-4.325728671512745e+00,-1.093570308509685e+00,6.497442583331667e+00,-3.474105775016822e+00,2.320801734444340e+00,-1.639482621997812e+00,1.154678631812157e+00,-7.783926429123954e-01,4.770186924936609e-01,-2.390929688162776e-01,6.774556836390960e-02}, +{ 1.633077542149610e-01,-6.312235241716204e-01,1.548738386275721e+00,-4.095400873319614e+00,-5.614199959045091e-01,5.210474317155057e+00,-2.638948277827933e+00,1.673261711304842e+00,-1.114845220674179e+00,7.278051043126285e-01,-4.374446638924567e-01,2.167575131431601e-01,-6.106223061605775e-02}, +{ -1.021657843447355e-01,3.793204053069177e-01,-8.428071324746875e-01,1.696650182923245e+00,-4.037128402476873e+00,-2.433847310183128e-01,4.520553087343277e+00,-2.169589749572489e+00,1.296460162416492e+00,-8.006746687206421e-01,4.664019762877326e-01,-2.270325598394739e-01,6.339721416954922e-02}, +{ 7.510319318152307e-02,-2.726558901227321e-01,5.764021657313689e-01,-1.044285856129311e+00,1.883901820285866e+00,-4.165082980336289e+00,-1.057102328945130e-15,4.165082980336281e+00,-1.883901820285853e+00,1.044285856129306e+00,-5.764021657313652e-01,2.726558901227297e-01,-7.510319318152180e-02}, +{ -6.339721416955127e-02,2.270325598394767e-01,-4.664019762877368e-01,8.006746687206490e-01,-1.296460162416506e+00,2.169589749572504e+00,-4.520553087343301e+00,2.433847310183521e-01,4.037128402476850e+00,-1.696650182923240e+00,8.428071324746844e-01,-3.793204053069154e-01,1.021657843447340e-01}, +{ 6.106223061605887e-02,-2.167575131431634e-01,4.374446638924621e-01,-7.278051043126359e-01,1.114845220674192e+00,-1.673261711304855e+00,2.638948277827947e+00,-5.210474317155055e+00,5.614199959044937e-01,4.095400873319618e+00,-1.548738386275721e+00,6.312235241716199e-01,-1.633077542149612e-01}, +{ -6.774556836390960e-02,2.390929688162813e-01,-4.770186924936668e-01,7.783926429124016e-01,-1.154678631812170e+00,1.639482621997827e+00,-2.320801734444353e+00,3.474105775016822e+00,-6.497442583331662e+00,1.093570308509689e+00,4.325728671512739e+00,-1.355036548825227e+00,3.223507705052281e-01}, +{ 8.958586143272584e-02,-3.149746671630789e-01,6.237695438289570e-01,-1.005481001078674e+00,1.462876375036607e+00,-2.013026917012595e+00,2.700117847856364e+00,-3.637620614358356e+00,5.179198612769087e+00,-9.117961336672602e+00,2.242348561636873e+00,4.667260842241149e+00,-8.760931085164574e-01}, +{ -1.553481717190719e-01,5.449006890415266e-01,-1.074195793528621e+00,1.718751529181589e+00,-2.472102560573621e+00,3.341836265770891e+00,-4.355919261957283e+00,5.583457666591269e+00,-7.199055145868584e+00,9.740860017426517e+00,-1.591731807891888e+01,5.807100473056290e+00,4.437032371497980e+00}, +{ 5.080355708247382e-01,-1.779805882419782e+00,3.500368663016651e+00,-5.579479355273810e+00,7.978687973412766e+00,-1.069137807158021e+01,1.374642703137287e+01,-1.722935369822142e+01,2.133858526578763e+01,-2.654859221977766e+01,3.423139336823534e+01,-5.083456231181054e+01,3.135967366643342e+01} +}; +#endif +#if p_cubNq==14 +const dfloat c_D[14][14] = { +{ -3.620154181773982e+01,5.877873357001366e+01,-3.980622720727229e+01,3.114049004070324e+01,-2.533691293859317e+01,2.080256509070064e+01,-1.697916054067125e+01,1.362611778512091e+01,-1.063192733389453e+01,7.948264582536493e+00,-5.563368078405301e+00,3.491880800690144e+00,-1.775867423975853e+00,5.069534707871316e-01}, +{ -5.083808571331648e+00,-6.727366514103726e+00,1.839642114516916e+01,-1.135520764673373e+01,8.494653347325984e+00,-6.698716416367973e+00,5.345562008309503e+00,-4.231003214619173e+00,3.271762832104269e+00,-2.431193464540774e+00,1.694726835633297e+00,-1.060775212554693e+00,5.385407299939172e-01,-1.535958582844131e-01}, +{ 9.926684125579616e-01,-5.304171575063344e+00,-2.619898004292840e+00,1.050911608383711e+01,-6.041501805640608e+00,4.313613028551082e+00,-3.274411231721480e+00,2.517792214252410e+00,-1.911954056150715e+00,1.403897903013791e+00,-9.708282178999365e-01,6.044477818048813e-01,-3.058493652414499e-01,8.707883199313668e-02}, +{ -3.592184257308720e-01,1.514465131301731e+00,-4.861229973466714e+00,-1.302607711702517e+00,7.450465512865557e+00,-4.048648675830290e+00,2.764925845922920e+00,-2.013647222012831e+00,1.481146490353292e+00,-1.065917346649217e+00,7.274918691863952e-01,-4.490786089232446e-01,2.260288653009543e-01,-6.417575061516310e-02}, +{ 1.778851221722569e-01,-6.895450158075915e-01,1.700895577407071e+00,-4.534570656289889e+00,-7.014777652842340e-01,5.929108445082805e+00,-3.068427356257258e+00,2.004551683032155e+00,-1.393777024861222e+00,9.704052853668219e-01,-6.487483921372988e-01,3.952467136793350e-01,-1.973497054433409e-01,5.580308934038822e-02}, +{ -1.079961359367937e-01,4.020813743110108e-01,-8.980061781224409e-01,1.822083422090563e+00,-4.384244782367551e+00,-3.552927949503761e-01,5.090599816544552e+00,-2.515194445593182e+00,1.566846191765717e+00,-1.030620321019654e+00,6.665859850649779e-01,-3.980298054891378e-01,1.963831298691438e-01,-5.519545616682872e-02}, +{ 7.636043189736411e-02,-2.779563299717549e-01,5.905167186206014e-01,-1.077959605889319e+00,1.965540876907218e+00,-4.409910664596058e+00,-1.093314891739502e-01,4.627275344456462e+00,-2.178875419180592e+00,1.284054603684263e+00,-7.850591613665959e-01,4.540658736218569e-01,-2.200019612168254e-01,6.128078220733098e-02}, +{ -6.128078220733121e-02,2.200019612168248e-01,-4.540658736218559e-01,7.850591613665955e-01,-1.284054603684262e+00,2.178875419180588e+00,-4.627275344456456e+00,1.093314891739432e-01,4.409910664596067e+00,-1.965540876907221e+00,1.077959605889319e+00,-5.905167186206006e-01,2.779563299717537e-01,-7.636043189736400e-02}, +{ 5.519545616682976e-02,-1.963831298691450e-01,3.980298054891380e-01,-6.665859850649765e-01,1.030620321019650e+00,-1.566846191765710e+00,2.515194445593176e+00,-5.090599816544547e+00,3.552927949503658e-01,4.384244782367554e+00,-1.822083422090559e+00,8.980061781224390e-01,-4.020813743110082e-01,1.079961359367949e-01}, +{ -5.580308934038813e-02,1.973497054433424e-01,-3.952467136793369e-01,6.487483921372992e-01,-9.704052853668205e-01,1.393777024861220e+00,-2.004551683032153e+00,3.068427356257261e+00,-5.929108445082828e+00,7.014777652842719e-01,4.534570656289871e+00,-1.700895577407071e+00,6.895450158075890e-01,-1.778851221722569e-01}, +{ 6.417575061516323e-02,-2.260288653009552e-01,4.490786089232447e-01,-7.274918691863957e-01,1.065917346649218e+00,-1.481146490353290e+00,2.013647222012831e+00,-2.764925845922922e+00,4.048648675830297e+00,-7.450465512865538e+00,1.302607711702469e+00,4.861229973466735e+00,-1.514465131301730e+00,3.592184257308728e-01}, +{ -8.707883199313793e-02,3.058493652414506e-01,-6.044477818048782e-01,9.708282178999365e-01,-1.403897903013789e+00,1.911954056150710e+00,-2.517792214252407e+00,3.274411231721482e+00,-4.313613028551091e+00,6.041501805640604e+00,-1.050911608383714e+01,2.619898004292903e+00,5.304171575063313e+00,-9.926684125579581e-01}, +{ 1.535958582844135e-01,-5.385407299939183e-01,1.060775212554686e+00,-1.694726835633303e+00,2.431193464540784e+00,-3.271762832104273e+00,4.231003214619186e+00,-5.345562008309523e+00,6.698716416368010e+00,-8.494653347326013e+00,1.135520764673377e+01,-1.839642114516915e+01,6.727366514103648e+00,5.083808571331684e+00}, +{ -5.069534707871313e-01,1.775867423975847e+00,-3.491880800690122e+00,5.563368078405306e+00,-7.948264582536494e+00,1.063192733389449e+01,-1.362611778512089e+01,1.697916054067126e+01,-2.080256509070069e+01,2.533691293859316e+01,-3.114049004070323e+01,3.980622720727223e+01,-5.877873357001358e+01,3.620154181773983e+01} +}; +#endif +#if p_cubNq==15 +const dfloat c_D[15][15] = { +{ -4.138919375805648e+01,6.729028696336295e+01,-4.577983525836606e+01,3.606295486642926e+01,-2.962762555903556e+01,2.464519648687645e+01,-2.046869904658545e+01,1.681372522520004e+01,-1.354265174944035e+01,1.058886322144685e+01,-7.925338347066512e+00,5.550822671551741e+00,-3.485121549671978e+00,1.772692540293917e+00,-5.060767069388161e-01}, +{ -5.777023097369748e+00,-7.713004311653004e+00,2.105220879618719e+01,-1.308446074984389e+01,9.883063547313910e+00,-7.895525117515303e+00,6.410693005478693e+00,-5.193105851533364e+00,4.144851272547692e+00,-3.220785691878870e+00,2.400257833771531e+00,-1.676064523931198e+00,1.050167509619723e+00,-5.334622790467950e-01,1.521896578534281e-01}, +{ 1.117889367511101e+00,-5.987844936594558e+00,-3.023417968499415e+00,1.199902439977988e+01,-6.964168273351111e+00,5.036817327326014e+00,-3.889585817008052e+00,3.060400488092823e+00,-2.398140821162950e+00,1.840848942000290e+00,-1.360439676678693e+00,9.445011387786204e-01,-5.894790370402735e-01,2.986974082354492e-01,-8.510254138911977e-02}, +{ -3.991452490558682e-01,1.686839496298806e+00,-5.438648514816385e+00,-1.524387296869582e+00,8.470998327505203e+00,-4.662008327979165e+00,3.238181552080966e+00,-2.412477977913325e+00,1.830457524098862e+00,-1.376494901975599e+00,1.003433072595241e+00,-6.902094997884687e-01,4.281022810283607e-01,-2.160770620482883e-01,6.143657683924220e-02}, +{ 1.940835223173911e-01,-7.541047604765029e-01,1.868256381573063e+00,-5.013681692333392e+00,-8.471513129786324e-01,6.699078764634274e+00,-3.524994708978988e+00,2.354770399618393e+00,-1.688063603619838e+00,1.227342006798175e+00,-8.756994193179417e-01,5.938962364350098e-01,-3.649610417120374e-01,1.831462329623569e-01,-5.191700492133089e-02}, +{ -1.150607252775599e-01,4.293625071207450e-01,-9.630003005171446e-01,1.966519291956166e+00,-4.774392159422995e+00,-4.666475025525899e-01,5.702824741676341e+00,-2.879731503324405e+00,1.848341015200013e+00,-1.268548246015782e+00,8.747190860216157e-01,-5.806304042334651e-01,3.519560010118664e-01,-1.751478969392386e-01,4.943609529643433e-02}, +{ 7.891922227114057e-02,-2.879027495786581e-01,6.141455566392634e-01,-1.128038946976225e+00,2.074718633940846e+00,-4.709640516613670e+00,-2.096818175983105e-01,5.126826867715294e+00,-2.485162412403491e+00,1.526440339309119e+00,-9.935495803118135e-01,6.376502814186773e-01,-3.786540775298739e-01,1.861443180856204e-01,-5.221511836791803e-02}, +{ -6.092965832989262e-02,2.191998478765999e-01,-4.541700165869145e-01,7.898752369301227e-01,-1.302631412587913e+00,2.235228736584660e+00,-4.818600179315561e+00,2.830546171511902e-14,4.818600179315532e+00,-2.235228736584662e+00,1.302631412587917e+00,-7.898752369301247e-01,4.541700165869144e-01,-2.191998478766012e-01,6.092965832989452e-02}, +{ 5.221511836791759e-02,-1.861443180856201e-01,3.786540775298744e-01,-6.376502814186769e-01,9.935495803118111e-01,-1.526440339309116e+00,2.485162412403499e+00,-5.126826867715280e+00,2.096818175982680e-01,4.709640516613698e+00,-2.074718633940858e+00,1.128038946976230e+00,-6.141455566392647e-01,2.879027495786601e-01,-7.891922227114279e-02}, +{ -4.943609529643278e-02,1.751478969392366e-01,-3.519560010118662e-01,5.806304042334630e-01,-8.747190860216102e-01,1.268548246015777e+00,-1.848341015200011e+00,2.879731503324392e+00,-5.702824741676327e+00,4.666475025525871e-01,4.774392159423001e+00,-1.966519291956166e+00,9.630003005171426e-01,-4.293625071207451e-01,1.150607252775611e-01}, +{ 5.191700492132889e-02,-1.831462329623543e-01,3.649610417120370e-01,-5.938962364350074e-01,8.756994193179373e-01,-1.227342006798170e+00,1.688063603619835e+00,-2.354770399618380e+00,3.524994708978971e+00,-6.699078764634266e+00,8.471513129786178e-01,5.013681692333396e+00,-1.868256381573054e+00,7.541047604765010e-01,-1.940835223173918e-01}, +{ -6.143657683923864e-02,2.160770620482853e-01,-4.281022810283594e-01,6.902094997884668e-01,-1.003433072595235e+00,1.376494901975592e+00,-1.830457524098863e+00,2.412477977913317e+00,-3.238181552080957e+00,4.662008327979171e+00,-8.470998327505230e+00,1.524387296869631e+00,5.438648514816353e+00,-1.686839496298802e+00,3.991452490558696e-01}, +{ 8.510254138911222e-02,-2.986974082354468e-01,5.894790370402753e-01,-9.445011387786164e-01,1.360439676678684e+00,-1.840848942000285e+00,2.398140821162954e+00,-3.060400488092819e+00,3.889585817008049e+00,-5.036817327326031e+00,6.964168273351143e+00,-1.199902439977989e+01,3.023417968499419e+00,5.987844936594557e+00,-1.117889367511108e+00}, +{ -1.521896578534285e-01,5.334622790467943e-01,-1.050167509619724e+00,1.676064523931184e+00,-2.400257833771511e+00,3.220785691878860e+00,-4.144851272547690e+00,5.193105851533343e+00,-6.410693005478669e+00,7.895525117515302e+00,-9.883063547313922e+00,1.308446074984386e+01,-2.105220879618710e+01,7.713004311652887e+00,5.777023097369813e+00}, +{ 5.060767069388135e-01,-1.772692540293914e+00,3.485121549671961e+00,-5.550822671551678e+00,7.925338347066445e+00,-1.058886322144678e+01,1.354265174944031e+01,-1.681372522519996e+01,2.046869904658532e+01,-2.464519648687638e+01,2.962762555903553e+01,-3.606295486642912e+01,4.577983525836584e+01,-6.729028696336304e+01,4.138919375805666e+01} +}; +#endif +#if p_cubNq==16 +const dfloat c_D[16][16] = { +{ -4.692264044219286e+01,7.636923239320666e+01,-5.215211890049053e+01,4.131554403478738e+01,-3.420955600593403e+01,2.875480499509683e+01,-2.421056421597340e+01,2.024664832141448e+01,-1.669876742416053e+01,1.348238673758765e+01,-1.055662625108551e+01,7.907593583255951e+00,-5.540843760657276e+00,3.479645274585247e+00,-1.770094643761947e+00,5.053563043218851e-01}, +{ -6.516633666353528e+00,-8.764077985126461e+00,2.388475059147461e+01,-1.492868838411791e+01,1.136423089693367e+01,-9.173533123088520e+00,7.550426147098019e+00,-6.226391285606645e+00,5.088276322607364e+00,-4.082372050802674e+00,3.182293079362822e+00,-2.376189537454585e+00,1.661237251669491e+00,-1.041623568236628e+00,5.293385784998014e-01,-1.510432668588315e-01}, +{ 1.251696784219230e+00,-6.718046516836637e+00,-3.453096179431577e+00,1.358780481934194e+01,-7.947423455960847e+00,5.807417829677239e+00,-4.545629586909287e+00,3.640435655398446e+00,-2.920320748862424e+00,2.314062893750595e+00,-1.788381286793166e+00,1.327269080234139e+00,-9.239403213934057e-01,5.776131897981055e-01,-2.929767073617601e-01,8.351455112940949e-02}, +{ -4.420462414534705e-01,1.871849798231634e+00,-6.057261847018600e+00,-1.759355912363998e+00,9.559178845009226e+00,-5.314632528258635e+00,3.741063971824841e+00,-2.836316463472622e+00,2.202543686559685e+00,-1.709262983454926e+00,1.302520504691636e+00,-9.573107381993899e-01,6.618971169215481e-01,-4.118802508652337e-01,2.082960360844961e-01,-5.928299423619294e-02}, +{ 2.117762237558470e-01,-8.244511518381895e-01,2.049882358281325e+00,-5.530904030781690e+00,-9.994294958167478e-01,7.520479819081497e+00,-4.009471606840120e+00,2.724932006594627e+00,-1.998584094543819e+00,1.498950642872362e+00,-1.117415829360347e+00,8.092235371233550e-01,-5.538963028588981e-01,3.423430860757254e-01,-1.723875745668703e-01,4.895241282194469e-02}, +{ -1.231607624520884e-01,4.604618915361178e-01,-1.036377169807821e+00,2.127555788721557e+00,-5.203286386913050e+00,-5.796062004288317e-01,6.357168034348612e+00,-3.264739617946319e+00,2.142950547886158e+00,-1.516303005240099e+00,1.091663066487478e+00,-7.731201616523953e-01,5.214255218870638e-01,-3.191500234531913e-01,1.597339510398329e-01,-4.521547401302477e-02}, +{ 8.244744142593244e-02,-3.013275592809711e-01,6.449694396165221e-01,-1.190729684091340e+00,2.205614723601482e+00,-5.054451353996397e+00,-3.058582839168755e-01,5.663366376257934e+00,-2.805863949744519e+00,1.775545793426224e+00,-1.205580808387421e+00,8.245743908576423e-01,-5.440351161184608e-01,3.283373225389692e-01,-1.629220897178144e-01,4.591335752909395e-02}, +{ -6.174396576819276e-02,2.225219762790982e-01,-4.625594544439364e-01,8.084289187171141e-01,-1.342354703400715e+00,2.324491639251098e+00,-5.071585680062424e+00,-9.587803702635105e-02,5.262464920192373e+00,-2.512671524728426e+00,1.525778841445075e+00,-9.845415419394284e-01,6.277860858913468e-01,-3.710605268883048e-01,1.818474379819558e-01,-5.092438550028361e-02}, +{ 5.092438550028219e-02,-1.818474379819546e-01,3.710605268883022e-01,-6.277860858913427e-01,9.845415419394217e-01,-1.525778841445065e+00,2.512671524728423e+00,-5.262464920192375e+00,9.587803702635246e-02,5.071585680062434e+00,-2.324491639251116e+00,1.342354703400725e+00,-8.084289187171206e-01,4.625594544439401e-01,-2.225219762790996e-01,6.174396576819277e-02}, +{ -4.591335752909267e-02,1.629220897178131e-01,-3.283373225389664e-01,5.440351161184577e-01,-8.245743908576365e-01,1.205580808387412e+00,-1.775545793426218e+00,2.805863949744511e+00,-5.663366376257915e+00,3.058582839168434e-01,5.054451353996424e+00,-2.205614723601494e+00,1.190729684091348e+00,-6.449694396165259e-01,3.013275592809733e-01,-8.244744142593416e-02}, +{ 4.521547401302485e-02,-1.597339510398313e-01,3.191500234531868e-01,-5.214255218870574e-01,7.731201616523858e-01,-1.091663066487465e+00,1.516303005240088e+00,-2.142950547886143e+00,3.264739617946296e+00,-6.357168034348595e+00,5.796062004288176e-01,5.203286386913065e+00,-2.127555788721565e+00,1.036377169807823e+00,-4.604618915361187e-01,1.231607624520878e-01}, +{ -4.895241282194071e-02,1.723875745668667e-01,-3.423430860757192e-01,5.538963028588901e-01,-8.092235371233447e-01,1.117415829360333e+00,-1.498950642872349e+00,1.998584094543804e+00,-2.724932006594605e+00,4.009471606840104e+00,-7.520479819081503e+00,9.994294958167580e-01,5.530904030781693e+00,-2.049882358281328e+00,8.244511518381893e-01,-2.117762237558465e-01}, +{ 5.928299423619038e-02,-2.082960360844920e-01,4.118802508652269e-01,-6.618971169215373e-01,9.573107381993736e-01,-1.302520504691619e+00,1.709262983454911e+00,-2.202543686559666e+00,2.836316463472598e+00,-3.741063971824823e+00,5.314632528258629e+00,-9.559178845009210e+00,1.759355912363956e+00,6.057261847018622e+00,-1.871849798231633e+00,4.420462414534727e-01}, +{ -8.351455112940731e-02,2.929767073617550e-01,-5.776131897980928e-01,9.239403213933899e-01,-1.327269080234120e+00,1.788381286793143e+00,-2.314062893750573e+00,2.920320748862406e+00,-3.640435655398426e+00,4.545629586909273e+00,-5.807417829677243e+00,7.947423455960859e+00,-1.358780481934200e+01,3.453096179431657e+00,6.718046516836607e+00,-1.251696784219231e+00}, +{ 1.510432668588297e-01,-5.293385784997979e-01,1.041623568236609e+00,-1.661237251669463e+00,2.376189537454548e+00,-3.182293079362784e+00,4.082372050802650e+00,-5.088276322607347e+00,6.226391285606618e+00,-7.550426147097999e+00,9.173533123088541e+00,-1.136423089693370e+01,1.492868838411798e+01,-2.388475059147462e+01,8.764077985126372e+00,6.516633666353571e+00}, +{ -5.053563043218858e-01,1.770094643761930e+00,-3.479645274585178e+00,5.540843760657143e+00,-7.907593583255768e+00,1.055662625108527e+01,-1.348238673758740e+01,1.669876742416029e+01,-2.024664832141417e+01,2.421056421597309e+01,-2.875480499509662e+01,3.420955600593383e+01,-4.131554403478722e+01,5.215211890049017e+01,-7.636923239320635e+01,4.692264044219288e+01} +}; +#endif +#if p_cubNq==17 +const dfloat c_D[17][17] = { +{ -5.280188976692783e+01,8.601557703419192e+01,-5.892300720798516e+01,4.689793890140826e+01,-3.908186424716755e+01,3.312963422579934e+01,-2.820154473399150e+01,2.391950108501537e+01,-2.009173904768324e+01,1.661576333634216e+01,-1.343745939765693e+01,1.053183739598054e+01,-7.893552467485380e+00,5.532763569899423e+00,-3.475142737968683e+00,1.767941153938000e+00,-5.047570957087357e-01}, +{ -7.302610088038413e+00,-9.880633209819663e+00,2.689409616933903e+01,-1.688794044036253e+01,1.293811818210388e+01,-1.053249507369306e+01,8.764140996081970e+00,-7.329638359622118e+00,6.099908091680863e+00,-5.012473658304136e+00,4.035317780862202e+00,-3.152372344296712e+00,2.357032296201234e+00,-1.649236181315328e+00,1.034631750614878e+00,-5.259418051174141e-01,1.500958936853265e-01}, +{ 1.394049175708222e+00,-7.494611728511044e+00,-3.909063579087469e+00,1.527553933795601e+01,-8.991426124610109e+00,6.625547723818917e+00,-5.242544813415290e+00,4.257631479013873e+00,-3.477771252279166e+00,2.822091438739225e+00,-2.252058827733832e+00,1.748265484206282e+00,-1.301248625162991e+00,9.075234206463296e-01,-5.680274478622883e-01,2.883221359819674e-01,-8.221779740863477e-02}, +{ -4.878633999117431e-01,2.069284308996908e+00,-6.716574619531205e+00,-2.007816033335262e+00,1.071509483623250e+01,-6.006825932790750e+00,4.273917306096036e+00,-3.285416408466766e+00,2.597410409235029e+00,-2.063758143980819e+00,1.623480543419651e+00,-1.247697501163249e+00,9.220410684789364e-01,-6.397838010667632e-01,3.990332936132490e-01,-2.020813943463062e-01,5.755546852055757e-02}, +{ 2.308797675589147e-01,-9.002864482744839e-01,2.245153271074936e+00,-6.085021854166526e+00,-1.158955941380678e+00,8.393349878081960e+00,-4.522389878706174e+00,3.115711709835320e+00,-2.325956592236370e+00,1.785563525089897e+00,-1.373609284594266e+00,1.039741039820955e+00,-7.602583646750116e-01,5.236201954238732e-01,-3.249209376552751e-01,1.640118141253691e-01,-4.663189932243987e-02}, +{ -1.321711514010716e-01,4.949374214762628e-01,-1.117245536842534e+00,2.303671756737308e+00,-5.668194636766145e+00,-6.955051763231868e-01,7.053516103568676e+00,-3.671151295453493e+00,2.451939866648523e+00,-1.775125828953753e+00,1.318234358770087e+00,-9.752471790125155e-01,7.021576213483903e-01,-4.785032106042925e-01,2.948045793140088e-01,-1.481346090079157e-01,4.201691650165119e-02}, +{ 8.674776741948895e-02,-3.175365772877107e-01,6.816075150298706e-01,-1.263767092393689e+00,2.354739152166875e+00,-5.438398643189212e+00,-4.006584990280069e-01,6.236453649370719e+00,-3.142707925110340e+00,2.033792447052659e+00,-1.423561454360487e+00,1.016384430527282e+00,-7.152173184898938e-01,4.800517040862664e-01,-2.928005912977134e-01,1.462049728519981e-01,-4.133353734810915e-02}, +{ -6.339254301246205e-02,2.288061001097154e-01,-4.769369113146061e-01,8.370126032567489e-01,-1.397760357136095e+00,2.438755757992435e+00,-5.373266079768804e+00,-1.843571748328360e-01,5.740482495605556e+00,-2.801368702870911e+00,1.752295227936268e+00,-1.179220900506984e+00,8.010336459051799e-01,-5.257755370473270e-01,3.161287163705952e-01,-1.564721877654007e-01,4.403584707892799e-02}, +{ 5.072327986415626e-02,-1.813894755217104e-01,3.711056974359563e-01,-6.303560048147909e-01,9.939872188286909e-01,-1.551599324451481e+00,2.579338917433510e+00,-5.468297562396319e+00,-2.425607852750267e-15,5.468297562396318e+00,-2.579338917433501e+00,1.551599324451475e+00,-9.939872188286921e-01,6.303560048147890e-01,-3.711056974359564e-01,1.813894755217129e-01,-5.072327986415671e-02}, +{ -4.403584707892610e-02,1.564721877653965e-01,-3.161287163705927e-01,5.257755370473248e-01,-8.010336459051772e-01,1.179220900506990e+00,-1.752295227936274e+00,2.801368702870914e+00,-5.740482495605563e+00,1.843571748328451e-01,5.373266079768790e+00,-2.438755757992425e+00,1.397760357136098e+00,-8.370126032567516e-01,4.769369113146083e-01,-2.288061001097196e-01,6.339254301246283e-02}, +{ 4.133353734810855e-02,-1.462049728519957e-01,2.928005912977126e-01,-4.800517040862668e-01,7.152173184898940e-01,-1.016384430527288e+00,1.423561454360498e+00,-2.033792447052671e+00,3.142707925110355e+00,-6.236453649370741e+00,4.006584990280422e-01,5.438398643189196e+00,-2.354739152166891e+00,1.263767092393700e+00,-6.816075150298784e-01,3.175365772877208e-01,-8.674776741949411e-02}, +{ -4.201691650165051e-02,1.481346090079151e-01,-2.948045793140096e-01,4.785032106042937e-01,-7.021576213483912e-01,9.752471790125214e-01,-1.318234358770096e+00,1.775125828953758e+00,-2.451939866648531e+00,3.671151295453496e+00,-7.053516103568645e+00,6.955051763231401e-01,5.668194636766179e+00,-2.303671756737323e+00,1.117245536842542e+00,-4.949374214762737e-01,1.321711514010745e-01}, +{ 4.663189932243902e-02,-1.640118141253667e-01,3.249209376552735e-01,-5.236201954238706e-01,7.602583646750087e-01,-1.039741039820954e+00,1.373609284594266e+00,-1.785563525089892e+00,2.325956592236363e+00,-3.115711709835306e+00,4.522389878706137e+00,-8.393349878081921e+00,1.158955941380642e+00,6.085021854166540e+00,-2.245153271074936e+00,9.002864482744972e-01,-2.308797675589191e-01}, +{ -5.755546852055715e-02,2.020813943463027e-01,-3.990332936132471e-01,6.397838010667605e-01,-9.220410684789337e-01,1.247697501163246e+00,-1.623480543419652e+00,2.063758143980817e+00,-2.597410409235021e+00,3.285416408466753e+00,-4.273917306096012e+00,6.006825932790735e+00,-1.071509483623253e+01,2.007816033335354e+00,6.716574619531174e+00,-2.069284308996939e+00,4.878633999117515e-01}, +{ 8.221779740863422e-02,-2.883221359819621e-01,5.680274478622832e-01,-9.075234206463247e-01,1.301248625162989e+00,-1.748265484206282e+00,2.252058827733833e+00,-2.822091438739220e+00,3.477771252279150e+00,-4.257631479013856e+00,5.242544813415253e+00,-6.625547723818873e+00,8.991426124610095e+00,-1.527553933795589e+01,3.909063579087229e+00,7.494611728511206e+00,-1.394049175708264e+00}, +{ -1.500958936853225e-01,5.259418051173954e-01,-1.034631750614854e+00,1.649236181315302e+00,-2.357032296201195e+00,3.152372344296663e+00,-4.035317780862147e+00,5.012473658304059e+00,-6.099908091680757e+00,7.329638359621988e+00,-8.764140996081787e+00,1.053249507369286e+01,-1.293811818210371e+01,1.688794044036227e+01,-2.689409616933886e+01,9.880633209819585e+00,7.302610088038501e+00}, +{ 5.047570957087189e-01,-1.767941153937920e+00,3.475142737968578e+00,-5.532763569899299e+00,7.893552467485190e+00,-1.053183739598036e+01,1.343745939765665e+01,-1.661576333634178e+01,2.009173904768276e+01,-2.391950108501474e+01,2.820154473399067e+01,-3.312963422579843e+01,3.908186424716672e+01,-4.689793890140719e+01,5.892300720798433e+01,-8.601557703419203e+01,5.280188976692813e+01} +}; +#endif +#if p_cubNq==18 +const dfloat c_D[18][18] = { +{ -5.902694754180544e+01,9.622932618720758e+01,-6.609244810066970e+01,5.280990529841009e+01,-4.424393294609128e+01,3.776839423436984e+01,-3.243928607719447e+01,2.782834824542994e+01,-2.371537117132983e+01,1.997961748496838e+01,-1.655395884837023e+01,1.340307387774471e+01,-1.051234469283206e+01,7.882233915960317e+00,-5.526120954685501e+00,3.471393478773787e+00,-1.766135681473806e+00,5.042532915877161e-01}, +{ -8.134930326399386e+00,-1.106270326020665e+01,3.008028174420289e+01,-1.896225368172749e+01,1.460469937866073e+01,-1.197223471253925e+01,1.005139133043266e+01,-8.501971438846420e+00,7.178229913089082e+00,-6.008644786902613e+00,4.955541938570297e+00,-3.998821316207560e+00,3.128567710734384e+00,-2.341495827133139e+00,1.639369990193120e+00,-1.028831879167734e+00,5.231088844595275e-01,-1.493036612124523e-01}, +{ 1.544916719172207e+00,-8.317422087233094e+00,-4.391413911992216e+00,1.706228624372006e+01,-1.009629084759676e+01,7.491305040050003e+00,-5.980339076840714e+00,4.911810119166495e+00,-4.070003022256589e+00,3.363961464077160e+00,-2.749766368670773e+00,2.204658144851077e+00,-1.716737227340968e+00,1.280385017102148e+00,-8.941744267404906e-01,5.601597785875744e-01,-2.844796823583107e-01,8.114412430319204e-02}, +{ -5.365559397167929e-01,2.278993978680752e+00,-7.416237123705567e+00,-2.269978900701272e+00,1.193880430316499e+01,-6.738802174768555e+00,4.836987031138949e+00,-3.759967179441045e+00,3.015087762478816e+00,-2.439713837489520e+00,1.965552911279501e+00,-1.559805741634584e+00,1.205612775513361e+00,-8.943206695674099e-01,6.221188440086397e-01,-3.886589102970714e-01,1.970290240384370e-01,-5.614615298163252e-02}, +{ 2.513371580229903e-01,-9.814071182409266e-01,2.453644237302063e+00,-6.675198576420068e+00,-1.326165852183438e+00,9.317701184484259e+00,-5.064107411898614e+00,3.527574056686452e+00,-2.670622256489710e+00,2.087459087486912e+00,-1.644207190727275e+00,1.284734720116829e+00,-9.821791608676260e-01,7.228702706180005e-01,-5.000306486954305e-01,3.111647005977807e-01,-1.573439214666398e-01,4.477672167444160e-02}, +{ -1.420095640188981e-01,5.324979044481829e-01,-1.205017406864377e+00,2.493860534481423e+00,-6.167297200931485e+00,-8.152121707957325e-01,7.791751559686560e+00,-4.099560003336445e+00,2.776140701570322e+00,-2.045867234353381e+00,1.555079701886489e+00,-1.187139869623631e+00,8.932226765741359e-01,-6.500949826249928e-01,4.461668472739557e-01,-2.761465767177086e-01,1.391516111979860e-01,-3.952652785240530e-02}, +{ 9.169619336037356e-02,-3.360932637168348e-01,7.231903014890642e-01,-1.345721912001155e+00,2.519880543845876e+00,-5.857685388908863e+00,-4.958103642030005e-01,6.845693913018501e+00,-3.496718397823939e+00,2.302685281343936e+00,-1.649072880609731e+00,1.214325657816318e+00,-8.924683770545231e-01,6.392790993372602e-01,-4.339612141752236e-01,2.666048476464364e-01,-1.337105344924200e-01,3.788649512792534e-02}, +{ -6.567892862710721e-02,2.373623751982920e-01,-4.959375193083867e-01,8.734198667411914e-01,-1.465587646747234e+00,2.573278576106461e+00,-5.715787680327399e+00,-2.689502218500035e-01,6.251733365639867e+00,-3.103222538555818e+00,1.985023193020366e+00,-1.376887511290283e+00,9.761177486719900e-01,-6.831124474496196e-01,4.565872200240088e-01,-2.776394604984466e-01,1.383513474956322e-01,-3.906973824351002e-02}, +{ 5.128099943276743e-02,-1.836105053684402e-01,3.765026624880327e-01,-6.416923979229793e-01,1.016567085428561e+00,-1.596536877534610e+00,2.674897801725173e+00,-5.727806940649081e+00,-8.538868468304837e-02,5.897964294666035e+00,-2.843157018245523e+00,1.761493805439093e+00,-1.176562298999229e+00,7.945871773371532e-01,-5.192372315349862e-01,3.111890681176819e-01,-1.536939216576094e-01,4.320298196100812e-02}, +{ -4.320298196100861e-02,1.536939216576101e-01,-3.111890681176837e-01,5.192372315349842e-01,-7.945871773371481e-01,1.176562298999226e+00,-1.761493805439091e+00,2.843157018245503e+00,-5.897964294666004e+00,8.538868468301541e-02,5.727806940649098e+00,-2.674897801725166e+00,1.596536877534604e+00,-1.016567085428563e+00,6.416923979229767e-01,-3.765026624880275e-01,1.836105053684385e-01,-5.128099943276595e-02}, +{ 3.906973824350825e-02,-1.383513474956322e-01,2.776394604984487e-01,-4.565872200240090e-01,6.831124474496173e-01,-9.761177486719892e-01,1.376887511290280e+00,-1.985023193020355e+00,3.103222538555808e+00,-6.251733365639889e+00,2.689502218500474e-01,5.715787680327368e+00,-2.573278576106449e+00,1.465587646747234e+00,-8.734198667411847e-01,4.959375193083779e-01,-2.373623751982889e-01,6.567892862710677e-02}, +{ -3.788649512792769e-02,1.337105344924227e-01,-2.666048476464423e-01,4.339612141752265e-01,-6.392790993372580e-01,8.924683770545248e-01,-1.214325657816322e+00,1.649072880609730e+00,-2.302685281343935e+00,3.496718397823960e+00,-6.845693913018524e+00,4.958103642030203e-01,5.857685388908857e+00,-2.519880543845886e+00,1.345721912001151e+00,-7.231903014890546e-01,3.360932637168325e-01,-9.169619336037432e-02}, +{ 3.952652785240568e-02,-1.391516111979882e-01,2.761465767177119e-01,-4.461668472739573e-01,6.500949826249902e-01,-8.932226765741365e-01,1.187139869623632e+00,-1.555079701886484e+00,2.045867234353376e+00,-2.776140701570328e+00,4.099560003336450e+00,-7.791751559686535e+00,8.152121707956893e-01,6.167297200931518e+00,-2.493860534481423e+00,1.205017406864364e+00,-5.324979044481802e-01,1.420095640188950e-01}, +{ -4.477672167444458e-02,1.573439214666396e-01,-3.111647005977825e-01,5.000306486954328e-01,-7.228702706179956e-01,9.821791608676220e-01,-1.284734720116828e+00,1.644207190727266e+00,-2.087459087486899e+00,2.670622256489708e+00,-3.527574056686447e+00,5.064107411898587e+00,-9.317701184484241e+00,1.326165852183418e+00,6.675198576420053e+00,-2.453644237302020e+00,9.814071182409160e-01,-2.513371580229864e-01}, +{ 5.614615298163328e-02,-1.970290240384407e-01,3.886589102970777e-01,-6.221188440086470e-01,8.943206695674152e-01,-1.205612775513363e+00,1.559805741634597e+00,-1.965552911279508e+00,2.439713837489522e+00,-3.015087762478847e+00,3.759967179441071e+00,-4.836987031138969e+00,6.738802174768606e+00,-1.193880430316512e+01,2.269978900701467e+00,7.416237123705454e+00,-2.278993978680740e+00,5.365559397167939e-01}, +{ -8.114412430318829e-02,2.844796823583161e-01,-5.601597785875881e-01,8.941744267405070e-01,-1.280385017102165e+00,1.716737227340991e+00,-2.204658144851114e+00,2.749766368670806e+00,-3.363961464077195e+00,4.070003022256648e+00,-4.911810119166568e+00,5.980339076840785e+00,-7.491305040050100e+00,1.009629084759689e+01,-1.706228624372003e+01,4.391413911992071e+00,8.317422087233144e+00,-1.544916719172213e+00}, +{ 1.493036612124765e-01,-5.231088844595349e-01,1.028831879167748e+00,-1.639369990193138e+00,2.341495827133153e+00,-3.128567710734409e+00,3.998821316207596e+00,-4.955541938570327e+00,6.008644786902639e+00,-7.178229913089144e+00,8.501971438846486e+00,-1.005139133043270e+01,1.197223471253932e+01,-1.460469937866085e+01,1.896225368172744e+01,-3.008028174420276e+01,1.106270326020663e+01,8.134930326399390e+00}, +{ -5.042532915877658e-01,1.766135681473822e+00,-3.471393478773839e+00,5.526120954685547e+00,-7.882233915960343e+00,1.051234469283213e+01,-1.340307387774480e+01,1.655395884837029e+01,-1.997961748496843e+01,2.371537117132997e+01,-2.782834824543004e+01,3.243928607719451e+01,-3.776839423437001e+01,4.424393294609154e+01,-5.280990529840986e+01,6.609244810066930e+01,-9.622932618720743e+01,5.902694754180539e+01} +}; +#endif +#if p_cubNq==19 +const dfloat c_D[19][19] = { +{ -6.559781811826279e+01,1.070104838368316e+02,-7.366040263064602e+01,5.905126798000214e+01,-4.969530006282068e+01,4.267011993253342e+01,-3.692202859104480e+01,3.197025548918202e+01,-2.756506397928632e+01,2.356704137776031e+01,-1.989603438561966e+01,1.650675692579339e+01,-1.337616986314764e+01,1.049672389062579e+01,-7.872964733683045e+00,5.520588470052094e+00,-3.468236649633244e+00,1.764606738022213e+00,-5.038256266588084e-01}, +{ -9.013577984549771e+00,-1.231031285558784e+01,3.344333425230365e+01,-2.115165567740820e+01,1.636395581253609e+01,-1.349262256036111e+01,1.141184889654811e+01,-9.742747527093520e+00,8.322133045478971e+00,-7.069112163934414e+00,5.940256326196308e+00,-4.911495756046434e+00,3.969836488186113e+00,-3.109264329230074e+00,2.328696694624535e+00,-1.631150301415314e+00,1.023963841747857e+00,-5.207204280533843e-01,1.486342260584318e-01}, +{ 1.704277504894152e+00,-9.186390440458904e+00,-4.900215811890150e+00,1.894808795301105e+01,-1.126210205590571e+01,8.404763163581842e+00,-6.759021412968801e+00,5.602848554593376e+00,-4.696672838264985e+00,3.938992931449567e+00,-3.280327202209585e+00,2.694540574542609e+00,-2.167396909199513e+00,1.691406777484670e+00,-1.263352869199341e+00,8.831530358472846e-01,-5.536142741234198e-01,2.812677580603444e-01,-8.024443924448543e-02}, +{ -5.880944449793216e-01,2.500871198148796e+00,-8.155996267949048e+00,-2.545995990659757e+00,1.323034680468735e+01,-7.510714959499448e+00,5.430452178605511e+00,-4.260111523450313e+00,3.455611672051690e+00,-2.836968446662104e+00,2.328254828240700e+00,-1.892651650314655e+00,1.510840880735788e+00,-1.172414541379925e+00,8.720505153347835e-01,-6.077441261403208e-01,3.801435206687349e-01,-1.928594560575402e-01,5.497980861908034e-02}, +{ 2.731082734502842e-01,-1.067669766108172e+00,2.675055363661724e+00,-7.300841467923055e+00,-1.501362994659098e+00,1.029353399423320e+01,-5.634872909039303e+00,3.960847797627535e+00,-3.032901674481695e+00,2.404859416937184e+00,-1.929217783591448e+00,1.543834539155252e+00,-1.218650953239664e+00,9.379837885383532e-01,-6.935138092719608e-01,4.812196277594333e-01,-3.000806467720588e-01,1.519363095188734e-01,-4.326710579537822e-02}, +{ -1.526197916960393e-01,5.729443904754458e-01,-1.299290460036708e+00,2.697431492174349e+00,-6.699339031774896e+00,-9.393128718726027e-01,8.571769211532541e+00,-4.550358999719759e+00,3.116119540783457e+00,-2.329123598169667e+00,1.802690404254970e+00,-1.409010118129779e+00,1.094286936583180e+00,-8.325766534494401e-01,6.104678343946324e-01,-4.210661598602506e-01,2.614741959118688e-01,-1.320303408746717e-01,3.754401947337031e-02}, +{ 9.721119076672929e-02,-3.567104467087024e-01,7.691451532860092e-01,-1.435650916600412e+00,2.699574358938076e+00,-6.309779535798522e+00,-5.924332173984762e-01,7.490760522310794e+00,-3.868533532330167e+00,2.583201681866658e+00,-1.883180272435562e+00,1.419322564885199e+00,-1.076262354618171e+00,8.055174081745632e-01,-5.838355041839142e-01,3.994216363438605e-01,-2.466396725359165e-01,1.240887572117885e-01,-3.521782117383327e-02}, +{ -6.847981560929578e-02,2.477576259433223e-01,-5.187034488976510e-01,9.162618281872703e-01,-1.543778861904674e+00,2.725054196157547e+00,-6.094129513119038e+00,-3.518210864770575e-01,6.795470517952767e+00,-3.419332943952504e+00,2.225705461487503e+00,-1.579458947042469e+00,1.154693906652145e+00,-8.438079138534890e-01,6.017244917246725e-01,-4.070702026679419e-01,2.494565890173945e-01,-1.248991134138299e-01,3.535722981532812e-02}, +{ 5.240153708194288e-02,-1.878230043470989e-01,3.858949606125779e-01,-6.596174009074515e-01,1.049116730334988e+00,-1.656197551360370e+00,2.793191285232573e+00,-6.030982644485293e+00,-1.645910882301161e-01,6.359089625387008e+00,-3.118010868723490e+00,1.975314435476530e+00,-1.359709740533273e+00,9.581183886922634e-01,-6.673393569778210e-01,4.444241842552078e-01,-2.695230815695168e-01,1.340662043830425e-01,-3.782261432170557e-02}, +{ -4.308394219016342e-02,1.534279031958033e-01,-3.112356582984921e-01,5.207712807558822e-01,-7.999825690658552e-01,1.190463029808830e+00,-1.793651133164656e+00,2.918335951817316e+00,-6.115335590594745e+00,-1.416684050645421e-14,6.115335590594768e+00,-2.918335951817325e+00,1.793651133164659e+00,-1.190463029808834e+00,7.999825690658580e-01,-5.207712807558801e-01,3.112356582984919e-01,-1.534279031958050e-01,4.308394219016227e-02}, +{ 3.782261432170504e-02,-1.340662043830416e-01,2.695230815695175e-01,-4.444241842552089e-01,6.673393569778172e-01,-9.581183886922551e-01,1.359709740533265e+00,-1.975314435476518e+00,3.118010868723470e+00,-6.359089625386993e+00,1.645910882300996e-01,6.030982644485301e+00,-2.793191285232569e+00,1.656197551360368e+00,-1.049116730334985e+00,6.596174009074461e-01,-3.858949606125751e-01,1.878230043470995e-01,-5.240153708194324e-02}, +{ -3.535722981532737e-02,1.248991134138296e-01,-2.494565890173948e-01,4.070702026679428e-01,-6.017244917246711e-01,8.438079138534873e-01,-1.154693906652139e+00,1.579458947042460e+00,-2.225705461487493e+00,3.419332943952497e+00,-6.795470517952785e+00,3.518210864770838e-01,6.094129513119027e+00,-2.725054196157537e+00,1.543778861904667e+00,-9.162618281872648e-01,5.187034488976465e-01,-2.477576259433220e-01,6.847981560929413e-02}, +{ 3.521782117383403e-02,-1.240887572117887e-01,2.466396725359159e-01,-3.994216363438610e-01,5.838355041839135e-01,-8.055174081745616e-01,1.076262354618167e+00,-1.419322564885191e+00,1.883180272435556e+00,-2.583201681866656e+00,3.868533532330175e+00,-7.490760522310794e+00,5.924332173984596e-01,6.309779535798522e+00,-2.699574358938069e+00,1.435650916600407e+00,-7.691451532860036e-01,3.567104467087022e-01,-9.721119076672606e-02}, +{ -3.754401947337380e-02,1.320303408746719e-01,-2.614741959118687e-01,4.210661598602463e-01,-6.104678343946321e-01,8.325766534494462e-01,-1.094286936583178e+00,1.409010118129773e+00,-1.802690404254965e+00,2.329123598169670e+00,-3.116119540783472e+00,4.550358999719776e+00,-8.571769211532576e+00,9.393128718726653e-01,6.699339031774873e+00,-2.697431492174348e+00,1.299290460036701e+00,-5.729443904754478e-01,1.526197916960392e-01}, +{ 4.326710579537746e-02,-1.519363095188728e-01,3.000806467720605e-01,-4.812196277594368e-01,6.935138092719633e-01,-9.379837885383524e-01,1.218650953239662e+00,-1.543834539155251e+00,1.929217783591443e+00,-2.404859416937192e+00,3.032901674481707e+00,-3.960847797627546e+00,5.634872909039315e+00,-1.029353399423315e+01,1.501362994659007e+00,7.300841467923076e+00,-2.675055363661706e+00,1.067669766108174e+00,-2.731082734502767e-01}, +{ -5.497980861907903e-02,1.928594560575387e-01,-3.801435206687325e-01,6.077441261403232e-01,-8.720505153347899e-01,1.172414541379929e+00,-1.510840880735791e+00,1.892651650314656e+00,-2.328254828240701e+00,2.836968446662119e+00,-3.455611672051708e+00,4.260111523450338e+00,-5.430452178605550e+00,7.510714959499484e+00,-1.323034680468746e+01,2.545995990659971e+00,8.155996267948957e+00,-2.500871198148806e+00,5.880944449793047e-01}, +{ 8.024443924449431e-02,-2.812677580603463e-01,5.536142741234231e-01,-8.831530358472954e-01,1.263352869199348e+00,-1.691406777484679e+00,2.167396909199523e+00,-2.694540574542621e+00,3.280327202209593e+00,-3.938992931449575e+00,4.696672838265018e+00,-5.602848554593421e+00,6.759021412968839e+00,-8.404763163581860e+00,1.126210205590575e+01,-1.894808795301092e+01,4.900215811889808e+00,9.186390440459052e+00,-1.704277504894137e+00}, +{ -1.486342260584390e-01,5.207204280533866e-01,-1.023963841747860e+00,1.631150301415311e+00,-2.328696694624533e+00,3.109264329230075e+00,-3.969836488186102e+00,4.911495756046399e+00,-5.940256326196282e+00,7.069112163934422e+00,-8.322133045478973e+00,9.742747527093526e+00,-1.141184889654810e+01,1.349262256036106e+01,-1.636395581253607e+01,2.115165567740803e+01,-3.344333425230374e+01,1.231031285558834e+01,9.013577984549533e+00}, +{ 5.038256266587950e-01,-1.764606738022247e+00,3.468236649633289e+00,-5.520588470052127e+00,7.872964733683181e+00,-1.049672389062604e+01,1.337616986314783e+01,-1.650675692579354e+01,1.989603438561989e+01,-2.356704137776066e+01,2.756506397928680e+01,-3.197025548918259e+01,3.692202859104543e+01,-4.267011993253408e+01,4.969530006282147e+01,-5.905126798000261e+01,7.366040263064687e+01,-1.070104838368321e+02,6.559781811826242e+01} +}; +#endif diff --git a/okl/mesh/constantInterpolationMatrices.h b/okl/mesh/constantInterpolationMatrices.h new file mode 100644 index 000000000..00f79ab55 --- /dev/null +++ b/okl/mesh/constantInterpolationMatrices.h @@ -0,0 +1,1497 @@ +#if p_Nq==2 && p_cubNq==2 +const dfloat c_I[2][2] = { +{ 7.886751345948129e-01,2.113248654051871e-01}, +{ 2.113248654051871e-01,7.886751345948129e-01} +}; +#endif +#if p_Nq==2 && p_cubNq==3 +const dfloat c_I[3][2] = { +{ 8.872983346207417e-01,1.127016653792583e-01}, +{ 5.000000000000000e-01,5.000000000000000e-01}, +{ 1.127016653792582e-01,8.872983346207418e-01} +}; +#endif +#if p_Nq==3 && p_cubNq==3 +const dfloat c_I[3][3] = { +{ 6.872983346207415e-01,4.000000000000001e-01,-8.729833462074173e-02}, +{ 0.000000000000000e+00,1.000000000000000e+00,0.000000000000000e+00}, +{ -8.729833462074171e-02,3.999999999999999e-01,6.872983346207419e-01} +}; +#endif +#if p_Nq==3 && p_cubNq==4 +const dfloat c_I[4][3] = { +{ 8.013460293699309e-01,2.584442528541907e-01,-5.979028222412171e-02}, +{ 2.277840767909521e-01,8.844128900029521e-01,-1.121969667939042e-01}, +{ -1.121969667939041e-01,8.844128900029521e-01,2.277840767909521e-01}, +{ -5.979028222412175e-02,2.584442528541907e-01,8.013460293699310e-01} +}; +#endif +#if p_Nq==3 && p_cubNq==5 +const dfloat c_I[5][3] = { +{ 8.636708795620424e-01,1.788380868145793e-01,-4.250896637662162e-02}, +{ 4.142092540156868e-01,7.100508020743097e-01,-1.242600560899964e-01}, +{ 0.000000000000000e+00,1.000000000000000e+00,-1.110223024625157e-16}, +{ -1.242600560899964e-01,7.100508020743098e-01,4.142092540156866e-01}, +{ -4.250896637662141e-02,1.788380868145787e-01,8.636708795620427e-01} +}; +#endif +#if p_Nq==4 && p_cubNq==4 +const dfloat c_I[4][4] = { +{ 6.299431661034456e-01,4.725587471138180e-01,-1.495034310460796e-01,4.700151782881605e-02}, +{ -7.069479527385591e-02,9.729761862582632e-01,1.325399262454269e-01,-3.482131722983418e-02}, +{ -3.482131722983417e-02,1.325399262454271e-01,9.729761862582630e-01,-7.069479527385593e-02}, +{ 4.700151782881598e-02,-1.495034310460796e-01,4.725587471138181e-01,6.299431661034456e-01} +}; +#endif +#if p_Nq==4 && p_cubNq==5 +const dfloat c_I[5][4] = { +{ 7.400289499867194e-01,3.382587987733499e-01,-1.147111902551258e-01,3.642344149505652e-02}, +{ 8.649005029831025e-02,9.781189357260486e-01,-9.055543313316150e-02,2.594644710880267e-02}, +{ -1.250000000000000e-01,6.250000000000002e-01,6.249999999999998e-01,-1.250000000000001e-01}, +{ 2.594644710880271e-02,-9.055543313316132e-02,9.781189357260485e-01,8.649005029831010e-02}, +{ 3.642344149505650e-02,-1.147111902551254e-01,3.382587987733489e-01,7.400289499867201e-01} +}; +#endif +#if p_Nq==4 && p_cubNq==6 +const dfloat c_I[6][4] = { +{ 8.086169815356239e-01,2.516267987615978e-01,-8.850104240942550e-02,2.825726211220384e-02}, +{ 2.462720621498499e-01,8.718189146469669e-01,-1.683162305858341e-01,5.022525378901731e-02}, +{ -1.107487223383050e-01,9.039054209873039e-01,2.749206840539369e-01,-6.807738270293580e-02}, +{ -6.807738270293573e-02,2.749206840539375e-01,9.039054209873032e-01,-1.107487223383050e-01}, +{ 5.022525378901738e-02,-1.683162305858342e-01,8.718189146469666e-01,2.462720621498502e-01}, +{ 2.825726211220375e-02,-8.850104240942519e-02,2.516267987615972e-01,8.086169815356242e-01} +}; +#endif +#if p_Nq==5 && p_cubNq==5 +const dfloat c_I[5][5] = { +{ 5.933706960199464e-01,5.164351986496180e-01,-1.638236393966081e-01,8.322282221996002e-02,-2.920507749291612e-02}, +{ -1.004825688050832e-01,9.313661019373954e-01,2.296672607957849e-01,-9.069490469997687e-02,3.014411077187973e-02}, +{ -4.930380657631324e-32,1.821670890584683e-16,9.999999999999999e-01,-1.821670890584682e-16,6.374520230305539e-17}, +{ 3.014411077187984e-02,-9.069490469997710e-02,2.296672607957854e-01,9.313661019373950e-01,-1.004825688050832e-01}, +{ -2.920507749291598e-02,8.322282221995989e-02,-1.638236393966077e-01,5.164351986496165e-01,5.933706960199473e-01} +}; +#endif +#if p_Nq==5 && p_cubNq==6 +const dfloat c_I[6][5] = { +{ 6.952211757042361e-01,3.943144038467445e-01,-1.342631882800180e-01,6.902223453632833e-02,-2.429462580729100e-02}, +{ 8.290910921565864e-03,9.997474117778138e-01,-1.132826345325630e-02,4.980806950970695e-03,-1.690866197094023e-03}, +{ -9.610913838780943e-02,4.104063461116276e-01,8.177677602732188e-01,-1.911433795320234e-01,5.907841153498634e-02}, +{ 5.907841153498647e-02,-1.911433795320233e-01,8.177677602732194e-01,4.104063461116266e-01,-9.610913838780913e-02}, +{ -1.690866197093757e-03,4.980806950970432e-03,-1.132826345325616e-02,9.997474117778136e-01,8.290910921566027e-03}, +{ -2.429462580729100e-02,6.902223453632819e-02,-1.342631882800177e-01,3.943144038467435e-01,6.952211757042369e-01} +}; +#endif +#if p_Nq==5 && p_cubNq==7 +const dfloat c_I[7][5] = { +{ 7.643937937285442e-01,3.082665410350870e-01,-1.093000994560194e-01,5.659843178759577e-02,-1.995866709520774e-02}, +{ 1.370626239500499e-01,9.514732616641499e-01,-1.273991423835427e-01,5.920536387827324e-02,-2.034210710893036e-02}, +{ -1.317289831653047e-01,7.339940232440431e-01,5.142678271004546e-01,-1.722057227387882e-01,5.567285555959519e-02}, +{ 2.220446049250315e-16,-5.546977718884092e-16,1.000000000000000e+00,5.546977718884090e-16,-1.410903275752519e-16}, +{ 5.567285555959525e-02,-1.722057227387886e-01,5.142678271004557e-01,7.339940232440424e-01,-1.317289831653048e-01}, +{ -2.034210710893020e-02,5.920536387827308e-02,-1.273991423835422e-01,9.514732616641502e-01,1.370626239500491e-01}, +{ -1.995866709520775e-02,5.659843178759581e-02,-1.093000994560197e-01,3.082665410350874e-01,7.643937937285442e-01} +}; +#endif +#if p_Nq==5 && p_cubNq==8 +const dfloat c_I[8][5] = { +{ 8.130042174737380e-01,2.464712508044187e-01,-8.965213694051827e-02,4.664592398038567e-02,-1.646925531802392e-02}, +{ 2.581325644914395e-01,8.623847241881027e-01,-1.756887568231306e-01,8.438501684176192e-02,-2.921354869817341e-02}, +{ -1.068998860510706e-01,9.165640551816816e-01,2.573670959599271e-01,-1.002790223883672e-01,3.324775729782904e-02}, +{ -7.501468548961804e-02,3.033131294598592e-01,8.904809203108437e-01,-1.705392091219726e-01,5.175984484088777e-02}, +{ 5.175984484088783e-02,-1.705392091219730e-01,8.904809203108436e-01,3.033131294598597e-01,-7.501468548961809e-02}, +{ 3.324775729782924e-02,-1.002790223883676e-01,2.573670959599278e-01,9.165640551816812e-01,-1.068998860510707e-01}, +{ -2.921354869817330e-02,8.438501684176183e-02,-1.756887568231307e-01,8.623847241881025e-01,2.581325644914396e-01}, +{ -1.646925531802371e-02,4.664592398038551e-02,-8.965213694051781e-02,2.464712508044172e-01,8.130042174737389e-01} +}; +#endif +#if p_Nq==6 && p_cubNq==6 +const dfloat c_I[6][6] = { +{ 5.681003371911977e-01,5.460052990461834e-01,-1.710005937792825e-01,9.089101441009001e-02,-5.384842269849474e-02,1.985236583030600e-02}, +{ -1.149129888893825e-01,8.932544371211688e-01,2.987272976255440e-01,-1.186707513571107e-01,6.503760749312376e-02,-2.343560199334330e-02}, +{ 2.097413460326787e-02,-7.227772737120583e-02,9.883752612168266e-01,8.794580976416334e-02,-3.791030585117006e-02,1.289282763811805e-02}, +{ 1.289282763811805e-02,-3.791030585117017e-02,8.794580976416405e-02,9.883752612168261e-01,-7.227772737120618e-02,2.097413460326810e-02}, +{ -2.343560199334324e-02,6.503760749312346e-02,-1.186707513571104e-01,2.987272976255428e-01,8.932544371211694e-01,-1.149129888893821e-01}, +{ 1.985236583030604e-02,-5.384842269849478e-02,9.089101441009001e-02,-1.710005937792824e-01,5.460052990461821e-01,5.681003371911990e-01} +}; +#endif +#if p_Nq==6 && p_cubNq==7 +const dfloat c_I[7][6] = { +{ 6.613801132697956e-01,4.357362309394600e-01,-1.462685224347596e-01,7.866897648617333e-02,-4.678573164611031e-02,1.726893338544077e-02}, +{ -3.795409189277543e-02,9.936129295815763e-01,6.202307653341386e-02,-2.756343724074758e-02,1.551446787435584e-02,-5.632944855822967e-03}, +{ -6.468864365152202e-02,2.549431027449786e-01,9.193227088517445e-01,-1.604494243763332e-01,7.821173349851360e-02,-2.733947706738130e-02}, +{ 6.250000000000017e-02,-1.946486423538425e-01,6.321486423538415e-01,6.321486423538435e-01,-1.946486423538427e-01,6.250000000000006e-02}, +{ -2.733947706738137e-02,7.821173349851350e-02,-1.604494243763330e-01,9.193227088517447e-01,2.549431027449779e-01,-6.468864365152181e-02}, +{ -5.632944855823033e-03,1.551446787435588e-02,-2.756343724074787e-02,6.202307653341441e-02,9.936129295815760e-01,-3.795409189277543e-02}, +{ 1.726893338544089e-02,-4.678573164611061e-02,7.866897648617364e-02,-1.462685224347604e-01,4.357362309394608e-01,6.613801132697956e-01} +}; +#endif +#if p_Nq==6 && p_cubNq==8 +const dfloat c_I[8][6] = { +{ 7.286932192353572e-01,3.531455605723031e-01,-1.236635133187770e-01,6.702420996760580e-02,-3.996082014716593e-02,1.476134369067709e-02}, +{ 6.441507565198301e-02,9.872345947467756e-01,-7.388270651774427e-02,3.492584267165040e-02,-1.998283194393247e-02,7.290025391267893e-03}, +{ -1.205882945854512e-01,5.691527543107484e-01,6.868992194178711e-01,-2.035888509000692e-01,1.056302664716023e-01,-3.750509471470150e-02}, +{ 4.088031273045199e-02,-1.367506889625265e-01,9.460354747981566e-01,2.054841216266148e-01,-8.385647894695793e-02,2.820725875426101e-02}, +{ 2.820725875426104e-02,-8.385647894695757e-02,2.054841216266138e-01,9.460354747981570e-01,-1.367506889625260e-01,4.088031273045181e-02}, +{ -3.750509471470136e-02,1.056302664716024e-01,-2.035888509000693e-01,6.868992194178712e-01,5.691527543107480e-01,-1.205882945854508e-01}, +{ 7.290025391267978e-03,-1.998283194393278e-02,3.492584267165075e-02,-7.388270651774524e-02,9.872345947467757e-01,6.441507565198355e-02}, +{ 1.476134369067696e-02,-3.996082014716581e-02,6.702420996760562e-02,-1.236635133187768e-01,3.531455605723011e-01,7.286932192353589e-01} +}; +#endif +#if p_Nq==6 && p_cubNq==9 +const dfloat c_I[9][6] = { +{ 7.783859592397122e-01,2.907425227594495e-01,-1.046950406378952e-01,5.704461519592489e-02,-3.407033538455918e-02,1.259227882736776e-02}, +{ 1.691240123787461e-01,9.309379734857437e-01,-1.452478897561335e-01,7.135034785808783e-02,-4.126826052138115e-02,1.510381655493688e-02}, +{ -1.305520612867672e-01,7.928707844907442e-01,4.437673169451030e-01,-1.620490677625707e-01,8.724854277152083e-02,-3.128551515803027e-02}, +{ -1.984889495580083e-02,7.250048845307433e-02,9.916271342593204e-01,-6.348833121764488e-02,2.933819560100882e-02,-1.012859213995770e-02}, +{ 6.250000000000000e-02,-1.946486423538424e-01,6.321486423538419e-01,6.321486423538430e-01,-1.946486423538426e-01,6.250000000000000e-02}, +{ -1.012859213995776e-02,2.933819560100904e-02,-6.348833121764563e-02,9.916271342593204e-01,7.250048845307516e-02,-1.984889495580122e-02}, +{ -3.128551515803024e-02,8.724854277152072e-02,-1.620490677625707e-01,4.437673169451026e-01,7.928707844907443e-01,-1.305520612867668e-01}, +{ 1.510381655493709e-02,-4.126826052138143e-02,7.135034785808801e-02,-1.452478897561338e-01,9.309379734857441e-01,1.691240123787461e-01}, +{ 1.259227882736796e-02,-3.407033538455928e-02,5.704461519592507e-02,-1.046950406378954e-01,2.907425227594495e-01,7.783859592397122e-01} +}; +#endif +#if p_Nq==7 && p_cubNq==7 +const dfloat c_I[7][7] = { +{ 5.496224156230661e-01,5.672889949264005e-01,-1.753729392504739e-01,9.430812000369378e-02,-5.939846606960709e-02,3.790276420569619e-02,-1.435088943877566e-02}, +{ -1.225480094508263e-01,8.610722772402529e-01,3.497685964998545e-01,-1.366894418469783e-01,7.879817845165070e-02,-4.858952596347067e-02,1.818792506951715e-02}, +{ 3.401809093350913e-02,-1.148336270801370e-01,9.659784214378074e-01,1.593672944505577e-01,-6.957880421214432e-02,3.942575103319104e-02,-1.437712656278395e-02}, +{ 0.000000000000000e+00,3.284858003315452e-16,-7.336515836066076e-16,1.000000000000000e+00,7.355211236263767e-16,-3.299828092647675e-16,1.507866011067049e-16}, +{ -1.437712656278400e-02,3.942575103319136e-02,-6.957880421214473e-02,1.593672944505586e-01,9.659784214378073e-01,-1.148336270801377e-01,3.401809093350920e-02}, +{ 1.818792506951739e-02,-4.858952596347052e-02,7.879817845165034e-02,-1.366894418469778e-01,3.497685964998545e-01,8.610722772402515e-01,-1.225480094508254e-01}, +{ -1.435088943877583e-02,3.790276420569637e-02,-5.939846606960734e-02,9.430812000369440e-02,-1.753729392504755e-01,5.672889949264017e-01,5.496224156230661e-01} +}; +#endif +#if p_Nq==7 && p_cubNq==8 +const dfloat c_I[8][7] = { +{ 6.350438437625402e-01,4.674709615755651e-01,-1.545102241229813e-01,8.403356797504626e-02,-5.313177191953877e-02,3.395788465603239e-02,-1.286426192666382e-02}, +{ -6.686280177625584e-02,9.768280677763566e-01,1.248776379178705e-01,-5.460934792907768e-02,3.234816598450935e-02,-2.014876303866046e-02,7.567041065257554e-03}, +{ -3.850058335269067e-02,1.445528523606540e-01,9.703754681309464e-01,-1.112306143208939e-01,5.531519567788890e-02,-3.248668159787811e-02,1.197436310197334e-02}, +{ 5.464946614535715e-02,-1.663516843701490e-01,4.707874429368521e-01,7.784764797450096e-01,-2.059984830313434e-01,1.061447002946480e-01,-3.770792172037468e-02}, +{ -3.770792172037507e-02,1.061447002946480e-01,-2.059984830313434e-01,7.784764797450089e-01,4.707874429368531e-01,-1.663516843701492e-01,5.464946614535733e-02}, +{ 1.197436310197353e-02,-3.248668159787769e-02,5.531519567788833e-02,-1.112306143208935e-01,9.703754681309467e-01,1.445528523606531e-01,-3.850058335269032e-02}, +{ 7.567041065257296e-03,-2.014876303866020e-02,3.234816598450877e-02,-5.460934792907669e-02,1.248776379178687e-01,9.768280677763563e-01,-6.686280177625444e-02}, +{ -1.286426192666357e-02,3.395788465603240e-02,-5.313177191953859e-02,8.403356797504627e-02,-1.545102241229815e-01,4.674709615755626e-01,6.350438437625425e-01} +}; +#endif +#if p_Nq==7 && p_cubNq==9 +const dfloat c_I[9][7] = { +{ 6.995098678471079e-01,3.893122348117259e-01,-1.343122465329718e-01,7.361500956703058e-02,-4.666890777843523e-02,2.986030902724026e-02,-1.131626694169767e-02}, +{ 1.467769306253532e-02,9.992284227915851e-01,-1.973608507271650e-02,9.211848917571264e-03,-5.553569626621691e-03,3.482498371392189e-03,-1.310808443745606e-03}, +{ -9.993398920103869e-02,4.295917446720899e-01,8.049979562125282e-01,-2.015735745589448e-01,1.075016215674348e-01,-6.453195455459175e-02,2.394819586252242e-02}, +{ 5.932582155455607e-02,-1.910361535956981e-01,8.348274595976652e-01,3.956334686460114e-01,-1.522025571158335e-01,8.372503411117974e-02,-3.027307319788083e-02}, +{ -1.110223024625157e-16,1.743377758688096e-16,-3.501909555650235e-16,1.000000000000000e+00,2.835168343454810e-16,-1.958893480767334e-16,1.110223024625157e-16}, +{ -3.027307319788075e-02,8.372503411117961e-02,-1.522025571158332e-01,3.956334686460102e-01,8.348274595976664e-01,-1.910361535956981e-01,5.932582155455590e-02}, +{ 2.394819586252256e-02,-6.453195455459153e-02,1.075016215674344e-01,-2.015735745589443e-01,8.049979562125278e-01,4.295917446720897e-01,-9.993398920103863e-02}, +{ -1.310808443745648e-03,3.482498371392362e-03,-5.553569626621989e-03,9.211848917571874e-03,-1.973608507271760e-02,9.992284227915850e-01,1.467769306253608e-02}, +{ -1.131626694169779e-02,2.986030902724041e-02,-4.666890777843535e-02,7.361500956703096e-02,-1.343122465329725e-01,3.893122348117258e-01,6.995098678471084e-01} +}; +#endif +#if p_Nq==7 && p_cubNq==10 +const dfloat c_I[10][7] = { +{ 7.489343251781180e-01,3.279324886725764e-01,-1.165086225984580e-01,6.421083217930700e-02,-4.078555812352368e-02,2.611684971410366e-02,-9.900315022123459e-03}, +{ 1.038742715016278e-01,9.700172103233357e-01,-1.065199400523905e-01,5.184894433881465e-02,-3.163982895398518e-02,1.993460732872982e-02,-7.515264486132225e-03}, +{ -1.295428488760914e-01,6.639486925463425e-01,5.938954902128136e-01,-1.956060663622887e-01,1.089050154126601e-01,-6.632932975375655e-02,2.472904682032015e-02}, +{ 2.055158558069897e-02,-7.075150784255495e-02,9.889880621331241e-01,8.597874887058576e-02,-3.886197239354870e-02,2.221888752023224e-02,-8.123803868537449e-03}, +{ 4.651486350686462e-02,-1.400969177462280e-01,3.725577701041141e-01,8.509725820102596e-01,-1.929812289552385e-01,9.749274326869788e-02,-3.445981218846955e-02}, +{ -3.445981218846983e-02,9.749274326869795e-02,-1.929812289552386e-01,8.509725820102585e-01,3.725577701041159e-01,-1.400969177462286e-01,4.651486350686462e-02}, +{ -8.123803868537560e-03,2.221888752023215e-02,-3.886197239354838e-02,8.597874887058532e-02,9.889880621331243e-01,-7.075150784255460e-02,2.055158558069875e-02}, +{ 2.472904682032012e-02,-6.632932975375631e-02,1.089050154126595e-01,-1.956060663622875e-01,5.938954902128109e-01,6.639486925463436e-01,-1.295428488760903e-01}, +{ -7.515264486132267e-03,1.993460732872976e-02,-3.163982895398509e-02,5.184894433881510e-02,-1.065199400523915e-01,9.700172103233357e-01,1.038742715016283e-01}, +{ -9.900315022123907e-03,2.611684971410416e-02,-4.078555812352450e-02,6.421083217930823e-02,-1.165086225984604e-01,3.279324886725830e-01,7.489343251781134e-01} +}; +#endif +#if p_Nq==7 && p_cubNq==11 +const dfloat c_I[11][7] = { +{ 7.874576129149565e-01,2.792864192014037e-01,-1.013430818106370e-01,5.608180457523022e-02,-3.567336720008248e-02,2.285695572471294e-02,-8.666343405584066e-03}, +{ 1.910957934552211e-01,9.154989987499278e-01,-1.553867720798033e-01,7.785466185129319e-02,-4.792709444275085e-02,3.030116189389346e-02,-1.143674942778151e-02}, +{ -1.274667934428669e-01,8.287380392372876e-01,3.963647319580240e-01,-1.507484832917080e-01,8.638140834236636e-02,-5.314961759327649e-02,1.988071479017364e-02}, +{ -3.389428800994423e-02,1.263162253318635e-01,9.767765990870809e-01,-1.004817004614098e-01,4.967930963243856e-02,-2.912614086365335e-02,1.072999528362442e-02}, +{ 6.404608888025454e-02,-2.011799340321685e-01,7.067911298454514e-01,5.554037984517750e-01,-1.907759921221282e-01,1.025650969658738e-01,-3.685018798905809e-02}, +{ 0.000000000000000e+00,-2.509490844243652e-16,4.745195025781237e-16,1.000000000000000e+00,-5.051179410184131e-16,2.298319219294550e-16,0.000000000000000e+00}, +{ -3.685018798905826e-02,1.025650969658740e-01,-1.907759921221280e-01,5.554037984517746e-01,7.067911298454521e-01,-2.011799340321686e-01,6.404608888025420e-02}, +{ 1.072999528362467e-02,-2.912614086365337e-02,4.967930963243816e-02,-1.004817004614095e-01,9.767765990870804e-01,1.263162253318638e-01,-3.389428800994415e-02}, +{ 1.988071479017350e-02,-5.314961759327619e-02,8.638140834236546e-02,-1.507484832917065e-01,3.963647319580216e-01,8.287380392372879e-01,-1.274667934428659e-01}, +{ -1.143674942778150e-02,3.030116189389365e-02,-4.792709444275112e-02,7.785466185129400e-02,-1.553867720798047e-01,9.154989987499276e-01,1.910957934552221e-01}, +{ -8.666343405584087e-03,2.285695572471297e-02,-3.567336720008257e-02,5.608180457523047e-02,-1.013430818106374e-01,2.792864192014038e-01,7.874576129149569e-01} +}; +#endif +#if p_Nq==8 && p_cubNq==8 +const dfloat c_I[8][8] = { +{ 5.355342505720099e-01,5.833338986139706e-01,-1.783117095445903e-01,9.623288030752708e-02,-6.179092624737878e-02,4.234811659541887e-02,-2.819497837947593e-02,1.084846808251884e-02}, +{ -1.268188705192677e-01,8.342973267412824e-01,3.888165421740908e-01,-1.491943684673626e-01,8.711220248284390e-02,-5.740146953079425e-02,3.754106590734196e-02,-1.435242878813459e-02}, +{ 4.248856454305550e-02,-1.414351904597506e-01,9.415999436891997e-01,2.166409038490179e-01,-9.323094155363872e-02,5.576597963277860e-02,-3.504395556278680e-02,1.321469586212445e-02}, +{ -9.260211868486055e-03,2.668371360042684e-02,-5.724050704923754e-02,9.935181852430504e-01,6.543089000328367e-02,-3.014872476127998e-02,1.740616545110563e-02,-6.389510618863017e-03}, +{ -6.389510618862879e-03,1.740616545110546e-02,-3.014872476127998e-02,6.543089000328370e-02,9.935181852430504e-01,-5.724050704923744e-02,2.668371360042675e-02,-9.260211868485932e-03}, +{ 1.321469586212420e-02,-3.504395556278646e-02,5.576597963277836e-02,-9.323094155363876e-02,2.166409038490181e-01,9.415999436891995e-01,-1.414351904597506e-01,4.248856454305556e-02}, +{ -1.435242878813454e-02,3.754106590734203e-02,-5.740146953079406e-02,8.711220248284353e-02,-1.491943684673626e-01,3.888165421740901e-01,8.342973267412833e-01,-1.268188705192678e-01}, +{ 1.084846808251893e-02,-2.819497837947593e-02,4.234811659541900e-02,-6.179092624737875e-02,9.623288030752700e-02,-1.783117095445901e-01,5.833338986139678e-01,5.355342505720121e-01} +}; +#endif +#if p_Nq==8 && p_cubNq==9 +const dfloat c_I[9][8] = { +{ 6.140199424134869e-01,4.924933510874336e-01,-1.604972830785227e-01,8.755063945380108e-02,-5.642552470063956e-02,3.873475837799463e-02,-2.580914381943445e-02,9.933260265880345e-03}, +{ -8.576211976259800e-02,9.565208064674118e-01,1.778741554177247e-01,-7.625081278625126e-02,4.571647336183470e-02,-3.044001166598239e-02,2.000059478327952e-02,-7.659085815419058e-03}, +{ -1.801202465982603e-02,6.546846920862762e-02,9.931316132781199e-01,-5.856886186586573e-02,2.876734415527790e-02,-1.785985536508347e-02,1.138971948601422e-02,-4.316404237264418e-03}, +{ 4.366503554225169e-02,-1.309060500845320e-01,3.409688113100978e-01,8.722924385549529e-01,-1.879358782544943e-01,9.955853952912198e-02,-5.992450656959268e-02,2.228160997219482e-02}, +{ -3.906250000000017e-02,1.088400233988091e-01,-2.040293534695561e-01,6.342518300707468e-01,6.342518300707476e-01,-2.040293534695564e-01,1.088400233988091e-01,-3.906250000000005e-02}, +{ 2.228160997219469e-02,-5.992450656959257e-02,9.955853952912161e-02,-1.879358782544939e-01,8.722924385549522e-01,3.409688113100984e-01,-1.309060500845319e-01,4.366503554225141e-02}, +{ -4.316404237264468e-03,1.138971948601432e-02,-1.785985536508388e-02,2.876734415527831e-02,-5.856886186586636e-02,9.931316132781201e-01,6.546846920862821e-02,-1.801202465982622e-02}, +{ -7.659085815419117e-03,2.000059478327984e-02,-3.044001166598264e-02,4.571647336183480e-02,-7.625081278625151e-02,1.778741554177254e-01,9.565208064674117e-01,-8.576211976259850e-02}, +{ 9.933260265880334e-03,-2.580914381943435e-02,3.873475837799452e-02,-5.642552470063941e-02,8.755063945380111e-02,-1.604972830785227e-01,4.924933510874334e-01,6.140199424134870e-01} +}; +#endif +#if p_Nq==8 && p_cubNq==10 +const dfloat c_I[10][8] = { +{ 6.753202332849673e-01,4.189375202774526e-01,-1.424879209484234e-01,7.831996363276472e-02,-5.061166829344464e-02,3.478510115163170e-02,-2.319042448852868e-02,8.927195383580569e-03}, +{ -2.033697832508445e-02,9.983084589880242e-01,3.102491432480985e-02,-1.422125507601435e-02,8.680299708263681e-03,-5.821857234863330e-03,3.837790349924596e-03,-1.471372735060233e-03}, +{ -7.821578661000543e-02,3.166739008401062e-01,8.835542489150554e-01,-1.812652107581687e-01,9.588591118632776e-02,-6.096719929291192e-02,3.926511698138051e-02,-1.493098126178378e-02}, +{ 6.423655929561191e-02,-2.016797050153989e-01,7.105630890353555e-01,5.519468065202032e-01,-1.924540310533173e-01,1.097317570744844e-01,-6.773644412421088e-02,2.539196826727196e-02}, +{ -2.016277465916426e-02,5.766362452581369e-02,-1.197692540051788e-01,9.651559489434690e-01,1.628245029085570e-01,-7.161591351391351e-02,4.084114338375718e-02,-1.493727758334053e-02}, +{ -1.493727758334037e-02,4.084114338375691e-02,-7.161591351391305e-02,1.628245029085558e-01,9.651559489434696e-01,-1.197692540051783e-01,5.766362452581332e-02,-2.016277465916386e-02}, +{ 2.539196826727211e-02,-6.773644412421065e-02,1.097317570744837e-01,-1.924540310533162e-01,5.519468065202017e-01,7.105630890353563e-01,-2.016797050153984e-01,6.423655929561167e-02}, +{ -1.493098126178380e-02,3.926511698138076e-02,-6.096719929291216e-02,9.588591118632810e-02,-1.812652107581699e-01,8.835542489150549e-01,3.166739008401081e-01,-7.821578661000601e-02}, +{ -1.471372735060201e-03,3.837790349924758e-03,-5.821857234863075e-03,8.680299708263490e-03,-1.422125507601458e-02,3.102491432481062e-02,9.983084589880242e-01,-2.033697832508518e-02}, +{ 8.927195383580801e-03,-2.319042448852906e-02,3.478510115163205e-02,-5.061166829344522e-02,7.831996363276579e-02,-1.424879209484255e-01,4.189375202774606e-01,6.753202332849605e-01} +}; +#endif +#if p_Nq==8 && p_cubNq==11 +const dfloat c_I[11][8] = { +{ 7.237402583181961e-01,3.594022728672484e-01,-1.259855551900917e-01,6.963871341461686e-02,-4.509136838400905e-02,3.101860639015305e-02,-2.068803120349565e-02,7.965103787381897e-03}, +{ 5.531778268025274e-02,9.903528175578387e-01,-6.537047429217836e-02,3.132519251968489e-02,-1.936501990268358e-02,1.305684834653218e-02,-8.627819572488597e-03,3.310672663042086e-03}, +{ -1.177775587422398e-01,5.452183913970147e-01,7.094427940393746e-01,-2.073652834489831e-01,1.149679549631315e-01,-7.430758894514662e-02,4.819079694935798e-02,-1.836950621250923e-02}, +{ 4.544484725287043e-02,-1.505296451624560e-01,9.302857774782992e-01,2.397370390158299e-01,-1.019635758343289e-01,6.080549289413305e-02,-3.816651891009656e-02,1.438658326574893e-02}, +{ 2.340594864551932e-02,-6.896009769134076e-02,1.640164942195122e-01,9.644434256227109e-01,-1.213381963631953e-01,6.135207509175340e-02,-3.638672648046699e-02,1.346707695550713e-02}, +{ -3.906250000000012e-02,1.088400233988092e-01,-2.040293534695563e-01,6.342518300707481e-01,6.342518300707464e-01,-2.040293534695563e-01,1.088400233988091e-01,-3.906249999999999e-02}, +{ 1.346707695550691e-02,-3.638672648046679e-02,6.135207509175292e-02,-1.213381963631947e-01,9.644434256227106e-01,1.640164942195124e-01,-6.896009769134076e-02,2.340594864551935e-02}, +{ 1.438658326574876e-02,-3.816651891009634e-02,6.080549289413256e-02,-1.019635758343282e-01,2.397370390158285e-01,9.302857774782998e-01,-1.505296451624552e-01,4.544484725287015e-02}, +{ -1.836950621250934e-02,4.819079694935793e-02,-7.430758894514641e-02,1.149679549631317e-01,-2.073652834489837e-01,7.094427940393739e-01,5.452183913970159e-01,-1.177775587422401e-01}, +{ 3.310672663042111e-03,-8.627819572488513e-03,1.305684834653211e-02,-1.936501990268361e-02,3.132519251968505e-02,-6.537047429217847e-02,9.903528175578385e-01,5.531778268025282e-02}, +{ 7.965103787382050e-03,-2.068803120349569e-02,3.101860639015296e-02,-4.509136838400894e-02,6.963871341461689e-02,-1.259855551900917e-01,3.594022728672483e-01,7.237402583181962e-01} +}; +#endif +#if p_Nq==8 && p_cubNq==12 +const dfloat c_I[12][8] = { +{ 7.624631483837585e-01,3.109546936311265e-01,-1.114525411960764e-01,6.186853031843344e-02,-4.012116015551428e-02,2.761839470281960e-02,-1.842614853160369e-02,7.095082847056397e-03}, +{ 1.326567140510431e-01,9.542172777829953e-01,-1.258256687548950e-01,6.221110323876358e-02,-3.882230748517822e-02,2.628000342218146e-02,-1.739711512905519e-02,6.679992874145049e-03}, +{ -1.320916129732376e-01,7.249279338237450e-01,5.271174104794968e-01,-1.842469653575637e-01,1.054833394669561e-01,-6.898754371513810e-02,4.497009265366572e-02,-1.717265437792421e-02}, +{ 3.434991611644454e-03,-1.210579772237747e-02,9.997253205439004e-01,1.274374959530465e-02,-6.047291330131175e-03,3.715823621390770e-03,-2.359849511420886e-03,8.930531916894933e-04}, +{ 5.627920190048103e-02,-1.714922938556915e-01,4.911586086266639e-01,7.626624665840881e-01,-2.094960760067484e-01,1.145923921244732e-01,-6.971476830095488e-02,2.601046892768859e-02}, +{ -2.639762375462995e-02,7.513441852282852e-02,-1.529926722490539e-01,9.334840978629134e-01,2.345783043097852e-01,-9.954338742894317e-02,5.625861331841402e-02,-2.052175058131412e-02}, +{ -2.052175058131426e-02,5.625861331841394e-02,-9.954338742894314e-02,2.345783043097851e-01,9.334840978629140e-01,-1.529926722490541e-01,7.513441852282846e-02,-2.639762375462997e-02}, +{ 2.601046892768855e-02,-6.971476830095481e-02,1.145923921244727e-01,-2.094960760067477e-01,7.626624665840871e-01,4.911586086266649e-01,-1.714922938556915e-01,5.627920190048091e-02}, +{ 8.930531916894794e-04,-2.359849511420501e-03,3.715823621390352e-03,-6.047291330130571e-03,1.274374959530343e-02,9.997253205439006e-01,-1.210579772237663e-02,3.434991611643978e-03}, +{ -1.717265437792434e-02,4.497009265366588e-02,-6.898754371513821e-02,1.054833394669563e-01,-1.842469653575646e-01,5.271174104794979e-01,7.249279338237445e-01,-1.320916129732375e-01}, +{ 6.679992874145224e-03,-1.739711512905521e-02,2.628000342218147e-02,-3.882230748517807e-02,6.221110323876332e-02,-1.258256687548945e-01,9.542172777829956e-01,1.326567140510422e-01}, +{ 7.095082847056289e-03,-1.842614853160382e-02,2.761839470281975e-02,-4.012116015551483e-02,6.186853031843452e-02,-1.114525411960783e-01,3.109546936311314e-01,7.624631483837550e-01} +}; +#endif +#if p_Nq==9 && p_cubNq==9 +const dfloat c_I[9][9] = { +{ 5.244429122844280e-01,5.958566985583839e-01,-1.804126288089369e-01,9.746304972297780e-02,-6.307578945894252e-02,4.429527662267451e-02,-3.190536244565979e-02,2.181997829412040e-02,-8.484134769045262e-03}, +{ -1.292847952946989e-01,8.119520222296919e-01,4.195549838225497e-01,-1.583015387188154e-01,9.273173897441650e-02,-6.243009352483582e-02,4.404135172136838e-02,-2.980959816446436e-02,1.154592895478820e-02}, +{ 4.820922384829929e-02,-1.588603833110272e-01,9.182382830560551e-01,2.630275076567237e-01,-1.111328520013887e-01,6.740852655710215e-02,-4.540458347114949e-02,3.006714315519038e-02,-1.155286548980521e-02}, +{ -1.595298560699732e-02,4.572157820117682e-02,-9.602554563833028e-02,9.795710516542204e-01,1.215858913287557e-01,-5.538507617261743e-02,3.384184787434381e-02,-2.149732991841784e-02,8.140568277866092e-03}, +{ 2.220446049250313e-16,-1.850346063343578e-16,2.758957802184714e-16,-5.080368779773549e-16,1.000000000000000e+00,5.300535822189961e-16,-2.954953839672239e-16,2.002519345521415e-16,-2.220446049250313e-16}, +{ 8.140568277866023e-03,-2.149732991841790e-02,3.384184787434372e-02,-5.538507617261725e-02,1.215858913287550e-01,9.795710516542208e-01,-9.602554563832981e-02,4.572157820117653e-02,-1.595298560699711e-02}, +{ -1.155286548980516e-02,3.006714315519027e-02,-4.540458347114917e-02,6.740852655710215e-02,-1.111328520013887e-01,2.630275076567239e-01,9.182382830560554e-01,-1.588603833110275e-01,4.820922384829900e-02}, +{ 1.154592895478823e-02,-2.980959816446403e-02,4.404135172136814e-02,-6.243009352483591e-02,9.273173897441629e-02,-1.583015387188148e-01,4.195549838225488e-01,8.119520222296913e-01,-1.292847952946981e-01}, +{ -8.484134769045359e-03,2.181997829412043e-02,-3.190536244565973e-02,4.429527662267456e-02,-6.307578945894265e-02,9.746304972297823e-02,-1.804126288089384e-01,5.958566985583842e-01,5.244429122844287e-01} +}; +#endif +#if p_Nq==9 && p_cubNq==10 +const dfloat c_I[10][9] = { +{ 5.968750899412025e-01,5.126924425292181e-01,-1.650141466731764e-01,9.004967026397198e-02,-5.848438995251541e-02,4.113714801771441e-02,-2.965492821020259e-02,2.028932717806745e-02,-7.890213094280014e-03}, +{ -9.857080589805101e-02,9.357505619587616e-01,2.225652749858444e-01,-9.357919993563933e-02,5.623053061642966e-02,-3.824493497522750e-02,2.711293575226440e-02,-1.839592324821341e-02,7.131560743831068e-03}, +{ -2.205690440727270e-03,7.833021632501576e-03,9.998901407465379e-01,-7.895230803956392e-03,3.806320760973370e-03,-2.395332772515018e-03,1.638693309454764e-03,-1.092977107637671e-03,4.210546753688088e-04}, +{ 3.269865783524817e-02,-9.696845469104790e-02,2.389162059403330e-01,9.309988151853480e-01,-1.563391192585770e-01,8.214389660172550e-02,-5.244602424176317e-02,3.392142354122778e-02,-1.292540091249438e-02}, +{ -3.601277525360747e-02,9.963726249715044e-02,-1.823949859960125e-01,5.052443260261779e-01,7.529607781243607e-01,-2.114196411430797e-01,1.166517890299632e-01,-7.134625695460721e-02,2.667950366965460e-02}, +{ 2.667950366965477e-02,-7.134625695460735e-02,1.166517890299631e-01,-2.114196411430796e-01,7.529607781243587e-01,5.052443260261803e-01,-1.823949859960134e-01,9.963726249715084e-02,-3.601277525360735e-02}, +{ -1.292540091249439e-02,3.392142354122765e-02,-5.244602424176312e-02,8.214389660172579e-02,-1.563391192585770e-01,9.309988151853472e-01,2.389162059403341e-01,-9.696845469104844e-02,3.269865783524820e-02}, +{ 4.210546753689037e-04,-1.092977107637619e-03,1.638693309454934e-03,-2.395332772515235e-03,3.806320760973444e-03,-7.895230803957131e-03,9.998901407465376e-01,7.833021632502470e-03,-2.205690440727374e-03}, +{ 7.131560743831068e-03,-1.839592324821344e-02,2.711293575226464e-02,-3.824493497522750e-02,5.623053061642944e-02,-9.357919993563898e-02,2.225652749858434e-01,9.357505619587613e-01,-9.857080589804984e-02}, +{ -7.890213094280327e-03,2.028932717806761e-02,-2.965492821020280e-02,4.113714801771480e-02,-5.848438995251639e-02,9.004967026397365e-02,-1.650141466731798e-01,5.126924425292277e-01,5.968750899411955e-01} +}; +#endif +#if p_Nq==9 && p_cubNq==11 +const dfloat c_I[11][9] = { +{ 6.550006942915111e-01,4.435716694926817e-01,-1.489197736246804e-01,8.187124424844040e-02,-5.331243279483602e-02,3.754431313825739e-02,-2.708152612254725e-02,1.853440375065812e-02,-7.208592379485047e-03}, +{ -4.561330210722980e-02,9.904378374334051e-01,7.716474143714247e-02,-3.472174559977322e-02,2.123816331526432e-02,-1.455173558448099e-02,1.035324506283427e-02,-7.037080409917945e-03,2.729876452755804e-03}, +{ -5.834800558352571e-02,2.265938863115020e-01,9.345516899095446e-01,-1.514943153167281e-01,7.886310158332817e-02,-5.085996468024255e-02,3.517221788418225e-02,-2.357901991872596e-02,9.100409810665538e-03}, +{ 6.191326584222923e-02,-1.909179563084407e-01,5.920934000867233e-01,6.741163330778485e-01,-2.097664945321859e-01,1.191862909108876e-01,-7.824585058808910e-02,5.122104117637982e-02,-1.960002966535260e-02}, +{ -3.218253714354979e-02,9.104799364136079e-02,-1.812958102869985e-01,8.871892731048824e-01,3.189542974593478e-01,-1.312206253851253e-01,7.806241955210010e-02,-4.907187222798872e-02,1.851686128597127e-02}, +{ 1.110223024625157e-16,1.103655041684964e-16,-2.570590060887539e-16,5.229791390943358e-16,1.000000000000000e+00,-5.955302078893015e-16,3.216450669416013e-16,-1.605107672508741e-16,1.110223024625157e-16}, +{ 1.851686128597130e-02,-4.907187222798875e-02,7.806241955209993e-02,-1.312206253851251e-01,3.189542974593474e-01,8.871892731048827e-01,-1.812958102869988e-01,9.104799364136075e-02,-3.218253714354960e-02}, +{ -1.960002966535246e-02,5.122104117637958e-02,-7.824585058808901e-02,1.191862909108875e-01,-2.097664945321854e-01,6.741163330778477e-01,5.920934000867242e-01,-1.909179563084411e-01,6.191326584222908e-02}, +{ 9.100409810665594e-03,-2.357901991872587e-02,3.517221788418226e-02,-5.085996468024219e-02,7.886310158332768e-02,-1.514943153167280e-01,9.345516899095435e-01,2.265938863115027e-01,-5.834800558352553e-02}, +{ 2.729876452755681e-03,-7.037080409917960e-03,1.035324506283435e-02,-1.455173558448103e-02,2.123816331526391e-02,-3.472174559977208e-02,7.716474143713985e-02,9.904378374334055e-01,-4.561330210722830e-02}, +{ -7.208592379485276e-03,1.853440375065811e-02,-2.708152612254723e-02,3.754431313825739e-02,-5.331243279483620e-02,8.187124424844087e-02,-1.489197736246816e-01,4.435716694926819e-01,6.550006942915120e-01} +}; +#endif +#if p_Nq==9 && p_cubNq==12 +const dfloat c_I[12][9] = { +{ 7.020314287390872e-01,3.862596866239101e-01,-1.337049613674030e-01,7.391980287709596e-02,-4.823111774818368e-02,3.399713084766436e-02,-2.453440063922472e-02,1.679516757204334e-02,-6.532736904989619e-03}, +{ 1.860360348662671e-02,9.987776662295925e-01,-2.471136816781408e-02,1.164390068376399e-02,-7.215304999245129e-03,4.970940074420752e-03,-3.546298854468705e-03,2.413654981312763e-03,-9.367934341888095e-04}, +{ -1.020525359825462e-01,4.413872512969982e-01,7.962178026881802e-01,-2.038582453534801e-01,1.115427814178285e-01,-7.319068429469827e-02,5.101444790875414e-02,-3.432822269920411e-02,1.326740501816746e-02}, +{ 5.858942296647351e-02,-1.888914378981254e-01,8.458263174908256e-01,3.808520530855855e-01,-1.505578635443582e-01,8.984010430103719e-02,-6.011288262366329e-02,3.968677500096978e-02,-1.523248877874467e-02}, +{ 2.110902762877620e-03,-6.123413963659671e-03,1.356117423623847e-02,9.996922521675758e-01,-1.326765252897739e-02,6.447214070441479e-03,-4.014490629365485e-03,2.569606446048263e-03,-9.755925611790372e-04}, +{ -3.215253333194479e-02,8.863726361030715e-02,-1.601984964017338e-01,4.175419508172293e-01,8.213516043079926e-01,-2.033918195926612e-01,1.101942304489617e-01,-6.697786900355383e-02,2.499566914540280e-02}, +{ 2.499566914540285e-02,-6.697786900355403e-02,1.101942304489618e-01,-2.033918195926614e-01,8.213516043079924e-01,4.175419508172303e-01,-1.601984964017343e-01,8.863726361030734e-02,-3.215253333194501e-02}, +{ -9.755925611791838e-04,2.569606446048019e-03,-4.014490629365396e-03,6.447214070441683e-03,-1.326765252897773e-02,9.996922521675757e-01,1.356117423623947e-02,-6.123413963660286e-03,2.110902762877526e-03}, +{ -1.523248877874495e-02,3.968677500096976e-02,-6.011288262366297e-02,8.984010430103706e-02,-1.505578635443582e-01,3.808520530855850e-01,8.458263174908267e-01,-1.888914378981257e-01,5.858942296647311e-02}, +{ 1.326740501816726e-02,-3.432822269920404e-02,5.101444790875409e-02,-7.319068429469790e-02,1.115427814178279e-01,-2.038582453534794e-01,7.962178026881799e-01,4.413872512969972e-01,-1.020525359825451e-01}, +{ -9.367934341888429e-04,2.413654981312847e-03,-3.546298854468792e-03,4.970940074420788e-03,-7.215304999245269e-03,1.164390068376448e-02,-2.471136816781561e-02,9.987776662295930e-01,1.860360348662746e-02}, +{ -6.532736904989900e-03,1.679516757204370e-02,-2.453440063922518e-02,3.399713084766477e-02,-4.823111774818446e-02,7.391980287709732e-02,-1.337049613674060e-01,3.862596866239165e-01,7.020314287390832e-01} +}; +#endif +#if p_Nq==9 && p_cubNq==13 +const dfloat c_I[13][9] = { +{ 7.404475907912537e-01,3.386021610704071e-01,-1.199323020197517e-01,6.659437264416380e-02,-4.351940796573645e-02,3.069803123139131e-02,-2.216173683041853e-02,1.517378026659463e-02,-5.902489187903801e-03}, +{ 8.676588106395255e-02,9.781920140921242e-01,-9.349347860945788e-02,4.553612549137539e-02,-2.849534423819983e-02,1.971468784394599e-02,-1.409404396185827e-02,9.602596710325238e-03,-3.728438392207323e-03}, +{ -1.265731270545656e-01,6.243853307109479e-01,6.347349162507985e-01,-2.022826327327860e-01,1.145848553804635e-01,-7.615120501939938e-02,5.339310810255313e-02,-3.603173573455842e-02,1.394049009654617e-02}, +{ 3.019009594140376e-02,-1.023869346944421e-01,9.743984174976127e-01,1.365578262880203e-01,-6.147438152342872e-02,3.792397505349503e-02,-2.572500415111067e-02,1.709043375455580e-02,-6.574428166106167e-03}, +{ 3.938388142174842e-02,-1.174848907829351e-01,2.985856363144430e-01,8.984535449322657e-01,-1.771153646720800e-01,9.451055203470982e-02,-6.066081082393378e-02,3.932269133854467e-02,-1.499523976276278e-02}, +{ -3.825691000529785e-02,1.073657062209784e-01,-2.071819645481469e-01,7.837235542300931e-01,4.671878933589012e-01,-1.751555612813812e-01,1.019716060343389e-01,-6.358060137303619e-02,2.392627736355030e-02}, +{ 0.000000000000000e+00,1.540607963182910e-16,-2.989020734892950e-16,4.781523557433928e-16,1.000000000000000e+00,-5.180402052024006e-16,3.344108415231502e-16,-1.816301586741357e-16,0.000000000000000e+00}, +{ 2.392627736355035e-02,-6.358060137303650e-02,1.019716060343392e-01,-1.751555612813817e-01,4.671878933589045e-01,7.837235542300908e-01,-2.071819645481476e-01,1.073657062209787e-01,-3.825691000529780e-02}, +{ -1.499523976276285e-02,3.932269133854434e-02,-6.066081082393331e-02,9.451055203470952e-02,-1.771153646720794e-01,8.984535449322654e-01,2.985856363144435e-01,-1.174848907829354e-01,3.938388142174829e-02}, +{ -6.574428166106205e-03,1.709043375455586e-02,-2.572500415111053e-02,3.792397505349515e-02,-6.147438152342907e-02,1.365578262880206e-01,9.743984174976128e-01,-1.023869346944424e-01,3.019009594140381e-02}, +{ 1.394049009654608e-02,-3.603173573455821e-02,5.339310810255293e-02,-7.615120501939902e-02,1.145848553804626e-01,-2.022826327327850e-01,6.347349162507968e-01,6.243853307109483e-01,-1.265731270545645e-01}, +{ -3.728438392207600e-03,9.602596710325477e-03,-1.409404396185861e-02,1.971468784394649e-02,-2.849534423820067e-02,4.553612549137699e-02,-9.349347860946133e-02,9.781920140921239e-01,8.676588106395522e-02}, +{ -5.902489187903839e-03,1.517378026659452e-02,-2.216173683041830e-02,3.069803123139090e-02,-4.351940796573599e-02,6.659437264416307e-02,-1.199323020197506e-01,3.386021610704018e-01,7.404475907912584e-01} +}; +#endif +#if p_Nq==9 && p_cubNq==14 +const dfloat c_I[14][9] = { +{ 7.721326518650051e-01,2.987610143518875e-01,-1.077166814556528e-01,6.001772446420022e-02,-3.927041344755902e-02,2.771675612242414e-02,-2.001536446377731e-02,1.370623900723985e-02,-5.331926443767575e-03}, +{ 1.544392416288436e-01,9.407434027928868e-01,-1.382956217070503e-01,6.904380825582709e-02,-4.353609956266314e-02,3.022073988312889e-02,-2.164058540786276e-02,1.475642627459173e-02,-5.731312157701972e-03}, +{ -1.318886647679813e-01,7.666809344627836e-01,4.776020276743291e-01,-1.734241249900716e-01,1.007577854553471e-01,-6.761494252707030e-02,4.762568198696605e-02,-3.221142684038523e-02,1.247272954608263e-02}, +{ -1.025696896218919e-02,3.684795219771805e-02,9.977084688175143e-01,-3.494095173536317e-02,1.706698635991974e-02,-1.078340311125569e-02,7.389973969392450e-03,-4.932988287271320e-03,1.900930751534816e-03}, +{ 6.122636731525555e-02,-1.884060221751225e-01,5.761860833598305e-01,6.889653546208401e-01,-2.106602553702886e-01,1.193273592985672e-01,-7.824847722495566e-02,5.119684756370417e-02,-1.958725738783075e-02}, +{ -1.782321592154790e-02,5.101460373521348e-02,-1.065471606442861e-01,9.739050446398573e-01,1.390784367975979e-01,-6.281868932393572e-02,3.829349616521983e-02,-2.430234372110553e-02,9.199828272986904e-03}, +{ -2.866067138942419e-02,7.881453311002821e-02,-1.412098567449225e-01,3.539450851219568e-01,8.652103854163348e-01,-1.916031610164213e-01,1.023468340731013e-01,-6.191397555657513e-02,2.307082698592205e-02}, +{ 2.307082698592233e-02,-6.191397555657531e-02,1.023468340731013e-01,-1.916031610164215e-01,8.652103854163347e-01,3.539450851219573e-01,-1.412098567449227e-01,7.881453311002828e-02,-2.866067138942435e-02}, +{ 9.199828272986932e-03,-2.430234372110576e-02,3.829349616522010e-02,-6.281868932393579e-02,1.390784367975978e-01,9.739050446398575e-01,-1.065471606442862e-01,5.101460373521347e-02,-1.782321592154799e-02}, +{ -1.958725738783063e-02,5.119684756370396e-02,-7.824847722495554e-02,1.193273592985670e-01,-2.106602553702879e-01,6.889653546208413e-01,5.761860833598290e-01,-1.884060221751225e-01,6.122636731525545e-02}, +{ 1.900930751534857e-03,-4.932988287271391e-03,7.389973969392756e-03,-1.078340311125583e-02,1.706698635991968e-02,-3.494095173536338e-02,9.977084688175143e-01,3.684795219771832e-02,-1.025696896218937e-02}, +{ 1.247272954608258e-02,-3.221142684038516e-02,4.762568198696576e-02,-6.761494252706995e-02,1.007577854553466e-01,-1.734241249900710e-01,4.776020276743287e-01,7.666809344627825e-01,-1.318886647679801e-01}, +{ -5.731312157702076e-03,1.475642627459169e-02,-2.164058540786249e-02,3.022073988312888e-02,-4.353609956266375e-02,6.904380825582843e-02,-1.382956217070533e-01,9.407434027928869e-01,1.544392416288456e-01}, +{ -5.331926443767683e-03,1.370623900723992e-02,-2.001536446377747e-02,2.771675612242433e-02,-3.927041344755922e-02,6.001772446420044e-02,-1.077166814556534e-01,2.987610143518876e-01,7.721326518650055e-01} +}; +#endif +#if p_Nq==10 && p_cubNq==10 +const dfloat c_I[10][10] = { +{ 5.154867662656436e-01,6.058989631943574e-01,-1.819814464369726e-01,9.831352268064393e-02,-6.386149895063113e-02,4.533078185567573e-02,-3.358633347965074e-02,2.498410231796022e-02,-1.739918175316877e-02,6.814324306142396e-03}, +{ -1.307231036250440e-01,7.931432245336335e-01,4.443300291302421e-01,-1.651754022118850e-01,9.677345114011041e-02,-6.572626467376091e-02,4.761451974970499e-02,-3.498747678693480e-02,2.420879013896095e-02,-9.457767395027128e-03}, +{ 5.220348640377254e-02,-1.707045005278797e-01,8.968409512320626e-01,3.011186047971880e-01,-1.249726196660501e-01,7.606620344206952e-02,-5.242284426673962e-02,3.754121728057165e-02,-2.563586900036511e-02,9.965370305370091e-03}, +{ -2.087413041692032e-02,5.958810226470487e-02,-1.232085284596143e-01,9.628742077560756e-01,1.693570676361684e-01,-7.584659895353155e-02,4.704861570730013e-02,-3.209880461870785e-02,2.141137119294175e-02,-8.251302108416847e-03}, +{ 4.999599473097471e-03,-1.352375870271576e-02,2.294782581011236e-02,-4.688086529363892e-02,9.958662837744131e-01,5.200265864033946e-02,-2.461103201119363e-02,1.525031145067189e-02,-9.754898545022074e-03,3.703875403936019e-03}, +{ 3.703875403936130e-03,-9.754898545021760e-03,1.525031145067135e-02,-2.461103201119254e-02,5.200265864033697e-02,9.958662837744132e-01,-4.688086529363673e-02,2.294782581011119e-02,-1.352375870271510e-02,4.999599473097360e-03}, +{ -8.251302108416958e-03,2.141137119294163e-02,-3.209880461870770e-02,4.704861570729995e-02,-7.584659895353103e-02,1.693570676361669e-01,9.628742077560761e-01,-1.232085284596134e-01,5.958810226470444e-02,-2.087413041691999e-02}, +{ 9.965370305370258e-03,-2.563586900036482e-02,3.754121728057159e-02,-5.242284426673936e-02,7.606620344206898e-02,-1.249726196660499e-01,3.011186047971867e-01,8.968409512320638e-01,-1.707045005278797e-01,5.220348640377259e-02}, +{ -9.457767395026906e-03,2.420879013896082e-02,-3.498747678693490e-02,4.761451974970551e-02,-6.572626467376116e-02,9.677345114011049e-02,-1.651754022118855e-01,4.443300291302444e-01,7.931432245336323e-01,-1.307231036250450e-01}, +{ 6.814324306142372e-03,-1.739918175316890e-02,2.498410231796014e-02,-3.358633347965062e-02,4.533078185567553e-02,-6.386149895063131e-02,9.831352268064456e-02,-1.819814464369736e-01,6.058989631943681e-01,5.154867662656337e-01} +}; +#endif +#if p_Nq==10 && p_cubNq==11 +const dfloat c_I[11][10] = { +{ 5.826405846218484e-01,5.293193413315870e-01,-1.685206358810891e-01,9.191399229242325e-02,-5.990464866185705e-02,4.258779348172183e-02,-3.157971174233693e-02,2.350204699265669e-02,-1.637099758964618e-02,6.412235154692122e-03}, +{ -1.075076002732948e-01,9.158160622682420e-01,2.604710666836640e-01,-1.075813352045528e-01,6.458150918470876e-02,-4.429538735066637e-02,3.224613636841406e-02,-2.375695571432437e-02,1.646064611742537e-02,-6.434142079615728e-03}, +{ 1.001933851287046e-02,-3.496639503890561e-02,9.975806013086694e-01,3.885939155851053e-02,-1.837577488952621e-02,1.159216108849772e-02,-8.113216528038779e-03,5.855298750922609e-03,-4.014098904591346e-03,1.562694141591214e-03}, +{ 2.282479921992142e-02,-6.713690018139094e-02,1.589534244195882e-01,9.664988749272160e-01,-1.191035217311578e-01,6.157569436368102e-02,-3.991078965475575e-02,2.776004559114177e-02,-1.868732787197907e-02,7.225700917735112e-03}, +{ -3.113095451984765e-02,8.568620574904746e-02,-1.541669449468092e-01,3.956003338556882e-01,8.373172960068405e-01,-2.007768552812587e-01,1.102868558325789e-01,-7.174317540476897e-02,4.683905106728389e-02,-1.791181235875433e-02}, +{ 2.734374999999989e-02,-7.283186257766791e-02,1.177435823540573e-01,-2.074090936574366e-01,6.351536238810480e-01,6.351536238810465e-01,-2.074090936574361e-01,1.177435823540569e-01,-7.283186257766780e-02,2.734374999999989e-02}, +{ -1.791181235875416e-02,4.683905106728401e-02,-7.174317540476921e-02,1.102868558325792e-01,-2.007768552812592e-01,8.373172960068397e-01,3.956003338556892e-01,-1.541669449468094e-01,8.568620574904762e-02,-3.113095451984782e-02}, +{ 7.225700917735334e-03,-1.868732787197915e-02,2.776004559114187e-02,-3.991078965475622e-02,6.157569436368161e-02,-1.191035217311584e-01,9.664988749272158e-01,1.589534244195895e-01,-6.713690018139166e-02,2.282479921992153e-02}, +{ 1.562694141591173e-03,-4.014098904591430e-03,5.855298750922605e-03,-8.113216528038559e-03,1.159216108849770e-02,-1.837577488952627e-02,3.885939155851019e-02,9.975806013086695e-01,-3.496639503890525e-02,1.001933851287029e-02}, +{ -6.434142079615679e-03,1.646064611742515e-02,-2.375695571432437e-02,3.224613636841428e-02,-4.429538735066678e-02,6.458150918470928e-02,-1.075813352045531e-01,2.604710666836648e-01,9.158160622682423e-01,-1.075076002732961e-01}, +{ 6.412235154691914e-03,-1.637099758964622e-02,2.350204699265672e-02,-3.157971174233692e-02,4.258779348172162e-02,-5.990464866185680e-02,9.191399229242292e-02,-1.685206358810884e-01,5.293193413315871e-01,5.826405846218480e-01} +}; +#endif +#if p_Nq==10 && p_cubNq==12 +const dfloat c_I[12][10] = { +{ 6.377227007036271e-01,4.643335745781212e-01,-1.540790292071470e-01,8.464294857588436e-02,-5.530648764425020e-02,3.936528942527003e-02,-2.920837985957818e-02,2.174481166755880e-02,-1.514974179225534e-02,5.934313552769173e-03}, +{ -6.426597363274200e-02,9.789546446489460e-01,1.185565074622602e-01,-5.241376541297763e-02,3.201919547088150e-02,-2.212183622595491e-02,1.616328609716050e-02,-1.193173104662574e-02,8.275816764795810e-03,-3.236144125743579e-03}, +{ -4.110575553289753e-02,1.548185656178315e-01,9.665902146731673e-01,-1.174340149287519e-01,6.005750675066700e-02,-3.882902711074417e-02,2.747824762374924e-02,-1.994383917020920e-02,1.371208203792058e-02,-5.343979960733103e-03}, +{ 5.609355993900283e-02,-1.706629246147094e-01,4.862202525501220e-01,7.671290862485306e-01,-2.105786159257142e-01,1.180876327865069e-01,-7.877879800280882e-02,5.553230186000596e-02,-3.762609165873784e-02,1.458359681780191e-02}, +{ -3.807546523357214e-02,1.068573208272035e-01,-2.064250160507805e-01,7.925852915866163e-01,4.562245437344595e-01,-1.733401391702514e-01,1.031720774186612e-01,-6.919520396473007e-02,4.579386785565252e-02,-1.759727700325914e-02}, +{ 1.163316534779640e-02,-3.137880094299848e-02,5.276398375949957e-02,-1.045982618143961e-01,9.756052004327656e-01,1.344818691325719e-01,-6.116291296890877e-02,3.746824786787024e-02,-2.385621838444387e-02,9.043727570243576e-03}, +{ 9.043727570243409e-03,-2.385621838444380e-02,3.746824786787008e-02,-6.116291296890829e-02,1.344818691325708e-01,9.756052004327660e-01,-1.045982618143951e-01,5.276398375949895e-02,-3.137880094299823e-02,1.163316534779624e-02}, +{ -1.759727700325919e-02,4.579386785565236e-02,-6.919520396473003e-02,1.031720774186613e-01,-1.733401391702512e-01,4.562245437344577e-01,7.925852915866177e-01,-2.064250160507801e-01,1.068573208272033e-01,-3.807546523357186e-02}, +{ 1.458359681780186e-02,-3.762609165873784e-02,5.553230186000632e-02,-7.877879800280911e-02,1.180876327865072e-01,-2.105786159257149e-01,7.671290862485306e-01,4.862202525501227e-01,-1.706629246147101e-01,5.609355993900322e-02}, +{ -5.343979960732770e-03,1.371208203792047e-02,-1.994383917020951e-02,2.747824762374931e-02,-3.882902711074367e-02,6.005750675066641e-02,-1.174340149287514e-01,9.665902146731681e-01,1.548185656178304e-01,-4.110575553289719e-02}, +{ -3.236144125743468e-03,8.275816764795709e-03,-1.193173104662597e-02,1.616328609716100e-02,-2.212183622595552e-02,3.201919547088224e-02,-5.241376541297870e-02,1.185565074622626e-01,9.789546446489458e-01,-6.426597363274367e-02}, +{ 5.934313552769188e-03,-1.514974179225538e-02,2.174481166755891e-02,-2.920837985957856e-02,3.936528942527039e-02,-5.530648764425074e-02,8.464294857588522e-02,-1.540790292071488e-01,4.643335745781287e-01,6.377227007036210e-01} +}; +#endif +#if p_Nq==10 && p_cubNq==13 +const dfloat c_I[13][10] = { +{ 6.831817501416485e-01,4.093805104750549e-01,-1.400743904311238e-01,7.737778647700413e-02,-5.065987287812379e-02,3.609129924177523e-02,-2.679222835966445e-02,1.995148122726040e-02,-1.390232956540903e-02,5.445993671577963e-03}, +{ -9.586768200657840e-03,9.996403626372847e-01,1.405307832519657e-02,-6.513389606903422e-03,4.031288327915096e-03,-2.800701819384877e-03,2.052107890432951e-03,-1.517192263200812e-03,1.053170125181947e-03,-4.119554158643693e-04}, +{ -8.560126035255716e-02,3.526805498512755e-01,8.603429652161589e-01,-1.902476844077130e-01,1.024817947774273e-01,-6.744438309670561e-02,4.812357506058823e-02,-3.507845950395550e-02,2.417083988553516e-02,-9.427937430053823e-03}, +{ 6.381645441618097e-02,-2.016757797543456e-01,7.529998430850257e-01,5.032165163352509e-01,-1.836739302342100e-01,1.084971791264239e-01,-7.385786618868262e-02,5.257145231970248e-02,-3.579103247273265e-02,1.389716336738703e-02}, +{ -1.447880078095504e-02,4.151987649974218e-02,-8.750960636728518e-02,9.835366140533958e-01,1.082449608256488e-01,-4.994766870798877e-02,3.124686100002357e-02,-2.139568794630626e-02,1.429619118806435e-02,-5.512739764339358e-03}, +{ -2.062703817230971e-02,5.642002122588920e-02,-9.934049523516109e-02,2.325314999227388e-01,9.349690013710330e-01,-1.539927714454274e-01,8.123256199314134e-02,-5.209930022450548e-02,3.380688905831621e-02,-1.290036849371479e-02}, +{ 2.734374999999983e-02,-7.283186257766781e-02,1.177435823540571e-01,-2.074090936574365e-01,6.351536238810479e-01,6.351536238810465e-01,-2.074090936574360e-01,1.177435823540570e-01,-7.283186257766792e-02,2.734374999999994e-02}, +{ -1.290036849371468e-02,3.380688905831585e-02,-5.209930022450504e-02,8.123256199314088e-02,-1.539927714454263e-01,9.349690013710342e-01,2.325314999227363e-01,-9.934049523516020e-02,5.642002122588858e-02,-2.062703817230960e-02}, +{ -5.512739764339414e-03,1.429619118806435e-02,-2.139568794630638e-02,3.124686100002372e-02,-4.994766870798889e-02,1.082449608256485e-01,9.835366140533960e-01,-8.750960636728487e-02,4.151987649974209e-02,-1.447880078095520e-02}, +{ 1.389716336738700e-02,-3.579103247273261e-02,5.257145231970264e-02,-7.385786618868279e-02,1.084971791264241e-01,-1.836739302342104e-01,5.032165163352511e-01,7.529998430850259e-01,-2.016757797543461e-01,6.381645441618122e-02}, +{ -9.427937430053657e-03,2.417083988553497e-02,-3.507845950395540e-02,4.812357506058821e-02,-6.744438309670552e-02,1.024817947774272e-01,-1.902476844077129e-01,8.603429652161591e-01,3.526805498512759e-01,-8.560126035255777e-02}, +{ -4.119554158640293e-04,1.053170125181579e-03,-1.517192263200760e-03,2.052107890433039e-03,-2.800701819385000e-03,4.031288327915202e-03,-6.513389606903357e-03,1.405307832519639e-02,9.996403626372848e-01,-9.586768200657736e-03}, +{ 5.445993671577942e-03,-1.390232956540877e-02,1.995148122726011e-02,-2.679222835966416e-02,3.609129924177477e-02,-5.065987287812324e-02,7.737778647700308e-02,-1.400743904311213e-01,4.093805104750485e-01,6.831817501416531e-01} +}; +#endif +#if p_Nq==10 && p_cubNq==14 +const dfloat c_I[14][10] = { +{ 7.209767891285487e-01,3.628568155036386e-01,-1.271021391191930e-01,7.052024998977879e-02,-4.624314521158467e-02,3.296897716267530e-02,-2.448393619072022e-02,1.823651681997675e-02,-1.270880007427615e-02,4.978671991155824e-03}, +{ 5.036955161961763e-02,9.918887202892184e-01,-6.046194052316288e-02,2.900644577432747e-02,-1.813403528576482e-02,1.265330653011143e-02,-9.291855225150025e-03,6.878126828501011e-03,-4.777557518390456e-03,1.869237510692255e-03}, +{ -1.160475940439368e-01,5.319277307887073e-01,7.214066917630195e-01,-2.081310126105587e-01,1.163075197446580e-01,-7.756985178242502e-02,5.569977828086244e-02,-4.073637625126158e-02,2.811775137434176e-02,-1.097463726340671e-02}, +{ 4.766069669059506e-02,-1.571774769813764e-01,9.209629864232346e-01,2.580580034439383e-01,-1.096110368830913e-01,6.711280600760287e-02,-4.636821881714268e-02,3.324673145485663e-02,-2.271747936000076e-02,8.832988021383742e-03}, +{ 2.065478268604201e-02,-6.065726716357455e-02,1.424963746538745e-01,9.724886878071035e-01,-1.098367114756021e-01,5.648487873408815e-02,-3.654634341910943e-02,2.539949219651111e-02,-1.709167694850932e-02,6.607782929176215e-03}, +{ -3.901027667602747e-02,1.083500964940521e-01,-2.013474086517034e-01,6.063159513106310e-01,6.628909282397591e-01,-2.105214672244527e-01,1.208106556545246e-01,-7.987414151717949e-02,5.252163827288590e-02,-2.013597590248967e-02}, +{ 1.588685893066111e-02,-4.276908571722927e-02,7.147118401544993e-02,-1.388853549524651e-01,9.506838807883824e-01,1.990310811900090e-01,-8.766431496952332e-02,5.323180671303954e-02,-3.377441565532395e-02,1.278835965699965e-02}, +{ 1.278835965699943e-02,-3.377441565532381e-02,5.323180671303925e-02,-8.766431496952298e-02,1.990310811900083e-01,9.506838807883825e-01,-1.388853549524642e-01,7.147118401544934e-02,-4.276908571722910e-02,1.588685893066122e-02}, +{ -2.013597590248961e-02,5.252163827288584e-02,-7.987414151717950e-02,1.208106556545249e-01,-2.105214672244531e-01,6.628909282397590e-01,6.063159513106315e-01,-2.013474086517034e-01,1.083500964940520e-01,-3.901027667602774e-02}, +{ 6.607782929176298e-03,-1.709167694850903e-02,2.539949219651071e-02,-3.654634341910892e-02,5.648487873408743e-02,-1.098367114756016e-01,9.724886878071040e-01,1.424963746538736e-01,-6.065726716357423e-02,2.065478268604182e-02}, +{ 8.832988021383770e-03,-2.271747936000069e-02,3.324673145485681e-02,-4.636821881714275e-02,6.711280600760279e-02,-1.096110368830915e-01,2.580580034439378e-01,9.209629864232355e-01,-1.571774769813766e-01,4.766069669059492e-02}, +{ -1.097463726340631e-02,2.811775137434160e-02,-4.073637625126159e-02,5.569977828086235e-02,-7.756985178242494e-02,1.163075197446581e-01,-2.081310126105588e-01,7.214066917630216e-01,5.319277307887055e-01,-1.160475940439374e-01}, +{ 1.869237510692456e-03,-4.777557518390818e-03,6.878126828501419e-03,-9.291855225149803e-03,1.265330653011065e-02,-1.813403528576429e-02,2.900644577432754e-02,-6.046194052316278e-02,9.918887202892185e-01,5.036955161961731e-02}, +{ 4.978671991155870e-03,-1.270880007427622e-02,1.823651681997673e-02,-2.448393619072028e-02,3.296897716267535e-02,-4.624314521158457e-02,7.052024998977849e-02,-1.271021391191924e-01,3.628568155036387e-01,7.209767891285483e-01} +}; +#endif +#if p_Nq==10 && p_cubNq==15 +const dfloat c_I[15][10] = { +{ 7.526445900479319e-01,3.233291811844189e-01,-1.153591473592308e-01,6.423083968270824e-02,-4.217270938291041e-02,3.008490248623245e-02,-2.234919012836712e-02,1.664941568658662e-02,-1.160385825604121e-02,4.545976038671509e-03}, +{ 1.115481612640302e-01,9.660604843132188e-01,-1.121360963086408e-01,5.522063422775922e-02,-3.479672073944579e-02,2.436426239191430e-02,-1.792364474631579e-02,1.328062917695614e-02,-9.229506328184280e-03,3.611796748708086e-03}, +{ -1.306170066078081e-01,6.808129656716869e-01,5.763631012693701e-01,-1.941108959110043e-01,1.114592849799695e-01,-7.510578274024900e-02,5.419952533887287e-02,-3.974401887656863e-02,2.747041646212648e-02,-1.072758958639587e-02}, +{ 1.626681532767171e-02,-5.627190531238779e-02,9.933607715496154e-01,6.592938488838798e-02,-3.078027308215601e-02,1.934417016202094e-02,-1.351621107817505e-02,9.746379167418261e-03,-6.678760349252131e-03,2.599628726856706e-03}, +{ 4.965584829035283e-02,-1.496949842026203e-01,4.038780255970415e-01,8.300072285165654e-01,-2.016027231970079e-01,1.110882599043969e-01,-7.362959232617643e-02,5.174393684427624e-02,-3.500686389072010e-02,1.356086446389185e-02}, +{ -3.337089509694191e-02,9.425131964846811e-02,-1.866289638233383e-01,8.748915694352541e-01,3.391383240230198e-01,-1.387472182000872e-01,8.404404412063499e-02,-5.677033466826189e-02,3.769396018205453e-02,-1.450180562080228e-02}, +{ -1.147803038645662e-02,3.126149836144157e-02,-5.425707948261768e-02,1.201103613026455e-01,9.800977426063395e-01,-9.605165750563552e-02,4.894317627867038e-02,-3.103032192405168e-02,2.003731403181200e-02,-7.633003282147388e-03}, +{ 2.734374999999972e-02,-7.283186257766784e-02,1.177435823540571e-01,-2.074090936574366e-01,6.351536238810482e-01,6.351536238810462e-01,-2.074090936574360e-01,1.177435823540570e-01,-7.283186257766787e-02,2.734374999999994e-02}, +{ -7.633003282147444e-03,2.003731403181219e-02,-3.103032192405227e-02,4.894317627867121e-02,-9.605165750563688e-02,9.800977426063393e-01,1.201103613026476e-01,-5.425707948261858e-02,3.126149836144180e-02,-1.147803038645678e-02}, +{ -1.450180562080217e-02,3.769396018205454e-02,-5.677033466826203e-02,8.404404412063521e-02,-1.387472182000876e-01,3.391383240230195e-01,8.748915694352546e-01,-1.866289638233381e-01,9.425131964846799e-02,-3.337089509694202e-02}, +{ 1.356086446389182e-02,-3.500686389071998e-02,5.174393684427621e-02,-7.362959232617654e-02,1.110882599043971e-01,-2.016027231970084e-01,8.300072285165656e-01,4.038780255970419e-01,-1.496949842026207e-01,4.965584829035297e-02}, +{ 2.599628726856595e-03,-6.678760349252463e-03,9.746379167418584e-03,-1.351621107817550e-02,1.934417016202189e-02,-3.078027308215704e-02,6.592938488838972e-02,9.933607715496151e-01,-5.627190531238936e-02,1.626681532767238e-02}, +{ -1.072758958639597e-02,2.747041646212645e-02,-3.974401887656857e-02,5.419952533887290e-02,-7.510578274024925e-02,1.114592849799699e-01,-1.941108959110047e-01,5.763631012693716e-01,6.808129656716864e-01,-1.306170066078087e-01}, +{ 3.611796748708040e-03,-9.229506328184534e-03,1.328062917695622e-02,-1.792364474631575e-02,2.436426239191378e-02,-3.479672073944506e-02,5.522063422775869e-02,-1.121360963086399e-01,9.660604843132193e-01,1.115481612640291e-01}, +{ 4.545976038671629e-03,-1.160385825604149e-02,1.664941568658714e-02,-2.234919012836764e-02,3.008490248623301e-02,-4.217270938291123e-02,6.423083968270936e-02,-1.153591473592329e-01,3.233291811844255e-01,7.526445900479266e-01} +}; +#endif +#if p_Nq==11 && p_cubNq==11 +const dfloat c_I[11][11] = { +{ 5.081048249011929e-01,6.141291834135808e-01,-1.831921783429969e-01,9.893381995913701e-02,-6.438510918421751e-02,4.595127331137519e-02,-3.449089058547024e-02,2.647227811054990e-02,-2.013522330073379e-02,1.420395558984243e-02,-5.591933872259761e-03}, +{ -1.315497308740086e-01,7.771528092679020e-01,4.646950893036642e-01,-1.705117085453590e-01,9.980387664515367e-02,-6.805701503023287e-02,4.989300241270402e-02,-3.778516592603681e-02,2.851732113383040e-02,-2.003149998634021e-02,7.873021598723301e-03}, +{ 5.507101879567905e-02,-1.789961756726006e-01,8.775870237460236e-01,3.328334960722411e-01,-1.358906061529600e-01,8.270436640648821e-02,-5.753919757034099e-02,4.237326806552601e-02,-3.147991185316360e-02,2.192602356814137e-02,-8.589305405034116e-03}, +{ -2.456264483301265e-02,6.990292595692719e-02,-1.428079944919613e-01,9.456113206385860e-01,2.100881754355276e-01,-9.246662933253591e-02,5.758167271408559e-02,-4.023654160842159e-02,2.907335642711573e-02,-1.995949573867184e-02,7.775854832361241e-03}, +{ 8.911701663246799e-03,-2.405461580697662e-02,4.055997073048743e-02,-8.131387853332293e-02,9.863623701183348e-01,9.813553820921050e-02,-4.574104080776885e-02,2.880270367605660e-02,-1.981540468471582e-02,1.328017980632797e-02,-5.127524370879891e-03}, +{ 1.110223024625157e-16,-2.909419206188850e-16,4.423614327288554e-16,-7.487040316997927e-16,1.510372776854457e-15,9.999999999999999e-01,-1.320724139568699e-15,6.285052300828812e-16,-4.001571194972847e-16,2.999791832412605e-16,-5.551115123125783e-17}, +{ -5.127524370879752e-03,1.328017980632774e-02,-1.981540468471537e-02,2.880270367605602e-02,-4.574104080776799e-02,9.813553820920917e-02,9.863623701183343e-01,-8.131387853332143e-02,4.055997073048671e-02,-2.405461580697603e-02,8.911701663246660e-03}, +{ 7.775854832361540e-03,-1.995949573867169e-02,2.907335642711547e-02,-4.023654160842156e-02,5.758167271408524e-02,-9.246662933253580e-02,2.100881754355267e-01,9.456113206385865e-01,-1.428079944919608e-01,6.990292595692700e-02,-2.456264483301262e-02}, +{ -8.589305405034081e-03,2.192602356814150e-02,-3.147991185316380e-02,4.237326806552590e-02,-5.753919757034059e-02,8.270436640648787e-02,-1.358906061529596e-01,3.328334960722404e-01,8.775870237460244e-01,-1.789961756726008e-01,5.507101879567891e-02}, +{ 7.873021598723287e-03,-2.003149998634042e-02,2.851732113383044e-02,-3.778516592603641e-02,4.989300241270395e-02,-6.805701503023351e-02,9.980387664515383e-02,-1.705117085453583e-01,4.646950893036633e-01,7.771528092679020e-01,-1.315497308740084e-01}, +{ -5.591933872259855e-03,1.420395558984212e-02,-2.013522330073359e-02,2.647227811054993e-02,-3.449089058547006e-02,4.595127331137497e-02,-6.438510918421761e-02,9.893381995913716e-02,-1.831921783429976e-01,6.141291834135812e-01,5.081048249011931e-01} +}; +#endif +#if p_Nq==11 && p_cubNq==12 +const dfloat c_I[12][11] = { +{ 5.706413675486145e-01,5.432323091446096e-01,-1.713058767363286e-01,9.335244998710271e-02,-6.094540759832855e-02,4.356036538081569e-02,-3.272206891488447e-02,2.512612419303105e-02,-1.911643974647471e-02,1.348726653083440e-02,-5.310089788991591e-03}, +{ -1.138913061191341e-01,8.972236368539173e-01,2.928711004002528e-01,-1.190232804839161e-01,7.130201791200078e-02,-4.908012764556102e-02,3.615144807351478e-02,-2.745075578689654e-02,2.074929671849840e-02,-1.458708162168170e-02,5.735051699005192e-03}, +{ 1.955628152534794e-02,-6.732920704636056e-02,9.901838588982058e-01,8.120795840052905e-02,-3.769796594249175e-02,2.374988109501724e-02,-1.677295862264850e-02,1.244861385294244e-02,-9.288343378366679e-03,6.484308068349010e-03,-2.542426850524030e-03}, +{ 1.429408716697883e-02,-4.177820681955700e-02,9.599317614863478e-02,9.866669004245111e-01,-8.036651716283418e-02,4.081296430147396e-02,-2.653412026689030e-02,1.890428648783165e-02,-1.379685430145300e-02,9.520560976241911e-03,-3.716276954937762e-03}, +{ -2.572627051722354e-02,7.052998976884796e-02,-1.252378397660307e-01,3.043010250211480e-01,8.963254201459006e-01,-1.796636354894553e-01,9.735111034592561e-02,-6.437929795529869e-02,4.528327278502480e-02,-3.067364372783311e-02,1.188986938899440e-02}, +{ 2.590005678844709e-02,-6.878280309729522e-02,1.102661093434955e-01,-1.901899811926308e-01,5.277598649859647e-01,7.351434964093992e-01,-2.137717685775707e-01,1.211987629531177e-01,-7.990721200529352e-02,5.251841193049026e-02,-2.013493753812440e-02}, +{ -2.013493753812456e-02,5.251841193048983e-02,-7.990721200529302e-02,1.211987629531174e-01,-2.137717685775695e-01,7.351434964093977e-01,5.277598649859648e-01,-1.901899811926302e-01,1.102661093434952e-01,-6.878280309729480e-02,2.590005678844703e-02}, +{ 1.188986938899439e-02,-3.067364372783319e-02,4.528327278502504e-02,-6.437929795529905e-02,9.735111034592606e-02,-1.796636354894567e-01,8.963254201459001e-01,3.043010250211504e-01,-1.252378397660315e-01,7.052998976884851e-02,-2.572627051722408e-02}, +{ -3.716276954937925e-03,9.520560976242161e-03,-1.379685430145313e-02,1.890428648783194e-02,-2.653412026689088e-02,4.081296430147430e-02,-8.036651716283467e-02,9.866669004245111e-01,9.599317614863548e-02,-4.177820681955752e-02,1.429408716697899e-02}, +{ -2.542426850524190e-03,6.484308068349100e-03,-9.288343378366811e-03,1.244861385294249e-02,-1.677295862264866e-02,2.374988109501741e-02,-3.769796594249222e-02,8.120795840053061e-02,9.901838588982055e-01,-6.732920704636158e-02,1.955628152534827e-02}, +{ 5.735051699005289e-03,-1.458708162168200e-02,2.074929671849858e-02,-2.745075578689662e-02,3.615144807351536e-02,-4.908012764556180e-02,7.130201791200086e-02,-1.190232804839158e-01,2.928711004002537e-01,8.972236368539164e-01,-1.138913061191338e-01}, +{ -5.310089788991723e-03,1.348726653083448e-02,-1.911643974647481e-02,2.512612419303119e-02,-3.272206891488469e-02,4.356036538081609e-02,-6.094540759832926e-02,9.335244998710354e-02,-1.713058767363309e-01,5.432323091446178e-01,5.706413675486082e-01} +}; +#endif +#if p_Nq==11 && p_cubNq==13 +const dfloat c_I[13][11] = { +{ 6.228692188668665e-01,4.820429148373097e-01,-1.582856906370168e-01,8.685782361826198e-02,-5.684518937586010e-02,4.067644727067465e-02,-3.057456482805935e-02,2.348551448028481e-02,-1.787198751152122e-02,1.261072469383529e-02,-4.965211414775570e-03}, +{ -7.829693713181056e-02,9.657556299002965e-01,1.555080931788787e-01,-6.762999182957918e-02,4.121282657453115e-02,-2.857111508975055e-02,2.112140513283475e-02,-1.607079153540900e-02,1.216184049499207e-02,-8.555476977754554e-03,3.364517282770606e-03}, +{ -2.646552595282470e-02,9.736806830567662e-02,9.856213545979243e-01,-8.206862260458203e-02,4.123870444727952e-02,-2.662096111237605e-02,1.900908685368577e-02,-1.419090918399425e-02,1.062305525286518e-02,-7.429106316164296e-03,2.914855712509771e-03}, +{ 4.884872017475328e-02,-1.470815046270958e-01,3.943784362169525e-01,8.368126195708002e-01,-2.002283198397232e-01,1.105198674272529e-01,-7.397593161356607e-02,5.343722933267534e-02,-3.928605865596565e-02,2.721261611798251e-02,-1.063767410406595e-02}, +{ -3.985910527811029e-02,1.111713065938061e-01,-2.099185476147694e-01,6.955026673457344e-01,5.717051620128267e-01,-1.991694659502351e-01,1.173249097044255e-01,-8.009521954881541e-02,5.720360465427291e-02,-3.904149107664233e-02,1.517617915750699e-02}, +{ 1.961231844691447e-02,-5.267218151689149e-02,8.740392333658323e-02,-1.665155357404832e-01,9.180787227969588e-01,2.661142417824078e-01,-1.139272153706989e-01,7.005909100936540e-02,-4.771108343963406e-02,3.182347070833438e-02,-1.226575201285648e-02}, +{ 0.000000000000000e+00,-2.554538126065738e-16,4.118567832930163e-16,-7.209939690336753e-16,1.368953987602601e-15,1.000000000000000e+00,-1.351333855331500e-15,7.014965971079535e-16,-3.914711514422228e-16,2.375616499235981e-16,-1.110223024625157e-16}, +{ -1.226575201285651e-02,3.182347070833440e-02,-4.771108343963397e-02,7.005909100936568e-02,-1.139272153706993e-01,2.661142417824104e-01,9.180787227969561e-01,-1.665155357404832e-01,8.740392333658321e-02,-5.267218151689140e-02,1.961231844691455e-02}, +{ 1.517617915750688e-02,-3.904149107664231e-02,5.720360465427292e-02,-8.009521954881578e-02,1.173249097044255e-01,-1.991694659502352e-01,5.717051620128266e-01,6.955026673457348e-01,-2.099185476147697e-01,1.111713065938064e-01,-3.985910527811029e-02}, +{ -1.063767410406578e-02,2.721261611798280e-02,-3.928605865596582e-02,5.343722933267506e-02,-7.397593161356625e-02,1.105198674272534e-01,-2.002283198397230e-01,8.368126195708000e-01,3.943784362169526e-01,-1.470815046270961e-01,4.884872017475315e-02}, +{ 2.914855712509681e-03,-7.429106316164407e-03,1.062305525286578e-02,-1.419090918399487e-02,1.900908685368556e-02,-2.662096111237572e-02,4.123870444727944e-02,-8.206862260458200e-02,9.856213545979244e-01,9.736806830567646e-02,-2.646552595282450e-02}, +{ 3.364517282770582e-03,-8.555476977754477e-03,1.216184049499178e-02,-1.607079153540845e-02,2.112140513283462e-02,-2.857111508975060e-02,4.121282657453063e-02,-6.762999182957795e-02,1.555080931788759e-01,9.657556299002974e-01,-7.829693713180937e-02}, +{ -4.965211414775579e-03,1.261072469383505e-02,-1.787198751152089e-02,2.348551448028447e-02,-3.057456482805878e-02,4.067644727067404e-02,-5.684518937585979e-02,8.685782361826132e-02,-1.582856906370150e-01,4.820429148373024e-01,6.228692188668727e-01} +}; +#endif +#if p_Nq==11 && p_cubNq==14 +const dfloat c_I[14][11] = { +{ 6.666914293445740e-01,4.294513850980600e-01,-1.453904399182545e-01,8.021869655833616e-02,-5.260286448594236e-02,3.767523050039484e-02,-2.833263651678413e-02,2.176960606728343e-02,-1.656899795838452e-02,1.169239825162230e-02,-4.603806940905137e-03}, +{ -3.154380053327355e-02,9.957274416411085e-01,5.031515374259149e-02,-2.295722499271483e-02,1.417308344577303e-02,-9.880146077134611e-03,7.324809244689578e-03,-5.582258975349170e-03,4.228419465909673e-03,-2.976082084946686e-03,1.170605123346547e-03}, +{ -6.989003704807917e-02,2.776482363264111e-01,9.070802945105053e-01,-1.702786096675832e-01,9.026714036243555e-02,-5.932584098485083e-02,4.271913473721749e-02,-3.203535649792779e-02,2.404223979456587e-02,-1.683671214984938e-02,6.609510617155083e-03}, +{ 6.403964240911353e-02,-1.993028058141915e-01,6.610919018545681e-01,6.059979947262983e-01,-2.031641695909418e-01,1.183979516235378e-01,-8.091730030952486e-02,5.905387640689524e-02,-4.365601931383173e-02,3.032743672849223e-02,-1.186850872041527e-02}, +{ -2.614602862455956e-02,7.431525820721188e-02,-1.510403020139422e-01,9.365256732861413e-01,2.293717144690628e-01,-9.995564744204288e-02,6.207705559698481e-02,-4.332671348477314e-02,3.128745595009878e-02,-2.147298297818902e-02,8.364517034007354e-03}, +{ -8.539895170671885e-03,2.321892744319930e-02,-4.008924693478564e-02,8.726199935023907e-02,9.889929319685253e-01,-7.404295835151786e-02,3.756428635972532e-02,-2.428590192072993e-02,1.690508102154854e-02,-1.139327051676459e-02,4.408046751232193e-03}, +{ 2.380925381145371e-02,-6.313103574866892e-02,1.007305718059480e-01,-1.715716797553613e-01,4.494103365733116e-01,7.986171629788914e-01,-2.088980166129713e-01,1.165013193934029e-01,-7.634072707333660e-02,5.003842201609007e-02,-1.916560738875950e-02}, +{ -1.916560738875939e-02,5.003842201608978e-02,-7.634072707333615e-02,1.165013193934023e-01,-2.088980166129702e-01,7.986171629788903e-01,4.494103365733111e-01,-1.715716797553605e-01,1.007305718059476e-01,-6.313103574866866e-02,2.380925381145366e-02}, +{ 4.408046751232464e-03,-1.139327051676466e-02,1.690508102154888e-02,-2.428590192073032e-02,3.756428635972552e-02,-7.404295835151864e-02,9.889929319685251e-01,8.726199935024036e-02,-4.008924693478631e-02,2.321892744319977e-02,-8.539895170672045e-03}, +{ 8.364517034007243e-03,-2.147298297818913e-02,3.128745595009887e-02,-4.332671348477340e-02,6.207705559698518e-02,-9.995564744204366e-02,2.293717144690648e-01,9.365256732861404e-01,-1.510403020139429e-01,7.431525820721235e-02,-2.614602862455999e-02}, +{ -1.186850872041519e-02,3.032743672849234e-02,-4.365601931383190e-02,5.905387640689554e-02,-8.091730030952506e-02,1.183979516235378e-01,-2.031641695909415e-01,6.059979947262975e-01,6.610919018545693e-01,-1.993028058141922e-01,6.403964240911343e-02}, +{ 6.609510617155312e-03,-1.683671214984939e-02,2.404223979456535e-02,-3.203535649792731e-02,4.271913473721714e-02,-5.932584098485050e-02,9.026714036243515e-02,-1.702786096675823e-01,9.070802945105063e-01,2.776482363264091e-01,-6.989003704807861e-02}, +{ 1.170605123346519e-03,-2.976082084946775e-03,4.228419465909685e-03,-5.582258975348640e-03,7.324809244689217e-03,-9.880146077134666e-03,1.417308344577293e-02,-2.295722499271435e-02,5.031515374258988e-02,9.957274416411087e-01,-3.154380053327263e-02}, +{ -4.603806940904967e-03,1.169239825162216e-02,-1.656899795838458e-02,2.176960606728361e-02,-2.833263651678400e-02,3.767523050039462e-02,-5.260286448594235e-02,8.021869655833613e-02,-1.453904399182547e-01,4.294513850980600e-01,6.666914293445740e-01} +}; +#endif +#if p_Nq==11 && p_cubNq==15 +const dfloat c_I[15][11] = { +{ 7.036756034639666e-01,3.842494891381966e-01,-1.332109883753972e-01,7.382147782279966e-02,-4.848440717200827e-02,3.475123136810815e-02,-2.614415312043564e-02,2.009271254166353e-02,-1.529478597700121e-02,1.079402349318156e-02,-4.250203183073701e-03}, +{ 2.121099338949761e-02,9.984251127813160e-01,-2.793942326304276e-02,1.320914576211810e-02,-8.238281698705406e-03,5.768274729927900e-03,-4.286170531482241e-03,3.270733122239054e-03,-2.479364127464724e-03,1.745766566035862e-03,-6.867867304394090e-04}, +{ -1.033958345536636e-01,4.491647978920374e-01,7.902387927362453e-01,-2.048627305319437e-01,1.128483048895986e-01,-7.518890430851938e-02,5.449719719539203e-02,-4.101354560400348e-02,3.084264101316998e-02,-2.162263036249295e-02,8.491911634179831e-03}, +{ 5.794720211099047e-02,-1.870842290226995e-01,8.530141233206832e-01,3.705424624080323e-01,-1.479796551381559e-01,8.957649942489576e-02,-6.217839610733750e-02,4.573620765900246e-02,-3.395644879969861e-02,2.364287971099773e-02,-9.260645566710461e-03}, +{ 3.617024169526667e-03,-1.049557369158134e-02,2.331929055367995e-02,9.991093224151597e-01,-2.239737442787014e-02,1.104387153391828e-02,-7.111868336313108e-03,5.044620753280629e-03,-3.673253403371619e-03,2.531738975220410e-03,-9.877985416494048e-04}, +{ -3.312637367740770e-02,9.128003701413417e-02,-1.649692778222635e-01,4.332893349090227e-01,8.102214294943427e-01,-2.069066474596359e-01,1.155518085627475e-01,-7.726693092136593e-02,5.463338423446910e-02,-3.710230963171298e-02,1.439554529766987e-02}, +{ 2.485862822236851e-02,-6.653337511538525e-02,1.092279254301183e-01,-2.014731379482927e-01,8.341097917196516e-01,4.010537802846122e-01,-1.587210768736452e-01,9.569639101241735e-02,-6.463725152792087e-02,4.294955745729186e-02,-1.653123266121581e-02}, +{ 0.000000000000000e+00,-2.258210522371657e-16,3.597566144618557e-16,-7.870200690214702e-16,1.592993523032929e-15,1.000000000000000e+00,-1.612884496067945e-15,8.267703145419572e-16,-4.178341836242768e-16,2.859126263250758e-16,-1.110223024625157e-16}, +{ -1.653123266121559e-02,4.294955745729133e-02,-6.463725152792019e-02,9.569639101241649e-02,-1.587210768736437e-01,4.010537802846095e-01,8.341097917196524e-01,-2.014731379482911e-01,1.092279254301174e-01,-6.653337511538464e-02,2.485862822236806e-02}, +{ 1.439554529767037e-02,-3.710230963171310e-02,5.463338423446926e-02,-7.726693092136594e-02,1.155518085627473e-01,-2.069066474596368e-01,8.102214294943428e-01,4.332893349090234e-01,-1.649692778222637e-01,9.128003701413435e-02,-3.312637367740776e-02}, +{ -9.877985416492097e-04,2.531738975220279e-03,-3.673253403371491e-03,5.044620753280272e-03,-7.111868336313294e-03,1.104387153391850e-02,-2.239737442787019e-02,9.991093224151602e-01,2.331929055367986e-02,-1.049557369158137e-02,3.617024169526472e-03}, +{ -9.260645566710551e-03,2.364287971099801e-02,-3.395644879969901e-02,4.573620765900283e-02,-6.217839610733791e-02,8.957649942489637e-02,-1.479796551381565e-01,3.705424624080347e-01,8.530141233206817e-01,-1.870842290227005e-01,5.794720211099071e-02}, +{ 8.491911634179866e-03,-2.162263036249294e-02,3.084264101317000e-02,-4.101354560400350e-02,5.449719719539171e-02,-7.518890430851935e-02,1.128483048895989e-01,-2.048627305319435e-01,7.902387927362455e-01,4.491647978920364e-01,-1.033958345536631e-01}, +{ -6.867867304395196e-04,1.745766566035650e-03,-2.479364127464513e-03,3.270733122238926e-03,-4.286170531482016e-03,5.768274729927497e-03,-8.238281698705352e-03,1.320914576211842e-02,-2.793942326304344e-02,9.984251127813162e-01,2.121099338949813e-02}, +{ -4.250203183073765e-03,1.079402349318151e-02,-1.529478597700126e-02,2.009271254166371e-02,-2.614415312043556e-02,3.475123136810836e-02,-4.848440717200930e-02,7.382147782280112e-02,-1.332109883754002e-01,3.842494891382043e-01,7.036756034639610e-01} +}; +#endif +#if p_Nq==11 && p_cubNq==16 +const dfloat c_I[16][11] = { +{ 7.350855194368920e-01,3.453110527954979e-01,-1.219904362422669e-01,6.784522264239309e-02,-4.461699939615035e-02,3.199871386182227e-02,-2.408126828086024e-02,1.851083164295929e-02,-1.409222065381311e-02,9.945948698120309e-03,-3.916364504594327e-03}, +{ 7.634978246935735e-02,9.826563986015249e-01,-8.480944964157526e-02,4.120352140018057e-02,-2.590729069519573e-02,1.820439638487219e-02,-1.355207477498765e-02,1.035238053874021e-02,-7.852409981133634e-03,5.530894667280691e-03,-2.176148969063674e-03}, +{ -1.241523798409527e-01,5.990973362196249e-01,6.595936087857278e-01,-2.051203814056521e-01,1.162803060610645e-01,-7.831049931566031e-02,5.705667553235703e-02,-4.306293792443403e-02,3.243685060869567e-02,-2.276043495611150e-02,8.941856235340780e-03}, +{ 3.565810388537882e-02,-1.199067399240145e-01,9.623338366578655e-01,1.694011146620689e-01,-7.537267814901578e-02,4.691662599258728e-02,-3.295852842113407e-02,2.439337049621268e-02,-1.817260127258945e-02,1.267604632537479e-02,-4.968550252734245e-03}, +{ 3.444762503018264e-02,-1.022386630731453e-01,2.534946560502238e-01,9.236111646047084e-01,-1.623226036993190e-01,8.656876866911045e-02,-5.722973135299685e-02,4.109480475635611e-02,-3.011641145477485e-02,2.082652318900412e-02,-8.136132719349577e-03}, +{ -3.950255105130249e-02,1.104406548059716e-01,-2.104124481901543e-01,7.375481188118108e-01,5.241239037089964e-01,-1.899455023875375e-01,1.128236445630454e-01,-7.728187316407004e-02,5.528582926736475e-02,-3.776391459739407e-02,1.468413823326950e-02}, +{ 4.940381339437090e-03,-1.335743427778205e-02,2.264434069581420e-02,-4.621884261357917e-02,9.959976948219442e-01,5.121367475151450e-02,-2.441785482570094e-02,1.548007864213434e-02,-1.068147725447037e-02,7.168744727908636e-03,-2.769306007220480e-03}, +{ 2.175764142899705e-02,-5.762475640413712e-02,9.163001079378656e-02,-1.546678301826039e-01,3.896180951210359e-01,8.421181281663759e-01,-2.001535407639001e-01,1.101530952089582e-01,-7.183231423737101e-02,4.698335978042928e-02,-1.798188891157076e-02}, +{ -1.798188891157060e-02,4.698335978042885e-02,-7.183231423737049e-02,1.101530952089577e-01,-2.001535407638990e-01,8.421181281663750e-01,3.896180951210354e-01,-1.546678301826031e-01,9.163001079378616e-02,-5.762475640413678e-02,2.175764142899689e-02}, +{ -2.769306007220022e-03,7.168744727908174e-03,-1.068147725446965e-02,1.548007864213356e-02,-2.441785482570026e-02,5.121367475151217e-02,9.959976948219442e-01,-4.621884261357676e-02,2.264434069581304e-02,-1.335743427778111e-02,4.940381339436743e-03}, +{ 1.468413823326939e-02,-3.776391459739407e-02,5.528582926736482e-02,-7.728187316407008e-02,1.128236445630452e-01,-1.899455023875376e-01,5.241239037089955e-01,7.375481188118119e-01,-2.104124481901545e-01,1.104406548059719e-01,-3.950255105130249e-02}, +{ -8.136132719349598e-03,2.082652318900437e-02,-3.011641145477509e-02,4.109480475635625e-02,-5.722973135299695e-02,8.656876866911023e-02,-1.623226036993187e-01,9.236111646047089e-01,2.534946560502236e-01,-1.022386630731455e-01,3.444762503018244e-02}, +{ -4.968550252734481e-03,1.267604632537491e-02,-1.817260127258945e-02,2.439337049621278e-02,-3.295852842113436e-02,4.691662599258778e-02,-7.537267814901621e-02,1.694011146620697e-01,9.623338366578656e-01,-1.199067399240154e-01,3.565810388537901e-02}, +{ 8.941856235340884e-03,-2.276043495611157e-02,3.243685060869561e-02,-4.306293792443387e-02,5.705667553235683e-02,-7.831049931566036e-02,1.162803060610645e-01,-2.051203814056519e-01,6.595936087857314e-01,5.990973362196207e-01,-1.241523798409521e-01}, +{ -2.176148969063540e-03,5.530894667280271e-03,-7.852409981133318e-03,1.035238053874027e-02,-1.355207477498747e-02,1.820439638487181e-02,-2.590729069519514e-02,4.120352140017947e-02,-8.480944964157401e-02,9.826563986015260e-01,7.634978246935559e-02}, +{ -3.916364504594420e-03,9.945948698120667e-03,-1.409222065381359e-02,1.851083164296008e-02,-2.408126828086083e-02,3.199871386182306e-02,-4.461699939615160e-02,6.784522264239494e-02,-1.219904362422704e-01,3.453110527955092e-01,7.350855194368828e-01} +}; +#endif +#if p_Nq==11 && p_cubNq==17 +const dfloat c_I[17][11] = { +{ 7.619315811464084e-01,3.116514397196330e-01,-1.117927454106849e-01,6.235745097327313e-02,-4.105212111701909e-02,2.945691595188686e-02,-2.217443853053827e-02,1.704778460615297e-02,-1.297962002053516e-02,9.161173291337511e-03,-3.607420609914322e-03}, +{ 1.314734981190437e-01,9.549254732808847e-01,-1.251958133650991e-01,6.214311470959652e-02,-3.933190819013010e-02,2.771837859602851e-02,-2.066628284097465e-02,1.580070111441875e-02,-1.199114818421336e-02,8.448405211175448e-03,-3.324418450730567e-03}, +{ -1.320996391968198e-01,7.225198335304361e-01,5.300920248815486e-01,-1.855025819527315e-01,1.075169397652042e-01,-7.303234087081611e-02,5.343686672775434e-02,-4.042549623865728e-02,3.049115900979210e-02,-2.141078285730538e-02,8.414017201594765e-03}, +{ 4.234230555587887e-03,-1.489618337651761e-02,9.995822320194848e-01,1.579914935459337e-02,-7.565033757471192e-03,4.810173697244235e-03,-3.411171056010894e-03,2.537234688136616e-03,-1.895424008971191e-03,1.324080578059816e-03,-5.192886941358392e-04}, +{ 5.610413289075944e-02,-1.706557076110585e-01,4.859328317507915e-01,7.674450500510220e-01,-2.107972547623093e-01,1.186667266247803e-01,-8.001670628296691e-02,5.800886877674837e-02,-4.272930631355640e-02,2.962753663781842e-02,-1.158617176202903e-02}, +{ -2.715325469312591e-02,7.711431177343216e-02,-1.562024710459345e-01,9.301358402969619e-01,2.422886847960935e-01,-1.048747291950034e-01,6.501373312318315e-02,-4.534058783115306e-02,3.272863004830976e-02,-2.245751017572204e-02,8.747352902958486e-03}, +{ -2.019482520662935e-02,5.520058442419199e-02,-9.702613224370210e-02,2.261298908216440e-01,9.381535513907050e-01,-1.515776986485073e-01,8.043832070492674e-02,-5.280342629161248e-02,3.701344875523856e-02,-2.502987500960744e-02,9.696161303352449e-03}, +{ 2.707039899837027e-02,-7.227317608398286e-02,1.177437323032472e-01,-2.123871706788521e-01,7.532512088404493e-01,5.063027324267860e-01,-1.862692225105532e-01,1.104461838256373e-01,-7.409658886322897e-02,4.908255044455809e-02,-1.887064870243113e-02}, +{ 0.000000000000000e+00,-1.149383427025859e-16,2.360021414095731e-16,-3.790158691012836e-16,8.487172111519828e-16,1.000000000000000e+00,-8.400817001740101e-16,4.084191313438337e-16,-3.030163694606237e-16,1.937791255564813e-16,0.000000000000000e+00}, +{ -1.887064870243099e-02,4.908255044455784e-02,-7.409658886322860e-02,1.104461838256369e-01,-1.862692225105526e-01,5.063027324267861e-01,7.532512088404479e-01,-2.123871706788512e-01,1.177437323032470e-01,-7.227317608398262e-02,2.707039899837022e-02}, +{ 9.696161303352380e-03,-2.502987500960735e-02,3.701344875523872e-02,-5.280342629161251e-02,8.043832070492637e-02,-1.515776986485078e-01,9.381535513907056e-01,2.261298908216437e-01,-9.702613224370202e-02,5.520058442419219e-02,-2.019482520662928e-02}, +{ 8.747352902958402e-03,-2.245751017572189e-02,3.272863004830947e-02,-4.534058783115285e-02,6.501373312318279e-02,-1.048747291950031e-01,2.422886847960918e-01,9.301358402969628e-01,-1.562024710459338e-01,7.711431177343198e-02,-2.715325469312573e-02}, +{ -1.158617176202910e-02,2.962753663781854e-02,-4.272930631355670e-02,5.800886877674848e-02,-8.001670628296702e-02,1.186667266247809e-01,-2.107972547623096e-01,7.674450500510208e-01,4.859328317507929e-01,-1.706557076110591e-01,5.610413289075979e-02}, +{ -5.192886941359776e-04,1.324080578060024e-03,-1.895424008971469e-03,2.537234688136703e-03,-3.411171056011034e-03,4.810173697244569e-03,-7.565033757471957e-03,1.579914935459507e-02,9.995822320194848e-01,-1.489618337651918e-02,4.234230555588359e-03}, +{ 8.414017201595236e-03,-2.141078285730559e-02,3.049115900979248e-02,-4.042549623865782e-02,5.343686672775429e-02,-7.303234087081570e-02,1.075169397652036e-01,-1.855025819527302e-01,5.300920248815443e-01,7.225198335304395e-01,-1.320996391968198e-01}, +{ -3.324418450730476e-03,8.448405211175422e-03,-1.199114818421340e-02,1.580070111441897e-02,-2.066628284097442e-02,2.771837859602816e-02,-3.933190819012990e-02,6.214311470959598e-02,-1.251958133650988e-01,9.549254732808856e-01,1.314734981190430e-01}, +{ -3.607420609914209e-03,9.161173291337633e-03,-1.297962002053566e-02,1.704778460615363e-02,-2.217443853053891e-02,2.945691595188776e-02,-4.105212111702031e-02,6.235745097327511e-02,-1.117927454106892e-01,3.116514397196462e-01,7.619315811463979e-01} +}; +#endif +#if p_Nq==12 && p_cubNq==12 +const dfloat c_I[12][12] = { +{ 5.019164041245392e-01,6.209959140141242e-01,-1.841512206971929e-01,9.940421193553202e-02,-6.475590199932242e-02,4.635512796603666e-02,-3.503261321820807e-02,2.728990492396322e-02,-2.146922465000268e-02,1.659464613408724e-02,-1.181781979606144e-02,4.670571262504931e-03}, +{ -1.319999363979834e-01,7.634233777949329e-01,4.817146252216118e-01,-1.747506134229288e-01,1.021470517883991e-01,-6.978795570554829e-02,5.147331552807630e-02,-3.953674357601834e-02,3.083913626693234e-02,-2.371344730092262e-02,1.683811032182447e-02,-6.646920518375461e-03}, +{ 5.717807517608126e-02,-1.849402609667591e-01,8.603699800507318e-01,3.595808107728564e-01,-1.446606454791853e-01,8.791815947695640e-02,-6.142931118825373e-02,4.581552411492033e-02,-3.512464447043665e-02,2.673210827490028e-02,-1.887326788689482e-02,7.433472125083039e-03}, +{ -2.737861539596954e-02,7.772705346845647e-02,-1.572813182942002e-01,9.287964515693872e-01,2.450240030923576e-01,-1.060859647394716e-01,6.606128664494680e-02,-4.662550260137551e-02,3.467832952206706e-02,-2.593719629320162e-02,1.813955732756585e-02,-7.118084300562474e-03}, +{ 1.199646679120869e-02,-3.232803864568125e-02,5.424308618560786e-02,-1.071586407957148e-01,9.742753652734651e-01,1.387816047835083e-01,-6.363640218526202e-02,4.024693390387515e-02,-2.838174048015264e-02,2.062816943984657e-02,-1.421119218271429e-02,5.544387912013345e-03}, +{ -3.048657324396083e-03,8.003573189800274e-03,-1.240733029470075e-02,1.967445238389168e-02,-3.959369460815068e-02,9.971344127374986e-01,4.311564614016821e-02,-2.069586613987032e-02,1.317433984708245e-02,-9.117142214171115e-03,6.130319958917105e-03,-2.370053676069232e-03}, +{ -2.370053676069033e-03,6.130319958916709e-03,-9.117142214170997e-03,1.317433984708240e-02,-2.069586613986966e-02,4.311564614016660e-02,9.971344127374991e-01,-3.959369460814973e-02,1.967445238389094e-02,-1.240733029470024e-02,8.003573189800002e-03,-3.048657324396136e-03}, +{ 5.544387912013280e-03,-1.421119218271426e-02,2.062816943984674e-02,-2.838174048015277e-02,4.024693390387506e-02,-6.363640218526159e-02,1.387816047835067e-01,9.742753652734660e-01,-1.071586407957139e-01,5.424308618560717e-02,-3.232803864568098e-02,1.199646679120863e-02}, +{ -7.118084300562544e-03,1.813955732756585e-02,-2.593719629320142e-02,3.467832952206696e-02,-4.662550260137507e-02,6.606128664494611e-02,-1.060859647394708e-01,2.450240030923554e-01,9.287964515693873e-01,-1.572813182941984e-01,7.772705346845581e-02,-2.737861539596921e-02}, +{ 7.433472125083267e-03,-1.887326788689494e-02,2.673210827490057e-02,-3.512464447043726e-02,4.581552411492057e-02,-6.142931118825378e-02,8.791815947695691e-02,-1.446606454791863e-01,3.595808107728576e-01,8.603699800507314e-01,-1.849402609667597e-01,5.717807517608182e-02}, +{ -6.646920518375510e-03,1.683811032182457e-02,-2.371344730092260e-02,3.083913626693195e-02,-3.953674357601816e-02,5.147331552807627e-02,-6.978795570554859e-02,1.021470517884002e-01,-1.747506134229302e-01,4.817146252216153e-01,7.634233777949317e-01,-1.319999363979850e-01}, +{ 4.670571262505141e-03,-1.181781979606160e-02,1.659464613408767e-02,-2.146922465000332e-02,2.728990492396360e-02,-3.503261321820863e-02,4.635512796603725e-02,-6.475590199932296e-02,9.940421193553228e-02,-1.841512206971933e-01,6.209959140141331e-01,5.019164041245308e-01} +}; +#endif +#if p_Nq==12 && p_cubNq==13 +const dfloat c_I[13][12] = { +{ 5.603940157942685e-01,5.550380706348198e-01,-1.735606863857827e-01,9.449078476625082e-02,-6.173986945895071e-02,4.425789989901281e-02,-3.347294055047818e-02,2.608656506663621e-02,-2.052816908315785e-02,1.586992081183702e-02,-1.130278088142775e-02,4.467189386971963e-03}, +{ -1.185389261578744e-01,8.801163156409332e-01,3.207927813158102e-01,-1.284799245008408e-01,7.678165229676147e-02,-5.292994029020950e-02,3.921703520244131e-02,-3.020058412829583e-02,2.359348151902625e-02,-1.815905056184374e-02,1.290092552964333e-02,-5.093765865551494e-03}, +{ 2.707612017826174e-02,-9.221161897232058e-02,9.800276636899632e-01,1.192658267404009e-01,-5.441845399828601e-02,3.419157172246844e-02,-2.423782479325060e-02,1.821548564483622e-02,-1.402650252534998e-02,1.070278967253682e-02,-7.567162597406551e-03,2.982105238146209e-03}, +{ 7.049059677191607e-03,-2.049807734414752e-02,4.601024332640152e-02,9.966763940054971e-01,-4.228202301888717e-02,2.109577874451603e-02,-1.369898312358754e-02,9.854158204943937e-03,-7.403946458944734e-03,5.569534690765598e-03,-3.907196312583116e-03,1.535057608834386e-03}, +{ -2.042480902933798e-02,5.582024278884220e-02,-9.809592768203672e-02,2.287954811644293e-01,9.369122012492656e-01,-1.528338303807686e-01,8.148928976598355e-02,-5.409390997387802e-02,3.901174614838986e-02,-2.869021443175241e-02,1.988638665361020e-02,-7.776656272746968e-03}, +{ 2.337996692142291e-02,-6.194833926566721e-02,9.866394857735693e-02,-1.674398614377767e-01,4.333610512594011e-01,8.108987587176562e-01,-2.074807212033780e-01,1.163027618467164e-01,-7.804714082512967e-02,5.534278599310231e-02,-3.765528992140105e-02,1.462207933769694e-02}, +{ -2.050781250000014e-02,5.338924668599516e-02,-8.082637279563824e-02,1.213558294955848e-01,-2.090335758180522e-01,6.356226849321110e-01,6.356226849321098e-01,-2.090335758180522e-01,1.213558294955846e-01,-8.082637279563791e-02,5.338924668599510e-02,-2.050781250000008e-02}, +{ 1.462207933769689e-02,-3.765528992140116e-02,5.534278599310244e-02,-7.804714082512937e-02,1.163027618467160e-01,-2.074807212033775e-01,8.108987587176586e-01,4.333610512593979e-01,-1.674398614377760e-01,9.866394857735611e-02,-6.194833926566681e-02,2.337996692142282e-02}, +{ -7.776656272747033e-03,1.988638665361027e-02,-2.869021443175226e-02,3.901174614838936e-02,-5.409390997387746e-02,8.148928976598309e-02,-1.528338303807684e-01,9.369122012492656e-01,2.287954811644289e-01,-9.809592768203605e-02,5.582024278884207e-02,-2.042480902933804e-02}, +{ 1.535057608834141e-03,-3.907196312583218e-03,5.569534690765818e-03,-7.403946458944773e-03,9.854158204944116e-03,-1.369898312358796e-02,2.109577874451643e-02,-4.228202301888866e-02,9.966763940054973e-01,4.601024332640286e-02,-2.049807734414814e-02,7.049059677191881e-03}, +{ 2.982105238146047e-03,-7.567162597406415e-03,1.070278967253685e-02,-1.402650252534985e-02,1.821548564483583e-02,-2.423782479325035e-02,3.419157172246814e-02,-5.441845399828595e-02,1.192658267404008e-01,9.800276636899634e-01,-9.221161897232044e-02,2.707612017826197e-02}, +{ -5.093765865551547e-03,1.290092552964336e-02,-1.815905056184373e-02,2.359348151902601e-02,-3.020058412829557e-02,3.921703520244060e-02,-5.292994029020861e-02,7.678165229676136e-02,-1.284799245008409e-01,3.207927813158096e-01,8.801163156409348e-01,-1.185389261578754e-01}, +{ 4.467189386971998e-03,-1.130278088142789e-02,1.586992081183725e-02,-2.052816908315792e-02,2.608656506663598e-02,-3.347294055047808e-02,4.425789989901236e-02,-6.173986945894986e-02,9.449078476624956e-02,-1.735606863857797e-01,5.550380706348118e-01,5.603940157942744e-01} +}; +#endif +#if p_Nq==12 && p_cubNq==14 +const dfloat c_I[14][12] = { +{ 6.099745919326937e-01,4.973099551884862e-01,-1.617643749746474e-01,8.866033386165585e-02,-5.806804274570518e-02,4.167205929925971e-02,-3.153622796272119e-02,2.458596332078415e-02,-1.935154121960070e-02,1.496229957961035e-02,-1.065716771376485e-02,4.212151433949341e-03}, +{ -8.902830207694014e-02,9.519274159942585e-01,1.884730884693850e-01,-8.073200088280762e-02,4.905686576164481e-02,-3.405258696013962e-02,2.532012908863934e-02,-1.953843181544363e-02,1.528269969370350e-02,-1.177131011178064e-02,8.366314577642628e-03,-3.303881738162108e-03}, +{ -1.413728294082652e-02,5.105725093128494e-02,9.957209627895480e-01,-4.708010048410467e-02,2.326185510868618e-02,-1.497029873177896e-02,1.072805917342503e-02,-8.109680191216400e-03,6.266001876756223e-03,-4.790882288693494e-03,3.391079589701734e-03,-1.336964832782108e-03}, +{ 4.127896322736023e-02,-1.232621935679713e-01,3.157933806318734e-01,8.883701109326123e-01,-1.826754800965880e-01,9.919756250273638e-02,-6.631345367438016e-02,4.836759434697850e-02,-3.661820092511527e-02,2.766574083674848e-02,-1.945428140813139e-02,7.650257193876905e-03}, +{ -3.906657180575662e-02,1.084248556109780e-01,-2.011280618242421e-01,6.029418791244703e-01,6.664563280804420e-01,-2.116484277683015e-01,1.229674091546512e-01,-8.430532586993582e-02,6.177251514416808e-02,-4.582077288075848e-02,3.190417072340893e-02,-1.249799768912397e-02}, +{ 2.448863169498034e-02,-6.554407881775384e-02,1.076442202323580e-01,-1.990202131350262e-01,8.443659996174220e-01,3.865598655604807e-01,-1.548727778679911e-01,9.448735837574591e-02,-6.559463034057937e-02,4.729486525334075e-02,-3.244956357786613e-02,1.264032300488892e-02}, +{ -7.412905724121920e-03,1.943577541581564e-02,-3.001827218599383e-02,4.718187984803913e-02,-9.237850245078513e-02,9.819593715093889e-01,1.144029473789542e-01,-5.304740776484929e-02,3.342018029209186e-02,-2.302268717410437e-02,1.544674783247392e-02,-5.967126976909243e-03}, +{ -5.967126976909176e-03,1.544674783247372e-02,-2.302268717410441e-02,3.342018029209191e-02,-5.304740776484906e-02,1.144029473789534e-01,9.819593715093893e-01,-9.237850245078445e-02,4.718187984803850e-02,-3.001827218599348e-02,1.943577541581547e-02,-7.412905724121669e-03}, +{ 1.264032300488864e-02,-3.244956357786621e-02,4.729486525334115e-02,-6.559463034057973e-02,9.448735837574611e-02,-1.548727778679914e-01,3.865598655604807e-01,8.443659996174224e-01,-1.990202131350261e-01,1.076442202323576e-01,-6.554407881775377e-02,2.448863169498035e-02}, +{ -1.249799768912424e-02,3.190417072340906e-02,-4.582077288075850e-02,6.177251514416788e-02,-8.430532586993553e-02,1.229674091546506e-01,-2.116484277683010e-01,6.664563280804429e-01,6.029418791244684e-01,-2.011280618242406e-01,1.084248556109776e-01,-3.906657180575668e-02}, +{ 7.650257193876991e-03,-1.945428140813153e-02,2.766574083674885e-02,-3.661820092511546e-02,4.836759434697861e-02,-6.631345367438060e-02,9.919756250273709e-02,-1.826754800965898e-01,8.883701109326115e-01,3.157933806318760e-01,-1.232621935679724e-01,4.127896322736085e-02}, +{ -1.336964832782067e-03,3.391079589701629e-03,-4.790882288693339e-03,6.266001876756025e-03,-8.109680191216401e-03,1.072805917342496e-02,-1.497029873177865e-02,2.326185510868536e-02,-4.708010048410312e-02,9.957209627895485e-01,5.105725093128346e-02,-1.413728294082629e-02}, +{ -3.303881738162349e-03,8.366314577642617e-03,-1.177131011178044e-02,1.528269969370333e-02,-1.953843181544348e-02,2.532012908863875e-02,-3.405258696013927e-02,4.905686576164543e-02,-8.073200088280852e-02,1.884730884693859e-01,9.519274159942592e-01,-8.902830207694132e-02}, +{ 4.212151433949428e-03,-1.065716771376507e-02,1.496229957961065e-02,-1.935154121960098e-02,2.458596332078434e-02,-3.153622796272153e-02,4.167205929925991e-02,-5.806804274570509e-02,8.866033386165548e-02,-1.617643749746464e-01,4.973099551884864e-01,6.099745919326928e-01} +}; +#endif +#if p_Nq==12 && p_cubNq==15 +const dfloat c_I[15][12] = { +{ 6.521624623019628e-01,4.470113024591535e-01,-1.498733562042901e-01,8.258422381652730e-02,-5.419219721689818e-02,3.892557049681102e-02,-2.947214945410797e-02,2.298341381676375e-02,-1.809339022395410e-02,1.399104020875481e-02,-9.965983625370843e-03,3.939063624647778e-03}, +{ -4.887024641988410e-02,9.888407196882655e-01,8.388839591940389e-02,-3.771513413433718e-02,2.321502397914722e-02,-1.620303934451514e-02,1.208212613607727e-02,-9.338467562596991e-03,7.311649228772010e-03,-5.635090393039549e-03,4.006423339022330e-03,-1.582360436315248e-03}, +{ -5.551404328400016e-02,2.143789444436453e-01,9.406200144642968e-01,-1.465583018203616e-01,7.648397175993189e-02,-5.011505335764083e-02,3.621651792206539e-02,-2.750297598311513e-02,2.130769701788392e-02,-1.631771596909254e-02,1.156031948582912e-02,-4.559374679442131e-03}, +{ 6.126135471509359e-02,-1.883586640645724e-01,5.746629679291637e-01,6.906841788691174e-01,-2.116181647321014e-01,1.215462305332894e-01,-8.299092325830135e-02,6.117000537649409e-02,-4.658263076783322e-02,3.531356292279075e-02,-2.487820371964044e-02,9.790286196499886e-03}, +{ -3.362775095797359e-02,9.491094839301563e-02,-1.876116513230702e-01,8.728201144008336e-01,3.427008142964079e-01,-1.405579181136384e-01,8.630232829445514e-02,-6.053875445256585e-02,4.488193248614482e-02,-3.350855184004943e-02,2.341212757536546e-02,-9.183638758925090e-03}, +{ 2.178762907402900e-03,-5.895779318600100e-03,1.002582641296354e-02,-2.070792613589570e-02,9.992444803199280e-01,2.168762608688227e-02,-1.052695125983343e-02,6.772898399922372e-03,-4.813401440546636e-03,3.512569268410331e-03,-2.424918206998018e-03,9.468129663643999e-04}, +{ 1.736098886047464e-02,-4.587064951898840e-02,7.244911866146894e-02,-1.203127802356511e-01,2.847471924151084e-01,9.080037857407663e-01,-1.737821930782882e-01,9.401282956365806e-02,-6.226917802818822e-02,4.388132930686880e-02,-2.976567231022463e-02,1.154522862299525e-02}, +{ -2.050781250000000e-02,5.338924668599515e-02,-8.082637279563833e-02,1.213558294955848e-01,-2.090335758180522e-01,6.356226849321114e-01,6.356226849321095e-01,-2.090335758180522e-01,1.213558294955846e-01,-8.082637279563777e-02,5.338924668599507e-02,-2.050781250000000e-02}, +{ 1.154522862299537e-02,-2.976567231022502e-02,4.388132930686928e-02,-6.226917802818878e-02,9.401282956365878e-02,-1.737821930782893e-01,9.080037857407655e-01,2.847471924151109e-01,-1.203127802356521e-01,7.244911866146911e-02,-4.587064951898875e-02,1.736098886047504e-02}, +{ 9.468129663642703e-04,-2.424918206998043e-03,3.512569268410400e-03,-4.813401440546645e-03,6.772898399922556e-03,-1.052695125983376e-02,2.168762608688255e-02,9.992444803199279e-01,-2.070792613589566e-02,1.002582641296347e-02,-5.895779318600094e-03,2.178762907403033e-03}, +{ -9.183638758925014e-03,2.341212757536536e-02,-3.350855184004936e-02,4.488193248614469e-02,-6.053875445256544e-02,8.630232829445458e-02,-1.405579181136378e-01,3.427008142964066e-01,8.728201144008334e-01,-1.876116513230687e-01,9.491094839301517e-02,-3.362775095797353e-02}, +{ 9.790286196499659e-03,-2.487820371964047e-02,3.531356292279098e-02,-4.658263076783315e-02,6.117000537649407e-02,-8.299092325830133e-02,1.215462305332897e-01,-2.116181647321030e-01,6.906841788691203e-01,5.746629679291618e-01,-1.883586640645726e-01,6.126135471509418e-02}, +{ -4.559374679442422e-03,1.156031948582908e-02,-1.631771596909216e-02,2.130769701788343e-02,-2.750297598311514e-02,3.621651792206545e-02,-5.011505335764083e-02,7.648397175993202e-02,-1.465583018203614e-01,9.406200144642974e-01,2.143789444436448e-01,-5.551404328400023e-02}, +{ -1.582360436315480e-03,4.006423339022172e-03,-5.635090393039359e-03,7.311649228771602e-03,-9.338467562596649e-03,1.208212613607639e-02,-1.620303934451455e-02,2.321502397914773e-02,-3.771513413433816e-02,8.388839591940610e-02,9.888407196882657e-01,-4.887024641988568e-02}, +{ 3.939063624647979e-03,-9.965983625371169e-03,1.399104020875528e-02,-1.809339022395455e-02,2.298341381676416e-02,-2.947214945410840e-02,3.892557049681146e-02,-5.419219721689888e-02,8.258422381652800e-02,-1.498733562042913e-01,4.470113024591619e-01,6.521624623019556e-01} +}; +#endif +#if p_Nq==12 && p_cubNq==16 +const dfloat c_I[16][12] = { +{ 6.882273443425490e-01,4.032210635587743e-01,-1.384540716774436e-01,7.662487752851876e-02,-5.036053148921860e-02,3.620002962202334e-02,-2.741952740877263e-02,2.138777784780731e-02,-1.683970510046099e-02,1.302277169328412e-02,-9.276743974295705e-03,3.666715057234615e-03}, +{ -2.378176089451667e-03,9.999784779275857e-01,3.396871377776381e-03,-1.583705183138938e-03,9.848385285867256e-04,-6.904104846773604e-04,5.160045301565660e-04,-3.993587010251463e-04,3.129346400961053e-04,-2.412973515747022e-04,1.716044361881330e-04,-6.778363052174743e-05}, +{ -9.020347805437337e-02,3.761346997097528e-01,8.443453167238231e-01,-1.950215660841880e-01,1.058984241766511e-01,-7.036045557711657e-02,5.118648099860901e-02,-3.901419549445587e-02,3.029164593369032e-02,-2.322793576820857e-02,1.646780988339301e-02,-6.496746447576864e-03}, +{ 6.301820606523165e-02,-2.000954382304416e-01,7.792020436810861e-01,4.711274928798300e-01,-1.763868406759652e-01,1.054166748915532e-01,-7.313944488696736e-02,5.435057177027430e-02,-4.158124182021556e-02,3.160727581453755e-02,-2.230016805890479e-02,8.780868569981668e-03}, +{ -1.039652155592965e-02,2.988365610732002e-02,-6.369300498038125e-02,9.918900160376286e-01,7.409610360441005e-02,-3.493303205734841e-02,2.228715852873881e-02,-1.590173183136537e-02,1.189544760522905e-02,-8.925941982109499e-03,6.253389305494352e-03,-2.455538781686873e-03}, +{ -2.412080590763521e-02,6.605009796342461e-02,-1.168477800924772e-01,2.799940295215687e-01,9.102780686292208e-01,-1.721181243452830e-01,9.305848083740197e-02,-6.208211311151351e-02,4.488028085381569e-02,-3.304850482085497e-02,2.292270907121367e-02,-8.966338598881518e-03}, +{ 2.772421877311191e-02,-7.386458597774957e-02,1.196103532406448e-01,-2.123436376454268e-01,6.877901260699006e-01,5.811456531402401e-01,-2.015884947171465e-01,1.190797053243304e-01,-8.156605538278694e-02,5.841819369984861e-02,-3.994614845265650e-02,1.554067192768973e-02}, +{ -1.043287185909121e-02,2.732784298685267e-02,-4.209336151706271e-02,6.574069583317863e-02,-1.262641313656907e-01,9.619591213920751e-01,1.725654241121828e-01,-7.774798175722061e-02,4.857821253536432e-02,-3.334474146484569e-02,2.233417526345225e-02,-8.622384159194956e-03}, +{ -8.622384159194767e-03,2.233417526345208e-02,-3.334474146484576e-02,4.857821253536417e-02,-7.774798175722000e-02,1.725654241121812e-01,9.619591213920763e-01,-1.262641313656905e-01,6.574069583317799e-02,-4.209336151706215e-02,2.732784298685239e-02,-1.043287185909102e-02}, +{ 1.554067192768945e-02,-3.994614845265662e-02,5.841819369984906e-02,-8.156605538278718e-02,1.190797053243304e-01,-2.015884947171467e-01,5.811456531402387e-01,6.877901260699029e-01,-2.123436376454269e-01,1.196103532406445e-01,-7.386458597774942e-02,2.772421877311186e-02}, +{ -8.966338598881404e-03,2.292270907121366e-02,-3.304850482085500e-02,4.488028085381551e-02,-6.208211311151309e-02,9.305848083740183e-02,-1.721181243452830e-01,9.102780686292196e-01,2.799940295215699e-01,-1.168477800924770e-01,6.605009796342459e-02,-2.412080590763562e-02}, +{ -2.455538781686827e-03,6.253389305494247e-03,-8.925941982109272e-03,1.189544760522886e-02,-1.590173183136528e-02,2.228715852873859e-02,-3.493303205734810e-02,7.409610360440903e-02,9.918900160376285e-01,-6.369300498038008e-02,2.988365610731954e-02,-1.039652155592911e-02}, +{ 8.780868569981629e-03,-2.230016805890473e-02,3.160727581453788e-02,-4.158124182021573e-02,5.435057177027419e-02,-7.313944488696732e-02,1.054166748915537e-01,-1.763868406759667e-01,4.711274928798315e-01,7.792020436810859e-01,-2.000954382304422e-01,6.301820606523187e-02}, +{ -6.496746447576848e-03,1.646780988339304e-02,-2.322793576820851e-02,3.029164593369016e-02,-3.901419549445556e-02,5.118648099860840e-02,-7.036045557711616e-02,1.058984241766510e-01,-1.950215660841876e-01,8.443453167238270e-01,3.761346997097484e-01,-9.020347805437330e-02}, +{ -6.778363052179414e-05,1.716044361882125e-04,-2.412973515749707e-04,3.129346400962640e-04,-3.993587010249994e-04,5.160045301561163e-04,-6.904104846774058e-04,9.848385285878062e-04,-1.583705183141071e-03,3.396871377781030e-03,9.999784779275859e-01,-2.378176089455094e-03}, +{ 3.666715057234815e-03,-9.276743974296040e-03,1.302277169328458e-02,-1.683970510046147e-02,2.138777784780798e-02,-2.741952740877346e-02,3.620002962202433e-02,-5.036053148921974e-02,7.662487752852032e-02,-1.384540716774461e-01,4.032210635587871e-01,6.882273443425377e-01} +}; +#endif +#if p_Nq==12 && p_cubNq==17 +const dfloat c_I[17][12] = { +{ 7.192169440463921e-01,3.650474683312186e-01,-1.277725318267793e-01,7.096779080507751e-02,-4.670316263398083e-02,3.359163286125069e-02,-2.545230116959379e-02,1.985722005765801e-02,-1.563651804071585e-02,1.209320398710419e-02,-8.614931464625636e-03,3.405185046994391e-03}, +{ 4.726312856949239e-02,9.927936442293724e-01,-5.728855989722225e-02,2.747210720272178e-02,-1.722529048240936e-02,1.211926393668521e-02,-9.074970848954773e-03,7.031239861583820e-03,-5.513314447408918e-03,4.252932841269525e-03,-3.025269092422300e-03,1.195088127292431e-03}, +{ -1.148950092163486e-01,5.234792769094767e-01,7.288497406107711e-01,-2.083249645532708e-01,1.165686074723738e-01,-7.830822289023391e-02,5.727497992260696e-02,-4.378558867296307e-02,3.405681518217304e-02,-2.614302724285311e-02,1.854556980109454e-02,-7.318177322826700e-03}, +{ 4.897069783197941e-02,-1.610894031996001e-01,9.148133648613241e-01,2.695817685647709e-01,-1.140677622756894e-01,7.021137519232463e-02,-4.932494781217774e-02,3.689271386779507e-02,-2.833026977425305e-02,2.158193493948130e-02,-1.524526932821752e-02,6.005797132262447e-03}, +{ 1.889499574267326e-02,-5.539662574096428e-02,1.292387475991107e-01,9.769809679581752e-01,-1.020426538333286e-01,5.253335135381872e-02,-3.446897060366810e-02,2.491606186456306e-02,-1.877052388011833e-02,1.414132483421423e-02,-9.928716995105401e-03,3.902041700629843e-03}, +{ -3.876970257536675e-02,1.075285386755183e-01,-1.989787945695717e-01,5.887146690447046e-01,6.799125464874241e-01,-2.125845229171270e-01,1.231603194314326e-01,-8.434204961115108e-02,6.176422178259365e-02,-4.580033578083477e-02,3.188467248823085e-02,-1.248956245585280e-02}, +{ 1.720818854587449e-02,-4.625955208956142e-02,7.701946470408846e-02,-1.484511844041323e-01,9.413088229225273e-01,2.200982535040504e-01,-9.686740079522188e-02,6.055767243608752e-02,-4.248562836780756e-02,3.079746347284237e-02,-2.118828443255388e-02,8.262184503806735e-03}, +{ 1.158707719176369e-02,-3.055242963461318e-02,4.796631621777794e-02,-7.845474546645216e-02,1.753689928481349e-01,9.608002734630314e-01,-1.278819330346733e-01,6.706528267885012e-02,-4.393757613207522e-02,3.080547867950370e-02,-2.084403420117141e-02,8.077297389923433e-03}, +{ -2.050781250000003e-02,5.338924668599512e-02,-8.082637279563827e-02,1.213558294955849e-01,-2.090335758180521e-01,6.356226849321103e-01,6.356226849321105e-01,-2.090335758180521e-01,1.213558294955844e-01,-8.082637279563784e-02,5.338924668599514e-02,-2.050781250000008e-02}, +{ 8.077297389923563e-03,-2.084403420117153e-02,3.080547867950377e-02,-4.393757613207523e-02,6.706528267885001e-02,-1.278819330346732e-01,9.608002734630322e-01,1.753689928481342e-01,-7.845474546645217e-02,4.796631621777786e-02,-3.055242963461312e-02,1.158707719176373e-02}, +{ 8.262184503807054e-03,-2.118828443255408e-02,3.079746347284293e-02,-4.248562836780804e-02,6.055767243608794e-02,-9.686740079522278e-02,2.200982535040516e-01,9.413088229225268e-01,-1.484511844041327e-01,7.701946470408851e-02,-4.625955208956144e-02,1.720818854587429e-02}, +{ -1.248956245585262e-02,3.188467248823091e-02,-4.580033578083482e-02,6.176422178259339e-02,-8.434204961115067e-02,1.231603194314317e-01,-2.125845229171265e-01,6.799125464874214e-01,5.887146690447069e-01,-1.989787945695710e-01,1.075285386755182e-01,-3.876970257536695e-02}, +{ 3.902041700629869e-03,-9.928716995105604e-03,1.414132483421457e-02,-1.877052388011876e-02,2.491606186456346e-02,-3.446897060366855e-02,5.253335135381978e-02,-1.020426538333312e-01,9.769809679581751e-01,1.292387475991132e-01,-5.539662574096536e-02,1.889499574267349e-02}, +{ 6.005797132262292e-03,-1.524526932821760e-02,2.158193493948179e-02,-2.833026977425365e-02,3.689271386779498e-02,-4.932494781217776e-02,7.021137519232523e-02,-1.140677622756907e-01,2.695817685647728e-01,9.148133648613235e-01,-1.610894031996009e-01,4.897069783197998e-02}, +{ -7.318177322826760e-03,1.854556980109457e-02,-2.614302724285296e-02,3.405681518217248e-02,-4.378558867296303e-02,5.727497992260731e-02,-7.830822289023434e-02,1.165686074723745e-01,-2.083249645532711e-01,7.288497406107686e-01,5.234792769094808e-01,-1.148950092163501e-01}, +{ 1.195088127292354e-03,-3.025269092422226e-03,4.252932841269636e-03,-5.513314447409350e-03,7.031239861584240e-03,-9.074970848955109e-03,1.211926393668509e-02,-1.722529048240863e-02,2.747210720272029e-02,-5.728855989721896e-02,9.927936442293729e-01,4.726312856948973e-02}, +{ 3.405185046994561e-03,-8.614931464626005e-03,1.209320398710461e-02,-1.563651804071641e-02,1.985722005765857e-02,-2.545230116959461e-02,3.359163286125184e-02,-4.670316263398205e-02,7.096779080507924e-02,-1.277725318267827e-01,3.650474683312334e-01,7.192169440463796e-01} +}; +#endif +#if p_Nq==12 && p_cubNq==18 +const dfloat c_I[18][12] = { +{ 7.459871197833838e-01,3.316867348413837e-01,-1.179298884174735e-01,6.569763016972320e-02,-4.328207355492543e-02,3.114701967991654e-02,-2.360664446180328e-02,1.842034407749943e-02,-1.450653708925501e-02,1.121998647469942e-02,-7.993154128475254e-03,3.159462625326422e-03}, +{ 9.782062442679672e-02,9.730354568426142e-01,-1.021900102037668e-01,5.011237970724671e-02,-3.163449047698449e-02,2.232393798214220e-02,-1.674271550595409e-02,1.298408705058184e-02,-1.018674486096434e-02,7.860667684767919e-03,-5.592659832206065e-03,2.209467185726192e-03}, +{ -1.287038973652634e-01,6.501758170143148e-01,6.087103231616995e-01,-1.993104598359153e-01,1.141624321289298e-01,-7.737788209539419e-02,5.684419642483992e-02,-4.356380250148022e-02,3.393437987633271e-02,-2.607221801657771e-02,1.850453671591139e-02,-7.303425507397213e-03}, +{ 2.418939529305075e-02,-8.272506249582444e-02,9.844412456489886e-01,1.040958153519803e-01,-4.784241323426455e-02,3.012247226683110e-02,-2.137314515428234e-02,1.607054808541789e-02,-1.237839178825563e-02,9.446822850730020e-03,-6.679789616948340e-03,2.632502792576543e-03}, +{ 4.429735311978530e-02,-1.326798115911041e-01,3.456295811489461e-01,8.697047418145629e-01,-1.903714077977177e-01,1.041244505065732e-01,-6.979020074310975e-02,5.096888913737878e-02,-3.861507709032983e-02,2.918639807930895e-02,-2.052818757196659e-02,8.073270987672554e-03}, +{ -3.667290428961760e-02,1.031189536499098e-01,-2.008807371310013e-01,8.270564680937538e-01,4.102991443929295e-01,-1.614511625184032e-01,9.812823460242470e-02,-6.853556589331689e-02,5.069583052079742e-02,-3.780156709654384e-02,2.639385323406300e-02,-1.035054756499528e-02}, +{ -5.081145631971654e-03,1.379097006364848e-02,-2.368267974098387e-02,5.062265608979833e-02,9.960721753056301e-01,-4.596827104475153e-02,2.307184442863678e-02,-1.500368561961514e-02,1.071572056917995e-02,-7.840064917519339e-03,5.419693572062565e-03,-2.117213074114572e-03}, +{ 2.630886630039487e-02,-6.986317904134121e-02,1.120086294142277e-01,-1.934712010509978e-01,5.428739572787132e-01,7.222643447526533e-01,-2.144508949367920e-01,1.230432278340951e-01,-8.331115857031404e-02,5.933062650756488e-02,-4.045511466522646e-02,1.572189617702224e-02}, +{ -1.258294371341761e-02,3.293585143804015e-02,-5.062692149310738e-02,7.868817151187298e-02,-1.489989933027328e-01,9.413402016677042e-01,2.202190838640953e-01,-9.683613428382536e-02,6.009754954536385e-02,-4.113214277201849e-02,2.751251072698619e-02,-1.061623318896099e-02}, +{ -1.061623318896089e-02,2.751251072698594e-02,-4.113214277201860e-02,6.009754954536377e-02,-9.683613428382468e-02,2.202190838640939e-01,9.413402016677048e-01,-1.489989933027320e-01,7.868817151187209e-02,-5.062692149310678e-02,3.293585143803988e-02,-1.258294371341749e-02}, +{ 1.572189617702218e-02,-4.045511466522674e-02,5.933062650756538e-02,-8.331115857031436e-02,1.230432278340952e-01,-2.144508949367924e-01,7.222643447526553e-01,5.428739572787121e-01,-1.934712010509979e-01,1.120086294142273e-01,-6.986317904134125e-02,2.630886630039509e-02}, +{ -2.117213074114436e-03,5.419693572062411e-03,-7.840064917519006e-03,1.071572056917954e-02,-1.500368561961447e-02,2.307184442863595e-02,-4.596827104475045e-02,9.960721753056303e-01,5.062265608979697e-02,-2.368267974098317e-02,1.379097006364815e-02,-5.081145631971693e-03}, +{ -1.035054756499511e-02,2.639385323406298e-02,-3.780156709654362e-02,5.069583052079733e-02,-6.853556589331671e-02,9.812823460242408e-02,-1.614511625184025e-01,4.102991443929276e-01,8.270564680937542e-01,-2.008807371309997e-01,1.031189536499092e-01,-3.667290428961758e-02}, +{ 8.073270987672375e-03,-2.052818757196659e-02,2.918639807930906e-02,-3.861507709033017e-02,5.096888913737873e-02,-6.979020074310945e-02,1.041244505065738e-01,-1.903714077977189e-01,8.697047418145634e-01,3.456295811489460e-01,-1.326798115911043e-01,4.429735311978588e-02}, +{ 2.632502792576562e-03,-6.679789616948544e-03,9.446822850730684e-03,-1.237839178825659e-02,1.607054808541841e-02,-2.137314515428277e-02,3.012247226683213e-02,-4.784241323426670e-02,1.040958153519847e-01,9.844412456489875e-01,-8.272506249582715e-02,2.418939529305185e-02}, +{ -7.303425507397293e-03,1.850453671591165e-02,-2.607221801657785e-02,3.393437987633242e-02,-4.356380250148008e-02,5.684419642483980e-02,-7.737788209539428e-02,1.141624321289306e-01,-1.993104598359159e-01,6.087103231616984e-01,6.501758170143175e-01,-1.287038973652650e-01}, +{ 2.209467185726288e-03,-5.592659832206304e-03,7.860667684768115e-03,-1.018674486096468e-02,1.298408705058234e-02,-1.674271550595507e-02,2.232393798214302e-02,-3.163449047698470e-02,5.011237970724597e-02,-1.021900102037653e-01,9.730354568426142e-01,9.782062442679614e-02}, +{ 3.159462625326627e-03,-7.993154128475346e-03,1.121998647469943e-02,-1.450653708925504e-02,1.842034407749938e-02,-2.360664446180308e-02,3.114701967991618e-02,-4.328207355492523e-02,6.569763016972253e-02,-1.179298884174717e-01,3.316867348413795e-01,7.459871197833867e-01} +}; +#endif +#if p_Nq==13 && p_cubNq==13 +const dfloat c_I[13][13] = { +{ 4.966541984977693e-01,6.268112333750365e-01,-1.849271946315004e-01,9.977170894554586e-02,-6.503054114628654e-02,4.663446864614598e-02,-3.538252456393649e-02,2.778461632876750e-02,-2.221949965296691e-02,1.779929114084909e-02,-1.392466765044426e-02,9.987997178743921e-03,-3.959086467723480e-03}, +{ -1.322128148544343e-01,7.515252122974930e-01,4.961401313754205e-01,-1.781830013737518e-01,1.040030633348472e-01,-7.111914920687472e-02,5.263135240496547e-02,-4.073214556232476e-02,3.228038441089176e-02,-2.570997226323359e-02,2.004052229982314e-02,-1.434493282204991e-02,5.681349959228118e-03}, +{ 5.875681753986520e-02,-1.892836984469139e-01,8.449889133949112e-01,3.824020403954938e-01,-1.518183359605115e-01,9.209500235732283e-02,-6.447480356571590e-02,4.840132151248421e-02,-3.766457120778625e-02,2.965870315400987e-02,-2.295595617198251e-02,1.636591563403487e-02,-6.471348635211951e-03}, +{ -2.956493451685995e-02,8.376691561955144e-02,-1.681878723566684e-01,9.128787352892488e-01,2.752042516836289e-01,-1.173628663237213e-01,7.297139483865826e-02,-5.173738729472759e-02,3.899406898878592e-02,-3.012319932976354e-02,2.304702009425215e-02,-1.632441488444598e-02,6.438288192061205e-03}, +{ 1.445402619915252e-02,-3.889984013570730e-02,6.501305264926403e-02,-1.269280960465973e-01,9.611282154111985e-01,1.745799258405581e-01,-7.879027413265349e-02,4.983999732988989e-02,-3.551548826622723e-02,2.658919914685570e-02,-1.997601992381039e-02,1.400860887927714e-02,-5.503306951200005e-03}, +{ -5.553592690907007e-03,1.456461056627763e-02,-2.251586645002912e-02,3.549257190431028e-02,-7.024952138960985e-02,9.902482534331144e-01,8.220566878233848e-02,-3.886438152492342e-02,2.486037221024541e-02,-1.764587569917415e-02,1.287816153441415e-02,-8.893676459919786e-03,3.473275783862984e-03}, +{ 0.000000000000000e+00,5.011507874541444e-17,-4.582854220441720e-17,4.760605791123289e-18,-1.458746229354359e-16,2.529177906008857e-16,1.000000000000000e+00,-2.470623412757891e-16,1.403050855930400e-16,3.191521892358811e-19,4.147353304286376e-17,-4.680875201782755e-17,-2.220446049250313e-16}, +{ 3.473275783862989e-03,-8.893676459920188e-03,1.287816153441493e-02,-1.764587569917502e-02,2.486037221024628e-02,-3.886438152492457e-02,8.220566878234109e-02,9.902482534331134e-01,-7.024952138961096e-02,3.549257190431070e-02,-2.251586645002967e-02,1.456461056627823e-02,-5.553592690907275e-03}, +{ -5.503306951200068e-03,1.400860887927730e-02,-1.997601992381053e-02,2.658919914685582e-02,-3.551548826622758e-02,4.983999732989027e-02,-7.879027413265383e-02,1.745799258405583e-01,9.611282154111984e-01,-1.269280960465971e-01,6.501305264926414e-02,-3.889984013570770e-02,1.445402619915250e-02}, +{ 6.438288192060996e-03,-1.632441488444594e-02,2.304702009425231e-02,-3.012319932976399e-02,3.899406898878646e-02,-5.173738729472784e-02,7.297139483865850e-02,-1.173628663237218e-01,2.752042516836303e-01,9.128787352892481e-01,-1.681878723566693e-01,8.376691561955249e-02,-2.956493451686060e-02}, +{ -6.471348635211642e-03,1.636591563403486e-02,-2.295595617198256e-02,2.965870315401027e-02,-3.766457120778665e-02,4.840132151248453e-02,-6.447480356571637e-02,9.209500235732331e-02,-1.518183359605124e-01,3.824020403954962e-01,8.449889133949109e-01,-1.892836984469164e-01,5.875681753986601e-02}, +{ 5.681349959228088e-03,-1.434493282204960e-02,2.004052229982279e-02,-2.570997226323327e-02,3.228038441089150e-02,-4.073214556232466e-02,5.263135240496557e-02,-7.111914920687429e-02,1.040030633348462e-01,-1.781830013737504e-01,4.961401313754198e-01,7.515252122974936e-01,-1.322128148544354e-01}, +{ -3.959086467723361e-03,9.987997178743749e-03,-1.392466765044417e-02,1.779929114084901e-02,-2.221949965296683e-02,2.778461632876739e-02,-3.538252456393609e-02,4.663446864614514e-02,-6.503054114628511e-02,9.977170894554357e-02,-1.849271946314967e-01,6.268112333750281e-01,4.966541984977753e-01} +}; +#endif +#if p_Nq==13 && p_cubNq==14 +const dfloat c_I[14][13] = { +{ 5.515439363512669e-01,5.651767944343866e-01,-1.754157350917031e-01,9.540990382997248e-02,-6.236458530145037e-02,4.478208048465975e-02,-3.400162875168509e-02,2.671174745199816e-02,-2.136730608686303e-02,1.711960662841305e-02,-1.339441170335383e-02,9.608258988963569e-03,-3.808661234605026e-03}, +{ -1.219748126264120e-01,8.644719496011302e-01,3.450488483707588e-01,-1.363808033789511e-01,8.130449586937487e-02,-5.607424198253338e-02,4.167811497109310e-02,-3.233591009732273e-02,2.566572892332385e-02,-2.046158974485617e-02,1.595921512392408e-02,-1.142753711278603e-02,4.526542083256510e-03}, +{ 3.307151173515328e-02,-1.116328003652196e-01,9.684916423741153e-01,1.533821650459114e-01,-6.887844709886819e-02,4.314070543099593e-02,-3.062465754693483e-02,2.316015465823345e-02,-1.810095556020634e-02,1.429166222966951e-02,-1.108003651866054e-02,7.906633124297274e-03,-3.127577508486740e-03}, +{ 9.331201864807963e-04,-2.702139905919118e-03,5.952700545451896e-03,9.999399802280117e-01,-5.924415813902724e-03,2.906374350095301e-03,-1.882007874698388e-03,1.359210696847895e-03,-1.034735560329895e-03,8.040658307702770e-04,-6.173577049113815e-04,4.381407952252223e-04,-1.729357731216263e-04}, +{ -1.550051263898442e-02,4.225441661250432e-02,-7.363638144585057e-02,1.663645874783578e-01,9.641756415223868e-01,-1.232543504098135e-01,6.464519155830209e-02,-4.287692917879676e-02,3.123782373755598e-02,-2.367026507331691e-02,1.790613616728871e-02,-1.260421092887093e-02,4.958852599237212e-03}, +{ 2.039502561249501e-02,-5.394075205114768e-02,8.546344965122682e-02,-1.431800968861910e-01,3.518096256658871e-01,8.676454483692579e-01,-1.929052461938543e-01,1.065865568042387e-01,-7.185028175999274e-02,5.228085921891097e-02,-3.866228027737485e-02,2.688501385644482e-02,-1.052732200990078e-02}, +{ -1.973196650105806e-02,5.129352921395832e-02,-7.735084416128986e-02,1.152177474099421e-01,-1.948320811112234e-01,5.436653873014317e-01,7.220242515113264e-01,-2.148724916360421e-01,1.235839404202301e-01,-8.386754038848974e-02,5.983592486607595e-02,-4.084939220749639e-02,1.588353528263513e-02}, +{ 1.588353528263504e-02,-4.084939220749634e-02,5.983592486607607e-02,-8.386754038849020e-02,1.235839404202307e-01,-2.148724916360427e-01,7.220242515113254e-01,5.436653873014331e-01,-1.948320811112230e-01,1.152177474099418e-01,-7.735084416128987e-02,5.129352921395872e-02,-1.973196650105864e-02}, +{ -1.052732200990074e-02,2.688501385644480e-02,-3.866228027737513e-02,5.228085921891181e-02,-7.185028175999365e-02,1.065865568042393e-01,-1.929052461938553e-01,8.676454483692578e-01,3.518096256658878e-01,-1.431800968861910e-01,8.546344965122714e-02,-5.394075205114832e-02,2.039502561249536e-02}, +{ 4.958852599237527e-03,-1.260421092887057e-02,1.790613616728853e-02,-2.367026507331706e-02,3.123782373755592e-02,-4.287692917879675e-02,6.464519155830172e-02,-1.232543504098123e-01,9.641756415223879e-01,1.663645874783552e-01,-7.363638144585000e-02,4.225441661250417e-02,-1.550051263898406e-02}, +{ -1.729357731214498e-04,4.381407952252479e-04,-6.173577049115382e-04,8.040658307704743e-04,-1.034735560330245e-03,1.359210696848047e-03,-1.882007874698507e-03,2.906374350095667e-03,-5.924415813902735e-03,9.999399802280113e-01,5.952700545452122e-03,-2.702139905919227e-03,9.331201864808784e-04}, +{ -3.127577508486679e-03,7.906633124297555e-03,-1.108003651866115e-02,1.429166222967017e-02,-1.810095556020705e-02,2.316015465823427e-02,-3.062465754693578e-02,4.314070543099723e-02,-6.887844709887038e-02,1.533821650459164e-01,9.684916423741144e-01,-1.116328003652235e-01,3.307151173515465e-02}, +{ 4.526542083256563e-03,-1.142753711278598e-02,1.595921512392398e-02,-2.046158974485582e-02,2.566572892332355e-02,-3.233591009732246e-02,4.167811497109276e-02,-5.607424198253304e-02,8.130449586937492e-02,-1.363808033789513e-01,3.450488483707604e-01,8.644719496011296e-01,-1.219748126264132e-01}, +{ -3.808661234604993e-03,9.608258988963402e-03,-1.339441170335377e-02,1.711960662841297e-02,-2.136730608686290e-02,2.671174745199799e-02,-3.400162875168478e-02,4.478208048465923e-02,-6.236458530144932e-02,9.540990382997094e-02,-1.754157350917015e-01,5.651767944343871e-01,5.515439363512656e-01} +}; +#endif +#if p_Nq==13 && p_cubNq==15 +const dfloat c_I[15][13] = { +{ 5.986824254606107e-01,5.105963662124758e-01,-1.646768154687260e-01,9.014931938246706e-02,-5.906104435577232e-02,4.245543894213248e-02,-3.225394002182380e-02,2.534760440043517e-02,-2.028055213011487e-02,1.625118777975767e-02,-1.271609341599147e-02,9.122147223773644e-03,-3.616044009224032e-03}, +{ -9.735537783320818e-02,9.380915097898311e-01,2.179262754802436e-01,-9.205135217831686e-02,5.577292256230346e-02,-3.872470527620126e-02,2.888241045398354e-02,-2.245319726356635e-02,1.784360217542532e-02,-1.423667980841467e-02,1.110949437318026e-02,-7.957153782163695e-03,3.152251306903628e-03}, +{ -3.773178855648788e-03,1.342412340269770e-02,9.996815317859098e-01,-1.337937468271381e-02,6.506233795746000e-03,-4.172035838046138e-03,2.993275328469445e-03,-2.276746264940177e-03,1.785498347431750e-03,-1.412739863485181e-03,1.096705420889857e-03,-7.831841842309158e-04,3.098916079204844e-04}, +{ 3.394158813488808e-02,-1.006686872240990e-01,2.489192953790359e-01,9.260332355472921e-01,-1.607190031541480e-01,8.588681771229775e-02,-5.723772122243492e-02,4.190949692624228e-02,-3.214944487387351e-02,2.509670482745193e-02,-1.932222991283928e-02,1.373421350611759e-02,-5.424265645931022e-03}, +{ -3.675245940436646e-02,1.015936544049713e-01,-1.857940719150530e-01,5.179609498556215e-01,7.430135782688403e-01,-2.138439645489541e-01,1.224178936838797e-01,-8.386429040032586e-02,6.208619494660239e-02,-4.747010078193068e-02,3.609849919385862e-02,-2.548287064415233e-02,1.003698734100861e-02}, +{ 2.700117465208727e-02,-7.206986049835569e-02,1.173829425044842e-01,-2.120058309524838e-01,7.648074989274423e-01,4.926532456739114e-01,-1.839804878726300e-01,1.109165054897636e-01,-7.739841755942072e-02,5.730935007058939e-02,-4.278933428024449e-02,2.990696522312833e-02,-1.173375137827163e-02}, +{ -1.294222526584433e-02,3.386158982509992e-02,-5.199628274241015e-02,8.067752771293431e-02,-1.522840046849961e-01,9.378342723407894e-01,2.277838717727101e-01,-1.001221084653866e-01,6.271093931546623e-02,-4.409388775987325e-02,3.202314866947509e-02,-2.205954801933641e-02,8.606707301371736e-03}, +{ 1.110223024625157e-16,7.511011487598128e-17,-1.036979629396523e-16,2.437231077283910e-17,-2.488904254062834e-16,5.374503050268821e-16,9.999999999999998e-01,-5.304659467336102e-16,2.422471018682712e-16,-1.831319424597006e-17,9.850332412271235e-17,-7.116634058677126e-17,1.110223024625156e-16}, +{ 8.606707301371684e-03,-2.205954801933609e-02,3.202314866947482e-02,-4.409388775987309e-02,6.271093931546573e-02,-1.001221084653855e-01,2.277838717727071e-01,9.378342723407911e-01,-1.522840046849943e-01,8.067752771293325e-02,-5.199628274240973e-02,3.386158982509981e-02,-1.294222526584453e-02}, +{ -1.173375137827152e-02,2.990696522312825e-02,-4.278933428024467e-02,5.730935007058986e-02,-7.739841755942112e-02,1.109165054897641e-01,-1.839804878726306e-01,4.926532456739114e-01,7.648074989274423e-01,-2.120058309524834e-01,1.173829425044843e-01,-7.206986049835636e-02,2.700117465208749e-02}, +{ 1.003698734100830e-02,-2.548287064415230e-02,3.609849919385880e-02,-4.747010078193094e-02,6.208619494660300e-02,-8.386429040032641e-02,1.224178936838800e-01,-2.138439645489544e-01,7.430135782688411e-01,5.179609498556207e-01,-1.857940719150534e-01,1.015936544049722e-01,-3.675245940436677e-02}, +{ -5.424265645931042e-03,1.373421350611735e-02,-1.932222991283922e-02,2.509670482745185e-02,-3.214944487387333e-02,4.190949692624224e-02,-5.723772122243434e-02,8.588681771229713e-02,-1.607190031541466e-01,9.260332355472939e-01,2.489192953790321e-01,-1.006686872240982e-01,3.394158813488821e-02}, +{ 3.098916079205694e-04,-7.831841842307078e-04,1.096705420889399e-03,-1.412739863484655e-03,1.785498347431068e-03,-2.276746264939460e-03,2.993275328468475e-03,-4.172035838044808e-03,6.506233795743860e-03,-1.337937468271000e-02,9.996815317859101e-01,1.342412340269398e-02,-3.773178855647692e-03}, +{ 3.152251306903678e-03,-7.957153782163341e-03,1.110949437317989e-02,-1.423667980841460e-02,1.784360217542542e-02,-2.245319726356654e-02,2.888241045398371e-02,-3.872470527620134e-02,5.577292256230398e-02,-9.205135217831810e-02,2.179262754802473e-01,9.380915097898298e-01,-9.735537783320974e-02}, +{ -3.616044009223995e-03,9.122147223773632e-03,-1.271609341599162e-02,1.625118777975791e-02,-2.028055213011515e-02,2.534760440043561e-02,-3.225394002182406e-02,4.245543894213227e-02,-5.906104435577166e-02,9.014931938246606e-02,-1.646768154687263e-01,5.105963662124852e-01,5.986824254606021e-01} +}; +#endif +#if p_Nq==13 && p_cubNq==16 +const dfloat c_I[16][13] = { +{ 6.392768230119900e-01,4.624857375522477e-01,-1.536892650382099e-01,8.457626922905584e-02,-5.551365791176343e-02,3.994056090335934e-02,-3.035793142009432e-02,2.386442082825993e-02,-1.909730966526010e-02,1.530481100444894e-02,-1.197645938592809e-02,8.591918968163657e-03,-3.405918076269448e-03}, +{ -6.270494127604490e-02,9.801554449357202e-01,1.148046534996095e-01,-5.090687495239397e-02,3.123886926217329e-02,-2.180714247670942e-02,1.631018194077393e-02,-1.270016727253177e-02,1.010301366530274e-02,-8.065947058970445e-03,6.296737450017786e-03,-4.511066513178189e-03,1.787238796231389e-03}, +{ -4.264491849609414e-02,1.609954867550092e-01,9.641883244385869e-01,-1.208589744855592e-01,6.212954229085806e-02,-4.056006308823931e-02,2.934409998993691e-02,-2.242224055670683e-02,1.763261467829104e-02,-1.397537685916808e-02,1.086056598112260e-02,-7.760485705280694e-03,3.071425057243327e-03}, +{ 5.679355896285436e-02,-1.729146497189622e-01,4.956359504063331e-01,7.595924761912342e-01,-2.115460837836945e-01,1.197436911005386e-01,-8.151849851722053e-02,6.032138693344673e-02,-4.655132067218216e-02,3.647070371071046e-02,-2.814087228469845e-02,2.002719263260256e-02,-7.913534960962238e-03}, +{ -3.786329348152945e-02,1.062570285202056e-01,-2.054829758265046e-01,8.016098725637334e-01,4.448482087031063e-01,-1.712521179275260e-01,1.038027606441396e-01,-7.279103007858054e-02,5.454245245933142e-02,-4.199159225254514e-02,3.206269867045567e-02,-2.268488630528724e-02,8.942874311001136e-03}, +{ 1.084058716906794e-02,-2.922255641360254e-02,4.909326751098613e-02,-9.745765289615531e-02,9.794354336258180e-01,1.228033910262926e-01,-5.686502552470050e-02,3.624006605127865e-02,-2.591182864059697e-02,1.943451803507864e-02,-1.461587578738194e-02,1.025540750133840e-02,-4.029731657422989e-03}, +{ 1.001251314882174e-02,-2.637968863991280e-02,4.132765401806788e-02,-6.728784941465703e-02,1.482740444465318e-01,9.710410785718648e-01,-1.132288578813063e-01,5.910967953651090e-02,-3.904247586747344e-02,2.812964820901238e-02,-2.069198291911935e-02,1.434870087192519e-02,-5.612464080265812e-03}, +{ -1.849040316256466e-02,4.802783380833411e-02,-7.226706503358454e-02,1.071300364845868e-01,-1.790141444012911e-01,4.732071520992022e-01,7.807215059398747e-01,-2.120356602060137e-01,1.201530059896162e-01,-8.107094562303112e-02,5.767907598770509e-02,-3.932202875898322e-02,1.528163687614935e-02}, +{ 1.528163687614943e-02,-3.932202875898320e-02,5.767907598770541e-02,-8.107094562303205e-02,1.201530059896173e-01,-2.120356602060146e-01,7.807215059398730e-01,4.732071520992043e-01,-1.790141444012913e-01,1.071300364845868e-01,-7.226706503358479e-02,4.802783380833466e-02,-1.849040316256496e-02}, +{ -5.612464080265847e-03,1.434870087192535e-02,-2.069198291911971e-02,2.812964820901295e-02,-3.904247586747427e-02,5.910967953651172e-02,-1.132288578813081e-01,9.710410785718641e-01,1.482740444465347e-01,-6.728784941465819e-02,4.132765401806882e-02,-2.637968863991363e-02,1.001251314882218e-02}, +{ -4.029731657423010e-03,1.025540750133846e-02,-1.461587578738194e-02,1.943451803507880e-02,-2.591182864059723e-02,3.624006605127862e-02,-5.686502552470016e-02,1.228033910262919e-01,9.794354336258179e-01,-9.745765289615460e-02,4.909326751098580e-02,-2.922255641360266e-02,1.084058716906828e-02}, +{ 8.942874311000854e-03,-2.268488630528724e-02,3.206269867045580e-02,-4.199159225254551e-02,5.454245245933192e-02,-7.279103007858108e-02,1.038027606441404e-01,-1.712521179275266e-01,4.448482087031079e-01,8.016098725637320e-01,-2.054829758265051e-01,1.062570285202067e-01,-3.786329348153034e-02}, +{ -7.913534960962233e-03,2.002719263260248e-02,-2.814087228469852e-02,3.647070371071074e-02,-4.655132067218233e-02,6.032138693344699e-02,-8.151849851722044e-02,1.197436911005385e-01,-2.115460837836950e-01,7.595924761912368e-01,4.956359504063312e-01,-1.729146497189633e-01,5.679355896285489e-02}, +{ 3.071425057243171e-03,-7.760485705280311e-03,1.086056598112210e-02,-1.397537685916725e-02,1.763261467829010e-02,-2.242224055670598e-02,2.934409998993587e-02,-4.056006308823775e-02,6.212954229085526e-02,-1.208589744855539e-01,9.641883244385895e-01,1.609954867550016e-01,-4.264491849609227e-02}, +{ 1.787238796231598e-03,-4.511066513178394e-03,6.296737450018038e-03,-8.065947058970723e-03,1.010301366530320e-02,-1.270016727253238e-02,1.631018194077482e-02,-2.180714247671025e-02,3.123886926217457e-02,-5.090687495239656e-02,1.148046534996159e-01,9.801554449357184e-01,-6.270494127604816e-02}, +{ -3.405918076269554e-03,8.591918968163784e-03,-1.197645938592837e-02,1.530481100444928e-02,-1.909730966526025e-02,2.386442082826034e-02,-3.035793142009478e-02,3.994056090335982e-02,-5.551365791176375e-02,8.457626922905646e-02,-1.536892650382122e-01,4.624857375522624e-01,6.392768230119769e-01} +}; +#endif +#if p_Nq==13 && p_cubNq==17 +const dfloat c_I[17][13] = { +{ 6.743671931694989e-01,4.201353710938754e-01,-1.429853988983901e-01,7.902578937660190e-02,-5.195092302531437e-02,3.740458736715394e-02,-2.844174877465327e-02,2.236344293712357e-02,-1.789885476934984e-02,1.434574229956890e-02,-1.122664634809281e-02,8.054286204242190e-03,-3.192840632264410e-03}, +{ -2.163856112970917e-02,9.980754051054000e-01,3.319519987190926e-02,-1.527304117410606e-02,9.468423286592084e-03,-6.638782607043314e-03,4.976758232973116e-03,-3.880426738213402e-03,3.089459357980929e-03,-2.467841963832301e-03,1.927180243199760e-03,-1.380921780606279e-03,5.471492954552758e-04}, +{ -7.731619835946295e-02,3.121647998336324e-01,8.864995684263525e-01,-1.805964585107872e-01,9.670521169096068e-02,-6.402434427353101e-02,4.663077593485809e-02,-3.576393937548843e-02,2.818754071152000e-02,-2.237247729051329e-02,1.740131484191048e-02,-1.244040831359245e-02,4.924614684141299e-03}, +{ 6.445363658298482e-02,-2.019296950786656e-01,7.047843801802837e-01,5.591146123240578e-01,-1.956997653794641e-01,1.154330938412840e-01,-7.987709175484710e-02,5.960085572987686e-02,-4.621612418360144e-02,3.631383091869677e-02,-2.806975839621397e-02,1.999665967848898e-02,-7.904634462880594e-03}, +{ -2.117133635212286e-02,6.039101147837658e-02,-1.246405280981737e-01,9.618830411551325e-01,1.720923479346613e-01,-7.742461700505703e-02,4.885071683147833e-02,-3.486268318460731e-02,2.636821893880194e-02,-2.041159371184104e-02,1.563592689366112e-02,-1.108263803287093e-02,4.372133152561116e-03}, +{ -1.427963266906463e-02,3.890452907312215e-02,-6.767164238468357e-02,1.517982060574592e-01,9.696207605771106e-01,-1.152541121332712e-01,6.016647330517169e-02,-3.984153096685997e-02,2.900360597156579e-02,-2.196775045359881e-02,1.661404416819582e-02,-1.169309839823446e-02,4.600147853087492e-03}, +{ 2.622782211300165e-02,-6.962108666837916e-02,1.115141900468803e-01,-1.922658679504988e-01,5.362330239229741e-01,7.281868880564167e-01,-2.147563434479780e-01,1.236179518555050e-01,-8.465004543531583e-02,6.207777688697817e-02,-4.610336555937222e-02,3.213184622581522e-02,-1.259279004602715e-02}, +{ -1.716442001909037e-02,4.483145907327037e-02,-6.850819784712871e-02,1.051170812921827e-01,-1.922113355727420e-01,8.688628182679178e-01,3.502138428718815e-01,-1.438102621477716e-01,8.847080339456452e-02,-6.172092159374673e-02,4.464549128195188e-02,-3.069159602571342e-02,1.196523702442405e-02}, +{ 6.162975822039155e-33,-7.701278857676514e-17,8.439055008340252e-17,-8.202001351902309e-17,1.143911010055117e-16,-3.056089969760702e-16,9.999999999999999e-01,3.245837957804765e-16,-1.324393915188681e-16,9.848115600372775e-17,-9.850311737651180e-17,8.772706188943888e-17,-6.162975822039155e-33}, +{ 1.196523702442421e-02,-3.069159602571318e-02,4.464549128195189e-02,-6.172092159374709e-02,8.847080339456478e-02,-1.438102621477720e-01,3.502138428718817e-01,8.688628182679174e-01,-1.922113355727413e-01,1.051170812921822e-01,-6.850819784712853e-02,4.483145907327064e-02,-1.716442001909047e-02}, +{ -1.259279004602718e-02,3.213184622581539e-02,-4.610336555937260e-02,6.207777688697878e-02,-8.465004543531655e-02,1.236179518555056e-01,-2.147563434479789e-01,7.281868880564178e-01,5.362330239229731e-01,-1.922658679504982e-01,1.115141900468802e-01,-6.962108666837967e-02,2.622782211300196e-02}, +{ 4.600147853087669e-03,-1.169309839823441e-02,1.661404416819594e-02,-2.196775045359931e-02,2.900360597156610e-02,-3.984153096686070e-02,6.016647330517286e-02,-1.152541121332724e-01,9.696207605771100e-01,1.517982060574606e-01,-6.767164238468448e-02,3.890452907312296e-02,-1.427963266906478e-02}, +{ 4.372133152561068e-03,-1.108263803287099e-02,1.563592689366118e-02,-2.041159371184128e-02,2.636821893880207e-02,-3.486268318460734e-02,4.885071683147856e-02,-7.742461700505698e-02,1.720923479346610e-01,9.618830411551321e-01,-1.246405280981738e-01,6.039101147837725e-02,-2.117133635212285e-02}, +{ -7.904634462880880e-03,1.999665967848903e-02,-2.806975839621424e-02,3.631383091869708e-02,-4.621612418360172e-02,5.960085572987768e-02,-7.987709175484799e-02,1.154330938412847e-01,-1.956997653794651e-01,5.591146123240619e-01,7.047843801802809e-01,-2.019296950786674e-01,6.445363658298595e-02}, +{ 4.924614684141446e-03,-1.244040831359248e-02,1.740131484191052e-02,-2.237247729051366e-02,2.818754071152013e-02,-3.576393937548849e-02,4.663077593485819e-02,-6.402434427353045e-02,9.670521169095957e-02,-1.805964585107858e-01,8.864995684263512e-01,3.121647998336335e-01,-7.731619835946366e-02}, +{ 5.471492954554000e-04,-1.380921780606430e-03,1.927180243199840e-03,-2.467841963832419e-03,3.089459357981288e-03,-3.880426738213756e-03,4.976758232973625e-03,-6.638782607044050e-03,9.468423286593560e-03,-1.527304117410869e-02,3.319519987191410e-02,9.980754051053996e-01,-2.163856112971207e-02}, +{ -3.192840632264327e-03,8.054286204242339e-03,-1.122664634809326e-02,1.434574229956931e-02,-1.789885476935035e-02,2.236344293712396e-02,-2.844174877465379e-02,3.740458736715491e-02,-5.195092302531508e-02,7.902578937660285e-02,-1.429853988983931e-01,4.201353710938920e-01,6.743671931694846e-01} +}; +#endif +#if p_Nq==13 && p_cubNq==18 +const dfloat c_I[18][13] = { +{ 7.048290475224602e-01,3.828336972488290e-01,-1.328391401670755e-01,7.368233388386830e-02,-4.850111857281365e-02,3.494215996359668e-02,-2.657828313094893e-02,2.090241453552312e-02,-1.673161870428811e-02,1.341130629557421e-02,-1.049591815469528e-02,7.530266038989568e-03,-2.985146759019485e-03}, +{ 2.305951828211111e-02,9.981501335787708e-01,-3.019268808967118e-02,1.429878505158487e-02,-8.938609950995088e-03,6.290113185596997e-03,-4.724405692831251e-03,3.687791635769490e-03,-2.938135616883878e-03,2.348008902980270e-03,-1.834113078013112e-03,1.314445420023527e-03,-5.208436284426180e-04}, +{ -1.043224652519347e-01,4.546450482712230e-01,7.859582877704389e-01,-2.054338784477047e-01,1.134825643538059e-01,-7.598088984409777e-02,5.564191173279236e-02,-4.280597319751198e-02,3.380047740546790e-02,-2.685879878038569e-02,2.090596210610670e-02,-1.495212398757970e-02,5.919877869379621e-03}, +{ 5.744417477377600e-02,-1.856741089299145e-01,8.580185978444972e-01,3.631579644663772e-01,-1.458621818991914e-01,8.873076896373150e-02,-6.219373686273329e-02,4.671824972146742e-02,-3.636821174375073e-02,2.864436372183870e-02,-2.217392974440819e-02,1.580963012995373e-02,-6.251580441643541e-03}, +{ 4.705103769394225e-03,-1.365926072723933e-02,3.043453178440770e-02,9.985043727789970e-01,-2.883321218583965e-02,1.430546634477882e-02,-9.296924817139236e-03,6.725733237332539e-03,-5.124922536678319e-03,3.984648912978406e-03,-3.060412643881623e-03,2.172389570269366e-03,-8.575134873798201e-04}, +{ -3.373647813608229e-02,9.296780361833001e-02,-1.681600169848090e-01,4.446414517728318e-01,8.018398405883641e-01,-2.086871645665280e-01,1.176132553107849e-01,-8.008575907113120e-02,5.910747443518120e-02,-4.511425219624771e-02,3.427218400129065e-02,-2.418011045565777e-02,9.521771683673387e-03}, +{ 2.458170369074598e-02,-6.577628485279884e-02,1.079622787989429e-01,-1.994201179502591e-01,8.432579779743323e-01,3.883037316064907e-01,-1.556776264844802e-01,9.543393655062346e-02,-6.707021833273406e-02,4.984638383821615e-02,-3.729438483020404e-02,2.609533063310684e-02,-1.024271064198227e-02}, +{ 7.962516524884185e-04,-2.092085446847753e-03,3.251468082087096e-03,-5.191166829200119e-03,1.069982856038088e-02,9.998125759863883e-01,-1.048338548519798e-02,5.185450147899836e-03,-3.364080243767847e-03,2.403364478645849e-03,-1.759982334053062e-03,1.217592874016571e-03,-4.758314428401752e-04}, +{ -1.719661520890097e-02,4.464012568815471e-02,-6.705708521173072e-02,9.904595923323146e-02,-1.640423013429948e-01,4.173781843089904e-01,8.229831692124761e-01,-2.055417764577846e-01,1.150479037491748e-01,-7.726380565417641e-02,5.484652258646082e-02,-3.734907004449520e-02,1.450878914159453e-02}, +{ 1.450878914159421e-02,-3.734907004449520e-02,5.484652258646111e-02,-7.726380565417723e-02,1.150479037491757e-01,-2.055417764577855e-01,8.229831692124747e-01,4.173781843089926e-01,-1.640423013429952e-01,9.904595923323163e-02,-6.705708521173104e-02,4.464012568815527e-02,-1.719661520890121e-02}, +{ -4.758314428401697e-04,1.217592874016457e-03,-1.759982334052996e-03,2.403364478645811e-03,-3.364080243767869e-03,5.185450147899557e-03,-1.048338548519741e-02,9.998125759863886e-01,1.069982856038026e-02,-5.191166829199862e-03,3.251468082086943e-03,-2.092085446847809e-03,7.962516524885015e-04}, +{ -1.024271064198222e-02,2.609533063310700e-02,-3.729438483020443e-02,4.984638383821667e-02,-6.707021833273449e-02,9.543393655062408e-02,-1.556776264844813e-01,3.883037316064922e-01,8.432579779743317e-01,-1.994201179502594e-01,1.079622787989432e-01,-6.577628485279956e-02,2.458170369074660e-02}, +{ 9.521771683673352e-03,-2.418011045565771e-02,3.427218400129077e-02,-4.511425219624780e-02,5.910747443518129e-02,-8.008575907113186e-02,1.176132553107857e-01,-2.086871645665281e-01,8.018398405883641e-01,4.446414517728316e-01,-1.681600169848095e-01,9.296780361833083e-02,-3.373647813608285e-02}, +{ -8.575134873796053e-04,2.172389570269219e-03,-3.060412643881730e-03,3.984648912978493e-03,-5.124922536678337e-03,6.725733237332296e-03,-9.296924817138423e-03,1.430546634477807e-02,-2.883321218583777e-02,9.985043727789966e-01,3.043453178440570e-02,-1.365926072723845e-02,4.705103769394090e-03}, +{ -6.251580441643388e-03,1.580963012995387e-02,-2.217392974440865e-02,2.864436372183938e-02,-3.636821174375165e-02,4.671824972146865e-02,-6.219373686273445e-02,8.873076896373297e-02,-1.458621818991944e-01,3.631579644663857e-01,8.580185978444927e-01,-1.856741089299180e-01,5.744417477377728e-02}, +{ 5.919877869379725e-03,-1.495212398757958e-02,2.090596210610648e-02,-2.685879878038554e-02,3.380047740546804e-02,-4.280597319751188e-02,5.564191173279193e-02,-7.598088984409734e-02,1.134825643538050e-01,-2.054338784477029e-01,7.859582877704383e-01,4.546450482712229e-01,-1.043224652519351e-01}, +{ -5.208436284427249e-04,1.314445420023682e-03,-1.834113078013317e-03,2.348008902980270e-03,-2.938135616883871e-03,3.687791635769358e-03,-4.724405692830895e-03,6.290113185596454e-03,-8.938609950994143e-03,1.429878505158333e-02,-3.019268808966903e-02,9.981501335787706e-01,2.305951828211028e-02}, +{ -2.985146759019286e-03,7.530266038989367e-03,-1.049591815469537e-02,1.341130629557404e-02,-1.673161870428779e-02,2.090241453552291e-02,-2.657828313094855e-02,3.494215996359604e-02,-4.850111857281278e-02,7.368233388386668e-02,-1.328391401670732e-01,3.828336972488246e-01,7.048290475224634e-01} +}; +#endif +#if p_Nq==13 && p_cubNq==19 +const dfloat c_I[19][13] = { +{ 7.313910804459847e-01,3.499213132070687e-01,-1.233752230384090e-01,6.864039989152691e-02,-4.523196972589811e-02,3.260388542093795e-02,-2.480676984025659e-02,1.951253765607490e-02,-1.562075075562266e-02,1.252175049932342e-02,-9.800164754928399e-03,7.031276788687401e-03,-2.787365794489224e-03}, +{ 6.934726537959565e-02,9.854233113388429e-01,-7.863848532835074e-02,3.811366819468957e-02,-2.399101882721520e-02,1.693394852432540e-02,-1.273931379696046e-02,9.953495103407924e-03,-7.934814380187812e-03,6.343486826996090e-03,-4.956295593352473e-03,3.552488686051022e-03,-1.407736127841888e-03}, +{ -1.222628845857416e-01,5.815733733936658e-01,6.763022169524141e-01,-2.065438635475587e-01,1.169174340664863e-01,-7.900113405426143e-02,5.811564215344949e-02,-4.482338639156923e-02,3.544860302750417e-02,-2.819608699178726e-02,2.196029117912421e-02,-1.571168109692846e-02,6.221475895202541e-03}, +{ 3.914366317351744e-02,-1.309027964935362e-01,9.528010896270540e-01,1.923788823516099e-01,-8.474613569626813e-02,5.279731987874750e-02,-3.739151322375067e-02,2.824197974791501e-02,-2.205622457940203e-02,1.740652193534310e-02,-1.349108518358804e-02,9.625585436884198e-03,-3.807286974526063e-03}, +{ 3.091709782134041e-02,-9.146744182385882e-02,2.232582985927087e-01,9.388445078939734e-01,-1.503803000169577e-01,7.979305017340098e-02,-5.304307030256713e-02,3.879062643678795e-02,-2.973643989886723e-02,2.320346272817210e-02,-1.786013512011770e-02,1.269317648411895e-02,-5.012832968133907e-03}, +{ -3.989793870536916e-02,1.112947994330931e-01,-2.103861907300632e-01,7.043107945870988e-01,5.622051337206165e-01,-1.979463820972626e-01,1.176414015981188e-01,-8.181652987498456e-02,6.104108162762163e-02,-4.687792176945256e-02,3.574100214946188e-02,-2.526674311558648e-02,9.957493176707924e-03}, +{ 8.161169412459484e-03,-2.202516705047324e-02,3.713898630265212e-02,-7.462580968281828e-02,9.887627903101568e-01,8.855220626655481e-02,-4.169563718537869e-02,2.670540252348009e-02,-1.913812309128055e-02,1.437173147937119e-02,-1.081593272831529e-02,7.592007767475824e-03,-2.983624323884290e-03}, +{ 1.978536396602370e-02,-5.231395890056477e-02,8.281872598624769e-02,-1.384587149791403e-01,3.373225367595734e-01,8.768602534404066e-01,-1.893512369636587e-01,1.042547827554894e-01,-7.018635524202449e-02,5.103723166901674e-02,-3.772942771931650e-02,2.623149400842928e-02,-1.027069478048214e-02}, +{ -1.938968389252973e-02,5.057779546341411e-02,-7.700741792805939e-02,1.171813227786301e-01,-2.094503451692519e-01,7.986948822769264e-01,4.500493579902813e-01,-1.733854089922641e-01,1.050237319672240e-01,-7.278677488324763e-02,5.247433147851484e-02,-3.601226391396822e-02,1.403047282433014e-02}, +{ 5.551115123125783e-17,-1.325610399005183e-16,1.198135726965946e-16,-2.939374969665239e-16,4.836164696099873e-16,-9.695181973033953e-16,1.000000000000000e+00,9.919873258006848e-16,-5.049884674309508e-16,3.134300633707123e-16,-1.365250588738221e-16,1.452484146810952e-16,-5.551115123125783e-17}, +{ 1.403047282432974e-02,-3.601226391396826e-02,5.247433147851525e-02,-7.278677488324825e-02,1.050237319672246e-01,-1.733854089922642e-01,4.500493579902802e-01,7.986948822769271e-01,-2.094503451692512e-01,1.171813227786295e-01,-7.700741792805910e-02,5.057779546341423e-02,-1.938968389252988e-02}, +{ -1.027069478048218e-02,2.623149400842931e-02,-3.772942771931655e-02,5.103723166901693e-02,-7.018635524202514e-02,1.042547827554900e-01,-1.893512369636595e-01,8.768602534404076e-01,3.373225367595727e-01,-1.384587149791398e-01,8.281872598624765e-02,-5.231395890056522e-02,1.978536396602401e-02}, +{ -2.983624323884328e-03,7.592007767475948e-03,-1.081593272831535e-02,1.437173147937120e-02,-1.913812309128090e-02,2.670540252348046e-02,-4.169563718537900e-02,8.855220626655559e-02,9.887627903101563e-01,-7.462580968281847e-02,3.713898630265213e-02,-2.202516705047349e-02,8.161169412459980e-03}, +{ 9.957493176707577e-03,-2.526674311558641e-02,3.574100214946217e-02,-4.687792176945306e-02,6.104108162762198e-02,-8.181652987498524e-02,1.176414015981196e-01,-1.979463820972632e-01,5.622051337206199e-01,7.043107945870959e-01,-2.103861907300635e-01,1.112947994330941e-01,-3.989793870536987e-02}, +{ -5.012832968133849e-03,1.269317648411887e-02,-1.786013512011789e-02,2.320346272817229e-02,-2.973643989886711e-02,3.879062643678803e-02,-5.304307030256707e-02,7.979305017340091e-02,-1.503803000169575e-01,9.388445078939733e-01,2.232582985927083e-01,-9.146744182385913e-02,3.091709782134080e-02}, +{ -3.807286974526279e-03,9.625585436884564e-03,-1.349108518358872e-02,1.740652193534386e-02,-2.205622457940316e-02,2.824197974791625e-02,-3.739151322375137e-02,5.279731987874896e-02,-8.474613569627151e-02,1.923788823516168e-01,9.528010896270521e-01,-1.309027964935407e-01,3.914366317351917e-02}, +{ 6.221475895202597e-03,-1.571168109692841e-02,2.196029117912429e-02,-2.819608699178740e-02,3.544860302750428e-02,-4.482338639156886e-02,5.811564215344924e-02,-7.900113405426112e-02,1.169174340664855e-01,-2.065438635475570e-01,6.763022169524114e-01,5.815733733936679e-01,-1.222628845857424e-01}, +{ -1.407736127841855e-03,3.552488686050826e-03,-4.956295593352404e-03,6.343486826996010e-03,-7.934814380187722e-03,9.953495103407661e-03,-1.273931379695973e-02,1.693394852432429e-02,-2.399101882721327e-02,3.811366819468626e-02,-7.863848532834518e-02,9.854233113388439e-01,6.934726537959118e-02}, +{ -2.787365794489349e-03,7.031276788687303e-03,-9.800164754928448e-03,1.252175049932337e-02,-1.562075075562254e-02,1.951253765607470e-02,-2.480676984025635e-02,3.260388542093761e-02,-4.523196972589699e-02,6.864039989152469e-02,-1.233752230384064e-01,3.499213132070641e-01,7.313910804459881e-01} +}; +#endif +#if p_Nq==13 && p_cubNq==20 +const dfloat c_I[20][13] = { +{ 7.546568770156682e-01,3.208107950765345e-01,-1.146330861164525e-01,6.394111486856517e-02,-4.217484912910907e-02,3.041379255195144e-02,-2.314607224624208e-02,1.820892099422297e-02,-1.457848230326720e-02,1.168694970567095e-02,-9.147149227969012e-03,6.562902458918596e-03,-2.601713648492024e-03}, +{ 1.157826229764926e-01,9.637940916823501e-01,-1.150726843851579e-01,5.683516112113916e-02,-3.598308761044063e-02,2.546396425041077e-02,-1.918262358435559e-02,1.499986017618826e-02,-1.196372815711054e-02,9.567474554712505e-03,-7.476774799592360e-03,5.359697495462928e-03,-2.123973720099227e-03}, +{ -1.310764639571166e-01,6.899358549197095e-01,5.665460052787533e-01,-1.926154929369297e-01,1.111932136161861e-01,-7.570525627106378e-02,5.590239587096848e-02,-4.320937353429551e-02,3.421723795021058e-02,-2.723932066069143e-02,2.122615807693608e-02,-1.519095755863837e-02,6.015999205971338e-03}, +{ 1.378603539177797e-02,-4.784674765555513e-02,9.953156383776113e-01,5.489088661165106e-02,-2.583819357445112e-02,1.640510442264487e-02,-1.171696392410754e-02,8.890287336180060e-03,-6.961847795009073e-03,5.503409925976145e-03,-4.269874543400288e-03,3.048245880550486e-03,-1.205980453868611e-03}, +{ 5.120736107591575e-02,-1.546023131314929e-01,4.214202067338926e-01,8.174271453027152e-01,-2.046261894165571e-01,1.140141669142515e-01,-7.715324708655311e-02,5.691954271711484e-02,-4.385065714011544e-02,3.431914768145233e-02,-2.646396849854084e-02,1.882706650441114e-02,-7.438261656493955e-03}, +{ -3.231251027703792e-02,9.131450439845969e-02,-1.814539809907500e-01,8.877852092853544e-01,3.186571017099552e-01,-1.326524162235551e-01,8.196005416152687e-02,-5.795006278595347e-02,4.361212709506740e-02,-3.366170378012039e-02,2.574111600029242e-02,-1.822747852148438e-02,7.188039928245228e-03}, +{ -1.340500341290726e-02,3.650719981970866e-02,-6.341786850941399e-02,1.415460831204004e-01,9.732304201482578e-01,-1.093567268975678e-01,5.689498882180205e-02,-3.763124713588548e-02,2.737916425817953e-02,-2.073092480659962e-02,1.567582807933939e-02,-1.103169289657106e-02,4.339779411257582e-03}, +{ 2.776852321333537e-02,-7.391653298525247e-02,1.193944821536858e-01,-2.106417722731336e-01,6.608587601254412e-01,6.098176356592887e-01,-2.062950352521631e-01,1.217878769769773e-01,-8.424807705859537e-02,6.210225528593417e-02,-4.625270709041374e-02,3.228469165264697e-02,-1.266010040775128e-02}, +{ -6.301106976518823e-03,1.652131587493447e-02,-2.552446717504494e-02,4.017413914483178e-02,-7.914409242153185e-02,9.873033713593518e-01,9.469255608221060e-02,-4.449653927872699e-02,2.841097229874093e-02,-2.014928663748423e-02,1.469878687329313e-02,-1.014872133308098e-02,3.963072189025167e-03}, +{ -1.597015075189322e-02,4.143649084842898e-02,-6.216254367413532e-02,9.155528416830357e-02,-1.505957620813187e-01,3.724307111930765e-01,8.542981168862196e-01,-1.974789000727354e-01,1.093951145735111e-01,-7.318300231027489e-02,5.185326468264134e-02,-3.527824941933835e-02,1.369962595751472e-02}, +{ 1.369962595751440e-02,-3.527824941933817e-02,5.185326468264145e-02,-7.318300231027541e-02,1.093951145735118e-01,-1.974789000727360e-01,8.542981168862195e-01,3.724307111930766e-01,-1.505957620813182e-01,9.155528416830322e-02,-6.216254367413541e-02,4.143649084842937e-02,-1.597015075189313e-02}, +{ 3.963072189025025e-03,-1.014872133308083e-02,1.469878687329292e-02,-2.014928663748421e-02,2.841097229874075e-02,-4.449653927872631e-02,9.469255608220907e-02,9.873033713593520e-01,-7.914409242153048e-02,4.017413914483083e-02,-2.552446717504436e-02,1.652131587493435e-02,-6.301106976518889e-03}, +{ -1.266010040775145e-02,3.228469165264717e-02,-4.625270709041409e-02,6.210225528593461e-02,-8.424807705859608e-02,1.217878769769782e-01,-2.062950352521641e-01,6.098176356592911e-01,6.608587601254391e-01,-2.106417722731332e-01,1.193944821536860e-01,-7.391653298525318e-02,2.776852321333576e-02}, +{ 4.339779411257376e-03,-1.103169289657076e-02,1.567582807933935e-02,-2.073092480659950e-02,2.737916425817910e-02,-3.763124713588534e-02,5.689498882180160e-02,-1.093567268975665e-01,9.732304201482591e-01,1.415460831203977e-01,-6.341786850941313e-02,3.650719981970833e-02,-1.340500341290742e-02}, +{ 7.188039928245148e-03,-1.822747852148422e-02,2.574111600029236e-02,-3.366170378012082e-02,4.361212709506807e-02,-5.795006278595356e-02,8.196005416152681e-02,-1.326524162235553e-01,3.186571017099556e-01,8.877852092853541e-01,-1.814539809907503e-01,9.131450439846056e-02,-3.231251027703855e-02}, +{ -7.438261656493847e-03,1.882706650441104e-02,-2.646396849854120e-02,3.431914768145251e-02,-4.385065714011557e-02,5.691954271711527e-02,-7.715324708655316e-02,1.140141669142515e-01,-2.046261894165571e-01,8.174271453027161e-01,4.214202067338920e-01,-1.546023131314939e-01,5.120736107591638e-02}, +{ -1.205980453868133e-03,3.048245880550614e-03,-4.269874543400546e-03,5.503409925976510e-03,-6.961847795009920e-03,8.890287336180740e-03,-1.171696392410792e-02,1.640510442264592e-02,-2.583819357445336e-02,5.489088661165513e-02,9.953156383776111e-01,-4.784674765555878e-02,1.378603539177905e-02}, +{ 6.015999205971234e-03,-1.519095755863821e-02,2.122615807693594e-02,-2.723932066069134e-02,3.421723795021048e-02,-4.320937353429538e-02,5.590239587096841e-02,-7.570525627106363e-02,1.111932136161856e-01,-1.926154929369288e-01,5.665460052787559e-01,6.899358549197074e-01,-1.310764639571177e-01}, +{ -2.123973720099093e-03,5.359697495462942e-03,-7.476774799592576e-03,9.567474554712586e-03,-1.196372815711048e-02,1.499986017618809e-02,-1.918262358435540e-02,2.546396425041042e-02,-3.598308761043974e-02,5.683516112113755e-02,-1.150726843851558e-01,9.637940916823495e-01,1.157826229764921e-01}, +{ -2.601713648491710e-03,6.562902458918147e-03,-9.147149227968474e-03,1.168694970567029e-02,-1.457848230326629e-02,1.820892099422151e-02,-2.314607224624025e-02,3.041379255194914e-02,-4.217484912910517e-02,6.394111486855925e-02,-1.146330861164430e-01,3.208107950765072e-01,7.546568770156895e-01} +}; +#endif +#if p_Nq==14 && p_cubNq==14 +const dfloat c_I[14][14] = { +{ 4.921250602817405e-01,6.317989081803328e-01,-1.855662494207711e-01,1.000656804008770e-01,-6.524108768101324e-02,4.683688593142965e-02,-3.562178870607788e-02,2.810520910827731e-02,-2.268033056328971e-02,1.849273578502624e-02,-1.501815697412296e-02,1.185835712592064e-02,-8.553570392551557e-03,3.398346924222187e-03}, +{ -1.322736036729205e-01,7.411257110173393e-01,5.085160742462114e-01,-1.810079864944392e-01,1.055023487998643e-01,-7.217041214147482e-02,5.351375351374154e-02,-4.159653860992264e-02,3.325390319522695e-02,-2.694814651625248e-02,2.179630449008190e-02,-1.716549394748408e-02,1.236282113373399e-02,-4.908735013705682e-03}, +{ 5.995929016540594e-02,-1.925066386355137e-01,8.312253806828148e-01,4.020769928138838e-01,-1.577425532764325e-01,9.549736388497951e-02,-6.691277432650533e-02,5.041290294523618e-02,-3.954637450222768e-02,3.166228257757334e-02,-2.540844879527647e-02,1.991002132273496e-02,-1.429780334672068e-02,5.670358490047936e-03}, +{ -3.128824452952417e-02,8.850260303954791e-02,-1.765498034739210e-01,8.980277647394400e-01,3.014688794679057e-01,-1.267975228531991e-01,7.867095644030636e-02,-5.589430821059836e-02,4.241724686311356e-02,-3.327870016030487e-02,2.636431487407025e-02,-2.049308857357716e-02,1.464857315433245e-02,-5.798670777591614e-03}, +{ 1.643334570436416e-02,-4.417954810666368e-02,7.359807458657441e-02,-1.423028303125594e-01,9.477621943920580e-01,2.061872689653194e-01,-9.167791108320404e-02,5.790529573758267e-02,-4.146097298232930e-02,3.146862461137171e-02,-2.443397168853693e-02,1.876084373603289e-02,-1.331769381636093e-02,5.257280256350990e-03}, +{ -7.619094657576686e-03,1.996507919416235e-02,-3.079625653762993e-02,4.831295379652090e-02,-9.434555106679045e-02,9.811162801578872e-01,1.174504253174876e-01,-5.469507667842262e-02,3.500225940617766e-02,-2.511468940504084e-02,1.889016539407182e-02,-1.423655439019561e-02,1.000281564275034e-02,-3.932756173401631e-03}, +{ 2.017039090083270e-03,-5.208396498867482e-03,7.713101593391184e-03,-1.105863437858452e-02,1.711593307107694e-02,-3.423453830851540e-02,9.978967719311276e-01,3.680925650710247e-02,-1.782200980841449e-02,1.150039601514185e-02,-8.203080430570459e-03,6.004983192969155e-03,-4.154467118053430e-03,1.623645142113497e-03}, +{ 1.623645142113498e-03,-4.154467118053301e-03,6.004983192969081e-03,-8.203080430570086e-03,1.150039601514110e-02,-1.782200980841393e-02,3.680925650710180e-02,9.978967719311277e-01,-3.423453830851533e-02,1.711593307107729e-02,-1.105863437858466e-02,7.713101593391239e-03,-5.208396498867618e-03,2.017039090083276e-03}, +{ -3.932756173401523e-03,1.000281564275022e-02,-1.423655439019553e-02,1.889016539407207e-02,-2.511468940504079e-02,3.500225940617718e-02,-5.469507667842235e-02,1.174504253174868e-01,9.811162801578874e-01,-9.434555106678981e-02,4.831295379652047e-02,-3.079625653762965e-02,1.996507919416216e-02,-7.619094657576592e-03}, +{ 5.257280256350676e-03,-1.331769381636089e-02,1.876084373603321e-02,-2.443397168853737e-02,3.146862461137191e-02,-4.146097298232952e-02,5.790529573758306e-02,-9.167791108320408e-02,2.061872689653199e-01,9.477621943920569e-01,-1.423028303125588e-01,7.359807458657437e-02,-4.417954810666366e-02,1.643334570436425e-02}, +{ -5.798670777591643e-03,1.464857315433238e-02,-2.049308857357713e-02,2.636431487407033e-02,-3.327870016030467e-02,4.241724686311330e-02,-5.589430821059847e-02,7.867095644030622e-02,-1.267975228531986e-01,3.014688794679042e-01,8.980277647394409e-01,-1.765498034739208e-01,8.850260303954777e-02,-3.128824452952393e-02}, +{ 5.670358490048190e-03,-1.429780334672079e-02,1.991002132273523e-02,-2.540844879527694e-02,3.166228257757346e-02,-3.954637450222766e-02,5.041290294523679e-02,-6.691277432650623e-02,9.549736388498042e-02,-1.577425532764337e-01,4.020769928138869e-01,8.312253806828134e-01,-1.925066386355147e-01,5.995929016540585e-02}, +{ -4.908735013705923e-03,1.236282113373416e-02,-1.716549394748398e-02,2.179630449008173e-02,-2.694814651625228e-02,3.325390319522661e-02,-4.159653860992208e-02,5.351375351374098e-02,-7.217041214147422e-02,1.055023487998636e-01,-1.810079864944376e-01,5.085160742462085e-01,7.411257110173400e-01,-1.322736036729196e-01}, +{ 3.398346924222306e-03,-8.553570392551548e-03,1.185835712592074e-02,-1.501815697412322e-02,1.849273578502648e-02,-2.268033056328987e-02,2.810520910827734e-02,-3.562178870607795e-02,4.683688593142996e-02,-6.524108768101335e-02,1.000656804008772e-01,-1.855662494207719e-01,6.317989081803328e-01,4.921250602817412e-01} +}; +#endif +#if p_Nq==14 && p_cubNq==15 +const dfloat c_I[15][14] = { +{ 5.438254885938876e-01,5.739750928406536e-01,-1.769632131588694e-01,9.616440327715026e-02,-6.286712468404891e-02,4.518964574497396e-02,-3.439268098174268e-02,2.714668532689127e-02,-2.191259704792765e-02,1.786985726935596e-02,-1.451398531325499e-02,1.146111723845646e-02,-8.267397018206168e-03,3.284707912680613e-03}, +{ -1.245458871882756e-01,8.501972639792585e-01,3.662810004526577e-01,-1.430486717704106e-01,8.507960797050054e-02,-5.867608968173706e-02,4.368888263584987e-02,-3.404122890209112e-02,2.725461202444094e-02,-2.210792753151362e-02,1.789285008464193e-02,-1.409714581493745e-02,1.015538139187022e-02,-4.032647650254261e-03}, +{ 3.790311410573799e-02,-1.269971876908718e-01,9.563996703292112e-01,1.839752802109090e-01,-8.141175535650354e-02,5.082642300992253e-02,-3.609051302123688e-02,2.738484282478990e-02,-2.157293716396982e-02,1.731817993981433e-02,-1.392149172627470e-02,1.092078184454902e-02,-7.847394372099789e-03,3.112987066022573e-03}, +{ -4.226978212864557e-03,1.219799737040174e-02,-2.646231041569969e-02,9.987273291592514e-01,2.822474224822468e-02,-1.362806418875984e-02,8.794645599084763e-03,-6.360870885627435e-03,4.874466316596647e-03,-3.846767643901082e-03,3.058725007202791e-03,-2.382997878765942e-03,1.705604440819932e-03,-6.755209159635441e-04}, +{ -1.105075209519635e-02,3.006071070238004e-02,-5.202549992674196e-02,1.145817205595091e-01,9.818046325342062e-01,-9.275512543459367e-02,4.787528085080373e-02,-3.166870978914071e-02,2.317092592531238e-02,-1.779832510591519e-02,1.391866189343809e-02,-1.073322095135346e-02,7.637628363385947e-03,-3.017927526094369e-03}, +{ 1.729266729024879e-02,-4.566734851802589e-02,7.204668094358492e-02,-1.194423603337576e-01,2.818962229756007e-01,9.097038358993396e-01,-1.731272419645072e-01,9.422076675557332e-02,-6.348922121915708e-02,4.669117673402828e-02,-3.559991856007561e-02,2.704134425109008e-02,-1.908138319468969e-02,7.514778940747409e-03}, +{ -1.829059711657331e-02,4.749038832160803e-02,-7.139115793799421e-02,1.056606751586359e-01,-1.760492810547127e-01,4.609871229629262e-01,7.904177881852193e-01,-2.112476462278466e-01,1.199509070422509e-01,-8.182609002765737e-02,5.994697442067883e-02,-4.451829631659341e-02,3.103263986134119e-02,-1.216342727128285e-02}, +{ 1.611328124999990e-02,-4.139732081171958e-02,6.047960321705034e-02,-8.436165461443224e-02,1.232142737669564e-01,-2.099458793904556e-01,6.358976965826020e-01,6.358976965825996e-01,-2.099458793904560e-01,1.232142737669568e-01,-8.436165461443201e-02,6.047960321705029e-02,-4.139732081171965e-02,1.611328124999977e-02}, +{ -1.216342727128299e-02,3.103263986134128e-02,-4.451829631659353e-02,5.994697442067919e-02,-8.182609002765782e-02,1.199509070422512e-01,-2.112476462278469e-01,7.904177881852176e-01,4.609871229629287e-01,-1.760492810547127e-01,1.056606751586356e-01,-7.139115793799435e-02,4.749038832160812e-02,-1.829059711657336e-02}, +{ 7.514778940747278e-03,-1.908138319468967e-02,2.704134425109013e-02,-3.559991856007565e-02,4.669117673402846e-02,-6.348922121915727e-02,9.422076675557349e-02,-1.731272419645072e-01,9.097038358993387e-01,2.818962229756012e-01,-1.194423603337575e-01,7.204668094358505e-02,-4.566734851802601e-02,1.729266729024897e-02}, +{ -3.017927526094316e-03,7.637628363386106e-03,-1.073322095135362e-02,1.391866189343824e-02,-1.779832510591543e-02,2.317092592531267e-02,-3.166870978914109e-02,4.787528085080445e-02,-9.275512543459533e-02,9.818046325342064e-01,1.145817205595106e-01,-5.202549992674273e-02,3.006071070238054e-02,-1.105075209519652e-02}, +{ -6.755209159636138e-04,1.705604440819882e-03,-2.382997878766083e-03,3.058725007202993e-03,-3.846767643900854e-03,4.874466316597032e-03,-6.360870885628063e-03,8.794645599085184e-03,-1.362806418876069e-02,2.822474224822670e-02,9.987273291592514e-01,-2.646231041570191e-02,1.219799737040274e-02,-4.226978212864764e-03}, +{ 3.112987066022463e-03,-7.847394372100037e-03,1.092078184454944e-02,-1.392149172627505e-02,1.731817993981440e-02,-2.157293716396990e-02,2.738484282478997e-02,-3.609051302123760e-02,5.082642300992348e-02,-8.141175535650429e-02,1.839752802109109e-01,9.563996703292107e-01,-1.269971876908728e-01,3.790311410573838e-02}, +{ -4.032647650254434e-03,1.015538139187043e-02,-1.409714581493746e-02,1.789285008464167e-02,-2.210792753151343e-02,2.725461202444062e-02,-3.404122890209113e-02,4.368888263585009e-02,-5.867608968173660e-02,8.507960797049971e-02,-1.430486717704090e-01,3.662810004526559e-01,8.501972639792583e-01,-1.245458871882748e-01}, +{ 3.284707912680651e-03,-8.267397018206050e-03,1.146111723845635e-02,-1.451398531325490e-02,1.786985726935596e-02,-2.191259704792783e-02,2.714668532689166e-02,-3.439268098174318e-02,4.518964574497456e-02,-6.286712468404941e-02,9.616440327715081e-02,-1.769632131588718e-01,5.739750928406633e-01,5.438254885938799e-01} +}; +#endif +#if p_Nq==14 && p_cubNq==16 +const dfloat c_I[16][14] = { +{ 5.887161694568449e-01,5.222567423035746e-01,-1.671419591325827e-01,9.139495612950557e-02,-5.988103461527748e-02,4.308777419583477e-02,-3.281156071227905e-02,2.590748738593360e-02,-2.091685859686589e-02,1.706026384009993e-02,-1.385774521261925e-02,1.094358206049584e-02,-7.894359788348126e-03,3.136542685683362e-03}, +{ -1.038984354879769e-01,9.245976394101304e-01,2.443124806940390e-01,-1.018736324678410e-01,6.155078506216365e-02,-4.272611822605178e-02,3.191968842278312e-02,-2.491949446314452e-02,1.997574508236142e-02,-1.621643442050291e-02,1.313148328789302e-02,-1.034930835685215e-02,7.456952051852483e-03,-2.961350588853778e-03}, +{ 4.954925482890812e-03,-1.741150301705759e-02,9.994260753944071e-01,1.857712662833148e-02,-8.899807860241579e-03,5.685633328333149e-03,-4.079267464553468e-03,3.112736600590516e-03,-2.460417684267466e-03,1.979400322378531e-03,-1.593387592210609e-03,1.251047512011176e-03,-8.994314213052051e-04,3.568697706930860e-04}, +{ 2.709659667501466e-02,-7.991624990939493e-02,1.920680564099796e-01,9.531063821528732e-01,-1.362862227410259e-01,7.171622768286473e-02,-4.761186409913525e-02,3.490440365195981e-02,-2.695120223115657e-02,2.136731470874307e-02,-1.703962076917079e-02,1.329949026562943e-02,-9.528913626449434e-03,3.775601829268438e-03}, +{ -3.360778587941748e-02,9.259195728712560e-02,-1.673707985721918e-01,4.415310153938346e-01,8.042312693484210e-01,-2.084197174538303e-01,1.175537651995896e-01,-8.030206788784688e-02,5.969928676565475e-02,-4.627577772630393e-02,3.638890269630038e-02,-2.815558599883625e-02,2.007330920161449e-02,-7.937772374113944e-03}, +{ 2.781843821865741e-02,-7.408250842928008e-02,1.198418254670393e-01,-2.124208627711466e-01,6.854027146998206e-01,5.839617981052568e-01,-2.025791565576452e-01,1.205160681490062e-01,-8.408035498423182e-02,6.293898756143136e-02,-4.847526030675533e-02,3.704075668463079e-02,-2.622329171546905e-02,1.034084587868557e-02}, +{ -1.674838750935159e-02,4.374468471885441e-02,-6.685568338574918e-02,1.026555793614478e-01,-1.883091134745137e-01,8.790598564526145e-01,3.342263609777890e-01,-1.388118525850620e-01,8.605826193643899e-02,-6.087393421043308e-02,4.543841477321415e-02,-3.409646569043579e-02,2.390050333975691e-02,-9.388224704570574e-03}, +{ 5.060078581455228e-03,-1.305728005639425e-02,1.930119865916798e-02,-2.756890472141971e-02,4.231894251408869e-02,-8.255085283899945e-02,9.861152223209533e-01,9.947434898740108e-02,-4.671484966300110e-02,2.986384445757989e-02,-2.121024252448693e-02,1.549183120386117e-02,-1.070530541309062e-02,4.181968492884726e-03}, +{ 4.181968492884526e-03,-1.070530541309044e-02,1.549183120386100e-02,-2.121024252448656e-02,2.986384445757916e-02,-4.671484966300025e-02,9.947434898739965e-02,9.861152223209543e-01,-8.255085283899941e-02,4.231894251408876e-02,-2.756890472141963e-02,1.930119865916792e-02,-1.305728005639428e-02,5.060078581455234e-03}, +{ -9.388224704570562e-03,2.390050333975678e-02,-3.409646569043553e-02,4.543841477321404e-02,-6.087393421043281e-02,8.605826193643826e-02,-1.388118525850613e-01,3.342263609777860e-01,8.790598564526165e-01,-1.883091134745125e-01,1.026555793614468e-01,-6.685568338574872e-02,4.374468471885423e-02,-1.674838750935132e-02}, +{ 1.034084587868588e-02,-2.622329171546896e-02,3.704075668463067e-02,-4.847526030675549e-02,6.293898756143156e-02,-8.408035498423157e-02,1.205160681490058e-01,-2.025791565576447e-01,5.839617981052545e-01,6.854027146998216e-01,-2.124208627711456e-01,1.198418254670390e-01,-7.408250842927994e-02,2.781843821865727e-02}, +{ -7.937772374114048e-03,2.007330920161452e-02,-2.815558599883618e-02,3.638890269630058e-02,-4.627577772630427e-02,5.969928676565495e-02,-8.030206788784709e-02,1.175537651995901e-01,-2.084197174538314e-01,8.042312693484217e-01,4.415310153938352e-01,-1.673707985721924e-01,9.259195728712598e-02,-3.360778587941753e-02}, +{ 3.775601829268442e-03,-9.528913626449547e-03,1.329949026562952e-02,-1.703962076917075e-02,2.136731470874316e-02,-2.695120223115677e-02,3.490440365195992e-02,-4.761186409913509e-02,7.171622768286483e-02,-1.362862227410260e-01,9.531063821528739e-01,1.920680564099788e-01,-7.991624990939490e-02,2.709659667501445e-02}, +{ 3.568697706930590e-04,-8.994314213054437e-04,1.251047512011725e-03,-1.593387592211261e-03,1.979400322378984e-03,-2.460417684267845e-03,3.112736600591224e-03,-4.079267464554929e-03,5.685633328334880e-03,-8.899807860243749e-03,1.857712662833642e-02,9.994260753944065e-01,-1.741150301706191e-02,4.954925482892329e-03}, +{ -2.961350588853981e-03,7.456952051852688e-03,-1.034930835685228e-02,1.313148328789313e-02,-1.621643442050307e-02,1.997574508236174e-02,-2.491949446314488e-02,3.191968842278348e-02,-4.272611822605187e-02,6.155078506216388e-02,-1.018736324678414e-01,2.443124806940412e-01,9.245976394101282e-01,-1.038984354879769e-01}, +{ 3.136542685683530e-03,-7.894359788348114e-03,1.094358206049585e-02,-1.385774521261937e-02,1.706026384010036e-02,-2.091685859686634e-02,2.590748738593412e-02,-3.281156071227972e-02,4.308777419583584e-02,-5.988103461527873e-02,9.139495612950754e-02,-1.671419591325869e-01,5.222567423035901e-01,5.887161694568319e-01} +}; +#endif +#if p_Nq==14 && p_cubNq==17 +const dfloat c_I[17][14] = { +{ 6.277788348748461e-01,4.762131285179824e-01,-1.569651894213162e-01,8.627030517960828e-02,-5.662665034569914e-02,4.078106706871998e-02,-3.106959774382238e-02,2.453899806906454e-02,-1.981555842929679e-02,1.616394086063358e-02,-1.313071615148127e-02,1.036997079015457e-02,-7.480797923377469e-03,2.972264653983948e-03}, +{ -7.386969100515751e-02,9.704434623727000e-01,1.432033401838275e-01,-6.268505738081090e-02,3.835075311266933e-02,-2.676256556625693e-02,2.004868005076219e-02,-1.567701320123745e-02,1.257951751463576e-02,-1.021883295596989e-02,8.278424783730099e-03,-6.526287324228674e-03,4.703126947391834e-03,-1.867857532055292e-03}, +{ -3.125525171094408e-02,1.158329236691510e-01,9.802321337980524e-01,-9.436409883976164e-02,4.781842878959071e-02,-3.109675397819279e-02,2.249568255122082e-02,-1.724374418732477e-02,1.366760384135951e-02,-1.101487064664290e-02,8.876935652103613e-03,-6.974796774905426e-03,5.016594693358481e-03,-1.990786857064979e-03}, +{ 5.147198668858341e-02,-1.554456459797366e-01,4.245159721880014e-01,8.151629078194842e-01,-2.050986015169600e-01,1.144473464178485e-01,-7.761683508822122e-02,5.750355784011513e-02,-4.466878835581080e-02,3.554549914577466e-02,-2.841305320894406e-02,2.220935940538380e-02,-1.592626212378798e-02,6.312556768269564e-03}, +{ -3.970995615650537e-02,1.109140924710422e-01,-2.107172904491441e-01,7.285994876896544e-01,5.347792709605427e-01,-1.927405626303241e-01,1.152856671512077e-01,-8.063102982215316e-02,6.067911137482086e-02,-4.737057375027864e-02,3.741286025636657e-02,-2.902576766502444e-02,2.072526954653037e-02,-8.200578976735052e-03}, +{ 1.738863676903760e-02,-4.672566151659989e-02,7.772343124319726e-02,-1.495784757673235e-01,9.401905880290882e-01,2.226254912787708e-01,-9.814857293798800e-02,6.184523362995717e-02,-4.423455367975145e-02,3.355418214846809e-02,-2.604433737128757e-02,1.999317345609350e-02,-1.419084100862351e-02,5.601705726961515e-03}, +{ 2.929359378784752e-03,-7.699928369906968e-03,1.198336447138756e-02,-1.920428869073820e-02,4.012959354282317e-02,9.975057569002250e-01,-3.714337656725533e-02,1.866050872264829e-02,-1.222920825492542e-02,8.872827760481310e-03,-6.714419384832921e-03,5.077991333406278e-03,-3.574657043875218e-03,1.406476201777773e-03}, +{ -1.457480611131121e-02,3.778741083208383e-02,-5.657765282964830e-02,8.301621503357209e-02,-1.354686863373417e-01,3.255428243654203e-01,8.844298521817321e-01,-1.864530801004317e-01,1.025829970867805e-01,-6.914517653530192e-02,5.035619236507504e-02,-3.727466409329729e-02,2.593860340744386e-02,-1.016002926477568e-02}, +{ 1.611328125000001e-02,-4.139732081171971e-02,6.047960321705052e-02,-8.436165461443222e-02,1.232142737669564e-01,-2.099458793904556e-01,6.358976965826010e-01,6.358976965826006e-01,-2.099458793904561e-01,1.232142737669568e-01,-8.436165461443212e-02,6.047960321705027e-02,-4.139732081171965e-02,1.611328124999988e-02}, +{ -1.016002926477583e-02,2.593860340744404e-02,-3.727466409329738e-02,5.035619236507535e-02,-6.914517653530218e-02,1.025829970867803e-01,-1.864530801004318e-01,8.844298521817331e-01,3.255428243654191e-01,-1.354686863373410e-01,8.301621503357146e-02,-5.657765282964786e-02,3.778741083208359e-02,-1.457480611131100e-02}, +{ 1.406476201777534e-03,-3.574657043875101e-03,5.077991333406417e-03,-6.714419384832538e-03,8.872827760480722e-03,-1.222920825492526e-02,1.866050872264802e-02,-3.714337656725491e-02,9.975057569002248e-01,4.012959354282277e-02,-1.920428869073783e-02,1.198336447138721e-02,-7.699928369906808e-03,2.929359378784873e-03}, +{ 5.601705726961659e-03,-1.419084100862322e-02,1.999317345609317e-02,-2.604433737128750e-02,3.355418214846759e-02,-4.423455367975065e-02,6.184523362995613e-02,-9.814857293798615e-02,2.226254912787668e-01,9.401905880290893e-01,-1.495784757673211e-01,7.772343124319604e-02,-4.672566151659911e-02,1.738863676903715e-02}, +{ -8.200578976734936e-03,2.072526954653023e-02,-2.902576766502447e-02,3.741286025636665e-02,-4.737057375027862e-02,6.067911137482106e-02,-8.063102982215326e-02,1.152856671512080e-01,-1.927405626303246e-01,5.347792709605408e-01,7.285994876896569e-01,-2.107172904491449e-01,1.109140924710426e-01,-3.970995615650540e-02}, +{ 6.312556768269403e-03,-1.592626212378812e-02,2.220935940538398e-02,-2.841305320894424e-02,3.554549914577437e-02,-4.466878835581042e-02,5.750355784011527e-02,-7.761683508822154e-02,1.144473464178487e-01,-2.050986015169598e-01,8.151629078194859e-01,4.245159721879995e-01,-1.554456459797364e-01,5.147198668858347e-02}, +{ -1.990786857065316e-03,5.016594693358570e-03,-6.974796774905471e-03,8.876935652103540e-03,-1.101487064664338e-02,1.366760384135996e-02,-1.724374418732491e-02,2.249568255122129e-02,-3.109675397819325e-02,4.781842878959125e-02,-9.436409883976254e-02,9.802321337980504e-01,1.158329236691542e-01,-3.125525171094447e-02}, +{ -1.867857532055400e-03,4.703126947392123e-03,-6.526287324228958e-03,8.278424783730282e-03,-1.021883295597011e-02,1.257951751463600e-02,-1.567701320123749e-02,2.004868005076215e-02,-2.676256556625648e-02,3.835075311266880e-02,-6.268505738081059e-02,1.432033401838279e-01,9.704434623726990e-01,-7.386969100515720e-02}, +{ 2.972264653983703e-03,-7.480797923377519e-03,1.036997079015461e-02,-1.313071615148157e-02,1.616394086063399e-02,-1.981555842929753e-02,2.453899806906533e-02,-3.106959774382343e-02,4.078106706872197e-02,-5.662665034570129e-02,8.627030517961104e-02,-1.569651894213215e-01,4.762131285180006e-01,6.277788348748313e-01} +}; +#endif +#if p_Nq==14 && p_cubNq==18 +const dfloat c_I[18][14] = { +{ 6.618742866247922e-01,4.352924518567312e-01,-1.469272345163083e-01,8.109757453418324e-02,-5.331271093124729e-02,3.842213010950980e-02,-2.928397729162765e-02,2.313420038641829e-02,-1.868400015644889e-02,1.524243172049360e-02,-1.238295539960691e-02,9.779848031840141e-03,-7.055266386095049e-03,2.803221417365581e-03}, +{ -3.750041947887309e-02,9.937920523093549e-01,6.130169429624426e-02,-2.785434350818648e-02,1.721529452175308e-02,-1.206589779502894e-02,9.059586514169219e-03,-7.093606173689163e-03,5.696825515841930e-03,-4.630296047057341e-03,3.752430850549482e-03,-2.958915695783296e-03,2.132613882977585e-03,-8.470191922720987e-04}, +{ -6.515852670404756e-02,2.563128679782268e-01,9.190396821584336e-01,-1.630937751098122e-01,8.616495864137037e-02,-5.682857466348822e-02,4.138608980926214e-02,-3.184231474984568e-02,2.529599198342597e-02,-2.041600500476477e-02,1.646897999929065e-02,-1.294787368438451e-02,9.315982139758178e-03,-3.697482793424681e-03}, +{ 6.343720393004547e-02,-1.965706993733012e-01,6.328006395968347e-01,6.349668436468768e-01,-2.071880208499761e-01,1.206252673356805e-01,-8.316925367558838e-02,6.213697666057332e-02,-4.850361526624145e-02,3.871404731100098e-02,-3.100575954332488e-02,2.426557219119832e-02,-1.741299655954739e-02,6.903794595769255e-03}, +{ -2.899352867707296e-02,8.218384376436956e-02,-1.653271055074607e-01,9.174356927724256e-01,2.668525433844743e-01,-1.143882286263634e-01,7.132779481209132e-02,-5.078942485270501e-02,3.858936210938552e-02,-3.029712651537420e-02,2.401291832818596e-02,-1.867049815767421e-02,1.334787906947869e-02,-5.284121903760403e-03}, +{ -4.908247350555480e-03,1.331671490865460e-02,-2.284846583872813e-02,4.876157473117509e-02,9.963475778368577e-01,-4.445491000264364e-02,2.237038074751721e-02,-1.467217690907144e-02,1.069124860745208e-02,-8.193437016485889e-03,6.398588510516673e-03,-4.930062083996089e-03,3.506515859429238e-03,-1.385302000121840e-03}, +{ 2.206651629190667e-02,-5.839809559159781e-02,9.271022068400717e-02,-1.562014539088663e-01,3.935262382250546e-01,8.397361027814624e-01,-2.017241848107844e-01,1.128481934626804e-01,-7.682769464675282e-02,5.679208547965085e-02,-4.342744754251586e-02,3.304318937435564e-02,-2.333835150569738e-02,9.194681707096974e-03}, +{ -2.031310062272826e-02,5.293108085467876e-02,-8.036525553142566e-02,1.215771287841947e-01,-2.141783940815258e-01,7.470423535931481e-01,5.146741543932840e-01,-1.894701129363730e-01,1.141246319477642e-01,-7.973137984069283e-02,5.912807765331269e-02,-4.420712234054277e-02,3.092682907634767e-02,-1.213889094944173e-02}, +{ 7.283307063027317e-03,-1.878447698647522e-02,2.772843777368617e-02,-3.949279366403489e-02,6.024781395634465e-02,-1.153959433797278e-01,9.697610393126749e-01,1.521708706621689e-01,-6.962885619959401e-02,4.417166243848807e-02,-3.126314115279358e-02,2.279302367786495e-02,-1.573587174170868e-02,6.144928240079122e-03}, +{ 6.144928240078970e-03,-1.573587174170848e-02,2.279302367786473e-02,-3.126314115279336e-02,4.417166243848736e-02,-6.962885619959305e-02,1.521708706621676e-01,9.697610393126755e-01,-1.153959433797275e-01,6.024781395634443e-02,-3.949279366403449e-02,2.772843777368589e-02,-1.878447698647512e-02,7.283307063027386e-03}, +{ -1.213889094944180e-02,3.092682907634757e-02,-4.420712234054289e-02,5.912807765331324e-02,-7.973137984069334e-02,1.141246319477642e-01,-1.894701129363733e-01,5.146741543932859e-01,7.470423535931467e-01,-2.141783940815257e-01,1.215771287841943e-01,-8.036525553142554e-02,5.293108085467857e-02,-2.031310062272795e-02}, +{ 9.194681707097044e-03,-2.333835150569733e-02,3.304318937435558e-02,-4.342744754251584e-02,5.679208547965092e-02,-7.682769464675258e-02,1.128481934626796e-01,-2.017241848107844e-01,8.397361027814629e-01,3.935262382250536e-01,-1.562014539088653e-01,9.271022068400678e-02,-5.839809559159767e-02,2.206651629190648e-02}, +{ -1.385302000121890e-03,3.506515859429351e-03,-4.930062083996178e-03,6.398588510517026e-03,-8.193437016485883e-03,1.069124860745247e-02,-1.467217690907232e-02,2.237038074751795e-02,-4.445491000264576e-02,9.963475778368577e-01,4.876157473117732e-02,-2.284846583872916e-02,1.331671490865518e-02,-4.908247350555808e-03}, +{ -5.284121903760510e-03,1.334787906947878e-02,-1.867049815767445e-02,2.401291832818605e-02,-3.029712651537414e-02,3.858936210938575e-02,-5.078942485270514e-02,7.132779481209152e-02,-1.143882286263639e-01,2.668525433844753e-01,9.174356927724252e-01,-1.653271055074614e-01,8.218384376437013e-02,-2.899352867707325e-02}, +{ 6.903794595769147e-03,-1.741299655954762e-02,2.426557219119853e-02,-3.100575954332509e-02,3.871404731100110e-02,-4.850361526624151e-02,6.213697666057372e-02,-8.316925367558931e-02,1.206252673356814e-01,-2.071880208499770e-01,6.349668436468826e-01,6.328006395968299e-01,-1.965706993733011e-01,6.343720393004534e-02}, +{ -3.697482793424745e-03,9.315982139758238e-03,-1.294787368438436e-02,1.646897999929040e-02,-2.041600500476472e-02,2.529599198342592e-02,-3.184231474984585e-02,4.138608980926258e-02,-5.682857466348838e-02,8.616495864137018e-02,-1.630937751098117e-01,9.190396821584312e-01,2.563128679782289e-01,-6.515852670404787e-02}, +{ -8.470191922722373e-04,2.132613882977542e-03,-2.958915695782967e-03,3.752430850548946e-03,-4.630296047056994e-03,5.696825515841654e-03,-7.093606173689081e-03,9.059586514168839e-03,-1.206589779502784e-02,1.721529452175173e-02,-2.785434350818472e-02,6.130169429624104e-02,9.937920523093553e-01,-3.750041947887125e-02}, +{ 2.803221417365553e-03,-7.055266386094987e-03,9.779848031839894e-03,-1.238295539960692e-02,1.524243172049360e-02,-1.868400015644869e-02,2.313420038641807e-02,-2.928397729162748e-02,3.842213010950961e-02,-5.331271093124704e-02,8.109757453418279e-02,-1.469272345163076e-01,4.352924518567259e-01,6.618742866247973e-01} +}; +#endif +#if p_Nq==14 && p_cubNq==19 +const dfloat c_I[19][14] = { +{ 6.917397643364083e-01,3.989227674611404e-01,-1.373004526160718e-01,7.605569967439124e-02,-5.006286367371748e-02,3.610202693056750e-02,-2.752490112145937e-02,2.174892676150925e-02,-1.756746682680944e-02,1.433278050886307e-02,-1.164461253302826e-02,9.197053398597766e-03,-6.634975064949922e-03,2.636252764558719e-03}, +{ 2.782888870469095e-03,9.999710912874128e-01,-3.903280457656710e-03,1.826629306866218e-03,-1.138433007983472e-03,8.008147568500588e-04,-6.024391116213083e-04,4.722389028502563e-04,-3.795212241801623e-04,3.086125940186470e-04,-2.501790443327789e-04,1.973134597919015e-04,-1.422284404470897e-04,5.649210796249743e-05}, +{ -9.333154764509467e-02,3.925933061238785e-01,8.327175950573235e-01,-1.979001883208132e-01,1.079608979380133e-01,-7.201893273184454e-02,5.273828107380299e-02,-4.070254515910182e-02,3.239614513158807e-02,-2.617837448005791e-02,2.113417435212153e-02,-1.662412254706003e-02,1.196458549357676e-02,-4.749274286332576e-03}, +{ 6.221865089926090e-02,-1.982462003884569e-01,7.968801490029748e-01,4.485261132807228e-01,-1.707059662119222e-01,1.026191833104987e-01,-7.168945588626369e-02,5.392725101836882e-02,-4.226412686853026e-02,3.381856902047418e-02,-2.712868113277109e-02,2.125295539856603e-02,-1.526010446922765e-02,6.051663026305597e-03}, +{ -7.391929010966108e-03,2.128645811924339e-02,-4.575415434439689e-02,9.960124654734800e-01,5.098894591509477e-02,-2.435076441832533e-02,1.566115613003042e-02,-1.130932091504749e-02,8.658992918406990e-03,-6.829777112418532e-03,5.428832791485882e-03,-4.228628001956790e-03,3.026234219261618e-03,-1.198511763891935e-03}, +{ -2.642935987114722e-02,7.244669772160078e-02,-1.286684997757276e-01,3.139489364936354e-01,8.907310413331719e-01,-1.828535085967564e-01,1.000965959396766e-01,-6.761007103898574e-02,4.997886267344955e-02,-3.861477270001749e-02,3.030430304648858e-02,-2.341913139437057e-02,1.668499055632620e-02,-6.596084387344041e-03}, +{ 2.761467053706426e-02,-7.361178556583209e-02,1.194356320804049e-01,-2.134807229290628e-01,7.233379251497452e-01,5.418711094191545e-01,-1.949246159036578e-01,1.168531992530270e-01,-8.178576822474054e-02,6.132398047556902e-02,-4.727722291405799e-02,3.614611733394689e-02,-2.559810655521527e-02,1.009558784365467e-02}, +{ -8.373522429316835e-03,2.193686821845160e-02,-3.381526309204919e-02,5.296613023870978e-02,-1.029424977290758e-01,9.768718090916050e-01,1.312121219358971e-01,-6.069101541858311e-02,3.876132663782780e-02,-2.778620734757582e-02,2.088912449222661e-02,-1.573857738160483e-02,1.105644582327826e-02,-4.346743039790440e-03}, +{ -1.074893506497600e-02,2.783744598772503e-02,-4.155404332772693e-02,6.058031526443251e-02,-9.737028483625658e-02,2.209961946193694e-01,9.411185586217775e-01,-1.497500438464683e-01,8.011019973156862e-02,-5.344845682737830e-02,3.873083333319670e-02,-2.859195171425339e-02,1.986815155555072e-02,-7.777983496561102e-03}, +{ 1.611328124999968e-02,-4.139732081171956e-02,6.047960321705021e-02,-8.436165461443199e-02,1.232142737669564e-01,-2.099458793904555e-01,6.358976965826000e-01,6.358976965826016e-01,-2.099458793904562e-01,1.232142737669569e-01,-8.436165461443217e-02,6.047960321705043e-02,-4.139732081171976e-02,1.611328124999988e-02}, +{ -7.777983496561147e-03,1.986815155555082e-02,-2.859195171425346e-02,3.873083333319698e-02,-5.344845682737851e-02,8.011019973156876e-02,-1.497500438464685e-01,9.411185586217771e-01,2.209961946193700e-01,-9.737028483625644e-02,6.058031526443222e-02,-4.155404332772680e-02,2.783744598772486e-02,-1.074893506497576e-02}, +{ -4.346743039790784e-03,1.105644582327843e-02,-1.573857738160492e-02,2.088912449222681e-02,-2.778620734757613e-02,3.876132663782785e-02,-6.069101541858346e-02,1.312121219358978e-01,9.768718090916049e-01,-1.029424977290760e-01,5.296613023870975e-02,-3.381526309204913e-02,2.193686821845162e-02,-8.373522429316839e-03}, +{ 1.009558784365458e-02,-2.559810655521535e-02,3.614611733394706e-02,-4.727722291405827e-02,6.132398047556915e-02,-8.178576822474021e-02,1.168531992530268e-01,-1.949246159036575e-01,5.418711094191536e-01,7.233379251497448e-01,-2.134807229290615e-01,1.194356320804045e-01,-7.361178556583178e-02,2.761467053706409e-02}, +{ -6.596084387344058e-03,1.668499055632620e-02,-2.341913139437068e-02,3.030430304648874e-02,-3.861477270001731e-02,4.997886267344933e-02,-6.761007103898581e-02,1.000965959396767e-01,-1.828535085967571e-01,8.907310413331735e-01,3.139489364936344e-01,-1.286684997757276e-01,7.244669772160089e-02,-2.642935987114719e-02}, +{ -1.198511763891917e-03,3.026234219261556e-03,-4.228628001956720e-03,5.428832791485770e-03,-6.829777112418158e-03,8.658992918406935e-03,-1.130932091504748e-02,1.566115613003023e-02,-2.435076441832507e-02,5.098894591509411e-02,9.960124654734800e-01,-4.575415434439643e-02,2.128645811924331e-02,-7.391929010966132e-03}, +{ 6.051663026305602e-03,-1.526010446922782e-02,2.125295539856651e-02,-2.712868113277164e-02,3.381856902047437e-02,-4.226412686853059e-02,5.392725101836952e-02,-7.168945588626457e-02,1.026191833105000e-01,-1.707059662119241e-01,4.485261132807276e-01,7.968801490029718e-01,-1.982462003884578e-01,6.221865089926105e-02}, +{ -4.749274286332400e-03,1.196458549357683e-02,-1.662412254706011e-02,2.113417435212128e-02,-2.617837448005773e-02,3.239614513158817e-02,-4.070254515910179e-02,5.273828107380309e-02,-7.201893273184468e-02,1.079608979380135e-01,-1.979001883208125e-01,8.327175950573182e-01,3.925933061238831e-01,-9.333154764509494e-02}, +{ 5.649210796241722e-05,-1.422284404470582e-04,1.973134597920393e-04,-2.501790443329908e-04,3.086125940185055e-04,-3.795212241798796e-04,4.722389028499566e-04,-6.024391116208617e-04,8.008147568496767e-04,-1.138433007982909e-03,1.826629306865600e-03,-3.903280457655148e-03,9.999710912874127e-01,2.782888870467973e-03}, +{ 2.636252764559033e-03,-6.634975064949878e-03,9.197053398597763e-03,-1.164461253302842e-02,1.433278050886323e-02,-1.756746682680953e-02,2.174892676150911e-02,-2.752490112145932e-02,3.610202693056779e-02,-5.006286367371736e-02,7.605569967439034e-02,-1.373004526160709e-01,3.989227674611351e-01,6.917397643364133e-01} +}; +#endif +#if p_Nq==14 && p_cubNq==20 +const dfloat c_I[20][14] = { +{ 7.179990095106994e-01,3.665603347662485e-01,-1.282235251222103e-01,7.124395791166123e-02,-4.694731674288893e-02,3.387298227750195e-02,-2.583284124364573e-02,2.041546444304583e-02,-1.649219592253961e-02,1.345647819070862e-02,-1.093319335435614e-02,8.635436398285970e-03,-6.229925108054646e-03,2.475333995543684e-03}, +{ 4.513292610114375e-02,9.933872010603817e-01,-5.507266340203763e-02,2.639283766749538e-02,-1.656453546142492e-02,1.168792673756760e-02,-8.806939610645817e-03,6.910199549762073e-03,-5.556839872398132e-03,4.520408054791173e-03,-3.665462800104856e-03,2.891398768887442e-03,-2.084397519630854e-03,8.279407262131525e-04}, +{ -1.140763027099567e-01,5.176374882101332e-01,7.339313616265097e-01,-2.083586614438632e-01,1.165846829597940e-01,-7.850292795737919e-02,5.775129025384237e-02,-4.468783378168692e-02,3.562527139604699e-02,-2.881755110287198e-02,2.328059730667022e-02,-1.832044772068952e-02,1.318877800725815e-02,-5.235745043807059e-03}, +{ 4.983624104755741e-02,-1.636642023289904e-01,9.104574903469282e-01,2.775196019052827e-01,-1.170274602643943e-01,7.212698856645762e-02,-5.092702260150545e-02,3.852565590658437e-02,-3.029454020476800e-02,2.429188154428410e-02,-1.951298856223310e-02,1.529987786498500e-02,-1.099111833870031e-02,4.359595118512073e-03}, +{ 1.767669948303014e-02,-5.177134977607039e-02,1.202392225314499e-01,9.798303051495031e-01,-9.647996871960042e-02,4.961069088588022e-02,-3.267596465706648e-02,2.386313132667837e-02,-1.838589374563533e-02,1.455730476279937e-02,-1.159915396159545e-02,9.048421784734513e-03,-6.481121140659504e-03,2.567676076551908e-03}, +{ -3.853190563746287e-02,1.067829596590382e-01,-1.971006781119946e-01,5.765677870385103e-01,6.913081328277713e-01,-2.134861632683249e-01,1.238764768595835e-01,-8.556956872911790e-02,6.397962033870451e-02,-4.975809306935769e-02,3.920669541191812e-02,-3.037365467542886e-02,2.166995926774717e-02,-8.571567911586243e-03}, +{ 1.804651916195949e-02,-4.847729216339858e-02,8.055222596178095e-02,-1.545151528922536e-01,9.345077765881573e-01,2.344280361138954e-01,-1.027156682257946e-01,6.461258416747652e-02,-4.617835696427310e-02,3.501403084946559e-02,-2.717075325142179e-02,2.085478972250633e-02,-1.480117788676732e-02,5.842438818667407e-03}, +{ 1.071169770387951e-02,-2.822274979647685e-02,4.422534076999120e-02,-7.208082064788943e-02,1.596508387907819e-01,9.669274593504242e-01,-1.196667300035419e-01,6.285050126600497e-02,-4.181485303238151e-02,3.055951727955391e-02,-2.321874925514969e-02,1.760079942845680e-02,-1.240588096949569e-02,4.883629115842526e-03}, +{ -2.056985423810129e-02,5.350562911195672e-02,-8.083953330323369e-02,1.209678217668118e-01,-2.071608670519244e-01,6.171242644642910e-01,6.542976309836613e-01,-2.119160749690372e-01,1.243293643508177e-01,-8.592041608268430e-02,6.336131978442129e-02,-4.722448767644927e-02,3.298258315901209e-02,-1.293738029954174e-02}, +{ 8.939967408995202e-03,-2.304776791811528e-02,3.398429725852963e-02,-4.829423737378228e-02,7.332007952546218e-02,-1.384918783865632e-01,9.521840669417125e-01,1.965277108423448e-01,-8.793355687406705e-02,5.542710926019958e-02,-3.911665513072309e-02,2.847625087438886e-02,-1.964433154219550e-02,7.668945113813735e-03}, +{ 7.668945113813818e-03,-1.964433154219546e-02,2.847625087438907e-02,-3.911665513072320e-02,5.542710926019932e-02,-8.793355687406690e-02,1.965277108423454e-01,9.521840669417125e-01,-1.384918783865642e-01,7.332007952546311e-02,-4.829423737378272e-02,3.398429725852970e-02,-2.304776791811547e-02,8.939967408995038e-03}, +{ -1.293738029954168e-02,3.298258315901216e-02,-4.722448767644948e-02,6.336131978442171e-02,-8.592041608268465e-02,1.243293643508177e-01,-2.119160749690372e-01,6.542976309836600e-01,6.171242644642920e-01,-2.071608670519237e-01,1.209678217668112e-01,-8.083953330323349e-02,5.350562911195655e-02,-2.056985423810116e-02}, +{ 4.883629115842312e-03,-1.240588096949546e-02,1.760079942845663e-02,-2.321874925514962e-02,3.055951727955379e-02,-4.181485303238126e-02,6.285050126600478e-02,-1.196667300035410e-01,9.669274593504249e-01,1.596508387907799e-01,-7.208082064788865e-02,4.422534076999091e-02,-2.822274979647663e-02,1.071169770387928e-02}, +{ 5.842438818667257e-03,-1.480117788676733e-02,2.085478972250661e-02,-2.717075325142213e-02,3.501403084946594e-02,-4.617835696427316e-02,6.461258416747659e-02,-1.027156682257951e-01,2.344280361138966e-01,9.345077765881561e-01,-1.545151528922532e-01,8.055222596178108e-02,-4.847729216339866e-02,1.804651916195928e-02}, +{ -8.571567911586205e-03,2.166995926774724e-02,-3.037365467542904e-02,3.920669541191846e-02,-4.975809306935782e-02,6.397962033870472e-02,-8.556956872911822e-02,1.238764768595839e-01,-2.134861632683260e-01,6.913081328277704e-01,5.765677870385121e-01,-1.971006781119953e-01,1.067829596590387e-01,-3.853190563746303e-02}, +{ 2.567676076552131e-03,-6.481121140659633e-03,9.048421784734730e-03,-1.159915396159594e-02,1.455730476279978e-02,-1.838589374563558e-02,2.386313132667831e-02,-3.267596465706660e-02,4.961069088588107e-02,-9.647996871960106e-02,9.798303051495028e-01,1.202392225314505e-01,-5.177134977607073e-02,1.767669948303030e-02}, +{ 4.359595118511986e-03,-1.099111833870050e-02,1.529987786498533e-02,-1.951298856223339e-02,2.429188154428408e-02,-3.029454020476796e-02,3.852565590658432e-02,-5.092702260150568e-02,7.212698856645829e-02,-1.170274602643951e-01,2.775196019052846e-01,9.104574903469276e-01,-1.636642023289913e-01,4.983624104755772e-02}, +{ -5.235745043807111e-03,1.318877800725822e-02,-1.832044772068939e-02,2.328059730666997e-02,-2.881755110287200e-02,3.562527139604692e-02,-4.468783378168666e-02,5.775129025384205e-02,-7.850292795737868e-02,1.165846829597933e-01,-2.083586614438616e-01,7.339313616265084e-01,5.176374882101328e-01,-1.140763027099562e-01}, +{ 8.279407262130820e-04,-2.084397519630726e-03,2.891398768887545e-03,-3.665462800105253e-03,4.520408054791517e-03,-5.556839872398460e-03,6.910199549762328e-03,-8.806939610646400e-03,1.168792673756854e-02,-1.656453546142582e-02,2.639283766749703e-02,-5.507266340204101e-02,9.933872010603818e-01,4.513292610114594e-02}, +{ 2.475333995543560e-03,-6.229925108054216e-03,8.635436398285439e-03,-1.093319335435555e-02,1.345647819070798e-02,-1.649219592253890e-02,2.041546444304457e-02,-2.583284124364407e-02,3.387298227750015e-02,-4.694731674288573e-02,7.124395791165614e-02,-1.282235251222018e-01,3.665603347662177e-01,7.179990095107248e-01} +}; +#endif +#if p_Nq==14 && p_cubNq==21 +const dfloat c_I[21][14] = { +{ 7.411766250893687e-01,3.377102812460584e-01,-1.197550872396830e-01,6.671234299896088e-02,-4.400286267609971e-02,3.176281647472701e-02,-2.422953981493548e-02,1.915124385074401e-02,-1.547239533904158e-02,1.262518560753275e-02,-1.025820888110719e-02,8.102527921852554e-03,-5.845557477709845e-03,2.322628239332641e-03}, +{ 8.819156219607621e-02,9.775548128448778e-01,-9.468724144972890e-02,4.627277507540428e-02,-2.921349550985199e-02,2.066706797146745e-02,-1.559449753439709e-02,1.224601806576181e-02,-9.852768383226459e-03,8.017817939264026e-03,-6.502875343988359e-03,5.130359775254947e-03,-3.698769518818534e-03,1.469233871904797e-03}, +{ -1.269047000100331e-01,6.277484161391736e-01,6.315181192082013e-01,-2.023395374942854e-01,1.155593472308325e-01,-7.842197783063976e-02,5.791614567213351e-02,-4.491481293561148e-02,3.585530367190632e-02,-2.902932207029977e-02,2.346530681799065e-02,-1.847267993651718e-02,1.330124854798254e-02,-5.280857010833755e-03}, +{ 2.949623847407380e-02,-1.000915472372417e-01,9.757904242953671e-01,1.325770568237254e-01,-6.015983610749137e-02,3.782358987171936e-02,-2.694170306907144e-02,2.047744799022740e-02,-1.614782811869126e-02,1.297135898900618e-02,-1.043156122352335e-02,8.185248756878016e-03,-5.882608374541104e-03,2.333718929562848e-03}, +{ 4.009231825718504e-02,-1.195631225719658e-01,3.043361545657016e-01,8.952701324051401e-01,-1.794775386902873e-01,9.738067015654972e-02,-6.535926457473969e-02,4.817208677644030e-02,-3.730907893449115e-02,2.963453870020125e-02,-2.366049424782165e-02,1.848087893663859e-02,-1.324698044137287e-02,5.249699662821641e-03}, +{ -3.834436915513335e-02,1.075051733466361e-01,-2.071782728391437e-01,7.890059426079203e-01,4.613020583606178e-01,-1.756839235770070e-01,1.063818003110459e-01,-7.479007883223657e-02,5.643782733044657e-02,-4.413070729019693e-02,3.488890882134679e-02,-2.708435648832875e-02,1.934584017917437e-02,-7.655842775141908e-03}, +{ -5.187580915736344e-04,1.404900987304398e-03,-2.396182661530778e-03,5.007684076784584e-03,9.999582078398392e-01,-4.972244434779837e-03,2.453231595537382e-03,-1.598710403254932e-03,1.161384719233270e-03,-8.885322798817891e-04,6.931819809901005e-04,-5.337605552861250e-04,3.795060519128182e-04,-1.499088252946439e-04}, +{ 2.474120013091060e-02,-6.558136913170082e-02,1.046108971506180e-01,-1.784721671280410e-01,4.748271342021518e-01,7.792316813815565e-01,-2.122863228390678e-01,1.209349163627521e-01,-8.291255715987356e-02,6.150821876001804e-02,-4.712885487860101e-02,3.590208718959809e-02,-2.537419398277102e-02,9.999329942450237e-03}, +{ -1.552282649904350e-02,4.056527030534700e-02,-6.208974557402371e-02,9.566651830441496e-02,-1.771974557832278e-01,9.021878239976668e-01,2.955263157408115e-01,-1.254812668506449e-01,7.824410962686712e-02,-5.548758348805809e-02,4.147383501534533e-02,-3.114524971629713e-02,2.184079735917005e-02,-8.580542438327640e-03}, +{ -7.256591419232268e-03,1.877672847045402e-02,-2.796258791661623e-02,4.056268020275181e-02,-6.444948423712510e-02,1.404292694828094e-01,9.738379002931762e-01,-1.086901478852664e-01,5.670313196879807e-02,-3.749927025880503e-02,2.705816668063424e-02,-1.992936287515879e-02,1.383202819558428e-02,-5.412460702004100e-03}, +{ 1.611328124999968e-02,-4.139732081171958e-02,6.047960321705030e-02,-8.436165461443203e-02,1.232142737669562e-01,-2.099458793904553e-01,6.358976965825990e-01,6.358976965826026e-01,-2.099458793904563e-01,1.232142737669568e-01,-8.436165461443204e-02,6.047960321705042e-02,-4.139732081171976e-02,1.611328124999987e-02}, +{ -5.412460702004166e-03,1.383202819558435e-02,-1.992936287515872e-02,2.705816668063443e-02,-3.749927025880521e-02,5.670313196879775e-02,-1.086901478852660e-01,9.738379002931765e-01,1.404292694828082e-01,-6.444948423712424e-02,4.056268020275128e-02,-2.796258791661588e-02,1.877672847045375e-02,-7.256591419232078e-03}, +{ -8.580542438327794e-03,2.184079735917020e-02,-3.114524971629714e-02,4.147383501534534e-02,-5.548758348805815e-02,7.824410962686704e-02,-1.254812668506450e-01,2.955263157408116e-01,9.021878239976671e-01,-1.771974557832276e-01,9.566651830441442e-02,-6.208974557402350e-02,4.056527030534690e-02,-1.552282649904346e-02}, +{ 9.999329942449987e-03,-2.537419398277098e-02,3.590208718959826e-02,-4.712885487860135e-02,6.150821876001841e-02,-8.291255715987349e-02,1.209349163627518e-01,-2.122863228390679e-01,7.792316813815563e-01,4.748271342021513e-01,-1.784721671280400e-01,1.046108971506178e-01,-6.558136913170079e-02,2.474120013091059e-02}, +{ -1.499088252943817e-04,3.795060519129361e-04,-5.337605552862164e-04,6.931819809900525e-04,-8.885322798821542e-04,1.161384719233764e-03,-1.598710403255763e-03,2.453231595538724e-03,-4.972244434781979e-03,9.999582078398394e-01,5.007684076786768e-03,-2.396182661531985e-03,1.404900987304936e-03,-5.187580915740197e-04}, +{ -7.655842775141961e-03,1.934584017917446e-02,-2.708435648832877e-02,3.488890882134671e-02,-4.413070729019677e-02,5.643782733044659e-02,-7.479007883223661e-02,1.063818003110461e-01,-1.756839235770074e-01,4.613020583606168e-01,7.890059426079215e-01,-2.071782728391443e-01,1.075051733466365e-01,-3.834436915513290e-02}, +{ 5.249699662821621e-03,-1.324698044137316e-02,1.848087893663907e-02,-2.366049424782204e-02,2.963453870020148e-02,-3.730907893449153e-02,4.817208677644056e-02,-6.535926457474006e-02,9.738067015655086e-02,-1.794775386902886e-01,8.952701324051376e-01,3.043361545657056e-01,-1.195631225719673e-01,4.009231825718565e-02}, +{ 2.333718929562797e-03,-5.882608374541192e-03,8.185248756878474e-03,-1.043156122352376e-02,1.297135898900618e-02,-1.614782811869146e-02,2.047744799022768e-02,-2.694170306907150e-02,3.782358987172020e-02,-6.015983610749263e-02,1.325770568237274e-01,9.757904242953667e-01,-1.000915472372432e-01,2.949623847407435e-02}, +{ -5.280857010833889e-03,1.330124854798271e-02,-1.847267993651734e-02,2.346530681799077e-02,-2.902932207030004e-02,3.585530367190624e-02,-4.491481293561102e-02,5.791614567213311e-02,-7.842197783063931e-02,1.155593472308318e-01,-2.023395374942837e-01,6.315181192081998e-01,6.277484161391733e-01,-1.269047000100323e-01}, +{ 1.469233871904757e-03,-3.698769518818263e-03,5.130359775254792e-03,-6.502875343988450e-03,8.017817939264232e-03,-9.852768383226703e-03,1.224601806576191e-02,-1.559449753439718e-02,2.066706797146812e-02,-2.921349550985242e-02,4.627277507540453e-02,-9.468724144972868e-02,9.775548128448778e-01,8.819156219607568e-02}, +{ 2.322628239332825e-03,-5.845557477710150e-03,8.102527921852990e-03,-1.025820888110770e-02,1.262518560753328e-02,-1.547239533904208e-02,1.915124385074438e-02,-2.422953981493610e-02,3.176281647472824e-02,-4.400286267610113e-02,6.671234299896270e-02,-1.197550872396868e-01,3.377102812460687e-01,7.411766250893609e-01} +}; +#endif +#if p_Nq==15 && p_cubNq==15 +const dfloat c_I[15][15] = { +{ 4.881859845427823e-01,6.361235243572523e-01,-1.861004937052206e-01,1.003054119052305e-01,-6.540694608950155e-02,4.698902865720034e-02,-3.579285930572609e-02,2.832410886283120e-02,-2.298169644323044e-02,1.892578199787908e-02,-1.566157730163389e-02,1.285508566738804e-02,-1.022473443780898e-02,7.408026400078449e-03,-2.948645107520692e-03}, +{ -1.322360808835553e-01,7.319652989716930e-01,5.192459370023736e-01,-1.833659089950031e-01,1.067335117726706e-01,-7.301823891815057e-02,5.420614942380245e-02,-4.224819332313075e-02,3.395067187314910e-02,-2.778075300930433e-02,2.288975587950320e-02,-1.873273469969632e-02,1.487094391320920e-02,-1.076200598035704e-02,4.281646972796260e-03}, +{ 6.088795073375781e-02,-1.949274281285896e-01,8.188732608097289e-01,4.191977906070113e-01,-1.627069531672443e-01,9.830876836097768e-02,-6.889968379215661e-02,5.201819938264519e-02,-4.099752141020788e-02,3.312756675099122e-02,-2.706674824165622e-02,2.202641926110891e-02,-1.742140827443299e-02,1.258052141665220e-02,-5.000734308585691e-03}, +{ -3.266504684936049e-02,9.226728893108876e-02,-1.830564297755797e-01,8.842741679138718e-01,3.244897240944635e-01,-1.347688534650979e-01,8.342532068679177e-02,-5.932232237861636e-02,4.519055996523230e-02,-3.575375036357070e-02,2.881491835633504e-02,-2.323810894341337e-02,1.827315290824181e-02,-1.315088039010293e-02,5.220259309716438e-03}, +{ 1.804455804453842e-02,-4.846791955472239e-02,8.052300593936079e-02,-1.544346263576827e-01,9.346390656041395e-01,2.342005177143713e-01,-1.027030246953863e-01,6.472758879890328e-02,-4.644773364201541e-02,3.550621733318438e-02,-2.801080591795167e-02,2.228264160409116e-02,-1.737096789751482e-02,1.243939815406241e-02,-4.927915127378033e-03}, +{ -9.332269129041556e-03,2.443790081323212e-02,-3.762712750059902e-02,5.879530175393740e-02,-1.135430186384983e-01,9.708164269057906e-01,1.491852014919615e-01,-6.847957340944888e-02,4.376323684505173e-02,-3.155874488703044e-02,2.407294648886895e-02,-1.875932150239325e-02,1.444014208905877e-02,-1.026657398703263e-02,4.055472666142933e-03}, +{ 3.729852004701467e-03,-9.625661388394387e-03,1.423369868102326e-02,-2.035147582018620e-02,3.133054647175596e-02,-6.174800443634213e-02,9.926802666610692e-01,7.069526052817587e-02,-3.374452233925205e-02,2.179081073231874e-02,-1.571709178316317e-02,1.186274723088176e-02,-8.961123854787314e-03,6.305085639747124e-03,-2.480388327548266e-03}, +{ 2.775557561562891e-16,-1.290899306791331e-16,2.065133098044817e-16,-3.200116902957763e-16,2.873460730081031e-16,-5.632432032842435e-16,1.220670114767729e-15,1.000000000000000e+00,-1.173763126703481e-15,5.359816926531079e-16,-2.863667054911146e-16,3.441519855763892e-16,-2.468481678084240e-16,1.706369122540807e-16,-2.775557561562891e-16}, +{ -2.480388327548134e-03,6.305085639746744e-03,-8.961123854786871e-03,1.186274723088126e-02,-1.571709178316263e-02,2.179081073231781e-02,-3.374452233925038e-02,7.069526052817278e-02,9.926802666610697e-01,-6.174800443633972e-02,3.133054647175483e-02,-2.035147582018544e-02,1.423369868102270e-02,-9.625661388394050e-03,3.729852004701419e-03}, +{ 4.055472666142854e-03,-1.026657398703247e-02,1.444014208905867e-02,-1.875932150239321e-02,2.407294648886886e-02,-3.155874488703028e-02,4.376323684505119e-02,-6.847957340944798e-02,1.491852014919601e-01,9.708164269057907e-01,-1.135430186384972e-01,5.879530175393672e-02,-3.762712750059858e-02,2.443790081323185e-02,-9.332269129041323e-03}, +{ -4.927915127378252e-03,1.243939815406232e-02,-1.737096789751477e-02,2.228264160409128e-02,-2.801080591795187e-02,3.550621733318458e-02,-4.644773364201536e-02,6.472758879890315e-02,-1.027030246953865e-01,2.342005177143711e-01,9.346390656041399e-01,-1.544346263576828e-01,8.052300593936089e-02,-4.846791955472241e-02,1.804455804453857e-02}, +{ 5.220259309716514e-03,-1.315088039010299e-02,1.827315290824211e-02,-2.323810894341371e-02,2.881491835633510e-02,-3.575375036357095e-02,4.519055996523262e-02,-5.932232237861638e-02,8.342532068679251e-02,-1.347688534650988e-01,3.244897240944654e-01,8.842741679138696e-01,-1.830564297755798e-01,9.226728893108929e-02,-3.266504684936056e-02}, +{ -5.000734308585227e-03,1.258052141665221e-02,-1.742140827443361e-02,2.202641926110926e-02,-2.706674824165611e-02,3.312756675099146e-02,-4.099752141020802e-02,5.201819938264483e-02,-6.889968379215686e-02,9.830876836097847e-02,-1.627069531672449e-01,4.191977906070135e-01,8.188732608097281e-01,-1.949274281285910e-01,6.088795073375816e-02}, +{ 4.281646972796066e-03,-1.076200598035688e-02,1.487094391320887e-02,-1.873273469969569e-02,2.288975587950252e-02,-2.778075300930405e-02,3.395067187314883e-02,-4.224819332313076e-02,5.420614942380252e-02,-7.301823891815015e-02,1.067335117726700e-01,-1.833659089950017e-01,5.192459370023724e-01,7.319652989716923e-01,-1.322360808835543e-01}, +{ -2.948645107520742e-03,7.408026400078494e-03,-1.022473443780914e-02,1.285508566738827e-02,-1.566157730163403e-02,1.892578199787914e-02,-2.298169644323046e-02,2.832410886283131e-02,-3.579285930572665e-02,4.698902865720080e-02,-6.540694608950187e-02,1.003054119052306e-01,-1.861004937052224e-01,6.361235243572624e-01,4.881859845427742e-01} +}; +#endif +#if p_Nq==15 && p_cubNq==16 +const dfloat c_I[16][15] = { +{ 5.370359328999557e-01,5.816801466712198e-01,-1.782698064449117e-01,9.679248090822878e-02,-6.327878812779673e-02,4.551487443159841e-02,-3.469278012178807e-02,2.746447140067438e-02,-2.228987977815190e-02,1.835918812614991e-02,-1.519447315258717e-02,1.247267319862265e-02,-9.921086866134774e-03,7.188249272446576e-03,-2.861202417525877e-03}, +{ -1.264878077561582e-01,8.371730959276247e-01,3.849978676749906e-01,-1.487283493885218e-01,8.826279779246356e-02,-6.085395885351119e-02,4.535555673886313e-02,-3.543156938529114e-02,2.851401088042173e-02,-2.335433144697917e-02,1.925502104379344e-02,-1.576496937657929e-02,1.251855902856675e-02,-9.061123884128214e-03,3.605201004445075e-03}, +{ 4.183638338316981e-02,-1.392995017767592e-01,9.442435244185290e-01,2.114602540210591e-01,-9.231514190478902e-02,5.745302955260785e-02,-4.078397290103167e-02,3.100218636754948e-02,-2.453399977844980e-02,1.987624622362721e-02,-1.626793277015202e-02,1.325385294862708e-02,-1.049077082319397e-02,7.579034780662272e-03,-3.013191741456214e-03}, +{ -8.590847726781023e-03,2.471802468456277e-02,-5.294066774020159e-02,9.945621922600334e-01,6.000830985943714e-02,-2.854674118186867e-02,1.835632908289104e-02,-1.327937978900613e-02,1.021195820399076e-02,-8.125972278977716e-03,6.573099953885880e-03,-5.313729806992964e-03,4.184868837552775e-03,-3.014481048454588e-03,1.197036689928974e-03}, +{ -7.087179754129069e-03,1.924447681701140e-02,-3.311322327256339e-02,7.142289245940797e-02,9.924560910478791e-01,-6.244535953231024e-02,3.174118049239769e-02,-2.092418963919875e-02,1.533373686188591e-02,-1.185983638271537e-02,9.423313675318014e-03,-7.530264054046927e-03,5.887086405615063e-03,-4.222635445943761e-03,1.673910321392294e-03}, +{ 1.426189084300467e-02,-3.761657776212994e-02,5.913417630767760e-02,-9.718483228688982e-02,2.221066278970681e-01,9.404856731083009e-01,-1.502779387051236e-01,8.056644930315862e-02,-5.415570791296399e-02,4.000905943081224e-02,-3.093603468569943e-02,2.430565937390293e-02,-1.880289994792124e-02,1.340597534455464e-02,-5.301520307751738e-03}, +{ -1.649954022548610e-02,4.279840452173395e-02,-6.417177830053168e-02,9.447468268446269e-02,-1.555158229027871e-01,3.877279603352992e-01,8.440891386549965e-01,-2.009419224909999e-01,1.125258462529074e-01,-7.676498260008445e-02,5.686520697055627e-02,-4.356316322752575e-02,3.319342268071364e-02,-2.346618695964419e-02,9.248734606389711e-03}, +{ 1.565770990885174e-02,-4.019347883789424e-02,5.859699643390204e-02,-8.141710121816520e-02,1.180589482611160e-01,-1.979092276248323e-01,5.555090230994286e-01,7.119716291987708e-01,-2.153929832568141e-01,1.249766886950023e-01,-8.606800178182854e-02,6.339647687270666e-02,-4.723595442613694e-02,3.298979495744021e-02,-1.294052028154691e-02}, +{ -1.294052028154702e-02,3.298979495744003e-02,-4.723595442613697e-02,6.339647687270665e-02,-8.606800178182863e-02,1.249766886950024e-01,-2.153929832568135e-01,7.119716291987694e-01,5.555090230994304e-01,-1.979092276248323e-01,1.180589482611157e-01,-8.141710121816491e-02,5.859699643390194e-02,-4.019347883789426e-02,1.565770990885118e-02}, +{ 9.248734606389808e-03,-2.346618695964431e-02,3.319342268071382e-02,-4.356316322752617e-02,5.686520697055708e-02,-7.676498260008535e-02,1.125258462529080e-01,-2.009419224910009e-01,8.440891386549940e-01,3.877279603353027e-01,-1.555158229027878e-01,9.447468268446298e-02,-6.417177830053174e-02,4.279840452173404e-02,-1.649954022548619e-02}, +{ -5.301520307752036e-03,1.340597534455484e-02,-1.880289994792145e-02,2.430565937390325e-02,-3.093603468569969e-02,4.000905943081243e-02,-5.415570791296450e-02,8.056644930315982e-02,-1.502779387051251e-01,9.404856731082999e-01,2.221066278970704e-01,-9.718483228689086e-02,5.913417630767831e-02,-3.761657776213036e-02,1.426189084300501e-02}, +{ 1.673910321392424e-03,-4.222635445943757e-03,5.887086405615180e-03,-7.530264054046890e-03,9.423313675317256e-03,-1.185983638271535e-02,1.533373686188590e-02,-2.092418963919781e-02,3.174118049239776e-02,-6.244535953230999e-02,9.924560910478787e-01,7.142289245940725e-02,-3.311322327256319e-02,1.924447681701146e-02,-7.087179754128915e-03}, +{ 1.197036689928861e-03,-3.014481048454518e-03,4.184868837552896e-03,-5.313729806993138e-03,6.573099953886143e-03,-8.125972278977964e-03,1.021195820399078e-02,-1.327937978900586e-02,1.835632908289109e-02,-2.854674118186896e-02,6.000830985943691e-02,9.945621922600334e-01,-5.294066774020156e-02,2.471802468456273e-02,-8.590847726780886e-03}, +{ -3.013191741456374e-03,7.579034780662413e-03,-1.049077082319427e-02,1.325385294862790e-02,-1.626793277015266e-02,1.987624622362781e-02,-2.453399977845059e-02,3.100218636755043e-02,-4.078397290103292e-02,5.745302955260974e-02,-9.231514190479193e-02,2.114602540210659e-01,9.442435244185267e-01,-1.392995017767631e-01,4.183638338317101e-02}, +{ 3.605201004445197e-03,-9.061123884128258e-03,1.251855902856669e-02,-1.576496937657923e-02,1.925502104379384e-02,-2.335433144697940e-02,2.851401088042168e-02,-3.543156938529118e-02,4.535555673886316e-02,-6.085395885351114e-02,8.826279779246352e-02,-1.487283493885217e-01,3.849978676749934e-01,8.371730959276211e-01,-1.264878077561577e-01}, +{ -2.861202417525854e-03,7.188249272446753e-03,-9.921086866134937e-03,1.247267319862296e-02,-1.519447315258734e-02,1.835918812615014e-02,-2.228987977815211e-02,2.746447140067459e-02,-3.469278012178851e-02,4.551487443159895e-02,-6.327878812779744e-02,9.679248090823016e-02,-1.782698064449154e-01,5.816801466712366e-01,5.370359328999414e-01} +}; +#endif +#if p_Nq==15 && p_cubNq==17 +const dfloat c_I[17][15] = { +{ 5.798584174116663e-01,5.325673072832533e-01,-1.692489008779457e-01,9.244847325490439e-02,-6.056746263458917e-02,4.360817806880868e-02,-3.325761680298751e-02,2.633701255831514e-02,-2.137937726044848e-02,1.761174478332845e-02,-1.457728371149949e-02,1.196683117091877e-02,-9.519140395349667e-03,6.897199096097881e-03,-2.745381944473052e-03}, +{ -1.090961437668419e-01,9.116346388809671e-01,2.680285739812563e-01,-1.104382277237154e-01,6.654786471949704e-02,-4.617203567014783e-02,3.452471632941283e-02,-2.702167386737562e-02,2.177197831521500e-02,-1.784632578214425e-02,1.472165636771876e-02,-1.205764561276309e-02,9.576932474508205e-03,-6.932896790200853e-03,2.758588144613532e-03}, +{ 1.233118531955283e-02,-4.288315073292961e-02,9.962879648392249e-01,4.859899523326833e-02,-2.295857345326563e-02,1.461343509683328e-02,-1.047872731395328e-02,8.009246470861370e-03,-6.359250191445174e-03,5.162939889549585e-03,-4.231657485022434e-03,3.450900772612083e-03,-2.733164662494515e-03,1.975284595556646e-03,-7.854283783483938e-04}, +{ 2.084403186354455e-02,-6.118447661416434e-02,1.436644875486517e-01,9.721223642043623e-01,-1.106812504629072e-01,5.739328565350388e-02,-3.794911370714388e-02,2.781898177646252e-02,-2.155277623142556e-02,1.722898221451872e-02,-1.397790940447129e-02,1.132187283946008e-02,-8.927807945221159e-03,6.435653421158588e-03,-2.556325156328703e-03}, +{ -3.007189813450919e-02,8.261790184313550e-02,-1.478982159075017e-01,3.735686219307337e-01,8.527918678795995e-01,-1.975190641004735e-01,1.098009309856617e-01,-7.472539061919581e-02,5.563051962985117e-02,-4.341848920275429e-02,3.469259240776939e-02,-2.782292735929725e-02,2.180106584474242e-02,-1.565772038309211e-02,6.210205185330407e-03}, +{ 2.747068727902651e-02,-7.301686176671324e-02,1.174476832953927e-01,-2.049968802715723e-01,6.094320347929063e-01,6.613822847329984e-01,-2.124034411603437e-01,1.246498408776689e-01,-8.674650216527167e-02,6.523091828746359e-02,-5.095885092780714e-02,4.029053578256035e-02,-3.129015433332413e-02,2.235831859144678e-02,-8.849613014431428e-03}, +{ -1.913026638168012e-02,4.989450771142837e-02,-7.595580541621148e-02,1.156274299152620e-01,-2.072943225756698e-01,8.132525816653136e-01,4.309209283175293e-01,-1.686900520388863e-01,1.033428645573957e-01,-7.312741633666452e-02,5.521182972135215e-02,-4.276450430309489e-02,3.279832983802293e-02,-2.327113830102702e-02,9.185033626930225e-03}, +{ 9.049065804643874e-03,-2.332367473965338e-02,3.437298192392808e-02,-4.880770014401369e-02,7.402251768947431e-02,-1.396181726376202e-01,9.512282193140782e-01,1.988348952635815e-01,-8.905071921823079e-02,5.643102397550006e-02,-4.035049262347497e-02,3.031222963705818e-02,-2.283626066312005e-02,1.604414792425276e-02,-6.308061506403811e-03}, +{ -1.110223024625157e-16,-3.474526497736444e-17,1.389905421746642e-17,8.587224121615472e-18,5.085493062971073e-17,-1.370605556160261e-16,2.404198628617126e-16,1.000000000000000e+00,-2.200805862569026e-16,1.303685751724723e-16,-6.224657168044505e-17,1.968302408663235e-17,-5.218710828933558e-17,7.163687759451960e-17,-1.110223024625157e-16}, +{ -6.308061506403811e-03,1.604414792425252e-02,-2.283626066311983e-02,3.031222963705806e-02,-4.035049262347514e-02,5.643102397549993e-02,-8.905071921823011e-02,1.988348952635811e-01,9.512282193140783e-01,-1.396181726376200e-01,7.402251768947403e-02,-4.880770014401356e-02,3.437298192392809e-02,-2.332367473965342e-02,9.049065804643819e-03}, +{ 9.185033626930225e-03,-2.327113830102712e-02,3.279832983802314e-02,-4.276450430309501e-02,5.521182972135228e-02,-7.312741633666492e-02,1.033428645573961e-01,-1.686900520388865e-01,4.309209283175293e-01,8.132525816653130e-01,-2.072943225756692e-01,1.156274299152616e-01,-7.595580541621137e-02,4.989450771142839e-02,-1.913026638167985e-02}, +{ -8.849613014431316e-03,2.235831859144694e-02,-3.129015433332433e-02,4.029053578256044e-02,-5.095885092780732e-02,6.523091828746400e-02,-8.674650216527226e-02,1.246498408776693e-01,-2.124034411603442e-01,6.613822847329957e-01,6.094320347929099e-01,-2.049968802715730e-01,1.174476832953932e-01,-7.301686176671351e-02,2.747068727902652e-02}, +{ 6.210205185330518e-03,-1.565772038309231e-02,2.180106584474267e-02,-2.782292735929743e-02,3.469259240776950e-02,-4.341848920275439e-02,5.563051962985138e-02,-7.472539061919591e-02,1.098009309856621e-01,-1.975190641004741e-01,8.527918678795976e-01,3.735686219307354e-01,-1.478982159075020e-01,8.261790184313614e-02,-3.007189813450904e-02}, +{ -2.556325156328611e-03,6.435653421158597e-03,-8.927807945221189e-03,1.132187283946013e-02,-1.397790940447142e-02,1.722898221451862e-02,-2.155277623142522e-02,2.781898177646243e-02,-3.794911370714388e-02,5.739328565350373e-02,-1.106812504629073e-01,9.721223642043632e-01,1.436644875486507e-01,-6.118447661416412e-02,2.084403186354439e-02}, +{ -7.854283783483382e-04,1.975284595556413e-03,-2.733164662494181e-03,3.450900772611979e-03,-4.231657485022556e-03,5.162939889549467e-03,-6.359250191444748e-03,8.009246470860761e-03,-1.047872731395247e-02,1.461343509683255e-02,-2.295857345326456e-02,4.859899523326605e-02,9.962879648392253e-01,-4.288315073292803e-02,1.233118531955233e-02}, +{ 2.758588144613452e-03,-6.932896790200689e-03,9.576932474508193e-03,-1.205764561276294e-02,1.472165636771879e-02,-1.784632578214407e-02,2.177197831521473e-02,-2.702167386737576e-02,3.452471632941309e-02,-4.617203567014807e-02,6.654786471949721e-02,-1.104382277237155e-01,2.680285739812573e-01,9.116346388809655e-01,-1.090961437668413e-01}, +{ -2.745381944473158e-03,6.897199096098197e-03,-9.519140395350068e-03,1.196683117091910e-02,-1.457728371149983e-02,1.761174478332869e-02,-2.137937726044908e-02,2.633701255831574e-02,-3.325761680298808e-02,4.360817806880977e-02,-6.056746263459063e-02,9.244847325490640e-02,-1.692489008779503e-01,5.325673072832728e-01,5.798584174116503e-01} +}; +#endif +#if p_Nq==15 && p_cubNq==18 +const dfloat c_I[18][15] = { +{ 6.174614412651024e-01,4.884649612273509e-01,-1.597994665778688e-01,8.772344633263666e-02,-5.757411530293898e-02,4.148769263568025e-02,-3.165496723459591e-02,2.507481024872802e-02,-2.035839853079678e-02,1.677269122661174e-02,-1.388392807658628e-02,1.139827239489367e-02,-9.067205776201888e-03,6.569886851065682e-03,-2.615120683080648e-03}, +{ -8.296645292807715e-02,9.602137176313014e-01,1.692709796649510e-01,-7.320679558513495e-02,4.465888107868853e-02,-3.114572480971589e-02,2.335164429289665e-02,-1.830560808322533e-02,1.476393282475411e-02,-1.210985305180154e-02,9.994025096031698e-03,-8.187995872672457e-03,6.504705651488167e-03,-4.709413025734897e-03,1.873957116250737e-03}, +{ -2.123121876733655e-02,7.748064829296569e-02,9.905966572752163e-01,-6.785030866286598e-02,3.391772187540246e-02,-2.197220657199572e-02,1.588388363436461e-02,-1.219505631955209e-02,9.709169359137321e-03,-7.896572929977393e-03,6.479819756105318e-03,-5.288442158436812e-03,4.190679010572467e-03,-3.029559568333782e-03,1.204785774734008e-03}, +{ 4.581729186845156e-02,-1.374284343156819e-01,3.610825210635729e-01,8.596521568759599e-01,-1.940131904774697e-01,1.067770221032550e-01,-7.211701926860210e-02,5.342133735970041e-02,-4.163657278989204e-02,3.340791170925624e-02,-2.716977398941183e-02,2.204237485162202e-02,-1.739937780397361e-02,1.254999357412398e-02,-4.986240760910719e-03}, +{ -3.986208962887001e-02,1.108974474079568e-01,-2.076234703667016e-01,6.571747392075667e-01,6.127388702624335e-01,-2.062138069176626e-01,1.217241107690859e-01,-8.482324005460908e-02,6.392348485954949e-02,-5.024973711373102e-02,4.033208245654765e-02,-3.243979011131405e-02,2.546558124255057e-02,-1.830911838131418e-02,7.264936368512158e-03}, +{ 2.204369385149931e-02,-5.907714994665510e-02,9.746170504408061e-02,-1.829443047140349e-01,8.892592467775779e-01,3.172104068562902e-01,-1.329273312973413e-01,8.276097179502317e-02,-5.906457507612684e-02,4.501628375270411e-02,-3.544942466730835e-02,2.816818584538464e-02,-2.194367748131252e-02,1.570757363374359e-02,-6.221604373524640e-03}, +{ -3.310718269860654e-03,8.685409925128177e-03,-1.344275597599405e-02,2.126557916113166e-02,-4.264937219710218e-02,9.966600102366652e-01,4.678251024769789e-02,-2.257985408024078e-02,1.465137831029373e-02,-1.064070863271021e-02,8.148678274285094e-03,-6.364968109242750e-03,4.906468715443860e-03,-3.491168549159650e-03,1.379510943664659e-03}, +{ -9.846863768302211e-03,2.549045241620235e-02,-3.800995844110197e-02,5.530412574926163e-02,-8.854536691966780e-02,1.985321220490363e-01,9.513152107994326e-01,-1.396155597223988e-01,7.438081281728563e-02,-4.981628127276268e-02,3.656463403181845e-02,-2.786613807253190e-02,2.116835393240215e-02,-1.493992371855006e-02,5.884380119876326e-03}, +{ 1.486868857938028e-02,-3.815101131782954e-02,5.555379780699730e-02,-7.700962179817872e-02,1.111521559631503e-01,-1.842688857928798e-01,4.916577736568356e-01,7.663114746987617e-01,-2.139085487468903e-01,1.224525292603760e-01,-8.387996875293949e-02,6.161758488166533e-02,-4.584190490828799e-02,3.199065325216265e-02,-1.254471678232327e-02}, +{ -1.254471678232327e-02,3.199065325216255e-02,-4.584190490828815e-02,6.161758488166542e-02,-8.387996875293974e-02,1.224525292603761e-01,-2.139085487468898e-01,7.663114746987600e-01,4.916577736568375e-01,-1.842688857928799e-01,1.111521559631502e-01,-7.700962179817861e-02,5.555379780699719e-02,-3.815101131782953e-02,1.486868857938006e-02}, +{ 5.884380119876409e-03,-1.493992371855002e-02,2.116835393240204e-02,-2.786613807253177e-02,3.656463403181823e-02,-4.981628127276283e-02,7.438081281728567e-02,-1.396155597223984e-01,9.513152107994334e-01,1.985321220490351e-01,-8.854536691966740e-02,5.530412574926119e-02,-3.800995844110155e-02,2.549045241620212e-02,-9.846863768302155e-03}, +{ 1.379510943664517e-03,-3.491168549159540e-03,4.906468715443866e-03,-6.364968109242850e-03,8.148678274285273e-03,-1.064070863271041e-02,1.465137831029372e-02,-2.257985408024077e-02,4.678251024769784e-02,9.966600102366651e-01,-4.264937219710194e-02,2.126557916113148e-02,-1.344275597599407e-02,8.685409925128201e-03,-3.310718269860487e-03}, +{ -6.221604373524578e-03,1.570757363374353e-02,-2.194367748131251e-02,2.816818584538479e-02,-3.544942466730820e-02,4.501628375270410e-02,-5.906457507612705e-02,8.276097179502301e-02,-1.329273312973411e-01,3.172104068562892e-01,8.892592467775791e-01,-1.829443047140348e-01,9.746170504408051e-02,-5.907714994665519e-02,2.204369385149922e-02}, +{ 7.264936368512477e-03,-1.830911838131428e-02,2.546558124255066e-02,-3.243979011131414e-02,4.033208245654775e-02,-5.024973711373125e-02,6.392348485954960e-02,-8.482324005460905e-02,1.217241107690861e-01,-2.062138069176624e-01,6.127388702624336e-01,6.571747392075653e-01,-2.076234703667011e-01,1.108974474079570e-01,-3.986208962887011e-02}, +{ -4.986240760910782e-03,1.254999357412378e-02,-1.739937780397360e-02,2.204237485162223e-02,-2.716977398941190e-02,3.340791170925590e-02,-4.163657278989162e-02,5.342133735970009e-02,-7.211701926860187e-02,1.067770221032546e-01,-1.940131904774690e-01,8.596521568759641e-01,3.610825210635679e-01,-1.374284343156811e-01,4.581729186845132e-02}, +{ 1.204785774734030e-03,-3.029559568333920e-03,4.190679010572684e-03,-5.288442158436933e-03,6.479819756105319e-03,-7.896572929977190e-03,9.709169359137246e-03,-1.219505631955243e-02,1.588388363436502e-02,-2.197220657199563e-02,3.391772187540254e-02,-6.785030866286633e-02,9.905966572752156e-01,7.748064829296672e-02,-2.123121876733676e-02}, +{ 1.873957116250699e-03,-4.709413025734840e-03,6.504705651487946e-03,-8.187995872672035e-03,9.994025096031374e-03,-1.210985305180140e-02,1.476393282475361e-02,-1.830560808322496e-02,2.335164429289618e-02,-3.114572480971504e-02,4.465888107868726e-02,-7.320679558513314e-02,1.692709796649475e-01,9.602137176313023e-01,-8.296645292807546e-02}, +{ -2.615120683080563e-03,6.569886851065711e-03,-9.067205776202042e-03,1.139827239489372e-02,-1.388392807658645e-02,1.677269122661187e-02,-2.035839853079677e-02,2.507481024872793e-02,-3.165496723459590e-02,4.148769263567990e-02,-5.757411530293866e-02,8.772344633263619e-02,-1.597994665778684e-01,4.884649612273456e-01,6.174614412651078e-01} +}; +#endif +#if p_Nq==15 && p_cubNq==19 +const dfloat c_I[19][15] = { +{ 6.505642329583708e-01,4.489403231802611e-01,-1.503772991053357e-01,8.289760596452579e-02,-5.448863036660475e-02,3.929214715376061e-02,-2.999144963952465e-02,2.376268022221329e-02,-1.929599187801358e-02,1.589901596428981e-02,-1.316163675642170e-02,1.080580350514407e-02,-8.596169914267064e-03,6.228699590180097e-03,-2.479330878578049e-03}, +{ -5.066793765984190e-02,9.878912916040355e-01,8.768116220750118e-02,-3.937307585808288e-02,2.426234966631722e-02,-1.699390375902032e-02,1.277002033648237e-02,-1.002385109053291e-02,8.091280911325006e-03,-6.640422562153284e-03,5.482279304508511e-03,-4.492721851360576e-03,3.569706524664162e-03,-2.584727147441310e-03,1.028549373599140e-03}, +{ -5.391698117992243e-02,2.075857004706309e-01,9.438819311206642e-01,-1.436321126643426e-01,7.490970443137779e-02,-4.921517446270421e-02,3.581548532510723e-02,-2.759997281399545e-02,2.202391010362632e-02,-1.793874338648837e-02,1.473483446539257e-02,-1.203368216963374e-02,9.539887757220497e-03,-6.898409478027761e-03,2.743622481095057e-03}, +{ 6.082382226980856e-02,-1.867484136302406e-01,5.648834362438521e-01,6.997012586064896e-01,-2.121873478927800e-01,1.219695533610144e-01,-8.375975081910231e-02,6.257054926685408e-02,-4.900676159973320e-02,3.944263208465443e-02,-3.214234724529301e-02,2.611139565512558e-02,-2.062909727477814e-02,1.488706675720588e-02,-5.915995783077187e-03}, +{ -3.432647370097799e-02,9.678746369454912e-02,-1.906800782766363e-01,8.645727008307831e-01,3.556534431996081e-01,-1.450453237255332e-01,8.937251917104666e-02,-6.342256601570227e-02,4.826109833930398e-02,-3.815787340884638e-02,3.073950241546097e-02,-2.478339884770665e-02,1.948486099267438e-02,-1.402149664335870e-02,5.565621975335229e-03}, +{ 3.348700607799003e-03,-9.053737155177244e-03,1.535943898514482e-02,-3.152647343064689e-02,9.982077619239499e-01,3.381496008209510e-02,-1.638335527855969e-02,1.063336534212577e-02,-7.734688625134977e-03,5.957359586423884e-03,-4.721310842835668e-03,3.766688279528297e-03,-2.941736831652239e-03,2.108776383956859e-03,-8.357490270168629e-04}, +{ 1.656320680569461e-02,-4.372310456203640e-02,6.890309068352200e-02,-1.139463418354247e-01,2.666146019819387e-01,9.180629347157182e-01,-1.678822403129022e-01,9.113125049109935e-02,-6.153486196327682e-02,4.556322917857921e-02,-3.527621312866545e-02,2.773748432266199e-02,-2.146813516978436e-02,1.531040584624334e-02,-6.055307053367640e-03}, +{ -2.059965303727053e-02,5.357306542177481e-02,-8.090649160558601e-02,1.209885337543074e-01,-2.070041438541931e-01,6.152184178328353e-01,6.562497804852058e-01,-2.123091402489646e-01,1.248672319190850e-01,-8.685299327014896e-02,6.498235882240262e-02,-5.006695035224538e-02,3.827815077456001e-02,-2.711156350284409e-02,1.069339686108180e-02}, +{ 1.240911470965433e-02,-3.195362172879247e-02,4.697125768420400e-02,-6.635172659951762e-02,9.953044699118983e-02,-1.820574688217765e-01,8.938085204086377e-01,3.101807917338685e-01,-1.308321208340626e-01,8.155951724592544e-02,-5.789120633410304e-02,4.331823267832745e-02,-3.256155020047756e-02,2.284911090760982e-02,-8.979297840686906e-03}, +{ -5.551115123125783e-17,8.948838101929447e-17,-1.583223991710439e-16,8.810149052105539e-17,-1.223768342238510e-16,2.327926502025416e-16,-5.038029686822151e-16,9.999999999999999e-01,5.593714900696275e-16,-2.639302115722659e-16,1.208694184024435e-16,-5.548447954150051e-17,1.058441405115978e-16,-3.600808975464421e-17,-5.551115123125783e-17}, +{ -8.979297840686962e-03,2.284911090760938e-02,-3.256155020047723e-02,4.331823267832721e-02,-5.789120633410283e-02,8.155951724592503e-02,-1.308321208340613e-01,3.101807917338665e-01,8.938085204086385e-01,-1.820574688217759e-01,9.953044699118933e-02,-6.635172659951713e-02,4.697125768420383e-02,-3.195362172879245e-02,1.240911470965400e-02}, +{ 1.069339686108180e-02,-2.711156350284410e-02,3.827815077456007e-02,-5.006695035224586e-02,6.498235882240322e-02,-8.685299327014961e-02,1.248672319190856e-01,-2.123091402489652e-01,6.562497804852064e-01,6.152184178328347e-01,-2.070041438541928e-01,1.209885337543072e-01,-8.090649160558558e-02,5.357306542177476e-02,-2.059965303727058e-02}, +{ -6.055307053367667e-03,1.531040584624338e-02,-2.146813516978459e-02,2.773748432266199e-02,-3.527621312866551e-02,4.556322917857945e-02,-6.153486196327716e-02,9.113125049109980e-02,-1.678822403129027e-01,9.180629347157180e-01,2.666146019819395e-01,-1.139463418354251e-01,6.890309068352225e-02,-4.372310456203659e-02,1.656320680569488e-02}, +{ -8.357490270171066e-04,2.108776383956939e-03,-2.941736831652371e-03,3.766688279528521e-03,-4.721310842835999e-03,5.957359586424302e-03,-7.734688625135499e-03,1.063336534212632e-02,-1.638335527856070e-02,3.381496008209713e-02,9.982077619239497e-01,-3.152647343064872e-02,1.535943898514568e-02,-9.053737155177629e-03,3.348700607799360e-03}, +{ 5.565621975335167e-03,-1.402149664335861e-02,1.948486099267430e-02,-2.478339884770665e-02,3.073950241546121e-02,-3.815787340884604e-02,4.826109833930368e-02,-6.342256601570220e-02,8.937251917104655e-02,-1.450453237255326e-01,3.556534431996060e-01,8.645727008307832e-01,-1.906800782766353e-01,9.678746369454902e-02,-3.432647370097755e-02}, +{ -5.915995783077214e-03,1.488706675720600e-02,-2.062909727477824e-02,2.611139565512578e-02,-3.214234724529315e-02,3.944263208465423e-02,-4.900676159973302e-02,6.257054926685453e-02,-8.375975081910279e-02,1.219695533610147e-01,-2.121873478927811e-01,6.997012586064943e-01,5.648834362438484e-01,-1.867484136302408e-01,6.082382226980847e-02}, +{ 2.743622481095193e-03,-6.898409478027973e-03,9.539887757220700e-03,-1.203368216963382e-02,1.473483446539276e-02,-1.793874338648846e-02,2.202391010362647e-02,-2.759997281399562e-02,3.581548532510752e-02,-4.921517446270422e-02,7.490970443137818e-02,-1.436321126643432e-01,9.438819311206608e-01,2.075857004706348e-01,-5.391698117992297e-02}, +{ 1.028549373599162e-03,-2.584727147441529e-03,3.569706524664332e-03,-4.492721851360561e-03,5.482279304508498e-03,-6.640422562153414e-03,8.091280911325145e-03,-1.002385109053323e-02,1.277002033648268e-02,-1.699390375902073e-02,2.426234966631772e-02,-3.937307585808356e-02,8.768116220750348e-02,9.878912916040345e-01,-5.066793765984252e-02}, +{ -2.479330878577990e-03,6.228699590180145e-03,-8.596169914267038e-03,1.080580350514401e-02,-1.316163675642156e-02,1.589901596428950e-02,-1.929599187801334e-02,2.376268022221298e-02,-2.999144963952433e-02,3.929214715376055e-02,-5.448863036660481e-02,8.289760596452503e-02,-1.503772991053349e-01,4.489403231802556e-01,6.505642329583763e-01} +}; +#endif +#if p_Nq==15 && p_cubNq==20 +const dfloat c_I[20][15] = { +{ 6.797921293444275e-01,4.135305014734023e-01,-1.412473610063035e-01,7.814182895702469e-02,-5.142849487368054e-02,3.710793513289767e-02,-2.833369119950426e-02,2.245373463443123e-02,-1.823545279812771e-02,1.502648398861162e-02,-1.244006849679674e-02,1.021380331126563e-02,-8.125441524774566e-03,5.887707108650293e-03,-2.343614051523599e-03}, +{ -1.431372561785716e-02,9.991831210317047e-01,2.135709889838371e-02,-9.881746394830049e-03,6.140525749500848e-03,-4.316588313676428e-03,3.249892477331304e-03,-2.553888850448382e-03,2.062981870514872e-03,-1.693869519387778e-03,1.398895595625475e-03,-1.146644159529679e-03,9.112005709159999e-04,-6.598314027916491e-04,2.625780645443415e-04}, +{ -8.245462423766456e-02,3.369687749188108e-01,8.707652481978473e-01,-1.869115269324692e-01,1.007390697007728e-01,-6.694902020394872e-02,4.899101425921525e-02,-3.787088067599194e-02,3.027778724958506e-02,-2.469245630380719e-02,2.029931824559785e-02,-1.658745666263350e-02,1.315480761441006e-02,-9.514464200659692e-03,3.784409030935819e-03}, +{ 6.424435590126963e-02,-2.022790705749188e-01,7.346612193780639e-01,5.251579255879681e-01,-1.890587588463096e-01,1.123273002639996e-01,-7.817378011480591e-02,5.880244256335199e-02,-4.624301309330191e-02,3.731408566430961e-02,-3.045927980148196e-02,2.477199771731400e-02,-1.958518467096485e-02,1.413975412000168e-02,-5.619994094495508e-03}, +{ -1.721905617556500e-02,4.925180440916979e-02,-1.028517338352581e-01,9.761267327222864e-01,1.328178943140485e-01,-6.099290523010308e-02,3.880915826622184e-02,-2.793932048524592e-02,2.142779483197309e-02,-1.702272971415506e-02,1.375511081682115e-02,-1.111198940500296e-02,8.747437909803339e-03,-6.299398303695911e-03,2.501199878701703e-03}, +{ -1.826504050937693e-02,4.984583885334892e-02,-8.721255485391509e-02,2.003154632529113e-01,9.501415046533372e-01,-1.403441972323447e-01,7.454547145279525e-02,-4.989268326979232e-02,3.683459959466354e-02,-2.861036202907506e-02,2.279200792845073e-02,-1.824371142085278e-02,1.427775277077057e-02,-1.024721642389567e-02,4.063127232974917e-03}, +{ 2.731887078217916e-02,-7.259255152377128e-02,1.166643908883555e-01,-2.031499297487248e-01,5.967022165496746e-01,6.735308750400537e-01,-2.133283478429396e-01,1.248689658204902e-01,-8.680734060272242e-02,6.524063541232247e-02,-5.094989660913327e-02,4.027535452492659e-02,-3.127447243694369e-02,2.234552984131195e-02,-8.844300095078889e-03}, +{ -1.500937059631442e-02,3.922628750403603e-02,-6.005791057358258e-02,9.262635528453950e-02,-1.721802046902349e-01,9.109312811090432e-01,2.800607675706975e-01,-1.200834500351859e-01,7.528641682400380e-02,-5.382024436813974e-02,4.085885036295306e-02,-3.175008322901644e-02,2.439820538636727e-02,-1.732990539125452e-02,6.843004842088142e-03}, +{ -3.393053578658763e-03,8.770220112541511e-03,-1.302378468630813e-02,1.878582562334071e-02,-2.948855939619521e-02,6.178990953540112e-02,9.943210521099313e-01,-5.491290101615559e-02,2.789479867997846e-02,-1.837736102079679e-02,1.338121634611488e-02,-1.015247273339091e-02,7.692273386156438e-03,-5.421219540272533e-03,2.134056178313578e-03}, +{ 1.400758089025761e-02,-3.592886003350133e-02,5.226940675377689e-02,-7.232460014463454e-02,1.040133836490510e-01,-1.709629606944822e-01,4.395899878092737e-01,8.069530284063408e-01,-2.090826726516433e-01,1.183316942679161e-01,-8.069757009499716e-02,5.914734877821395e-02,-4.395002191324049e-02,3.065033558626365e-02,-1.201608060859455e-02}, +{ -1.201608060859471e-02,3.065033558626348e-02,-4.395002191324039e-02,5.914734877821386e-02,-8.069757009499701e-02,1.183316942679158e-01,-2.090826726516426e-01,8.069530284063409e-01,4.395899878092730e-01,-1.709629606944815e-01,1.040133836490506e-01,-7.232460014463427e-02,5.226940675377663e-02,-3.592886003350120e-02,1.400758089025744e-02}, +{ 2.134056178313661e-03,-5.421219540272859e-03,7.692273386156679e-03,-1.015247273339147e-02,1.338121634611547e-02,-1.837736102079746e-02,2.789479867997962e-02,-5.491290101615690e-02,9.943210521099307e-01,6.178990953540284e-02,-2.948855939619607e-02,1.878582562334114e-02,-1.302378468630824e-02,8.770220112541664e-03,-3.393053578658805e-03}, +{ 6.843004842088156e-03,-1.732990539125465e-02,2.439820538636770e-02,-3.175008322901692e-02,4.085885036295339e-02,-5.382024436814045e-02,7.528641682400464e-02,-1.200834500351865e-01,2.800607675706989e-01,9.109312811090424e-01,-1.721802046902352e-01,9.262635528453947e-02,-6.005791057358232e-02,3.922628750403603e-02,-1.500937059631461e-02}, +{ -8.844300095078680e-03,2.234552984131211e-02,-3.127447243694387e-02,4.027535452492682e-02,-5.094989660913354e-02,6.524063541232270e-02,-8.680734060272281e-02,1.248689658204909e-01,-2.133283478429406e-01,6.735308750400564e-01,5.967022165496725e-01,-2.031499297487249e-01,1.166643908883555e-01,-7.259255152377138e-02,2.731887078217905e-02}, +{ 4.063127232975070e-03,-1.024721642389567e-02,1.427775277077070e-02,-1.824371142085296e-02,2.279200792845085e-02,-2.861036202907503e-02,3.683459959466369e-02,-4.989268326979251e-02,7.454547145279580e-02,-1.403441972323451e-01,9.501415046533361e-01,2.003154632529121e-01,-8.721255485391537e-02,4.984583885334930e-02,-1.826504050937698e-02}, +{ 2.501199878701694e-03,-6.299398303695788e-03,8.747437909803299e-03,-1.111198940500297e-02,1.375511081682092e-02,-1.702272971415505e-02,2.142779483197322e-02,-2.793932048524555e-02,3.880915826622167e-02,-6.099290523010243e-02,1.328178943140465e-01,9.761267327222867e-01,-1.028517338352570e-01,4.925180440916938e-02,-1.721905617556454e-02}, +{ -5.619994094495241e-03,1.413975412000170e-02,-1.958518467096526e-02,2.477199771731426e-02,-3.045927980148220e-02,3.731408566431013e-02,-4.624301309330220e-02,5.880244256335216e-02,-7.817378011480629e-02,1.123273002640002e-01,-1.890587588463106e-01,5.251579255879700e-01,7.346612193780638e-01,-2.022790705749202e-01,6.424435590126980e-02}, +{ 3.784409030935437e-03,-9.514464200659694e-03,1.315480761441001e-02,-1.658745666263328e-02,2.029931824559794e-02,-2.469245630380704e-02,3.027778724958448e-02,-3.787088067599184e-02,4.899101425921504e-02,-6.694902020394805e-02,1.007390697007723e-01,-1.869115269324677e-01,8.707652481978466e-01,3.369687749188097e-01,-8.245462423766396e-02}, +{ 2.625780645440796e-04,-6.598314027914897e-04,9.112005709158016e-04,-1.146644159529176e-03,1.398895595625030e-03,-1.693869519387603e-03,2.062981870514395e-03,-2.553888850448001e-03,3.249892477330943e-03,-4.316588313676045e-03,6.140525749500051e-03,-9.881746394828528e-03,2.135709889838029e-02,9.991831210317053e-01,-1.431372561785507e-02}, +{ -2.343614051523724e-03,5.887707108650258e-03,-8.125441524774402e-03,1.021380331126524e-02,-1.244006849679595e-02,1.502648398861051e-02,-1.823545279812669e-02,2.245373463442989e-02,-2.833369119950252e-02,3.710793513289572e-02,-5.142849487367749e-02,7.814182895701950e-02,-1.412473610062948e-01,4.135305014733686e-01,6.797921293444558e-01} +}; +#endif +#if p_Nq==15 && p_cubNq==21 +const dfloat c_I[21][15] = { +{ 7.056818883193618e-01,3.817848289501514e-01,-1.325555183755903e-01,7.355673311628036e-02,-4.846420507648441e-02,3.498732341808994e-02,-2.672217621872157e-02,2.118032900879092e-02,-1.720319980968918e-02,1.417694252232629e-02,-1.173735388306768e-02,9.637182222745741e-03,-7.666895153786147e-03,5.555519196279588e-03,-2.211398236686889e-03}, +{ 2.443584389418599e-02,9.979321565822769e-01,-3.185191037486580e-02,1.510046436470392e-02,-9.449823723583287e-03,6.663442529456498e-03,-5.025008262465590e-03,3.952664930879222e-03,-3.194849324298812e-03,2.624292152703623e-03,-2.167893526551049e-03,1.777309668286642e-03,-1.412544447363280e-03,1.022946290488479e-03,-4.070907538534360e-04}, +{ -1.049997803501862e-01,4.587059231423729e-01,7.827554670308302e-01,-2.058042696797947e-01,1.138611384079762e-01,-7.639248832354861e-02,5.616193664215983e-02,-4.352880381537800e-02,3.485820656629823e-02,-2.845828741284049e-02,2.341194224067992e-02,-1.914017296056396e-02,1.518405599552233e-02,-1.098419995233647e-02,4.369332468808794e-03}, +{ 5.704966469047677e-02,-1.845668564971515e-01,8.616885760021746e-01,3.576506101607282e-01,-1.441977525482645e-01,8.792188255972617e-02,-6.185916061455161e-02,4.679874955952235e-02,-3.692906045581709e-02,2.986340776290059e-02,-2.441239042299221e-02,1.987320292640278e-02,-1.572187642644114e-02,1.135472740171365e-02,-4.513724098426927e-03}, +{ 5.520570332245822e-03,-1.603355545118280e-02,3.580652299170810e-02,9.979509674190323e-01,-3.357265383149198e-02,1.671588167183629e-02,-1.090411461026244e-02,7.941641420804769e-03,-6.130260724623138e-03,4.889347429832866e-03,-3.960912882660639e-03,3.205170392130863e-03,-2.525849263954879e-03,1.820110170806534e-03,-7.228650642215942e-04}, +{ -3.415837001147440e-02,9.414743447642465e-02,-1.704459777715627e-01,4.531273620066995e-01,7.954249972170484e-01,-2.097759788739884e-01,1.187310116813058e-01,-8.136109138111233e-02,6.078335667135981e-02,-4.753726427668938e-02,3.803222168096350e-02,-3.052637417550342e-02,2.393187070168692e-02,-1.719326347303124e-02,6.820065527873329e-03}, +{ 2.431576606761864e-02,-6.506491638642070e-02,1.068211489507184e-01,-1.976302877259501e-01,8.500077571685060e-01,3.785658491784543e-01,-1.529640337420361e-01,9.435586262979434e-02,-6.706422068343357e-02,5.099974207472630e-02,-4.010774618826957e-02,3.184305374328775e-02,-2.479355552735949e-02,1.774226202189965e-02,-7.026681581535785e-03}, +{ 1.413990906710553e-03,-3.714595834079057e-03,5.771983440242524e-03,-9.217960432252972e-03,1.905923276837156e-02,9.994155583138357e-01,-1.837489647328377e-02,9.157303083820451e-03,-6.003063079157531e-03,4.381050341008285e-03,-3.364158201362793e-03,2.632061566986976e-03,-2.030956266057013e-03,1.445924229314770e-03,-5.714743640976199e-04}, +{ -1.760957013485161e-02,4.569866871818559e-02,-6.860678879420212e-02,1.012785799710237e-01,-1.678129831638768e-01,4.299630127205192e-01,8.139765197748583e-01,-2.078087232223959e-01,1.175058236581196e-01,-8.046220012233953e-02,5.971705694965621e-02,-4.579735806951810e-02,3.491799664409113e-02,-2.469406525371557e-02,9.734030324445989e-03}, +{ 1.441919052619881e-02,-3.710184849197104e-02,5.443037498532624e-02,-7.657968208569718e-02,1.139108148711324e-01,-2.036958518522786e-01,8.328192567754507e-01,4.040606800365416e-01,-1.610464388610255e-01,9.896130345068500e-02,-6.980137809085714e-02,5.205585232007161e-02,-3.905554703949396e-02,2.737808898417208e-02,-1.075481552825477e-02}, +{ -5.551115123125783e-17,1.765707895083845e-16,-2.719952269093573e-16,3.087008963087683e-16,-4.603433153796705e-16,6.174300223885592e-16,-1.348837276158114e-15,9.999999999999999e-01,1.403013737387754e-15,-6.539820916714029e-16,4.731452114802517e-16,-2.984029380419228e-16,2.459517338720656e-16,-1.472289692833040e-16,1.665334536937735e-16}, +{ -1.075481552825466e-02,2.737808898417188e-02,-3.905554703949394e-02,5.205585232007159e-02,-6.980137809085699e-02,9.896130345068477e-02,-1.610464388610252e-01,4.040606800365418e-01,8.328192567754502e-01,-2.036958518522784e-01,1.139108148711321e-01,-7.657968208569689e-02,5.443037498532628e-02,-3.710184849197119e-02,1.441919052619858e-02}, +{ 9.734030324445961e-03,-2.469406525371559e-02,3.491799664409129e-02,-4.579735806951880e-02,5.971705694965702e-02,-8.046220012234018e-02,1.175058236581200e-01,-2.078087232223960e-01,8.139765197748580e-01,4.299630127205195e-01,-1.678129831638766e-01,1.012785799710234e-01,-6.860678879420197e-02,4.569866871818553e-02,-1.760957013485161e-02}, +{ -5.714743640975271e-04,1.445924229314907e-03,-2.030956266056995e-03,2.632061566986760e-03,-3.364158201362705e-03,4.381050341008292e-03,-6.003063079157547e-03,9.157303083820857e-03,-1.837489647328457e-02,9.994155583138359e-01,1.905923276837186e-02,-9.217960432253444e-03,5.771983440242974e-03,-3.714595834079320e-03,1.413990906710728e-03}, +{ -7.026681581535792e-03,1.774226202189964e-02,-2.479355552735955e-02,3.184305374328785e-02,-4.010774618826980e-02,5.099974207472653e-02,-6.706422068343366e-02,9.435586262979458e-02,-1.529640337420362e-01,3.785658491784534e-01,8.500077571685072e-01,-1.976302877259502e-01,1.068211489507183e-01,-6.506491638642092e-02,2.431576606761857e-02}, +{ 6.820065527873370e-03,-1.719326347303130e-02,2.393187070168701e-02,-3.052637417550345e-02,3.803222168096389e-02,-4.753726427668940e-02,6.078335667135985e-02,-8.136109138111255e-02,1.187310116813060e-01,-2.097759788739882e-01,7.954249972170472e-01,4.531273620067000e-01,-1.704459777715628e-01,9.414743447642503e-02,-3.415837001147448e-02}, +{ -7.228650642216155e-04,1.820110170806858e-03,-2.525849263955181e-03,3.205170392131211e-03,-3.960912882660803e-03,4.889347429833140e-03,-6.130260724623788e-03,7.941641420805511e-03,-1.090411461026355e-02,1.671588167183812e-02,-3.357265383149546e-02,9.979509674190316e-01,3.580652299171229e-02,-1.603355545118456e-02,5.520570332246360e-03}, +{ -4.513724098427242e-03,1.135472740171376e-02,-1.572187642644118e-02,1.987320292640303e-02,-2.441239042299240e-02,2.986340776290067e-02,-3.692906045581739e-02,4.679874955952307e-02,-6.185916061455209e-02,8.792188255972680e-02,-1.441977525482658e-01,3.576506101607309e-01,8.616885760021741e-01,-1.845668564971533e-01,5.704966469047693e-02}, +{ 4.369332468808759e-03,-1.098419995233647e-02,1.518405599552208e-02,-1.914017296056346e-02,2.341194224067971e-02,-2.845828741284037e-02,3.485820656629811e-02,-4.352880381537819e-02,5.616193664215979e-02,-7.639248832354795e-02,1.138611384079754e-01,-2.058042696797933e-01,7.827554670308294e-01,4.587059231423719e-01,-1.049997803501854e-01}, +{ -4.070907538534195e-04,1.022946290488392e-03,-1.412544447363263e-03,1.777309668286748e-03,-2.167893526550914e-03,2.624292152703545e-03,-3.194849324299030e-03,3.952664930879238e-03,-5.025008262465471e-03,6.663442529456409e-03,-9.449823723583231e-03,1.510046436470410e-02,-3.185191037486540e-02,9.979321565822765e-01,2.443584389418576e-02}, +{ -2.211398236687145e-03,5.555519196279772e-03,-7.666895153786282e-03,9.637182222745979e-03,-1.173735388306790e-02,1.417694252232640e-02,-1.720319980968942e-02,2.118032900879114e-02,-2.672217621872189e-02,3.498732341809080e-02,-4.846420507648588e-02,7.355673311628221e-02,-1.325555183755942e-01,3.817848289501631e-01,7.056818883193533e-01} +}; +#endif +#if p_Nq==15 && p_cubNq==22 +const dfloat c_I[22][15] = { +{ 7.286911275626057e-01,3.532849452003655e-01,-1.243735481556019e-01,6.919791593213871e-02,-4.563585090947092e-02,3.296039708105017e-02,-2.518035507271723e-02,1.996127464946391e-02,-1.621462845169215e-02,1.336314288407032e-02,-1.106408609918641e-02,9.084658170857501e-03,-7.227476943593735e-03,5.237173629537579e-03,-2.084689477826968e-03}, +{ 6.431856758472156e-02,9.872879751231278e-01,-7.403583299071954e-02,3.581178432637599e-02,-2.254566485821642e-02,1.593999700356141e-02,-1.203755671987884e-02,9.476648635900493e-03,-7.663836717485178e-03,6.297405056615252e-03,-5.203454914748838e-03,4.266658628560354e-03,-3.391356974909726e-03,2.456132176936528e-03,-9.774653598408557e-04}, +{ -1.207751784334268e-01,5.687251116526875e-01,6.882964001213128e-01,-2.073279899839904e-01,1.171675538186854e-01,-7.924052688212629e-02,5.848648331124971e-02,-4.543293807484734e-02,3.643417265248605e-02,-2.977227364040224e-02,2.450809279927756e-02,-2.004469302449173e-02,1.590596418398846e-02,-1.150828269027227e-02,4.578104189869540e-03}, +{ 4.154545691165822e-02,-1.383960892970185e-01,9.452256786049107e-01,2.093344974361023e-01,-9.148547415489480e-02,5.695327999698109e-02,-4.043441813254065e-02,3.073860254324325e-02,-2.432642438464536e-02,1.970860646890810e-02,-1.613101297023517e-02,1.314245760497794e-02,-1.040267890658712e-02,7.515427052495443e-03,-2.987908773355629e-03}, +{ 2.828883923948076e-02,-8.350678138852095e-02,2.016221807648783e-01,9.489039970019498e-01,-1.408156202197912e-01,7.434679690720002e-02,-4.946685780819113e-02,3.637253155273618e-02,-2.822843280834221e-02,2.258964313596812e-02,-1.833981852065574e-02,1.486178567608315e-02,-1.172266299600465e-02,8.451799530654383e-03,-3.357400067444873e-03}, +{ -3.996525805495958e-02,1.113140836119917e-01,-2.093007527421436e-01,6.793752084809785e-01,5.894258774884591e-01,-2.027748762228211e-01,1.202239803680634e-01,-8.393059793410834e-02,6.331139128336667e-02,-4.979686001245837e-02,3.998294675019223e-02,-3.216645793393207e-02,2.525475200897550e-02,-1.815909489209944e-02,7.205657800495265e-03}, +{ 1.039846102854263e-02,-2.803027581502822e-02,4.709917054092300e-02,-9.364430543655669e-02,9.812663960817852e-01,1.167713464490457e-01,-5.433797754213005e-02,3.483422769781555e-02,-2.519232225602357e-02,1.934097566971238e-02,-1.529794360722513e-02,1.218959448221263e-02,-9.512487966522421e-03,6.815944447658668e-03,-2.700803774209530e-03}, +{ 1.815940054205312e-02,-4.796671812221942e-02,7.572989399805856e-02,-1.258248693412955e-01,2.998218712442666e-01,8.995619397613922e-01,-1.789795805524046e-01,9.801111606524457e-02,-6.639477170162855e-02,4.924153298292532e-02,-3.815961304610353e-02,3.002178841226870e-02,-2.324425648751093e-02,1.658036444974494e-02,-6.558098204791467e-03}, +{ -2.011242861847584e-02,5.241422073826402e-02,-7.961334203470040e-02,1.205867734874888e-01,-2.132851148598048e-01,7.648202892791852e-01,4.932837374939683e-01,-1.847824953142987e-01,1.120436411847688e-01,-7.893407729255149e-02,5.945540621305638e-02,-4.598777429358119e-02,3.524128481115711e-02,-2.499293883884828e-02,9.862818044372279e-03}, +{ 2.001037446267385e-03,-5.166072232263869e-03,7.647085846902785e-03,-1.095719171644262e-02,1.694723987669890e-02,-3.388009781282054e-02,9.979437792404954e-01,3.640472641305823e-02,-1.766981615815944e-02,1.147007839032876e-02,-8.293262415050432e-03,6.267850218775440e-03,-4.738375517925573e-03,3.335339706472583e-03,-1.312321286336803e-03}, +{ 1.316020869040524e-02,-3.374575743475652e-02,4.905655992869892e-02,-6.777918137888125e-02,9.719411443138223e-02,-1.586716344967954e-01,3.966411964068335e-01,8.380360575990802e-01,-2.025891038067318e-01,1.135404921960831e-01,-7.713990636227561e-02,5.643367880664305e-02,-4.189046763868279e-02,2.919805066266993e-02,-1.144430760367288e-02}, +{ -1.144430760367282e-02,2.919805066266959e-02,-4.189046763868248e-02,5.643367880664266e-02,-7.713990636227513e-02,1.135404921960823e-01,-2.025891038067303e-01,8.380360575990833e-01,3.966411964068285e-01,-1.586716344967936e-01,9.719411443138126e-02,-6.777918137888062e-02,4.905655992869851e-02,-3.374575743475634e-02,1.316020869040507e-02}, +{ -1.312321286336921e-03,3.335339706472595e-03,-4.738375517925787e-03,6.267850218775638e-03,-8.293262415050703e-03,1.147007839032910e-02,-1.766981615816000e-02,3.640472641305997e-02,9.979437792404949e-01,-3.388009781282161e-02,1.694723987669942e-02,-1.095719171644283e-02,7.647085846903091e-03,-5.166072232264101e-03,2.001037446267184e-03}, +{ 9.862818044372196e-03,-2.499293883884821e-02,3.524128481115719e-02,-4.598777429358125e-02,5.945540621305642e-02,-7.893407729255175e-02,1.120436411847689e-01,-1.847824953142987e-01,4.932837374939676e-01,7.648202892791852e-01,-2.132851148598042e-01,1.205867734874886e-01,-7.961334203470016e-02,5.241422073826396e-02,-2.011242861847573e-02}, +{ -6.558098204791425e-03,1.658036444974521e-02,-2.324425648751141e-02,3.002178841226909e-02,-3.815961304610421e-02,4.924153298292602e-02,-6.639477170162937e-02,9.801111606524594e-02,-1.789795805524067e-01,8.995619397613903e-01,2.998218712442710e-01,-1.258248693412977e-01,7.572989399805967e-02,-4.796671812222002e-02,1.815940054205344e-02}, +{ -2.700803774209084e-03,6.815944447658370e-03,-9.512487966522112e-03,1.218959448221235e-02,-1.529794360722445e-02,1.934097566971204e-02,-2.519232225602320e-02,3.483422769781468e-02,-5.433797754212848e-02,1.167713464490417e-01,9.812663960817867e-01,-9.364430543655479e-02,4.709917054092188e-02,-2.803027581502738e-02,1.039846102854206e-02}, +{ 7.205657800495306e-03,-1.815909489209937e-02,2.525475200897556e-02,-3.216645793393207e-02,3.998294675019197e-02,-4.979686001245802e-02,6.331139128336633e-02,-8.393059793410808e-02,1.202239803680632e-01,-2.027748762228203e-01,5.894258774884545e-01,6.793752084809817e-01,-2.093007527421434e-01,1.113140836119920e-01,-3.996525805495930e-02}, +{ -3.357400067445053e-03,8.451799530654582e-03,-1.172266299600497e-02,1.486178567608362e-02,-1.833981852065634e-02,2.258964313596866e-02,-2.822843280834297e-02,3.637253155273737e-02,-4.946685780819243e-02,7.434679690720186e-02,-1.408156202197943e-01,9.489039970019474e-01,2.016221807648844e-01,-8.350678138852341e-02,2.828883923948151e-02}, +{ -2.987908773355738e-03,7.515427052495530e-03,-1.040267890658719e-02,1.314245760497814e-02,-1.613101297023546e-02,1.970860646890835e-02,-2.432642438464544e-02,3.073860254324339e-02,-4.043441813254097e-02,5.695327999698151e-02,-9.148547415489552e-02,2.093344974361037e-01,9.452256786049110e-01,-1.383960892970201e-01,4.154545691165876e-02}, +{ 4.578104189869703e-03,-1.150828269027216e-02,1.590596418398849e-02,-2.004469302449160e-02,2.450809279927768e-02,-2.977227364040230e-02,3.643417265248587e-02,-4.543293807484728e-02,5.848648331124930e-02,-7.924052688212567e-02,1.171675538186850e-01,-2.073279899839894e-01,6.882964001213183e-01,5.687251116526795e-01,-1.207751784334253e-01}, +{ -9.774653598408206e-04,2.456132176936631e-03,-3.391356974909693e-03,4.266658628560454e-03,-5.203454914748715e-03,6.297405056615080e-03,-7.663836717485301e-03,9.476648635900531e-03,-1.203755671987896e-02,1.593999700356168e-02,-2.254566485821672e-02,3.581178432637636e-02,-7.403583299072100e-02,9.872879751231279e-01,6.431856758472251e-02}, +{ -2.084689477827029e-03,5.237173629538027e-03,-7.227476943594172e-03,9.084658170857848e-03,-1.106408609918700e-02,1.336314288407087e-02,-1.621462845169278e-02,1.996127464946464e-02,-2.518035507271840e-02,3.296039708105183e-02,-4.563585090947266e-02,6.919791593214157e-02,-1.243735481556080e-01,3.532849452003841e-01,7.286911275625912e-01} +}; +#endif +#if p_Nq==15 && p_cubNq==23 +const dfloat c_I[23][15] = { +{ 7.492087905976532e-01,3.276527443728439e-01,-1.167275348000351e-01,6.509229581929016e-02,-4.296395231906461e-02,3.104289776348751e-02,-2.372063564591353e-02,1.880658936187986e-02,-1.527797075254464e-02,1.259192142575231e-02,-1.042595468977165e-02,8.560919217394531e-03,-6.810925157455726e-03,4.935382905537136e-03,-1.964568099053069e-03}, +{ 1.044008076332797e-01,9.697696586536351e-01,-1.070837483973530e-01,5.267718720849041e-02,-3.333455199415662e-02,2.362191645373946e-02,-1.786063150343365e-02,1.407114342595467e-02,-1.138471735218519e-02,9.357753553437376e-03,-7.733799094004341e-03,6.342360559361638e-03,-5.041702605248359e-03,3.651568811761696e-03,-1.453245353278817e-03}, +{ -1.297280478179825e-01,6.650472490286914e-01,5.932390874109227e-01,-1.971068876693479e-01,1.133643924343332e-01,-7.718935211861011e-02,5.716612905518634e-02,-4.449390626715070e-02,3.572454840064473e-02,-2.921571198369111e-02,2.406288303969628e-02,-1.968772121032944e-02,1.562641456711357e-02,-1.130760771284265e-02,4.498530843366173e-03}, +{ 2.046257227352725e-02,-7.034733862420602e-02,9.891908010112401e-01,8.554960254227151e-02,-3.971242829913690e-02,2.514657520370497e-02,-1.798917412142038e-02,1.373202425378470e-02,-1.089455384057222e-02,8.840602274449157e-03,-7.243518482478333e-03,5.905733612883313e-03,-4.676744141162504e-03,3.379637789044872e-03,-1.343791451929718e-03}, +{ 4.699167531632048e-02,-1.411351552912898e-01,3.735372584092657e-01,8.513005471622964e-01,-1.966080610487420e-01,1.085184144480570e-01,-7.337199914311864e-02,5.438022573698943e-02,-4.239710357977285e-02,3.402475579519530e-02,-2.767495520981826e-02,2.245410940354784e-02,-1.772534748989642e-02,1.278551792211935e-02,-5.079882431153229e-03}, +{ -3.531500191815224e-02,9.945881077746344e-02,-1.950523425289316e-01,8.507391827339328e-01,3.764820648083721e-01,-1.516184575757588e-01,9.313025959565550e-02,-6.599894241861311e-02,5.018456187351970e-02,-3.966107033517901e-02,3.194142196096099e-02,-2.574768866336792e-02,2.024060927901336e-02,-1.456434542447555e-02,5.780937835560308e-03}, +{ -8.161368321304086e-03,2.217143033389782e-02,-3.820701125029759e-02,8.286148113972262e-02,9.900280832397295e-01,-7.096011801409882e-02,3.623025119120344e-02,-2.391919393259361e-02,1.754123112355795e-02,-1.357278730341509e-02,1.078707638061083e-02,-8.621448524965577e-03,6.740847074567003e-03,-4.835295178845809e-03,1.916822042231456e-03}, +{ 2.718906309179809e-02,-7.223224165059348e-02,1.160108705077729e-01,-2.016591740471934e-01,5.871197984642820e-01,6.825313428843559e-01,-2.138958082987256e-01,1.249565879127426e-01,-8.679926947887022e-02,6.520749780078999e-02,-5.091159882454019e-02,4.023900320119218e-02,-3.124332660015415e-02,2.232208922644734e-02,-8.834834189303857e-03}, +{ -1.052371871073047e-02,2.754729781291711e-02,-4.236834695600129e-02,6.603470474636236e-02,-1.265553846581940e-01,9.618902203800661e-01,1.730268514440109e-01,-7.847975538347494e-02,4.997984531335777e-02,-3.598405838903759e-02,2.742440744830320e-02,-2.135980303333159e-02,1.643667195316314e-02,-1.168397590284777e-02,4.615043935437053e-03}, +{ -1.284216672483218e-02,3.327133827953172e-02,-4.972236915812493e-02,7.268625793051997e-02,-1.176560426090678e-01,2.746178816130565e-01,9.139331135977822e-01,-1.707975501204372e-01,9.298060060509686e-02,-6.276284866836546e-02,4.624621783080159e-02,-3.532152500758300e-02,2.686603558594490e-02,-1.897447953391504e-02,7.475536379591946e-03}, +{ 1.553535185790733e-02,-3.994978272917389e-02,5.851463324608640e-02,-8.206173568248051e-02,1.212571106222454e-01,-2.130918391817587e-01,7.737827916798220e-01,4.823860341876921e-01,-1.822242769606211e-01,1.105599586779191e-01,-7.755699342386022e-02,5.767375022635014e-02,-4.320052820228233e-02,3.025735151038044e-02,-1.188182582822628e-02}, +{ 0.000000000000000e+00,9.672606421277146e-17,-1.339547974751930e-16,1.804670498634458e-16,-2.299011409607738e-16,3.243602182387829e-16,-7.448518353443570e-16,1.000000000000000e+00,7.794520763265302e-16,-3.434770664917697e-16,2.283376387385551e-16,-1.592162859883039e-16,1.001788231470033e-16,-6.242781026685637e-17,0.000000000000000e+00}, +{ -1.188182582822647e-02,3.025735151038034e-02,-4.320052820228242e-02,5.767375022635022e-02,-7.755699342385999e-02,1.105599586779190e-01,-1.822242769606205e-01,4.823860341876917e-01,7.737827916798220e-01,-2.130918391817583e-01,1.212571106222450e-01,-8.206173568248022e-02,5.851463324608614e-02,-3.994978272917387e-02,1.553535185790736e-02}, +{ 7.475536379591752e-03,-1.897447953391490e-02,2.686603558594480e-02,-3.532152500758269e-02,4.624621783080125e-02,-6.276284866836535e-02,9.298060060509647e-02,-1.707975501204358e-01,9.139331135977840e-01,2.746178816130527e-01,-1.176560426090664e-01,7.268625793051907e-02,-4.972236915812431e-02,3.327133827953137e-02,-1.284216672483199e-02}, +{ 4.615043935436984e-03,-1.168397590284782e-02,1.643667195316328e-02,-2.135980303333167e-02,2.742440744830333e-02,-3.598405838903795e-02,4.997984531335777e-02,-7.847975538347486e-02,1.730268514440111e-01,9.618902203800656e-01,-1.265553846581938e-01,6.603470474636236e-02,-4.236834695600108e-02,2.754729781291708e-02,-1.052371871073044e-02}, +{ -8.834834189303982e-03,2.232208922644736e-02,-3.124332660015434e-02,4.023900320119239e-02,-5.091159882454042e-02,6.520749780079033e-02,-8.679926947887044e-02,1.249565879127433e-01,-2.138958082987263e-01,6.825313428843549e-01,5.871197984642836e-01,-2.016591740471942e-01,1.160108705077732e-01,-7.223224165059375e-02,2.718906309179830e-02}, +{ 1.916822042231656e-03,-4.835295178845718e-03,6.740847074566930e-03,-8.621448524965448e-03,1.078707638061074e-02,-1.357278730341484e-02,1.754123112355763e-02,-2.391919393259355e-02,3.623025119120299e-02,-7.096011801409766e-02,9.900280832397298e-01,8.286148113972081e-02,-3.820701125029687e-02,2.217143033389753e-02,-8.161368321303894e-03}, +{ 5.780937835560405e-03,-1.456434542447551e-02,2.024060927901336e-02,-2.574768866336769e-02,3.194142196096067e-02,-3.966107033517899e-02,5.018456187351956e-02,-6.599894241861244e-02,9.313025959565502e-02,-1.516184575757580e-01,3.764820648083685e-01,8.507391827339342e-01,-1.950523425289306e-01,9.945881077746307e-02,-3.531500191815180e-02}, +{ -5.079882431153142e-03,1.278551792211947e-02,-1.772534748989651e-02,2.245410940354809e-02,-2.767495520981848e-02,3.402475579519537e-02,-4.239710357977291e-02,5.438022573698983e-02,-7.337199914311944e-02,1.085184144480579e-01,-1.966080610487430e-01,8.513005471622955e-01,3.735372584092683e-01,-1.411351552912913e-01,4.699167531632037e-02}, +{ -1.343791451929914e-03,3.379637789044800e-03,-4.676744141162481e-03,5.905733612883482e-03,-7.243518482478449e-03,8.840602274449369e-03,-1.089455384057255e-02,1.373202425378477e-02,-1.798917412142056e-02,2.514657520370564e-02,-3.971242829913770e-02,8.554960254227316e-02,9.891908010112401e-01,-7.034733862420753e-02,2.046257227352773e-02}, +{ 4.498530843366516e-03,-1.130760771284273e-02,1.562641456711344e-02,-1.968772121032921e-02,2.406288303969638e-02,-2.921571198369100e-02,3.572454840064424e-02,-4.449390626715063e-02,5.716612905518630e-02,-7.718935211860965e-02,1.133643924343327e-01,-1.971068876693465e-01,5.932390874109212e-01,6.650472490286908e-01,-1.297280478179818e-01}, +{ -1.453245353278745e-03,3.651568811761593e-03,-5.041702605248152e-03,6.342360559361381e-03,-7.733799094004099e-03,9.357753553437154e-03,-1.138471735218516e-02,1.407114342595453e-02,-1.786063150343354e-02,2.362191645373912e-02,-3.333455199415591e-02,5.267718720848916e-02,-1.070837483973514e-01,9.697696586536373e-01,1.044008076332768e-01}, +{ -1.964568099052877e-03,4.935382905537098e-03,-6.810925157455738e-03,8.560919217394413e-03,-1.042595468977137e-02,1.259192142575187e-02,-1.527797075254440e-02,1.880658936187917e-02,-2.372063564591261e-02,3.104289776348679e-02,-4.296395231906315e-02,6.509229581928785e-02,-1.167275348000318e-01,3.276527443728323e-01,7.492087905976625e-01} +}; +#endif +#if p_Nq==16 && p_cubNq==16 +const dfloat c_I[16][16] = { +{ 4.847288479365953e-01,6.399088122020133e-01,-1.865529079970987e-01,1.005040789844866e-01,-6.554050492719954e-02,4.710679758911252e-02,-3.591964364674759e-02,2.847989795013992e-02,-2.318845219349548e-02,1.921231932182564e-02,-1.607010765519343e-02,1.345358220659599e-02,-1.113655722504579e-02,8.909845492005236e-03,-6.478530484995882e-03,2.582522447001789e-03}, +{ -1.321349895387028e-01,7.238393795431604e-01,5.286347061502360e-01,-1.853581737476892e-01,1.077587391070185e-01,-7.371385889527107e-02,5.476207836576643e-02,-4.275521894799867e-02,3.447102038410178e-02,-2.837302794725314e-02,2.362515067785058e-02,-1.971593276436553e-02,1.628455740003514e-02,-1.300946316211975e-02,9.451192386585212e-03,-3.766159011353901e-03}, +{ 6.161359096355441e-02,-1.967628345146665e-01,8.077488391923482e-01,4.342203742622231e-01,-1.669130188053374e-01,1.006609311771154e-01,-7.054328752453409e-02,5.332486648306786e-02,-4.214917336026292e-02,3.424689529243834e-02,-2.826685943334871e-02,2.344695824241732e-02,-1.928559730495089e-02,1.536440407592210e-02,-1.114369898950272e-02,4.437610243516688e-03}, +{ -3.377831999127441e-02,9.529685733532310e-02,-1.881843754337753e-01,8.715809596722109e-01,3.448039674436141e-01,-1.415655835723092e-01,8.743232906923373e-02,-6.218398138177417e-02,4.747487561619243e-02,-3.774694283543484e-02,3.071372621799425e-02,-2.523113920105834e-02,2.061715527481892e-02,-1.635446821524508e-02,1.183156379923425e-02,-4.706623797750388e-03}, +{ 1.936938850910537e-02,-5.198707059609670e-02,8.617013856653323e-02,-1.641305666497057e-01,9.220045725024667e-01,2.591367559695368e-01,-1.121952745528430e-01,7.053896633318985e-02,-5.066292191316283e-02,3.888304682261592e-02,-3.094254929155747e-02,2.505039959949357e-02,-2.027144130332296e-02,1.597925253238056e-02,-1.151752325368737e-02,4.574826725054004e-03}, +{ -1.076289521958749e-02,2.816848127130815e-02,-4.330514677479241e-02,6.744158212667158e-02,-1.290239825459740e-01,9.600103193632746e-01,1.777830335189578e-01,-8.050908527843310e-02,5.134027526559101e-02,-3.710885268716065e-02,2.851594803040243e-02,-2.258511897428881e-02,1.801957151427912e-02,-1.407698467480743e-02,1.009370657891731e-02,-4.000851514358104e-03}, +{ 5.186593332727009e-03,-1.337882558645389e-02,1.975992456486414e-02,-2.818901510748601e-02,4.320291633995479e-02,-8.410594541492937e-02,9.855470928103287e-01,1.017482314575800e-01,-4.790145078511995e-02,3.089514124995044e-02,-2.240039567810954e-02,1.715110574376877e-02,-1.340131170862933e-02,1.033505045307084e-02,-7.356430577989290e-03,2.907318906472701e-03}, +{ -1.414896898324021e-03,3.616742755748241e-03,-5.214994354634140e-03,7.093906307083106e-03,-9.876611700879652e-03,1.511187728578508e-02,-3.014002342535515e-02,9.983907131456422e-01,3.210574558704599e-02,-1.563461531928628e-02,1.016991636532232e-02,-7.365506151620428e-03,5.574190328613071e-03,-4.218246814565374e-03,2.971163039513207e-03,-1.169360150088343e-03}, +{ -1.169360150089002e-03,2.971163039513178e-03,-4.218246814565283e-03,5.574190328613032e-03,-7.365506151620393e-03,1.016991636532217e-02,-1.563461531928615e-02,3.210574558704616e-02,9.983907131456423e-01,-3.014002342535536e-02,1.511187728578504e-02,-9.876611700879586e-03,7.093906307083201e-03,-5.214994354634279e-03,3.616742755748264e-03,-1.414896898323722e-03}, +{ 2.907318906472566e-03,-7.356430577988960e-03,1.033505045307051e-02,-1.340131170862881e-02,1.715110574376829e-02,-2.240039567810901e-02,3.089514124994956e-02,-4.790145078511858e-02,1.017482314575770e-01,9.855470928103295e-01,-8.410594541492727e-02,4.320291633995369e-02,-2.818901510748552e-02,1.975992456486398e-02,-1.337882558645360e-02,5.186593332726756e-03}, +{ -4.000851514357848e-03,1.009370657891735e-02,-1.407698467480735e-02,1.801957151427899e-02,-2.258511897428873e-02,2.851594803040262e-02,-3.710885268716050e-02,5.134027526559118e-02,-8.050908527843352e-02,1.777830335189577e-01,9.600103193632750e-01,-1.290239825459745e-01,6.744158212667226e-02,-4.330514677479320e-02,2.816848127130820e-02,-1.076289521958738e-02}, +{ 4.574826725054087e-03,-1.151752325368755e-02,1.597925253238050e-02,-2.027144130332282e-02,2.505039959949371e-02,-3.094254929155829e-02,3.888304682261633e-02,-5.066292191316296e-02,7.053896633319066e-02,-1.121952745528442e-01,2.591367559695396e-01,9.220045725024658e-01,-1.641305666497085e-01,8.617013856653535e-02,-5.198707059609720e-02,1.936938850910518e-02}, +{ -4.706623797750657e-03,1.183156379923430e-02,-1.635446821524478e-02,2.061715527481878e-02,-2.523113920105816e-02,3.071372621799405e-02,-3.774694283543495e-02,4.747487561619252e-02,-6.218398138177421e-02,8.743232906923418e-02,-1.415655835723094e-01,3.448039674436161e-01,8.715809596722108e-01,-1.881843754337783e-01,9.529685733532327e-02,-3.377831999127358e-02}, +{ 4.437610243516597e-03,-1.114369898950252e-02,1.536440407592159e-02,-1.928559730495058e-02,2.344695824241718e-02,-2.826685943334791e-02,3.424689529243755e-02,-4.214917336026280e-02,5.332486648306733e-02,-7.054328752453298e-02,1.006609311771143e-01,-1.669130188053362e-01,4.342203742622254e-01,8.077488391923454e-01,-1.967628345146644e-01,6.161359096355219e-02}, +{ -3.766159011353853e-03,9.451192386585110e-03,-1.300946316211925e-02,1.628455740003459e-02,-1.971593276436494e-02,2.362515067785029e-02,-2.837302794725306e-02,3.447102038410144e-02,-4.275521894799785e-02,5.476207836576529e-02,-7.371385889526945e-02,1.077587391070169e-01,-1.853581737476883e-01,5.286347061502322e-01,7.238393795431574e-01,-1.321349895386965e-01}, +{ 2.582522447001856e-03,-6.478530484996050e-03,8.909845492005315e-03,-1.113655722504592e-02,1.345358220659646e-02,-1.607010765519407e-02,1.921231932182636e-02,-2.318845219349618e-02,2.847989795014095e-02,-3.591964364674913e-02,4.710679758911462e-02,-6.554050492720272e-02,1.005040789844924e-01,-1.865529079971099e-01,6.399088122020303e-01,4.847288479365857e-01} +}; +#endif +#if p_Nq==16 && p_cubNq==17 +const dfloat c_I[17][16] = { +{ 5.310179651539473e-01,5.884823047483347e-01,-1.793847948263858e-01,9.732164084031147e-02,-6.362108595724622e-02,4.577976819603007e-02,-3.492972494947259e-02,2.770554332857440e-02,-2.256350657595074e-02,1.869763014686431e-02,-1.564139383462259e-02,1.309572661657666e-02,-1.084094025003870e-02,8.673660364852866e-03,-6.306936264939025e-03,2.514143263163866e-03}, +{ -1.279643740930983e-01,8.252760195271482e-01,4.016052578050425e-01,-1.536075508159081e-01,9.097174773003083e-02,-6.269562416475474e-02,4.675394028868754e-02,-3.658362653399487e-02,2.953633333506260e-02,-2.433380644796893e-02,2.027471838678179e-02,-1.692737746974469e-02,1.398560334933961e-02,-1.117514119019901e-02,8.119567579283201e-03,-3.235687285707646e-03}, +{ 4.506847775101169e-02,-1.492565638184737e-01,9.323133249205192e-01,2.362193400138698e-01,-1.018419901991917e-01,6.319363895274982e-02,-4.483303014671681e-02,3.411303384957216e-02,-2.707010731090392e-02,2.205080070300123e-02,-1.823148475814174e-02,1.514049433573555e-02,-1.246336839301134e-02,9.934557924711587e-03,-7.207736866162504e-03,2.870613041430792e-03}, +{ -1.229554036123046e-02,3.528793100807145e-02,-7.476331401308023e-02,9.884767900729198e-01,8.944352429381019e-02,-4.196240826588817e-02,2.688770122719809e-02,-1.944302869602169e-02,1.497995718888949e-02,-1.197709268493100e-02,9.780881755744663e-03,-8.054536715507038e-03,6.592445566051840e-03,-5.235056128684646e-03,3.789678543292092e-03,-1.507932790634366e-03}, +{ -3.582234564908163e-03,9.712345260434048e-03,-1.662921035517750e-02,3.524923230686762e-02,9.980461370946738e-01,-3.297846445881476e-02,1.652159743212417e-02,-1.085112819950894e-02,7.953873646228102e-03,-6.174506294946993e-03,4.948273372891798e-03,-4.024346742651508e-03,3.266432998359502e-03,-2.579816169174187e-03,1.861588783323390e-03,-7.397741097204303e-04}, +{ 1.139891269024138e-02,-3.003351311013561e-02,4.707080384154152e-02,-7.679234987353881e-02,1.709490109412116e-01,9.626287483056380e-01,-1.258109160332848e-01,6.647574244712899e-02,-4.453582436867106e-02,3.295929578191454e-02,-2.566796309725854e-02,2.049754424751948e-02,-1.644016709043045e-02,1.288579309629135e-02,-9.257248914505986e-03,3.672131136338345e-03}, +{ -1.455857691406389e-02,3.773300702719277e-02,-5.645426657445377e-02,8.274433532555128e-02,-1.348356252830776e-01,3.233027662258359e-01,8.858877501707700e-01,-1.860605036384320e-01,1.027438196202610e-01,-6.994074440908218e-02,5.206092659087520e-02,-4.046019944613783e-02,3.190210201316630e-02,-2.473958412712701e-02,1.766481728184915e-02,-6.990023863127201e-03}, +{ 1.476942609043272e-02,-3.788738811311267e-02,5.513904599848125e-02,-7.636531428674283e-02,1.100722191402703e-01,-1.820572953998172e-01,4.820557925418194e-01,7.741419307270462e-01,-2.134701215720292e-01,1.222953984338138e-01,-8.426281753752038e-02,6.278183258882586e-02,-4.827648106171891e-02,3.687498906184589e-02,-2.610691455479239e-02,1.029569794319810e-02}, +{ -1.309204101562523e-02,3.335544248995533e-02,-4.768643702639035e-02,6.383014874598046e-02,-8.627598963282529e-02,1.243076624552211e-01,-2.105115081265012e-01,6.360727221101858e-01,6.360727221101846e-01,-2.105115081265015e-01,1.243076624552212e-01,-8.627598963282555e-02,6.383014874598093e-02,-4.768643702639101e-02,3.335544248995542e-02,-1.309204101562488e-02}, +{ 1.029569794319841e-02,-2.610691455479224e-02,3.687498906184516e-02,-4.827648106171816e-02,6.278183258882560e-02,-8.426281753752032e-02,1.222953984338132e-01,-2.134701215720287e-01,7.741419307270474e-01,4.820557925418171e-01,-1.820572953998161e-01,1.100722191402702e-01,-7.636531428674333e-02,5.513904599848186e-02,-3.788738811311252e-02,1.476942609043243e-02}, +{ -6.990023863127463e-03,1.766481728184918e-02,-2.473958412712662e-02,3.190210201316591e-02,-4.046019944613767e-02,5.206092659087541e-02,-6.994074440908217e-02,1.027438196202607e-01,-1.860605036384323e-01,8.858877501707716e-01,3.233027662258343e-01,-1.348356252830778e-01,8.274433532555198e-02,-5.645426657445449e-02,3.773300702719274e-02,-1.455857691406343e-02}, +{ 3.672131136338547e-03,-9.257248914506086e-03,1.288579309629150e-02,-1.644016709043087e-02,2.049754424751972e-02,-2.566796309725844e-02,3.295929578191466e-02,-4.453582436867114e-02,6.647574244712950e-02,-1.258109160332856e-01,9.626287483056372e-01,1.709490109412133e-01,-7.679234987354011e-02,4.707080384154258e-02,-3.003351311013579e-02,1.139891269024114e-02}, +{ -7.397741097206366e-04,1.861588783323291e-03,-2.579816169174125e-03,3.266432998359644e-03,-4.024346742651718e-03,4.948273372891815e-03,-6.174506294946816e-03,7.953873646228240e-03,-1.085112819950874e-02,1.652159743212352e-02,-3.297846445881419e-02,9.980461370946738e-01,3.524923230686709e-02,-1.662921035517736e-02,9.712345260433941e-03,-3.582234564907915e-03}, +{ -1.507932790634355e-03,3.789678543292251e-03,-5.235056128684861e-03,6.592445566052207e-03,-8.054536715507278e-03,9.780881755745026e-03,-1.197709268493140e-02,1.497995718888973e-02,-1.944302869602253e-02,2.688770122719916e-02,-4.196240826588926e-02,8.944352429381353e-02,9.884767900729193e-01,-7.476331401308367e-02,3.528793100807286e-02,-1.229554036123061e-02}, +{ 2.870613041431068e-03,-7.207736866162188e-03,9.934557924710959e-03,-1.246336839301088e-02,1.514049433573481e-02,-1.823148475814058e-02,2.205080070299996e-02,-2.707010731090287e-02,3.411303384957124e-02,-4.483303014671521e-02,6.319363895274736e-02,-1.018419901991876e-01,2.362193400138608e-01,9.323133249205233e-01,-1.492565638184687e-01,4.506847775100872e-02}, +{ -3.235687285707584e-03,8.119567579282901e-03,-1.117514119019844e-02,1.398560334933898e-02,-1.692737746974422e-02,2.027471838678171e-02,-2.433380644796863e-02,2.953633333506197e-02,-3.658362653399415e-02,4.675394028868624e-02,-6.269562416475263e-02,9.097174773002822e-02,-1.536075508159054e-01,4.016052578050349e-01,8.252760195271472e-01,-1.279643740930911e-01}, +{ 2.514143263164185e-03,-6.306936264939297e-03,8.673660364853088e-03,-1.084094025003931e-02,1.309572661657740e-02,-1.564139383462312e-02,1.869763014686470e-02,-2.256350657595139e-02,2.770554332857527e-02,-3.492972494947365e-02,4.577976819603204e-02,-6.362108595724970e-02,9.732164084031746e-02,-1.793847948263972e-01,5.884823047483547e-01,5.310179651539347e-01} +}; +#endif +#if p_Nq==16 && p_cubNq==18 +const dfloat c_I[18][16] = { +{ 5.719361722450037e-01,5.417460539150799e-01,-1.710654613414195e-01,9.334810578273552e-02,-6.114870920578082e-02,4.404309107539303e-02,-3.362239665319338e-02,2.667711739329659e-02,-2.173043597548834e-02,1.800980100434648e-02,-1.506745067709031e-02,1.261604928379799e-02,-1.044434492749155e-02,8.356613905820596e-03,-6.076514350664888e-03,2.422308525655195e-03}, +{ -1.132645566027733e-01,8.992953658759253e-01,2.894202521406307e-01,-1.179434241448028e-01,7.089309094459777e-02,-4.915646039521996e-02,3.677271029072859e-02,-2.882647332022599e-02,2.330052429895699e-02,-1.921122393425667e-02,1.601513815700075e-02,-1.337600628031996e-02,1.105425288582523e-02,-8.834365059464184e-03,6.419473279256759e-03,-2.558298135859256e-03}, +{ 1.859254613980712e-02,-6.408763915194113e-02,9.911983119727880e-01,7.664936678246750e-02,-3.573751181735841e-02,2.266765263104096e-02,-1.624013911674284e-02,1.242300216724361e-02,-9.890071052811384e-03,8.073158179135604e-03,-6.684290464859535e-03,5.556434487149966e-03,-4.577009703620533e-03,3.649950621608291e-03,-2.648813451706414e-03,1.055051777799278e-03}, +{ 1.520001904181417e-02,-4.443665489167049e-02,1.023326661421940e-01,9.850075467936653e-01,-8.477287209044021e-02,4.335015063971254e-02,-2.854797698033874e-02,2.091238767643359e-02,-1.622973311685763e-02,1.303501181767200e-02,-1.067635460168728e-02,8.809538808363986e-03,-7.220160245180101e-03,5.738613651847820e-03,-4.156382433908383e-03,1.654199788379381e-03}, +{ -2.641717271627844e-02,7.240305598373566e-02,-1.285511157972760e-01,3.134970151045369e-01,8.910312355151949e-01,-1.828001590858802e-01,1.002063455667024e-01,-6.791756497345093e-02,5.056140304894838e-02,-3.960244252787325e-02,3.191583501343025e-02,-2.605213932925502e-02,2.119735184883349e-02,-1.676805519547626e-02,1.211098401530137e-02,-4.814576471193344e-03}, +{ 2.635314397161517e-02,-6.993263461638216e-02,1.119452008099522e-01,-1.928977681354371e-01,5.385245126526712e-01,7.263675668478888e-01,-2.151700293683944e-01,1.245793077110370e-01,-8.639079676295279e-02,6.506664818268740e-02,-5.119528411839137e-02,4.114721938937191e-02,-3.314009448979329e-02,2.604405205384006e-02,-1.873894618235249e-02,7.437902054639935e-03}, +{ -2.039983015554016e-02,5.314147655237204e-02,-8.063102638534950e-02,1.218592472755316e-01,-2.143943186180346e-01,7.452988379770581e-01,5.169178036633424e-01,-1.903060442193443e-01,1.151399933482085e-01,-8.130310861750034e-02,6.168546146969952e-02,-4.848171133812777e-02,3.849427520888203e-02,-2.998092401777934e-02,2.146007435396269e-02,-8.500206497380829e-03}, +{ 1.200886981000533e-02,-3.092256318279907e-02,4.545677852951923e-02,-6.422825086982631e-02,9.643920139249547e-02,-1.770567599909343e-01,9.033947642445888e-01,2.937526576979563e-01,-1.252146687650396e-01,7.850510726872467e-02,-5.618563851032306e-02,4.271282752129769e-02,-3.323237643552807e-02,2.556264580926976e-02,-1.816896624901812e-02,7.176371729611268e-03}, +{ -3.633042940042188e-03,9.283128204513007e-03,-1.337183775934540e-02,1.815448522613186e-02,-2.518457129191431e-02,3.824071657184552e-02,-7.454047979562164e-02,9.889829645511656e-01,8.795702038942751e-02,-4.167994897377668e-02,2.688240653398864e-02,-1.939203887719637e-02,1.464381910554335e-02,-1.106772009119218e-02,7.790298508039748e-03,-3.065199361566571e-03}, +{ -3.065199361566376e-03,7.790298508039776e-03,-1.106772009119210e-02,1.464381910554323e-02,-1.939203887719623e-02,2.688240653398831e-02,-4.167994897377628e-02,8.795702038942715e-02,9.889829645511659e-01,-7.454047979562169e-02,3.824071657184547e-02,-2.518457129191427e-02,1.815448522613187e-02,-1.337183775934560e-02,9.283128204513049e-03,-3.633042940042078e-03}, +{ 7.176371729611422e-03,-1.816896624901825e-02,2.556264580926937e-02,-3.323237643552764e-02,4.271282752129764e-02,-5.618563851032340e-02,7.850510726872491e-02,-1.252146687650399e-01,2.937526576979589e-01,9.033947642445871e-01,-1.770567599909347e-01,9.643920139249582e-02,-6.422825086982702e-02,4.545677852951990e-02,-3.092256318279923e-02,1.200886981000512e-02}, +{ -8.500206497381289e-03,2.146007435396277e-02,-2.998092401777907e-02,3.849427520888178e-02,-4.848171133812774e-02,6.168546146969997e-02,-8.130310861750065e-02,1.151399933482091e-01,-1.903060442193455e-01,5.169178036633452e-01,7.452988379770565e-01,-2.143943186180357e-01,1.218592472755330e-01,-8.063102638535097e-02,5.314147655237220e-02,-2.039983015553961e-02}, +{ 7.437902054640084e-03,-1.873894618235251e-02,2.604405205383997e-02,-3.314009448979310e-02,4.114721938937201e-02,-5.119528411839160e-02,6.506664818268730e-02,-8.639079676295268e-02,1.245793077110371e-01,-2.151700293683943e-01,7.263675668478892e-01,5.385245126526712e-01,-1.928977681354387e-01,1.119452008099540e-01,-6.993263461638231e-02,2.635314397161439e-02}, +{ -4.814576471193613e-03,1.211098401530105e-02,-1.676805519547578e-02,2.119735184883329e-02,-2.605213932925459e-02,3.191583501342996e-02,-3.960244252787300e-02,5.056140304894797e-02,-6.791756497345053e-02,1.002063455667014e-01,-1.828001590858779e-01,8.910312355151960e-01,3.134970151045341e-01,-1.285511157972762e-01,7.240305598373498e-02,-2.641717271627749e-02}, +{ 1.654199788379137e-03,-4.156382433908057e-03,5.738613651847102e-03,-7.220160245179277e-03,8.809538808363349e-03,-1.067635460168668e-02,1.303501181767124e-02,-1.622973311685660e-02,2.091238767643184e-02,-2.854797698033608e-02,4.335015063970908e-02,-8.477287209043438e-02,9.850075467936661e-01,1.023326661421873e-01,-4.443665489166686e-02,1.520001904181280e-02}, +{ 1.055051777799364e-03,-2.648813451706211e-03,3.649950621608089e-03,-4.577009703620300e-03,5.556434487149499e-03,-6.684290464858694e-03,8.073158179134645e-03,-9.890071052810655e-03,1.242300216724265e-02,-1.624013911674157e-02,2.266765263103961e-02,-3.573751181735541e-02,7.664936678246065e-02,9.911983119727900e-01,-6.408763915193634e-02,1.859254613980479e-02}, +{ -2.558298135859356e-03,6.419473279256514e-03,-8.834365059463573e-03,1.105425288582430e-02,-1.337600628031908e-02,1.601513815700011e-02,-1.921122393425593e-02,2.330052429895596e-02,-2.882647332022461e-02,3.677271029072663e-02,-4.915646039521748e-02,7.089309094459408e-02,-1.179434241447975e-01,2.894202521406170e-01,8.992953658759277e-01,-1.132645566027649e-01}, +{ 2.422308525655389e-03,-6.076514350665219e-03,8.356613905820917e-03,-1.044434492749215e-02,1.261604928379852e-02,-1.506745067709051e-02,1.800980100434680e-02,-2.173043597548896e-02,2.667711739329726e-02,-3.362239665319416e-02,4.404309107539389e-02,-6.114870920578260e-02,9.334810578273921e-02,-1.710654613414262e-01,5.417460539150737e-01,5.719361722450141e-01} +}; +#endif +#if p_Nq==16 && p_cubNq==19 +const dfloat c_I[19][16] = { +{ 6.081556613069434e-01,4.994608976490126e-01,-1.622691047545900e-01,8.897965835489335e-02,-5.838805670512500e-02,4.208897901344461e-02,-3.214501714851888e-02,2.551182905424107e-02,-2.078486499371815e-02,1.722816639763771e-02,-1.441469887424414e-02,1.207019227084143e-02,-9.992851065514206e-03,7.995583432006466e-03,-5.814083755453131e-03,2.317709818143019e-03}, +{ -9.044242451928121e-02,9.498018478605152e-01,1.932085319615818e-01,-8.262070578489598e-02,5.026425744722665e-02,-3.502931303249197e-02,2.627351414577571e-02,-2.062786824718033e-02,1.668989888948347e-02,-1.376977031683788e-02,1.148411579858376e-02,-9.594658451889941e-03,7.930978627834637e-03,-6.339215583477457e-03,4.606774999566228e-03,-1.835963794512744e-03}, +{ -1.242860228050546e-02,4.476984523826506e-02,9.966721949124592e-01,-4.181610082572820e-02,2.063544003132210e-02,-1.331799428053194e-02,9.618072729313299e-03,-7.389836439505830e-03,5.898949681343221e-03,-4.823685490626018e-03,3.998579958129895e-03,-3.326610828722411e-03,2.741775547173865e-03,-2.187252054538593e-03,1.587666643984741e-03,-6.324425418328748e-04}, +{ 4.014602435631817e-02,-1.197206440623396e-01,3.047742011957253e-01,8.950250718127409e-01,-1.796484903041882e-01,9.756856413495478e-02,-6.562270337058179e-02,4.856995683408274e-02,-3.791802016975179e-02,3.056721121036820e-02,-2.509750796169786e-02,2.074349586304095e-02,-1.702023636679830e-02,1.353778749998203e-02,-9.809491758215926e-03,3.904781086360306e-03}, +{ -3.885021825622684e-02,1.077168304285917e-01,-1.992062518731171e-01,5.892270282002646e-01,6.796070087364939e-01,-2.129779850863016e-01,1.241002035783746e-01,-8.612563142004112e-02,6.490011906754274e-02,-5.119827474311475e-02,4.144896967448513e-02,-3.393568039175645e-02,2.766731023675426e-02,-2.191465713810702e-02,1.584032135308664e-02,-6.299092366928992e-03}, +{ 2.511887654375803e-02,-6.716642822567410e-02,1.100377477589089e-01,-2.023630139227568e-01,8.320765933119225e-01,4.044787528627441e-01,-1.608633603284180e-01,9.894596619820543e-02,-7.037713296281836e-02,5.372645555500911e-02,-4.261633136100425e-02,3.442932981110380e-02,-2.782300605247833e-02,2.191261203448159e-02,-1.578608287941772e-02,6.269021656433957e-03}, +{ -8.490128768036030e-03,2.223642467437556e-02,-3.425633981485939e-02,5.360880441338627e-02,-1.040562758324422e-01,9.763072996026425e-01,1.330334476017129e-01,-6.160824719435797e-02,3.954580847465704e-02,-2.867076361415900e-02,2.206910368789190e-02,-1.749717745731206e-02,1.396930486446618e-02,-1.091739217379337e-02,7.830019401033197e-03,-3.103887865205538e-03}, +{ -5.016026030923235e-03,1.296819126020199e-02,-1.927098904263657e-02,2.784237323127267e-02,-4.388681703371488e-02,9.334257170930160e-02,9.876694698289264e-01,-7.840801457172152e-02,4.038468388553918e-02,-2.681285227840886e-02,1.971176873131536e-02,-1.521051681818650e-02,1.194102860497233e-02,-9.235324020285941e-03,6.584300139117056e-03,-2.603847594769137e-03}, +{ 1.234021949183061e-02,-3.162958027342587e-02,4.593135620145737e-02,-6.334154836574075e-02,9.053449748784226e-02,-1.468174983976117e-01,3.581578513119990e-01,8.640680660678896e-01,-1.949213878084726e-01,1.085486965231376e-01,-7.397468541936721e-02,5.480878413483727e-02,-4.201076816178594e-02,3.202842332009276e-02,-2.265187102710363e-02,8.929444914421502e-03}, +{ -1.309204101562494e-02,3.335544248995538e-02,-4.768643702639028e-02,6.383014874598028e-02,-8.627598963282519e-02,1.243076624552210e-01,-2.105115081265010e-01,6.360727221101845e-01,6.360727221101856e-01,-2.105115081265016e-01,1.243076624552212e-01,-8.627598963282562e-02,6.383014874598103e-02,-4.768643702639103e-02,3.335544248995540e-02,-1.309204101562473e-02}, +{ 8.929444914421493e-03,-2.265187102710355e-02,3.202842332009209e-02,-4.201076816178523e-02,5.480878413483691e-02,-7.397468541936708e-02,1.085486965231373e-01,-1.949213878084717e-01,8.640680660678891e-01,3.581578513119983e-01,-1.468174983976111e-01,9.053449748784223e-02,-6.334154836574124e-02,4.593135620145789e-02,-3.162958027342586e-02,1.234021949183020e-02}, +{ -2.603847594769395e-03,6.584300139116938e-03,-9.235324020285336e-03,1.194102860497184e-02,-1.521051681818647e-02,1.971176873131522e-02,-2.681285227840864e-02,4.038468388553905e-02,-7.840801457172136e-02,9.876694698289273e-01,9.334257170930046e-02,-4.388681703371444e-02,2.784237323127257e-02,-1.927098904263655e-02,1.296819126020171e-02,-5.016026030923143e-03}, +{ -3.103887865205691e-03,7.830019401033268e-03,-1.091739217379334e-02,1.396930486446615e-02,-1.749717745731212e-02,2.206910368789219e-02,-2.867076361415932e-02,3.954580847465745e-02,-6.160824719435884e-02,1.330334476017144e-01,9.763072996026424e-01,-1.040562758324437e-01,5.360880441338733e-02,-3.425633981486010e-02,2.223642467437584e-02,-8.490128768036037e-03}, +{ 6.269021656433761e-03,-1.578608287941793e-02,2.191261203448136e-02,-2.782300605247802e-02,3.442932981110376e-02,-4.261633136100419e-02,5.372645555500946e-02,-7.037713296281865e-02,9.894596619820575e-02,-1.608633603284196e-01,4.044787528627488e-01,8.320765933119206e-01,-2.023630139227597e-01,1.100377477589113e-01,-6.716642822567459e-02,2.511887654375767e-02}, +{ -6.299092366929118e-03,1.584032135308661e-02,-2.191465713810663e-02,2.766731023675402e-02,-3.393568039175607e-02,4.144896967448500e-02,-5.119827474311433e-02,6.490011906754192e-02,-8.612563142004044e-02,1.241002035783741e-01,-2.129779850863003e-01,6.796070087364932e-01,5.892270282002652e-01,-1.992062518731187e-01,1.077168304285913e-01,-3.885021825622577e-02}, +{ 3.904781086359915e-03,-9.809491758215530e-03,1.353778749998141e-02,-1.702023636679785e-02,2.074349586304063e-02,-2.509750796169725e-02,3.056721121036729e-02,-3.791802016975079e-02,4.856995683408154e-02,-6.562270337057979e-02,9.756856413495210e-02,-1.796484903041845e-01,8.950250718127422e-01,3.047742011957202e-01,-1.197206440623359e-01,4.014602435631629e-02}, +{ -6.324425418327631e-04,1.587666643984989e-03,-2.187252054539182e-03,2.741775547174332e-03,-3.326610828722547e-03,3.998579958130469e-03,-4.823685490626943e-03,5.898949681344362e-03,-7.389836439506865e-03,9.618072729314205e-03,-1.331799428053304e-02,2.063544003132521e-02,-4.181610082573575e-02,9.966721949124584e-01,4.476984523827229e-02,-1.242860228050711e-02}, +{ -1.835963794512681e-03,4.606774999566016e-03,-6.339215583477069e-03,7.930978627834212e-03,-9.594658451889636e-03,1.148411579858309e-02,-1.376977031683679e-02,1.668989888948263e-02,-2.062786824717960e-02,2.627351414577416e-02,-3.502931303248994e-02,5.026425744722442e-02,-8.262070578489297e-02,1.932085319615745e-01,9.498018478605136e-01,-9.044242451927395e-02}, +{ 2.317709818143113e-03,-5.814083755453245e-03,7.995583432006608e-03,-9.992851065514490e-03,1.207019227084175e-02,-1.441469887424474e-02,1.722816639763824e-02,-2.078486499371847e-02,2.551182905424138e-02,-3.214501714851952e-02,4.208897901344581e-02,-5.838805670512698e-02,8.897965835489646e-02,-1.622691047545957e-01,4.994608976490062e-01,6.081556613069536e-01} +}; +#endif +#if p_Nq==16 && p_cubNq==20 +const dfloat c_I[20][16] = { +{ 6.402826266121562e-01,4.612848220947905e-01,-1.534142468557552e-01,8.447132444549908e-02,-5.551150953514922e-02,4.004328482212099e-02,-3.059434926351888e-02,2.428677695345433e-02,-1.978976448465253e-02,1.640500899465068e-02,-1.372693342685569e-02,1.149485695400538e-02,-9.516860481506241e-03,7.614903386660276e-03,-5.537343473306902e-03,2.207403257407140e-03}, +{ -6.167943123473397e-02,9.809166217662282e-01,1.123678881656010e-01,-4.989913312350908e-02,3.066170832604618e-02,-2.145916924003029e-02,1.613113956951269e-02,-1.268149449183644e-02,1.026908343438399e-02,-8.477083594219441e-03,7.072678353480466e-03,-5.910607922164971e-03,4.886636103585782e-03,-3.906361291212281e-03,2.839003181744651e-03,-1.131478002876558e-03}, +{ -4.364553298225900e-02,1.650411656498468e-01,9.625733586701595e-01,-1.230306517309817e-01,6.337784020836412e-02,-4.148164017773392e-02,3.015575783977312e-02,-2.325486404473342e-02,1.860525852867559e-02,-1.523639991693974e-02,1.264286349038268e-02,-1.052551284303113e-02,8.679221416370689e-03,-6.926045100363857e-03,5.028374272473386e-03,-2.003193280003022e-03}, +{ 5.721726365109382e-02,-1.743120053444088e-01,5.017942993734058e-01,7.545440678956141e-01,-2.119416297655598e-01,1.203293919010674e-01,-8.229283321291364e-02,6.142223649195848e-02,-4.818659177671274e-02,3.896528729255466e-02,-3.205852480578269e-02,2.653392093209609e-02,-2.179205717789800e-02,1.734412064925419e-02,-1.257222134252316e-02,5.005275238754164e-03}, +{ -3.765499330906666e-02,1.056967317953345e-01,-2.046511214647516e-01,8.075044394897416e-01,4.371314746161700e-01,-1.693713973038422e-01,1.031358868909773e-01,-7.290316738385690e-02,5.547395722957260e-02,-4.401833104052993e-02,3.577006213683054e-02,-2.935935227934340e-02,2.397641564065093e-02,-1.901192290428056e-02,1.375099425755289e-02,-5.469676371159098e-03}, +{ 1.025037076208672e-02,-2.763121639527441e-02,4.643257826334039e-02,-9.236914283083197e-02,9.818523917284312e-01,1.147842175652441e-01,-5.349256988482947e-02,3.434519185104344e-02,-2.490617663612772e-02,1.921759617646460e-02,-1.534334961427188e-02,1.244803863705476e-02,-1.008737297652012e-02,7.958673363207157e-03,-5.739463692053979e-03,2.280233683037305e-03}, +{ 1.063301435478577e-02,-2.800859197600810e-02,4.386544542659430e-02,-7.143432693411979e-02,1.579971842229037e-01,9.675621468360475e-01,-1.188146432200093e-01,6.251838477810326e-02,-4.182310696315370e-02,3.092920760489380e-02,-2.407687754109674e-02,1.922193904636199e-02,-1.541447513323541e-02,1.208057266357767e-02,-8.678239821969046e-03,3.442366656324229e-03}, +{ -1.886973577247993e-02,4.899322101836566e-02,-7.365800510849610e-02,1.090943060828344e-01,-1.823153170602850e-01,4.854426791914085e-01,7.712356922100045e-01,-2.135947493828090e-01,1.224735470424937e-01,-8.459156988723893e-02,6.343718420705059e-02,-4.951599947402197e-02,3.914714388491990e-02,-3.040825760925048e-02,2.173286585558369e-02,-8.603005198079728e-03}, +{ 1.531214417319927e-02,-3.937630884703473e-02,5.768023266773105e-02,-8.092293702330894e-02,1.197236316110141e-01,-2.112874521074985e-01,7.912550086322166e-01,4.603027000227607e-01,-1.768688561127421e-01,1.080437857614294e-01,-7.644508799222728e-02,5.775558401500118e-02,-4.477230757872818e-02,3.436364060227477e-02,-2.439432892459778e-02,9.630551100510415e-03}, +{ -5.320811543821022e-03,1.359152832069628e-02,-1.956224074671192e-02,2.651868344173693e-02,-3.668366937603098e-02,5.537240868898338e-02,-1.060840649793193e-01,9.753858363412451e-01,1.360075745299787e-01,-6.294287374050031e-02,4.030819367856798e-02,-2.898106279774556e-02,2.184556600655249e-02,-1.649369770808986e-02,1.160296550490474e-02,-4.564335620446672e-03}, +{ -4.564335620447215e-03,1.160296550490490e-02,-1.649369770808978e-02,2.184556600655264e-02,-2.898106279774594e-02,4.030819367856841e-02,-6.294287374050095e-02,1.360075745299809e-01,9.753858363412450e-01,-1.060840649793210e-01,5.537240868898424e-02,-3.668366937603155e-02,2.651868344173743e-02,-1.956224074671252e-02,1.359152832069654e-02,-5.320811543821033e-03}, +{ 9.630551100510797e-03,-2.439432892459776e-02,3.436364060227428e-02,-4.477230757872764e-02,5.775558401500077e-02,-7.644508799222711e-02,1.080437857614290e-01,-1.768688561127409e-01,4.603027000227594e-01,7.912550086322166e-01,-2.112874521074976e-01,1.197236316110138e-01,-8.092293702330947e-02,5.768023266773188e-02,-3.937630884703466e-02,1.531214417319877e-02}, +{ -8.603005198080008e-03,2.173286585558372e-02,-3.040825760925008e-02,3.914714388491940e-02,-4.951599947402182e-02,6.343718420705102e-02,-8.459156988723902e-02,1.224735470424939e-01,-2.135947493828096e-01,7.712356922100077e-01,4.854426791914059e-01,-1.823153170602853e-01,1.090943060828355e-01,-7.365800510849715e-02,4.899322101836548e-02,-1.886973577247945e-02}, +{ 3.442366656324372e-03,-8.678239821968891e-03,1.208057266357738e-02,-1.541447513323508e-02,1.922193904636181e-02,-2.407687754109638e-02,3.092920760489309e-02,-4.182310696315297e-02,6.251838477810208e-02,-1.188146432200073e-01,9.675621468360495e-01,1.579971842228994e-01,-7.143432693411873e-02,4.386544542659398e-02,-2.800859197600756e-02,1.063301435478537e-02}, +{ 2.280233683037643e-03,-5.739463692053982e-03,7.958673363207066e-03,-1.008737297652000e-02,1.244803863705487e-02,-1.534334961427204e-02,1.921759617646471e-02,-2.490617663612785e-02,3.434519185104384e-02,-5.349256988483023e-02,1.147842175652457e-01,9.818523917284309e-01,-9.236914283083374e-02,4.643257826334170e-02,-2.763121639527494e-02,1.025037076208659e-02}, +{ -5.469676371159443e-03,1.375099425755273e-02,-1.901192290428005e-02,2.397641564065046e-02,-2.935935227934330e-02,3.577006213683075e-02,-4.401833104052977e-02,5.547395722957245e-02,-7.290316738385687e-02,1.031358868909773e-01,-1.693713973038418e-01,4.371314746161701e-01,8.075044394897422e-01,-2.046511214647538e-01,1.056967317953343e-01,-3.765499330906548e-02}, +{ 5.005275238753895e-03,-1.257222134252285e-02,1.734412064925337e-02,-2.179205717789713e-02,2.653392093209560e-02,-3.205852480578207e-02,3.896528729255387e-02,-4.818659177671222e-02,6.142223649195773e-02,-8.229283321291228e-02,1.203293919010655e-01,-2.119416297655571e-01,7.545440678956123e-01,5.017942993734058e-01,-1.743120053444063e-01,5.721726365109171e-02}, +{ -2.003193280002952e-03,5.028374272473379e-03,-6.926045100363710e-03,8.679221416370496e-03,-1.052551284303118e-02,1.264286349038298e-02,-1.523639991693999e-02,1.860525852867583e-02,-2.325486404473361e-02,3.015575783977299e-02,-4.148164017773373e-02,6.337784020836493e-02,-1.230306517309842e-01,9.625733586701579e-01,1.650411656498492e-01,-4.364553298225820e-02}, +{ -1.131478002876232e-03,2.839003181743926e-03,-3.906361291211329e-03,4.886636103584711e-03,-5.910607922163841e-03,7.072678353479290e-03,-8.477083594218266e-03,1.026908343438244e-02,-1.268149449183453e-02,1.613113956951008e-02,-2.145916924002732e-02,3.066170832604258e-02,-4.989913312350318e-02,1.123678881655863e-01,9.809166217662293e-01,-6.167943123472387e-02}, +{ 2.207403257407092e-03,-5.537343473306857e-03,7.614903386660106e-03,-9.516860481506005e-03,1.149485695400510e-02,-1.372693342685525e-02,1.640500899464983e-02,-1.978976448465170e-02,2.428677695345344e-02,-3.059434926351772e-02,4.004328482212002e-02,-5.551150953514809e-02,8.447132444549756e-02,-1.534142468557525e-01,4.612848220947535e-01,6.402826266121913e-01} +}; +#endif +#if p_Nq==16 && p_cubNq==21 +const dfloat c_I[21][16] = { +{ 6.688511383811974e-01,4.268407826978621e-01,-1.447550254051758e-01,7.998462129144727e-02,-5.262966604108281e-02,3.798722592305463e-02,-2.903303322107179e-02,2.305196502904011e-02,-1.878602458204267e-02,1.557430596494885e-02,-1.303263148043540e-02,1.091391726847876e-02,-9.036154668775125e-03,7.230410282020015e-03,-5.257813003544717e-03,2.095981564079440e-03}, +{ -2.881719946639506e-02,9.964782149880074e-01,4.547129239151352e-02,-2.081317153914359e-02,1.289649820735250e-02,-9.058448258950252e-03,6.822295386030790e-03,-5.369372965036073e-03,4.351064099446050e-03,-3.593503740193791e-03,2.999154582608396e-03,-2.506958811237519e-03,2.072976833126511e-03,-1.657307470645703e-03,1.204548035507212e-03,-4.800822719902847e-04}, +{ -7.199596946673467e-02,2.872536937351125e-01,9.015322848396017e-01,-1.734155552291688e-01,9.238085742400116e-02,-6.116582308118211e-02,4.471187164503906e-02,-3.458728898737800e-02,2.772511306777452e-02,-2.273358829261446e-02,1.888009328226777e-02,-1.572752038528411e-02,1.297405541618648e-02,-1.035615085038389e-02,7.519879121569871e-03,-2.995952238806866e-03}, +{ 6.427422626793193e-02,-2.003518524952655e-01,6.734504312242796e-01,5.931671574701510e-01,-2.016169601790804e-01,1.184223466954059e-01,-8.208809142614473e-02,6.169664789937594e-02,-4.860026312485037e-02,3.940227360641278e-02,-3.247443248235039e-02,2.691007453394036e-02,-2.211890179342099e-02,1.761365787586076e-02,-1.277163973695105e-02,5.085325664705149e-03}, +{ -2.488597239210113e-02,7.077553384524342e-02,-1.443395987517609e-01,9.441427690755994e-01,2.134800945182739e-01,-9.418101491210289e-02,5.928589630085915e-02,-4.252456271396499e-02,3.261648560158974e-02,-2.600646152787728e-02,2.119960066026402e-02,-1.743674988195422e-02,1.425991970503255e-02,-1.131774395621490e-02,8.190380798725578e-03,-3.258576369611276e-03}, +{ -1.014912815900764e-02,2.759343502558557e-02,-4.767958374719062e-02,1.044681258437635e-01,9.846591628057423e-01,-8.609993229125439e-02,4.434098854649834e-02,-2.939279119164422e-02,2.164159445880931e-02,-1.684315638721286e-02,1.351972471306382e-02,-1.100683565452106e-02,8.940072859413388e-03,-7.063990780283103e-03,5.098690482409044e-03,-2.026376524171369e-03}, +{ 2.473261611101960e-02,-6.554379009150764e-02,1.044971847607550e-01,-1.781353985864836e-01,4.731248463133593e-01,7.806606024869922e-01,-2.123371602749433e-01,1.212394976290990e-01,-8.361077104426902e-02,6.279179105029907e-02,-4.932131057826110e-02,3.959836321612534e-02,-3.187035351672451e-02,2.503503165796408e-02,-1.800829052338575e-02,7.147141389961393e-03}, +{ -1.901469628886976e-02,4.959238459021469e-02,-7.549771078231667e-02,1.149604358155183e-01,-2.063570461059820e-01,8.185153442765981e-01,4.238264956526381e-01,-1.668213007335220e-01,1.025093468513794e-01,-7.287121555680251e-02,5.548821711786320e-02,-4.370546302402773e-02,3.474908456363808e-02,-2.708694202578853e-02,1.939798881170930e-02,-7.684923162249811e-03}, +{ 3.419420544325443e-03,-8.823918999848850e-03,1.304652958537721e-02,-1.865298850241649e-02,2.872559511753351e-02,-5.673494707684131e-02,9.939053783278348e-01,6.419603496760358e-02,-3.078247258889117e-02,1.996622289751886e-02,-1.451487015450342e-02,1.112990056196780e-02,-8.704264379175880e-03,6.716324946351589e-03,-4.782096057764433e-03,1.890150810928813e-03}, +{ 9.700207974769651e-03,-2.484670722696036e-02,3.601990541829325e-02,-4.950822497116079e-02,7.030578889317361e-02,-1.123382280490532e-01,2.590170448674918e-01,9.223832916749857e-01,-1.652657860134953e-01,8.971461481654375e-02,-6.055985582485289e-02,4.465547535130737e-02,-3.413548928074297e-02,2.598293570470017e-02,-1.836004989250813e-02,7.235076557508565e-03}, +{ -1.309204101562483e-02,3.335544248995531e-02,-4.768643702639017e-02,6.383014874598028e-02,-8.627598963282525e-02,1.243076624552209e-01,-2.105115081265009e-01,6.360727221101835e-01,6.360727221101867e-01,-2.105115081265018e-01,1.243076624552213e-01,-8.627598963282559e-02,6.383014874598104e-02,-4.768643702639107e-02,3.335544248995545e-02,-1.309204101562484e-02}, +{ 7.235076557508787e-03,-1.836004989250788e-02,2.598293570469952e-02,-3.413548928074213e-02,4.465547535130683e-02,-6.055985582485256e-02,8.971461481654279e-02,-1.652657860134938e-01,9.223832916749866e-01,2.590170448674892e-01,-1.123382280490518e-01,7.030578889317307e-02,-4.950822497116085e-02,3.601990541829348e-02,-2.484670722696016e-02,9.700207974769179e-03}, +{ 1.890150810928765e-03,-4.782096057764421e-03,6.716324946351666e-03,-8.704264379175874e-03,1.112990056196755e-02,-1.451487015450298e-02,1.996622289751862e-02,-3.078247258889108e-02,6.419603496760372e-02,9.939053783278349e-01,-5.673494707684171e-02,2.872559511753379e-02,-1.865298850241691e-02,1.304652958537759e-02,-8.823918999848947e-03,3.419420544325390e-03}, +{ -7.684923162250459e-03,1.939798881170924e-02,-2.708694202578820e-02,3.474908456363779e-02,-4.370546302402754e-02,5.548821711786356e-02,-7.287121555680287e-02,1.025093468513803e-01,-1.668213007335234e-01,4.238264956526402e-01,8.185153442765974e-01,-2.063570461059837e-01,1.149604358155199e-01,-7.549771078231786e-02,4.959238459021487e-02,-1.901469628886948e-02}, +{ 7.147141389961306e-03,-1.800829052338583e-02,2.503503165796370e-02,-3.187035351672417e-02,3.959836321612539e-02,-4.932131057826138e-02,6.279179105029910e-02,-8.361077104426852e-02,1.212394976290988e-01,-2.123371602749435e-01,7.806606024869931e-01,4.731248463133593e-01,-1.781353985864851e-01,1.044971847607564e-01,-6.554379009150771e-02,2.473261611101909e-02}, +{ -2.026376524171538e-03,5.098690482408939e-03,-7.063990780282988e-03,8.940072859413525e-03,-1.100683565452098e-02,1.351972471306377e-02,-1.684315638721255e-02,2.164159445880904e-02,-2.939279119164391e-02,4.434098854649741e-02,-8.609993229125266e-02,9.846591628057419e-01,1.044681258437625e-01,-4.767958374719033e-02,2.759343502558504e-02,-1.014912815900733e-02}, +{ -3.258576369610921e-03,8.190380798725255e-03,-1.131774395621437e-02,1.425991970503230e-02,-1.743674988195383e-02,2.119960066026344e-02,-2.600646152787669e-02,3.261648560158886e-02,-4.252456271396420e-02,5.928589630085859e-02,-9.418101491210157e-02,2.134800945182712e-01,9.441427690756012e-01,-1.443395987517612e-01,7.077553384524274e-02,-2.488597239210069e-02}, +{ 5.085325664705251e-03,-1.277163973695079e-02,1.761365787586014e-02,-2.211890179342044e-02,2.691007453394000e-02,-3.247443248234985e-02,3.940227360641203e-02,-4.860026312484979e-02,6.169664789937532e-02,-8.208809142614321e-02,1.184223466954042e-01,-2.016169601790782e-01,5.931671574701494e-01,6.734504312242795e-01,-2.003518524952626e-01,6.427422626792929e-02}, +{ -2.995952238806943e-03,7.519879121569742e-03,-1.035615085038369e-02,1.297405541618635e-02,-1.572752038528381e-02,1.888009328226783e-02,-2.273358829261484e-02,2.772511306777477e-02,-3.458728898737794e-02,4.471187164503869e-02,-6.116582308118157e-02,9.238085742400133e-02,-1.734155552291707e-01,9.015322848395988e-01,2.872536937351147e-01,-7.199596946673276e-02}, +{ -4.800822719902079e-04,1.204548035506913e-03,-1.657307470645362e-03,2.072976833125974e-03,-2.506958811236839e-03,2.999154582607634e-03,-3.593503740192907e-03,4.351064099445003e-03,-5.369372965034812e-03,6.822295386029190e-03,-9.058448258948276e-03,1.289649820734981e-02,-2.081317153913853e-02,4.547129239150278e-02,9.964782149880065e-01,-2.881719946638690e-02}, +{ 2.095981564079544e-03,-5.257813003544953e-03,7.230410282020252e-03,-9.036154668775408e-03,1.091391726847932e-02,-1.303263148043596e-02,1.557430596494967e-02,-1.878602458204397e-02,2.305196502904144e-02,-2.903303322107299e-02,3.798722592305633e-02,-5.262966604108584e-02,7.998462129145215e-02,-1.447550254051851e-01,4.268407826978741e-01,6.688511383811915e-01} +}; +#endif +#if p_Nq==16 && p_cubNq==22 +const dfloat c_I[22][16] = { +{ 6.943255179812828e-01,3.957533841478140e-01,-1.364397916962193e-01,7.561920378201410e-02,-4.981186254059637e-02,3.597207818982361e-02,-2.750076459956206e-02,2.183915179718010e-02,-1.779965245705362e-02,1.475768831479537e-02,-1.234993451070023e-02,1.034258921932644e-02,-8.563345361871477e-03,6.852202877213193e-03,-4.982839132058426e-03,1.986373988611395e-03}, +{ 6.657488889035450e-03,9.998368659797613e-01,-9.212938372144937e-03,4.322707940577860e-03,-2.697524465603457e-03,1.900590728831272e-03,-1.433756474811552e-03,1.129508695650425e-03,-9.158624007726240e-04,7.567158667712228e-04,-6.317391481517802e-04,5.281690413104048e-04,-4.367977510863995e-04,3.492441544641235e-04,-2.538482077417134e-04,1.011755239102822e-04}, +{ -9.559039998078921e-02,4.047691072032844e-01,8.239060437875639e-01,-1.997903302973012e-01,1.093284598033050e-01,-7.308641426978546e-02,5.367641082213451e-02,-4.163226057118454e-02,3.342743440085002e-02,-2.743906030740577e-02,2.280483870575101e-02,-1.900667371521372e-02,1.568467098093073e-02,-1.252276190883752e-02,9.094385266952379e-03,-3.623449920254687e-03}, +{ 6.150199064521336e-02,-1.964841988625081e-01,8.095715389593204e-01,4.317850769849923e-01,-1.662443931340344e-01,1.002950502078461e-01,-7.029792879006495e-02,5.314381880410177e-02,-4.200815795617133e-02,3.413340824841796e-02,-2.817379296407857e-02,2.337010448221860e-02,-1.922257695987478e-02,1.531429894121417e-02,-1.110740183761477e-02,4.423163231022178e-03}, +{ -5.106188786221500e-03,1.472543422570473e-02,-3.185885601004462e-02,9.981322087418559e-01,3.439442702701138e-02,-1.657028020880316e-02,1.070395899936731e-02,-7.769530626344687e-03,5.998707451690422e-03,-4.802462953732659e-03,3.925188570208298e-03,-3.234241211206135e-03,2.648177363199402e-03,-2.103450890401688e-03,1.522925346101544e-03,-6.060170383844661e-04}, +{ -2.804986655761517e-02,7.695511853488982e-02,-1.371065484266510e-01,3.393204811849951e-01,8.751487977715180e-01,-1.897510982825653e-01,1.046850791210380e-01,-7.112282918783902e-02,5.301152318078773e-02,-4.155078040169486e-02,3.350092994990694e-02,-2.735404399979543e-02,2.226100324573283e-02,-1.761168650444988e-02,1.272125640810141e-02,-5.057336036358930e-03}, +{ 2.734569759724919e-02,-7.293259454327694e-02,1.185406515787997e-01,-2.130593917877273e-01,7.487309246347382e-01,5.123097440421569e-01,-1.888876577652033e-01,1.141483687433331e-01,-8.057278600760472e-02,6.125693374231533e-02,-4.846878565345562e-02,3.909509366369845e-02,-3.156056910719671e-02,2.483960446476002e-02,-1.788776185905767e-02,7.102528256471263e-03}, +{ -6.722453794645353e-03,1.761629493634718e-02,-2.718089283305856e-02,4.269180054710847e-02,-8.378980183592147e-02,9.856017416922183e-01,1.014473640874634e-01,-4.771665525288803e-02,3.077219260300910e-02,-2.235856474008097e-02,1.723133602334439e-02,-1.367181824409907e-02,1.092041059430040e-02,-8.537143839630260e-03,6.123941570706045e-03,-2.427751514173682e-03}, +{ -1.224692553245672e-02,3.171960259198028e-02,-4.736721583221448e-02,6.914505177869454e-02,-1.116062861843181e-01,2.580715512141045e-01,9.227887160643693e-01,-1.648936597667333e-01,8.950683882213509e-02,-6.054282426715018e-02,4.492079103076867e-02,-3.484626232755265e-02,2.744428403423553e-02,-2.126766959468800e-02,1.517970725163777e-02,-6.005699282812215e-03}, +{ 1.626685174579516e-02,-4.178828634545396e-02,6.104604599927068e-02,-8.517863035861332e-02,1.246220386856511e-01,-2.137615403134721e-01,6.754653603649942e-01,5.954397801218400e-01,-2.051662399892095e-01,1.223629933993766e-01,-8.569901242841092e-02,6.439764729595668e-02,-4.976344810001698e-02,3.812218249036491e-02,-2.703387666397142e-02,1.066813409589902e-02}, +{ -6.623361872362167e-03,1.691456905430110e-02,-2.432938529903050e-02,3.294066872737502e-02,-4.546370959309916e-02,6.830235043294429e-02,-1.291003497308703e-01,9.602764361168280e-01,1.773136235894965e-01,-8.037685564037458e-02,5.116176940495294e-02,-3.668242379908723e-02,2.760904171145460e-02,-2.082717878457078e-02,1.464457685444120e-02,-5.759771172398963e-03}, +{ -5.759771172399331e-03,1.464457685444161e-02,-2.082717878457118e-02,2.760904171145532e-02,-3.668242379908829e-02,5.116176940495429e-02,-8.037685564037689e-02,1.773136235895031e-01,9.602764361168259e-01,-1.291003497308739e-01,6.830235043294643e-02,-4.546370959310054e-02,3.294066872737608e-02,-2.432938529903155e-02,1.691456905430156e-02,-6.623361872362298e-03}, +{ 1.066813409589941e-02,-2.703387666397137e-02,3.812218249036423e-02,-4.976344810001611e-02,6.439764729595611e-02,-8.569901242841058e-02,1.223629933993763e-01,-2.051662399892094e-01,5.954397801218431e-01,6.754653603649904e-01,-2.137615403134712e-01,1.246220386856510e-01,-8.517863035861380e-02,6.104604599927144e-02,-4.178828634545396e-02,1.626685174579450e-02}, +{ -6.005699282812488e-03,1.517970725163773e-02,-2.126766959468767e-02,2.744428403423528e-02,-3.484626232755254e-02,4.492079103076886e-02,-6.054282426715028e-02,8.950683882213514e-02,-1.648936597667340e-01,9.227887160643703e-01,2.580715512141039e-01,-1.116062861843183e-01,6.914505177869532e-02,-4.736721583221522e-02,3.171960259198020e-02,-1.224692553245633e-02}, +{ -2.427751514173604e-03,6.123941570705818e-03,-8.537143839630184e-03,1.092041059430027e-02,-1.367181824409871e-02,1.723133602334409e-02,-2.235856474008047e-02,3.077219260300906e-02,-4.771665525288794e-02,1.014473640874613e-01,9.856017416922194e-01,-8.378980183592037e-02,4.269180054710803e-02,-2.718089283305850e-02,1.761629493634691e-02,-6.722453794644942e-03}, +{ 7.102528256471546e-03,-1.788776185905775e-02,2.483960446475963e-02,-3.156056910719625e-02,3.909509366369836e-02,-4.846878565345569e-02,6.125693374231499e-02,-8.057278600760402e-02,1.141483687433330e-01,-1.888876577652029e-01,5.123097440421540e-01,7.487309246347412e-01,-2.130593917877294e-01,1.185406515788016e-01,-7.293259454327707e-02,2.734569759724859e-02}, +{ -5.057336036359036e-03,1.272125640810137e-02,-1.761168650444953e-02,2.226100324573275e-02,-2.735404399979538e-02,3.350092994990703e-02,-4.155078040169494e-02,5.301152318078783e-02,-7.112282918783906e-02,1.046850791210378e-01,-1.897510982825645e-01,8.751487977715156e-01,3.393204811849979e-01,-1.371065484266528e-01,7.695511853488983e-02,-2.804986655761459e-02}, +{ -6.060170383843507e-04,1.522925346101266e-03,-2.103450890401530e-03,2.648177363199192e-03,-3.234241211205751e-03,3.925188570208105e-03,-4.802462953732250e-03,5.998707451689732e-03,-7.769530626344452e-03,1.070395899936722e-02,-1.657028020880216e-02,3.439442702700872e-02,9.981322087418562e-01,-3.185885601004253e-02,1.472543422570385e-02,-5.106188786221141e-03}, +{ 4.423163231021751e-03,-1.110740183761461e-02,1.531429894121356e-02,-1.922257695987427e-02,2.337010448221841e-02,-2.817379296407788e-02,3.413340824841721e-02,-4.200815795617099e-02,5.314381880410123e-02,-7.029792879006340e-02,1.002950502078439e-01,-1.662443931340319e-01,4.317850769849887e-01,8.095715389593223e-01,-1.964841988625053e-01,6.150199064521097e-02}, +{ -3.623449920254859e-03,9.094385266952117e-03,-1.252276190883702e-02,1.568467098093014e-02,-1.900667371521307e-02,2.280483870575066e-02,-2.743906030740539e-02,3.342743440084965e-02,-4.163226057118422e-02,5.367641082213360e-02,-7.308641426978407e-02,1.093284598033039e-01,-1.997903302973008e-01,8.239060437875655e-01,4.047691072032783e-01,-9.559039998078453e-02}, +{ 1.011755239108668e-04,-2.538482077424359e-04,3.492441544650030e-04,-4.367977510872747e-04,5.281690413111056e-04,-6.317391481521218e-04,7.567158667717301e-04,-9.158624007736661e-04,1.129508695651921e-03,-1.433756474813475e-03,1.900590728833665e-03,-2.697524465607076e-03,4.322707940583927e-03,-9.212938372157984e-03,9.998368659797611e-01,6.657488889044931e-03}, +{ 1.986373988611281e-03,-4.982839132058745e-03,6.852202877213867e-03,-8.563345361872556e-03,1.034258921932724e-02,-1.234993451070114e-02,1.475768831479641e-02,-1.779965245705491e-02,2.183915179718154e-02,-2.750076459956383e-02,3.597207818982611e-02,-4.981186254059959e-02,7.561920378201992e-02,-1.364397916962304e-01,3.957533841478337e-01,6.943255179812711e-01} +}; +#endif +#if p_Nq==16 && p_cubNq==23 +const dfloat c_I[23][16] = { +{ 7.171064607330614e-01,3.676676634013520e-01,-1.285489321638687e-01,7.143388210101614e-02,-4.709994247876938e-02,3.402908328683553e-02,-2.602185124290640e-02,2.066784264544429e-02,-1.684665139036845e-02,1.396848334290738e-02,-1.169002960105356e-02,9.790261594582007e-03,-8.106217714118086e-03,6.486516834258078e-03,-4.716959026204380e-03,1.880389677832206e-03}, +{ 4.358181024167281e-02,9.938053746018018e-01,-5.343900996508854e-02,2.559502183908481e-02,-1.606930593894688e-02,1.135217059668587e-02,-8.575941027767759e-03,6.761798996914897e-03,-5.485762427235879e-03,4.534156832984990e-03,-3.786254902387257e-03,3.166071851467281e-03,-2.618670066062641e-03,2.093941611254000e-03,-1.522055245029264e-03,6.066530006516485e-04}, +{ -1.134660539499219e-01,5.133585122873875e-01,7.376222703043769e-01,-2.083396137476370e-01,1.165350998597925e-01,-7.853917724064788e-02,5.791248527009416e-02,-4.502064687757675e-02,3.619963552332237e-02,-2.974261878496563e-02,2.473525005137303e-02,-2.062477952576879e-02,1.702520889146468e-02,-1.359584017283800e-02,9.874887894653937e-03,-3.934619783109232e-03}, +{ 5.045072198951633e-02,-1.654863811009153e-01,9.072125322071350e-01,2.833250224499517e-01,-1.191468379536432e-01,7.344975525904111e-02,-5.196048976591360e-02,3.947544936894588e-02,-3.129635200367630e-02,2.547814164549479e-02,-2.105668660676839e-02,1.748185355184918e-02,-1.438799178049954e-02,1.146723295626936e-02,-8.319104462421857e-03,3.313134245634640e-03}, +{ 1.678444083239050e-02,-4.912331721131147e-02,1.137277578655917e-01,9.817900536586568e-01,-9.232302009059404e-02,4.740368280210337e-02,-3.126029101594443e-02,2.291442666229452e-02,-1.779018065905776e-02,1.429165227281049e-02,-1.170741957817756e-02,9.661328450681307e-03,-7.918836391298459e-03,6.294218003325972e-03,-4.558923061808230e-03,1.814427460337367e-03}, +{ -3.832668433900334e-02,1.061588102659730e-01,-1.956154983473417e-01,5.676712806431121e-01,6.994960221425418e-01,-2.139425670138107e-01,1.241184001427081e-01,-8.598667399475519e-02,6.473584322860131e-02,-5.104078293741794e-02,4.130704560021554e-02,-3.381164799915569e-02,2.756191257677279e-02,-2.182897056024008e-02,1.577744992558337e-02,-6.273939333783255e-03}, +{ 1.862754909962250e-02,-5.001571202005535e-02,8.300540404267803e-02,-1.587083973829499e-01,9.293526990561680e-01,2.448609275396632e-01,-1.068264279279686e-01,6.730370407360407e-02,-4.838487600606965e-02,3.715400564955072e-02,-2.957601163297664e-02,2.394899803050025e-02,-1.938277429251609e-02,1.528007402562093e-02,-1.101412675851983e-02,4.374964503648183e-03}, +{ 1.005567086448947e-02,-2.648291098339063e-02,4.145371652511495e-02,-6.741678740853729e-02,1.484035791934354e-01,9.710307059375377e-01,-1.134108531127476e-01,5.948721255028968e-02,-3.975131300600707e-02,2.938098278842651e-02,-2.286445374910795e-02,1.825041296341523e-02,-1.463354502997106e-02,1.146763051439080e-02,-8.237545145345561e-03,3.267497098007033e-03}, +{ -2.054313521289938e-02,5.341226921024759e-02,-8.061098601420290e-02,1.203984050850219e-01,-2.054587413191058e-01,6.036501058872885e-01,6.674505683481459e-01,-2.133751044719464e-01,1.254202504983729e-01,-8.749728447090871e-02,6.595979759313618e-02,-5.164355188596923e-02,4.090715640112005e-02,-3.181298185113755e-02,2.275216470903195e-02,-9.008932506194981e-03}, +{ 9.513554958432136e-03,-2.451448864111993e-02,3.610493540618405e-02,-5.121057773095083e-02,7.752017708831128e-02,-1.455850300877521e-01,9.455402478720552e-01,2.116813350183291e-01,-9.430177970261530e-02,5.984553972819408e-02,-4.306716262942004e-02,3.283867133701316e-02,-2.559562149429973e-02,1.970967222325804e-02,-1.401739601200892e-02,5.537922666389842e-03}, +{ 7.177677813427113e-03,-1.837570922236218e-02,2.660244847805475e-02,-3.646708278876028e-02,5.152116998042525e-02,-8.138090696365197e-02,1.800579711187916e-01,9.591564821926539e-01,-1.305566984230215e-01,6.925373466257501e-02,-4.635982916277925e-02,3.404370143206398e-02,-2.596309304790529e-02,1.973541851818545e-02,-1.393495218005620e-02,5.489667592359843e-03}, +{ -1.309204101562516e-02,3.335544248995536e-02,-4.768643702639026e-02,6.383014874598041e-02,-8.627598963282530e-02,1.243076624552211e-01,-2.105115081265011e-01,6.360727221101845e-01,6.360727221101858e-01,-2.105115081265016e-01,1.243076624552214e-01,-8.627598963282565e-02,6.383014874598103e-02,-4.768643702639113e-02,3.335544248995543e-02,-1.309204101562484e-02}, +{ 5.489667592359783e-03,-1.393495218005600e-02,1.973541851818507e-02,-2.596309304790462e-02,3.404370143206353e-02,-4.635982916277902e-02,6.925373466257434e-02,-1.305566984230201e-01,9.591564821926546e-01,1.800579711187897e-01,-8.138090696365134e-02,5.152116998042480e-02,-3.646708278876024e-02,2.660244847805479e-02,-1.837570922236192e-02,7.177677813426743e-03}, +{ 5.537922666390225e-03,-1.401739601200902e-02,1.970967222325799e-02,-2.559562149429955e-02,3.283867133701330e-02,-4.306716262942059e-02,5.984553972819485e-02,-9.430177970261697e-02,2.116813350183339e-01,9.455402478720526e-01,-1.455850300877538e-01,7.752017708831262e-02,-5.121057773095231e-02,3.610493540618533e-02,-2.451448864112044e-02,9.513554958431949e-03}, +{ -9.008932506195597e-03,2.275216470903199e-02,-3.181298185113712e-02,4.090715640111960e-02,-5.164355188596900e-02,6.595979759313624e-02,-8.749728447090876e-02,1.254202504983734e-01,-2.133751044719475e-01,6.674505683481485e-01,6.036501058872870e-01,-2.054587413191066e-01,1.203984050850232e-01,-8.061098601420416e-02,5.341226921024761e-02,-2.054313521289895e-02}, +{ 3.267497098007161e-03,-8.237545145345717e-03,1.146763051439096e-02,-1.463354502997110e-02,1.825041296341530e-02,-2.286445374910791e-02,2.938098278842662e-02,-3.975131300600694e-02,5.948721255028955e-02,-1.134108531127475e-01,9.710307059375376e-01,1.484035791934354e-01,-6.741678740853778e-02,4.145371652511538e-02,-2.648291098339065e-02,1.005567086448968e-02}, +{ 4.374964503648458e-03,-1.101412675851995e-02,1.528007402562095e-02,-1.938277429251596e-02,2.394899803050020e-02,-2.957601163297711e-02,3.715400564955116e-02,-4.838487600607024e-02,6.730370407360529e-02,-1.068264279279701e-01,2.448609275396668e-01,9.293526990561667e-01,-1.587083973829531e-01,8.300540404268045e-02,-5.001571202005611e-02,1.862754909962253e-02}, +{ -6.273939333783410e-03,1.577744992558330e-02,-2.182897056023973e-02,2.756191257677270e-02,-3.381164799915538e-02,4.130704560021520e-02,-5.104078293741756e-02,6.473584322860092e-02,-8.598667399475486e-02,1.241184001427073e-01,-2.139425670138095e-01,6.994960221425404e-01,5.676712806431143e-01,-1.956154983473438e-01,1.061588102659726e-01,-3.832668433900260e-02}, +{ 1.814427460337199e-03,-4.558923061808095e-03,6.294218003325750e-03,-7.918836391298242e-03,9.661328450681168e-03,-1.170741957817733e-02,1.429165227281028e-02,-1.779018065905767e-02,2.291442666229404e-02,-3.126029101594365e-02,4.740368280210264e-02,-9.232302009059273e-02,9.817900536586553e-01,1.137277578655919e-01,-4.912331721131071e-02,1.678444083238999e-02}, +{ 3.313134245634439e-03,-8.319104462421718e-03,1.146723295626914e-02,-1.438799178049950e-02,1.748185355184886e-02,-2.105668660676726e-02,2.547814164549383e-02,-3.129635200367589e-02,3.947544936894497e-02,-5.196048976591221e-02,7.344975525903985e-02,-1.191468379536408e-01,2.833250224499476e-01,9.072125322071365e-01,-1.654863811009123e-01,5.045072198951446e-02}, +{ -3.934619783109387e-03,9.874887894653780e-03,-1.359584017283755e-02,1.702520889146407e-02,-2.062477952576839e-02,2.473525005137302e-02,-2.974261878496548e-02,3.619963552332180e-02,-4.502064687757598e-02,5.791248527009300e-02,-7.853917724064617e-02,1.165350998597912e-01,-2.083396137476367e-01,7.376222703043709e-01,5.133585122873894e-01,-1.134660539499175e-01}, +{ 6.066530006518071e-04,-1.522055245029526e-03,2.093941611254492e-03,-2.618670066063366e-03,3.166071851468248e-03,-3.786254902388417e-03,4.534156832986042e-03,-5.485762427236610e-03,6.761798996915813e-03,-8.575941027769447e-03,1.135217059668790e-02,-1.606930593894931e-02,2.559502183908862e-02,-5.343900996509743e-02,9.938053746018026e-01,4.358181024167857e-02}, +{ 1.880389677832463e-03,-4.716959026204273e-03,6.486516834257821e-03,-8.106217714118036e-03,9.790261594582090e-03,-1.169002960105344e-02,1.396848334290710e-02,-1.684665139036862e-02,2.066784264544433e-02,-2.602185124290596e-02,3.402908328683540e-02,-4.709994247876941e-02,7.143388210101660e-02,-1.285489321638698e-01,3.676676634013387e-01,7.171064607330750e-01} +}; +#endif +#if p_Nq==16 && p_cubNq==24 +const dfloat c_I[24][16] = { +{ 7.375384363465141e-01,3.422576726898569e-01,-1.211194080558259e-01,6.746060490682110e-02,-4.451749904499277e-02,3.217612696841748e-02,-2.461031894585268e-02,1.954934406967057e-02,-1.593632481165661e-02,1.321445380425344e-02,-1.105944156220780e-02,9.262415434913534e-03,-7.669319503865971e-03,6.136996335640103e-03,-4.462824926094383e-03,1.779086294408938e-03}, +{ 8.106905875960656e-02,9.806865831625162e-01,-8.884257675310389e-02,4.329589310171251e-02,-2.732500247396921e-02,1.934862864844948e-02,-1.463494994552660e-02,1.154761648231954e-02,-9.372863088452008e-03,7.749425118699816e-03,-6.472587227350564e-03,5.413216063675212e-03,-4.477767604215548e-03,3.580767387992749e-03,-2.602916972804009e-03,1.037475340449511e-03}, +{ -1.253276936393787e-01,6.106535439050736e-01,6.484362551568768e-01,-2.042280397280827e-01,1.163342685451086e-01,-7.894890631981658e-02,5.841611642312242e-02,-4.550238522140488e-02,3.663246704195711e-02,-3.012298896980647e-02,2.506567321875873e-02,-2.090846132433002e-02,1.726403962962503e-02,-1.378903621657741e-02,1.001627869768299e-02,-3.991131198808555e-03}, +{ 3.326916388277511e-02,-1.122594953168348e-01,9.680638161701632e-01,1.545623821401753e-01,-6.941994080233069e-02,4.356064550998672e-02,-3.105804442904654e-02,2.369535971522577e-02,-1.883384125275592e-02,1.535784724153797e-02,-1.270680414739611e-02,1.055763167345488e-02,-8.693756005418505e-03,6.931334266598712e-03,-5.029494862734371e-03,2.003196216599183e-03}, +{ 3.676466984207360e-02,-1.092956740535585e-01,2.736687711524424e-01,9.128226307361723e-01,-1.697379587718587e-01,9.145480220870589e-02,-6.133326226026924e-02,4.533012330963198e-02,-3.535944798101701e-02,2.848981196971962e-02,-2.338377886660029e-02,1.932254580955220e-02,-1.585180805281689e-02,1.260710639199688e-02,-9.134553597280558e-03,3.636022163106117e-03}, +{ -3.921842692590208e-02,1.097279425938523e-01,-2.098430074475580e-01,7.585251077269803e-01,4.994243902400015e-01,-1.851362594123724e-01,1.115983513579206e-01,-7.854418110974327e-02,5.962782296682478e-02,-4.724826517094141e-02,3.836016122918978e-02,-3.146631037940776e-02,2.568667734810127e-02,-2.036268801101877e-02,1.472569247187871e-02,-5.857007477805549e-03}, +{ 2.835285709049586e-03,-7.666818577996872e-03,1.301395765057013e-02,-2.677115954683013e-02,9.987220416746568e-01,2.841823288066434e-02,-1.381212863360199e-02,8.983443061479770e-03,-6.554207387611375e-03,5.074515751527238e-03,-4.060057315428374e-03,3.298439254871095e-03,-2.675339619631080e-03,2.112005815541312e-03,-1.523610197086286e-03,6.053994798257660e-04}, +{ 2.316376345477656e-02,-6.132530528901187e-02,9.748324718984080e-02,-1.648907930586524e-01,4.232103776444556e-01,8.186326498238654e-01,-2.067044552241463e-01,1.167248831489850e-01,-8.015019841343024e-02,6.005872563848427e-02,-4.711256621362476e-02,3.779366998911165e-02,-3.040151902016591e-02,2.387304887765556e-02,-1.716904169217016e-02,6.813513144026753e-03}, +{ -1.734174121302126e-02,4.527131588913776e-02,-6.910010132071691e-02,1.058454680051013e-01,-1.931149777273486e-01,8.670195751065511e-01,3.533676635852918e-01,-1.453918894582052e-01,9.033125115584102e-02,-6.452652518971888e-02,4.926393048418089e-02,-3.886457157449686e-02,3.093111096722391e-02,-2.412589376454702e-02,1.728365603074693e-02,-6.848270976019946e-03}, +{ -4.559694336169453e-03,1.178719321124522e-02,-1.751108656965397e-02,2.528485110738144e-02,-3.980271264101447e-02,8.428310395454248e-02,9.898084480753125e-01,-7.194205914422609e-02,3.692686069468452e-02,-2.448826199932193e-02,1.799249038488545e-02,-1.387931698647655e-02,1.089381139332480e-02,-8.424373826791003e-03,6.005723602895765e-03,-2.374976920618752e-03}, +{ 1.581655120214282e-02,-4.059792860456446e-02,5.917758266691765e-02,-8.221483020553158e-02,1.192423335138701e-01,-2.001897851131696e-01,5.673680201052265e-01,7.014189005546492e-01,-2.153642297256825e-01,1.256330979479203e-01,-8.719134552429111e-02,6.520734784928187e-02,-5.024988022075436e-02,3.843145529681880e-02,-2.722820754576138e-02,1.074091780292779e-02}, +{ -7.643742346302346e-03,1.951638285018502e-02,-2.805681603684581e-02,3.794908704842614e-02,-5.227838750135043e-02,7.823846923181500e-02,-1.462866209934272e-01,9.449847478280655e-01,2.129539363285929e-01,-9.478350550846466e-02,6.001791075375375e-02,-4.293004036350266e-02,3.226984521808085e-02,-2.432524354021218e-02,1.709740320298590e-02,-6.723426171799368e-03}, +{ -6.723426171799729e-03,1.709740320298623e-02,-2.432524354021231e-02,3.226984521808120e-02,-4.293004036350307e-02,6.001791075375456e-02,-9.478350550846613e-02,2.129539363285973e-01,9.449847478280637e-01,-1.462866209934296e-01,7.823846923181586e-02,-5.227838750135146e-02,3.794908704842727e-02,-2.805681603684675e-02,1.951638285018543e-02,-7.643742346302651e-03}, +{ 1.074091780292812e-02,-2.722820754576127e-02,3.843145529681808e-02,-5.024988022075356e-02,6.520734784928148e-02,-8.719134552429109e-02,1.256330979479198e-01,-2.153642297256818e-01,7.014189005546511e-01,5.673680201052235e-01,-2.001897851131686e-01,1.192423335138702e-01,-8.221483020553220e-02,5.917758266691821e-02,-4.059792860456436e-02,1.581655120214249e-02}, +{ -2.374976920618760e-03,6.005723602895891e-03,-8.424373826791067e-03,1.089381139332463e-02,-1.387931698647647e-02,1.799249038488565e-02,-2.448826199932208e-02,3.692686069468503e-02,-7.194205914422742e-02,9.898084480753128e-01,8.428310395454376e-02,-3.980271264101549e-02,2.528485110738218e-02,-1.751108656965433e-02,1.178719321124529e-02,-4.559694336169466e-03}, +{ -6.848270976020268e-03,1.728365603074684e-02,-2.412589376454650e-02,3.093111096722355e-02,-3.886457157449681e-02,4.926393048418153e-02,-6.452652518971888e-02,9.033125115584136e-02,-1.453918894582062e-01,3.533676635852924e-01,8.670195751065508e-01,-1.931149777273495e-01,1.058454680051025e-01,-6.910010132071780e-02,4.527131588913786e-02,-1.734174121302084e-02}, +{ 6.813513144027023e-03,-1.716904169217029e-02,2.387304887765529e-02,-3.040151902016568e-02,3.779366998911177e-02,-4.711256621362485e-02,6.005872563848414e-02,-8.015019841342985e-02,1.167248831489847e-01,-2.067044552241460e-01,8.186326498238661e-01,4.232103776444548e-01,-1.648907930586535e-01,9.748324718984196e-02,-6.132530528901181e-02,2.316376345477631e-02}, +{ 6.053994798257915e-04,-1.523610197086402e-03,2.112005815541400e-03,-2.675339619631174e-03,3.298439254871430e-03,-4.060057315428990e-03,5.074515751527594e-03,-6.554207387611521e-03,8.983443061480542e-03,-1.381212863360313e-02,2.841823288066669e-02,9.987220416746563e-01,-2.677115954683206e-02,1.301395765057118e-02,-7.666818577997401e-03,2.835285709049667e-03}, +{ -5.857007477805688e-03,1.472569247187852e-02,-2.036268801101837e-02,2.568667734810112e-02,-3.146631037940778e-02,3.836016122918984e-02,-4.724826517094131e-02,5.962782296682455e-02,-7.854418110974370e-02,1.115983513579209e-01,-1.851362594123719e-01,4.994243902400035e-01,7.585251077269796e-01,-2.098430074475604e-01,1.097279425938521e-01,-3.921842692590092e-02}, +{ 3.636022163106316e-03,-9.134553597280409e-03,1.260710639199650e-02,-1.585180805281674e-02,1.932254580955186e-02,-2.338377886659959e-02,2.848981196971913e-02,-3.535944798101678e-02,4.533012330963123e-02,-6.133326226026795e-02,9.145480220870421e-02,-1.697379587718566e-01,9.128226307361706e-01,2.736687711524429e-01,-1.092956740535570e-01,3.676466984207236e-02}, +{ 2.003196216599043e-03,-5.029494862734117e-03,6.931334266598176e-03,-8.693756005418177e-03,1.055763167345473e-02,-1.270680414739542e-02,1.535784724153713e-02,-1.883384125275540e-02,2.369535971522522e-02,-3.105804442904532e-02,4.356064550998508e-02,-6.941994080232820e-02,1.545623821401700e-01,9.680638161701652e-01,-1.122594953168312e-01,3.326916388277308e-02}, +{ -3.991131198808395e-03,1.001627869768269e-02,-1.378903621657686e-02,1.726403962962450e-02,-2.090846132432931e-02,2.506567321875812e-02,-3.012298896980593e-02,3.663246704195628e-02,-4.550238522140389e-02,5.841611642312141e-02,-7.894890631981515e-02,1.163342685451070e-01,-2.042280397280815e-01,6.484362551568633e-01,6.106535439050819e-01,-1.253276936393740e-01}, +{ 1.037475340449956e-03,-2.602916972804862e-03,3.580767387993934e-03,-4.477767604216831e-03,5.413216063676759e-03,-6.472587227352299e-03,7.749425118701613e-03,-9.372863088453790e-03,1.154761648232183e-02,-1.463494994552996e-02,1.934862864845381e-02,-2.732500247397534e-02,4.329589310172288e-02,-8.884257675312499e-02,9.806865831625110e-01,8.106905875962633e-02}, +{ 1.779086294409038e-03,-4.462824926094480e-03,6.136996335640246e-03,-7.669319503866093e-03,9.262415434913745e-03,-1.105944156220832e-02,1.321445380425371e-02,-1.593632481165664e-02,1.954934406967071e-02,-2.461031894585295e-02,3.217612696841788e-02,-4.451749904499364e-02,6.746060490682286e-02,-1.211194080558285e-01,3.422576726898506e-01,7.375384363465217e-01} +}; +#endif diff --git a/okl/mesh/cubatureGeometricFactorsHex3D.okl b/okl/mesh/cubatureGeometricFactorsHex3D.okl new file mode 100644 index 000000000..64336f2ad --- /dev/null +++ b/okl/mesh/cubatureGeometricFactorsHex3D.okl @@ -0,0 +1,259 @@ +@kernel void cubatureGeometricFactorsHex3D(const dlong Nelements, + @restrict const dfloat *D, + @restrict const dfloat *x, + @restrict const dfloat *y, + @restrict const dfloat *z, + @restrict const dfloat* cubInterpT, + @restrict const dfloat* cubW, + @restrict dfloat* cubvgeo) +{ + for(dlong element = 0; element < Nelements; ++element; @outer(0)) { + @shared dfloat s_cubInterpT[p_Nq][p_cubNq]; + @shared dfloat s_cubw[p_cubNq]; + @shared dfloat s_D[p_Nq][p_Nq]; + + @shared dfloat s_x[p_Nq][p_Nq]; + @shared dfloat s_y[p_Nq][p_Nq]; + @shared dfloat s_z[p_Nq][p_Nq]; + + @shared dfloat s_cubxre[p_Nq][p_cubNq]; + @shared dfloat s_cubxse[p_Nq][p_cubNq]; + @shared dfloat s_cubxte[p_Nq][p_cubNq]; + + @exclusive dfloat r_x[p_Nq], r_y[p_Nq], r_z[p_Nq]; + + @shared dfloat s_xre[p_Nq][p_Nq]; + @shared dfloat s_xse[p_Nq][p_Nq]; + @shared dfloat s_xte[p_Nq][p_Nq]; + + @shared dfloat s_yre[p_Nq][p_Nq]; + @shared dfloat s_yse[p_Nq][p_Nq]; + @shared dfloat s_yte[p_Nq][p_Nq]; + + @shared dfloat s_zre[p_Nq][p_Nq]; + @shared dfloat s_zse[p_Nq][p_Nq]; + @shared dfloat s_zte[p_Nq][p_Nq]; + + @shared dfloat s_cubyre[p_Nq][p_cubNq]; + @shared dfloat s_cubyse[p_Nq][p_cubNq]; + @shared dfloat s_cubyte[p_Nq][p_cubNq]; + + @shared dfloat s_cubzre[p_Nq][p_cubNq]; + @shared dfloat s_cubzse[p_Nq][p_cubNq]; + @shared dfloat s_cubzte[p_Nq][p_cubNq]; + + // TODO: reduce register pressure + @exclusive dfloat r_cubxre[p_cubNq]; + @exclusive dfloat r_cubxse[p_cubNq]; + @exclusive dfloat r_cubxte[p_cubNq]; + + @exclusive dfloat r_cubyre[p_cubNq]; + @exclusive dfloat r_cubyse[p_cubNq]; + @exclusive dfloat r_cubyte[p_cubNq]; + + @exclusive dfloat r_cubzre[p_cubNq]; + @exclusive dfloat r_cubzse[p_cubNq]; + @exclusive dfloat r_cubzte[p_cubNq]; + + for(int j = 0; j < p_cubNq; ++j; @inner(1)) { + for(int i = 0; i < p_cubNq; ++i; @inner(0)) { + const int id = i + j * p_cubNq; + + if (j == 0) s_cubw[i] = cubW[i]; + if (id < p_Nq * p_cubNq) s_cubInterpT[j][i] = cubInterpT[id]; + if (i < p_Nq && j < p_Nq) s_D[j][i] = D[j*p_Nq+i]; + + for(int k = 0; k < p_cubNq; ++k) { + + r_cubxre[k] = 0; + r_cubxse[k] = 0; + r_cubxte[k] = 0; + + r_cubyre[k] = 0; + r_cubyse[k] = 0; + r_cubyte[k] = 0; + + r_cubzre[k] = 0; + r_cubzse[k] = 0; + r_cubzte[k] = 0; + } + } + } + @barrier("local"); + + for(int k = 0 ; k < p_Nq; ++k){ + for(int j=0;j p_Nfaces - @exclusive dfloat r_xre[p_Nq]; - @exclusive dfloat r_xse[p_Nq]; - @exclusive dfloat r_xte[p_Nq]; - @exclusive dfloat r_yre[p_Nq]; - @exclusive dfloat r_yse[p_Nq]; - @exclusive dfloat r_yte[p_Nq]; - @exclusive dfloat r_zre[p_Nq]; - @exclusive dfloat r_zse[p_Nq]; - @exclusive dfloat r_zte[p_Nq]; -#else - @exclusive dfloat r_xre[p_Nfaces]; - @exclusive dfloat r_xse[p_Nfaces]; - @exclusive dfloat r_xte[p_Nfaces]; - @exclusive dfloat r_yre[p_Nfaces]; - @exclusive dfloat r_yse[p_Nfaces]; - @exclusive dfloat r_yte[p_Nfaces]; - @exclusive dfloat r_zre[p_Nfaces]; - @exclusive dfloat r_zse[p_Nfaces]; - @exclusive dfloat r_zte[p_Nfaces]; -#endif - -#define r_f_xr r_xre -#define r_f_xs r_xse -#define r_f_xt r_xte -#define r_f_yr r_yre -#define r_f_ys r_yse -#define r_f_yt r_yte -#define r_f_zr r_zre -#define r_f_zs r_zse -#define r_f_zt r_zte - - #pragma unroll p_Nq - for(int k = 0 ; k < p_Nq; ++k){ - @barrier("local"); - for(int j=0;j tol) { + + const dfloat invVolNMag = 1.0 / volNMag; + volNx *= invVolNMag; + volNy *= invVolNMag; + volNz *= invVolNMag; + + const dfloat t1x = sgeo[sid * p_Nsgeo + p_T1XID]; + const dfloat t1y = sgeo[sid * p_Nsgeo + p_T1YID]; + const dfloat t1z = sgeo[sid * p_Nsgeo + p_T1ZID]; + + const dfloat t2x = sgeo[sid * p_Nsgeo + p_T2XID]; + const dfloat t2y = sgeo[sid * p_Nsgeo + p_T2YID]; + const dfloat t2z = sgeo[sid * p_Nsgeo + p_T2ZID]; + + dfloat NCrossT1x, NCrossT1y, NCrossT1z; + crossProduct(nx, ny, nz, t1x, t1y, t1z, &NCrossT1x, &NCrossT1y, &NCrossT1z); + + dfloat volNCrossT1x, volNCrossT1y, volNCrossT1z; + crossProduct(volNx, volNy, volNz, t1x, t1y, t1z, &volNCrossT1x, &volNCrossT1y, &volNCrossT1z); + + dfloat T2CrossNx, T2CrossNy, T2CrossNz; + crossProduct(t2x, t2y, t2z, nx, ny, nz, &T2CrossNx, &T2CrossNy, &T2CrossNz); + + dfloat T2CrossVolNx, T2CrossVolNy, T2CrossVolNz; + crossProduct(t2x, t2y, t2z, volNx, volNy, volNz, &T2CrossVolNx, &T2CrossVolNy, &T2CrossVolNz); + + const dfloat dot1 = dot(NCrossT1x, NCrossT1y, NCrossT1z, volNCrossT1x, volNCrossT1y, volNCrossT1z); + const dfloat dot2 = dot(T2CrossNx, T2CrossNy, T2CrossNz, T2CrossVolNx, T2CrossVolNy, T2CrossVolNz); + + const dfloat tolN = 1e-7; + + if(dot1 < tolN || (1.0 - dot1) < tolN) { + mask[idM + 0 * offset] = 0.0; + mask[idM + 1 * offset] = 0.0; + mask[idM + 2 * offset] = 1.0; + + } else if(dot2 < tolN || (1.0 - dot2) < tolN) { + mask[idM + 0 * offset] = 0.0; + mask[idM + 1 * offset] = 0.0; + mask[idM + 2 * offset] = 1.0; + + sgeo[sid * p_Nsgeo + p_T1XID] = t2x; + sgeo[sid * p_Nsgeo + p_T1YID] = t2y; + sgeo[sid * p_Nsgeo + p_T1ZID] = t2z; + + sgeo[sid * p_Nsgeo + p_T2XID] = t1x; + sgeo[sid * p_Nsgeo + p_T2YID] = t1y; + sgeo[sid * p_Nsgeo + p_T2ZID] = t1z; + + } else { + // corner + mask[idM + 0 * offset] = 0.0; + mask[idM + 1 * offset] = 0.0; + mask[idM + 2 * offset] = 0.0; + } + } + + } + } + @barrier("global"); + } + } + +} diff --git a/okl/nrs/gradientVolumeHex3D.okl b/okl/nrs/gradientVolumeHex3D.okl new file mode 100644 index 000000000..bbbeb8c9b --- /dev/null +++ b/okl/nrs/gradientVolumeHex3D.okl @@ -0,0 +1,102 @@ +/* + + The MIT License (MIT) + + Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + */ + + +@kernel void gradientVolumeHex3D(const dlong Nelements, + @restrict const dfloat* vgeo, + @restrict const dfloat* D, + const dlong offset, + @restrict const dfloat* P, + @restrict dfloat* gradP) +{ + for(dlong e = 0; e < Nelements; e++; @outer(0)) { // for all elements + @shared dfloat s_P[p_Nq][p_Nq]; + @exclusive dfloat s_Ploc[p_Nq]; + @shared dfloat s_D[p_Nq][p_Nq]; + +#ifdef smXX + #pragma unroll p_Nq +#endif + for(int k = 0; k < p_Nq; ++k){ + for(int j = 0; j < p_Nq; ++j; @inner(1)){ + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + s_P[j][i] = P[id]; + + if (k == 0) + s_D[j][i] = D[j * p_Nq + i]; + if(k == 0){ + #pragma unroll p_Nq + for(int l = 0 ; l < p_Nq; ++l){ + const dlong other_id = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; + s_Ploc[l] = P[other_id]; + } + } + } + } + + + // Make sure all node data is loaded into @shared + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong gid = e * p_Np * p_Nvgeo + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat drdx = vgeo[gid + p_RXID * p_Np]; + const dfloat drdy = vgeo[gid + p_RYID * p_Np]; + const dfloat drdz = vgeo[gid + p_RZID * p_Np]; + const dfloat dsdx = vgeo[gid + p_SXID * p_Np]; + const dfloat dsdy = vgeo[gid + p_SYID * p_Np]; + const dfloat dsdz = vgeo[gid + p_SZID * p_Np]; + const dfloat dtdx = vgeo[gid + p_TXID * p_Np]; + const dfloat dtdy = vgeo[gid + p_TYID * p_Np]; + const dfloat dtdz = vgeo[gid + p_TZID * p_Np]; + const dfloat JW = vgeo[gid + p_JWID * p_Np]; + + // compute 'r' and 's' derivatives of (q_m) at node n + dfloat dpdr = 0.f, dpds = 0.f, dpdt = 0.f; + +#pragma unroll p_Nq + for(int n = 0; n < p_Nq; ++n) { + const dfloat Dr = s_D[i][n]; + const dfloat Ds = s_D[j][n]; + const dfloat Dt = s_D[k][n]; + + dpdr += Dr * s_P[j][n]; + dpds += Ds * s_P[n][i]; + dpdt += Dt * s_Ploc[n]; + } + + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + gradP[id + 0 * offset] = JW * (drdx * dpdr + dsdx * dpds + dtdx * dpdt); + gradP[id + 1 * offset] = JW * (drdy * dpdr + dsdy * dpds + dtdy * dpdt); + gradP[id + 2 * offset] = JW * (drdz * dpdr + dsdz * dpds + dtdz * dpdt); + } + } + @barrier("local"); + } // k-loop + } +} diff --git a/okl/nrs/initializeZeroNormalMask.okl b/okl/nrs/initializeZeroNormalMask.okl new file mode 100644 index 000000000..f89c0a29d --- /dev/null +++ b/okl/nrs/initializeZeroNormalMask.okl @@ -0,0 +1,17 @@ +@kernel void initializeZeroNormalMask(const dlong N, + const dlong offset, + @restrict const dlong* mapB, + @restrict dfloat * mask) +{ + for(dlong id=0;id 0) { + if(bcType == 7 || bcType == 8) { + // handled elsewhere... + }else if(bcType == 4) { + UH[idM + 0 * offset] = 0.0; + }else if(bcType == 5) { + UH[idM + 1 * offset] = 0.0; + }else if(bcType == 6) { + UH[idM + 2 * offset] = 0.0; + } + } + } + @barrier("global"); + } + + for(int f = 0; f < p_Nfaces; f++) { + for(int m = 0; m < p_Nfp; ++m; @inner(0)) { + struct bcData bc; + const int n = m + f * p_Nfp; + const int sk = e * p_Nfp * p_Nfaces + n; + const dlong sid = e * p_Nfaces * p_Nfp + n; + const dlong idM = vmapM[sk]; + const dlong bcType = EToB[f + p_Nfaces * e]; + + if(bcType == 2) { + bc.id = EToBM[f + p_Nfaces * e]; + bc.idM = idM; + bc.time = time; + bc.x = x[idM]; + bc.y = y[idM]; + bc.z = z[idM]; + + bc.nx = sgeo[sid * p_Nsgeo + p_NXID]; + bc.ny = sgeo[sid * p_Nsgeo + p_NYID]; + bc.nz = sgeo[sid * p_Nsgeo + p_NZID]; + + bc.t1x = sgeo[sid * p_Nsgeo + p_T1XID]; + bc.t1y = sgeo[sid * p_Nsgeo + p_T1YID]; + bc.t1z = sgeo[sid * p_Nsgeo + p_T1ZID]; + + bc.t2x = sgeo[sid * p_Nsgeo + p_T2XID]; + bc.t2y = sgeo[sid * p_Nsgeo + p_T2YID]; + bc.t2z = sgeo[sid * p_Nsgeo + p_T2ZID]; + + bc.u = U[idM + 0 * offset]; + bc.v = U[idM + 1 * offset]; + bc.w = U[idM + 2 * offset]; + + bc.wrk = W; + bc.fieldOffset = offset; + + velocityDirichletConditions(&bc); + UH[idM + 0 * offset] = bc.u; + UH[idM + 1 * offset] = bc.v; + UH[idM + 2 * offset] = bc.w; + } + } + @barrier("global"); + } + + for(int f = 0; f < p_Nfaces; f++) { + for(int m = 0; m < p_Nfp; ++m; @inner(0)) { + const int n = m + f * p_Nfp; + const int sk = e * p_Nfp * p_Nfaces + n; + const dlong idM = vmapM[sk]; + const dlong bcType = EToB[f + p_Nfaces * e]; + + if(bcType > 0) { + if(bcType == 1) { + UH[idM + 0 * offset] = 0.0; + UH[idM + 1 * offset] = 0.0; + UH[idM + 2 * offset] = 0.0; + } + } + } + @barrier("global"); + } + } +} diff --git a/okl/nrs/velocityBCHex3D.okl b/okl/nrs/velocityNeumannBCHex3D.okl similarity index 72% rename from okl/nrs/velocityBCHex3D.okl rename to okl/nrs/velocityNeumannBCHex3D.okl index b3bed8f4a..b6e763886 100644 --- a/okl/nrs/velocityBCHex3D.okl +++ b/okl/nrs/velocityNeumannBCHex3D.okl @@ -24,40 +24,53 @@ */ -#define surfaceTerms(sk,face,i, j) \ - { \ - const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID]; \ - struct bcData bc; \ - bc.idM = vmapM[sk]; bc.time = time; \ - bc.nx = sgeo[sk * p_Nsgeo + p_NXID]; \ - bc.ny = sgeo[sk * p_Nsgeo + p_NYID]; \ - bc.nz = sgeo[sk * p_Nsgeo + p_NZID]; \ - bc.x = x[bc.idM]; \ - bc.y = y[bc.idM]; \ - bc.z = z[bc.idM]; \ - bc.fieldOffset = offset; \ - bc.id = EToBM[face + p_Nfaces * e]; \ - bc.u = U[bc.idM + 0 * offset]; \ - bc.v = U[bc.idM + 1 * offset]; \ - bc.w = U[bc.idM + 2 * offset]; \ - bc.wrk = W; \ - dfloat TRx = 0.f; \ - dfloat TRy = 0.f; \ - dfloat TRz = 0.f; \ - const dlong bcType = EToB[face + p_Nfaces * e]; \ - if(bcType == 3) { \ - bc.p = 0.f; \ - pressureDirichletConditions(&bc); \ - TRx = -bc.p*bc.nx; \ - TRy = -bc.p*bc.ny; \ - TRz = -bc.p*bc.nz; \ - } \ - s_ndU[j][i] = WsJ*TRx; \ - s_ndV[j][i] = WsJ*TRy; \ - s_ndW[j][i] = WsJ*TRz; \ +#define surfaceTerms(sk,face,i, j) \ + { \ + const dfloat WsJ = sgeo[sk * p_Nsgeo + p_WSJID]; \ + struct bcData bc; \ + bc.idM = vmapM[sk]; bc.time = time; \ + bc.nx = sgeo[sk * p_Nsgeo + p_NXID]; \ + bc.ny = sgeo[sk * p_Nsgeo + p_NYID]; \ + bc.nz = sgeo[sk * p_Nsgeo + p_NZID]; \ + bc.t1x = sgeo[sk * p_Nsgeo + p_T1XID]; \ + bc.t1y = sgeo[sk * p_Nsgeo + p_T1YID]; \ + bc.t1z = sgeo[sk * p_Nsgeo + p_T1ZID]; \ + bc.t2x = sgeo[sk * p_Nsgeo + p_T2XID]; \ + bc.t2y = sgeo[sk * p_Nsgeo + p_T2YID]; \ + bc.t2z = sgeo[sk * p_Nsgeo + p_T2ZID]; \ + bc.x = x[bc.idM]; \ + bc.y = y[bc.idM]; \ + bc.z = z[bc.idM]; \ + bc.fieldOffset = offset; \ + bc.id = EToBM[face + p_Nfaces * e]; \ + bc.u = U[bc.idM + 0 * offset]; \ + bc.v = U[bc.idM + 1 * offset]; \ + bc.w = U[bc.idM + 2 * offset]; \ + bc.wrk = W; \ + dfloat TRx = 0; \ + dfloat TRy = 0; \ + dfloat TRz = 0; \ + const dlong bcType = EToB[face + p_Nfaces * e]; \ + if(bcType == 3) { \ + bc.p = 0; \ + pressureDirichletConditions(&bc); \ + TRx = -bc.p*bc.nx; \ + TRy = -bc.p*bc.ny; \ + TRz = -bc.p*bc.nz; \ + } \ + if(bcType == 8) { \ + velocityNeumannConditions(&bc); \ + TRx = bc.nx * bc.trn + bc.t1x * bc.tr1 + bc.t2x * bc.tr2; \ + TRy = bc.ny * bc.trn + bc.t1y * bc.tr1 + bc.t2y * bc.tr2; \ + TRz = bc.nz * bc.trn + bc.t1z * bc.tr1 + bc.t2z * bc.tr2; \ + } \ + s_ndU[j][i] = WsJ*TRx; \ + s_ndV[j][i] = WsJ*TRy; \ + s_ndW[j][i] = WsJ*TRz; \ } //RHS contributions for continuous solver + @kernel void velocityNeumannBCHex3D(const dlong Nelements, const dlong offset, @restrict const dfloat* sgeo, @@ -253,80 +266,4 @@ } } -@kernel void velocityDirichletBCHex3D(const dlong Nelements, - const dlong offset, - const dfloat time, - @restrict const dfloat* sgeo, - @restrict const dfloat* x, - @restrict const dfloat* y, - @restrict const dfloat* z, - @restrict const dlong* vmapM, - @restrict const int* EToBM, - @restrict const int* EToB, - @restrict const dfloat* W, - @restrict const dfloat* U, - @restrict dfloat* UH) -{ - for(dlong e = 0; e < Nelements; e++; @outer(0)) { - for(int f = 0; f < p_Nfaces; f++) { - for(int m = 0; m < p_Nfp; ++m; @inner(0)) { - struct bcData bc; - const int n = m + f * p_Nfp; - const int sk = e * p_Nfp * p_Nfaces + n; - const dlong sid = e * p_Nfaces * p_Nfp + n; - const dlong idM = vmapM[sk]; - const dlong bcType = EToB[f + p_Nfaces * e]; - - if(bcType == 2) { - bc.id = EToBM[f + p_Nfaces * e]; - bc.idM = idM; - bc.time = time; - bc.x = x[idM]; - bc.y = y[idM]; - bc.z = z[idM]; - - bc.nx = sgeo[sid * p_Nsgeo + p_NXID]; - bc.ny = sgeo[sid * p_Nsgeo + p_NYID]; - bc.nz = sgeo[sid * p_Nsgeo + p_NZID]; - - bc.u = U[idM + 0 * offset]; - bc.v = U[idM + 1 * offset]; - bc.w = U[idM + 2 * offset]; - - bc.wrk = W; - bc.fieldOffset = offset; - - velocityDirichletConditions(&bc); - UH[idM + 0 * offset] = bc.u; - UH[idM + 1 * offset] = bc.v; - UH[idM + 2 * offset] = bc.w; - } - } - @barrier("global"); - } - - for(int f = 0; f < p_Nfaces; f++) { - for(int m = 0; m < p_Nfp; ++m; @inner(0)) { - const int n = m + f * p_Nfp; - const int sk = e * p_Nfp * p_Nfaces + n; - const dlong idM = vmapM[sk]; - const dlong bcType = EToB[f + p_Nfaces * e]; - - if(bcType > 0) { - if(bcType == 1) { - UH[idM + 0 * offset] = 0.0; - UH[idM + 1 * offset] = 0.0; - UH[idM + 2 * offset] = 0.0; - }else if(bcType == 4) { - UH[idM + 0 * offset] = 0.0; - }else if(bcType == 5) { - UH[idM + 1 * offset] = 0.0; - }else if(bcType == 6) { - UH[idM + 2 * offset] = 0.0; - } - } - } - @barrier("global"); - } - } -} +#undef surfaceTerms diff --git a/okl/nrs/velocityRhsHex3D.okl b/okl/nrs/velocityRhsHex3D.okl index ffb4f4f13..962194818 100644 --- a/okl/nrs/velocityRhsHex3D.okl +++ b/okl/nrs/velocityRhsHex3D.okl @@ -24,7 +24,7 @@ */ -@kernel void velocityRhsTOMBOHex3D(const dlong N, +@kernel void velocityRhsHex3D(const dlong N, const dlong fieldOffset, @restrict const dfloat* BF, @restrict const dfloat* GP, diff --git a/okl/nrs/wDivergenceVolumeHex3D.okl b/okl/nrs/wDivergenceVolumeHex3D.okl new file mode 100644 index 000000000..2e2f942d6 --- /dev/null +++ b/okl/nrs/wDivergenceVolumeHex3D.okl @@ -0,0 +1,114 @@ +/* + The MIT License (MIT) + + Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + */ +@kernel void wDivergenceVolumeHex3D(const dlong Nelements, + @restrict const dfloat* vgeo, + @restrict const dfloat* D, + const dlong offset, + @restrict const dfloat* U, + @restrict dfloat* divU) +{ + for(dlong e = 0; e < Nelements; ++e; @outer(0)) { + @shared dfloat s_U[p_Nq][p_Nq]; + @shared dfloat s_V[p_Nq][p_Nq]; + @shared dfloat s_W[p_Nq][p_Nq]; + + @shared dfloat s_D[p_Nq][p_Nq]; + + @exclusive dfloat r_div[p_Nq]; + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const int id = i + j * p_Nq; + s_D[j][i] = D[id]; + +#pragma unroll p_Nq + for(int k = 0; k < p_Nq; ++k) + r_div[k] = 0.; + } + } + + @barrier("local"); + +#ifdef smXX +#pragma unroll p_Nq +#endif + for(int k = 0; k < p_Nq; ++k) { + //fetch slice + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong gid = e * p_Np * p_Nvgeo + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat drdx = vgeo[gid + p_RXID * p_Np]; + const dfloat drdy = vgeo[gid + p_RYID * p_Np]; + const dfloat drdz = vgeo[gid + p_RZID * p_Np]; + const dfloat dsdx = vgeo[gid + p_SXID * p_Np]; + const dfloat dsdy = vgeo[gid + p_SYID * p_Np]; + const dfloat dsdz = vgeo[gid + p_SZID * p_Np]; + const dfloat dtdx = vgeo[gid + p_TXID * p_Np]; + const dfloat dtdy = vgeo[gid + p_TYID * p_Np]; + const dfloat dtdz = vgeo[gid + p_TZID * p_Np]; + const dfloat JW = vgeo[gid + p_JWID * p_Np]; + + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat Un = U[id + 0 * offset]; + const dfloat Vn = U[id + 1 * offset]; + const dfloat Wn = U[id + 2 * offset]; + + //store covarient field + s_U[j][i] = JW * (drdx * Un + drdy * Vn + drdz * Wn); + s_V[j][i] = JW * (dsdx * Un + dsdy * Vn + dsdz * Wn); + s_W[j][i] = JW * (dtdx * Un + dtdy * Vn + dtdz * Wn); + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for (int n = 0; n < p_Nq; n++) { + const dfloat Dr = s_D[n][i]; + const dfloat Ds = s_D[n][j]; + const dfloat Dt = s_D[k][n]; + r_div[k] += Dr * s_U[j][n]; + r_div[k] += Ds * s_V[n][i]; + r_div[n] += Dt * s_W[j][i]; + } + } + } + + @barrier("local"); + } //k loop + + //write out + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for(int k = 0; k < p_Nq; ++k) { + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + divU[id] = r_div[k]; + } + } + } + } +} diff --git a/okl/nrs/wGradientVolumeHex3D.okl b/okl/nrs/wGradientVolumeHex3D.okl new file mode 100644 index 000000000..83f24c656 --- /dev/null +++ b/okl/nrs/wGradientVolumeHex3D.okl @@ -0,0 +1,147 @@ +/* + + The MIT License (MIT) + + Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + */ + + +@kernel void wGradientVolumeHex3D(const dlong Nelements, + @restrict const dfloat* vgeo, + @restrict const dfloat* D, + const dlong offset, + @restrict const dfloat* P, + @restrict dfloat* gradP) +{ + for(dlong e = 0; e < Nelements; ++e; @outer(0)) { + @shared dfloat s_DrU[p_Nq][p_Nq]; + @shared dfloat s_DrV[p_Nq][p_Nq]; + @shared dfloat s_DrW[p_Nq][p_Nq]; + + @shared dfloat s_DsU[p_Nq][p_Nq]; + @shared dfloat s_DsV[p_Nq][p_Nq]; + @shared dfloat s_DsW[p_Nq][p_Nq]; + + @shared dfloat s_DtU[p_Nq][p_Nq]; + @shared dfloat s_DtV[p_Nq][p_Nq]; + @shared dfloat s_DtW[p_Nq][p_Nq]; + + @shared dfloat s_D[p_Nq][p_Nq]; + + @exclusive dfloat r_gradU[p_Nq]; + @exclusive dfloat r_gradV[p_Nq]; + @exclusive dfloat r_gradW[p_Nq]; + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const int id = i + j * p_Nq; + s_D[j][i] = D[id]; + +#pragma unroll p_Nq + for(int k = 0; k < p_Nq; ++k) { + r_gradU[k] = 0.f; + r_gradV[k] = 0.f; + r_gradW[k] = 0.f; + } + } + } + + @barrier("local"); + +#ifdef smXX +#pragma unroll p_Nq +#endif + for(int k = 0; k < p_Nq; ++k) { + //fetch slice + for(int j = 0; j < p_Nq; ++j; @inner(1)) + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong gid = e * p_Np * p_Nvgeo + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat drdx = vgeo[gid + p_RXID * p_Np]; + const dfloat drdy = vgeo[gid + p_RYID * p_Np]; + const dfloat drdz = vgeo[gid + p_RZID * p_Np]; + const dfloat dsdx = vgeo[gid + p_SXID * p_Np]; + const dfloat dsdy = vgeo[gid + p_SYID * p_Np]; + const dfloat dsdz = vgeo[gid + p_SZID * p_Np]; + const dfloat dtdx = vgeo[gid + p_TXID * p_Np]; + const dfloat dtdy = vgeo[gid + p_TYID * p_Np]; + const dfloat dtdz = vgeo[gid + p_TZID * p_Np]; + const dfloat JW = vgeo[gid + p_JWID * p_Np]; + + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat Pn = P[id + 0 * offset]; + + //store covarient field + s_DrU[j][i] = JW * drdx * Pn; + s_DsU[j][i] = JW * dsdx * Pn; + s_DtU[j][i] = JW * dtdx * Pn; + + s_DrV[j][i] = JW * drdy * Pn; + s_DsV[j][i] = JW * dsdy * Pn; + s_DtV[j][i] = JW * dtdy * Pn; + + s_DrW[j][i] = JW * drdz * Pn; + s_DsW[j][i] = JW * dsdz * Pn; + s_DtW[j][i] = JW * dtdz * Pn; + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for (int n = 0; n < p_Nq; n++) { + const dfloat Dr = s_D[n][i]; + const dfloat Ds = s_D[n][j]; + const dfloat Dt = s_D[k][n]; + + r_gradU[k] += Dr * s_DrU[j][n]; + r_gradU[k] += Ds * s_DsU[n][i]; + r_gradU[n] += Dt * s_DtU[j][i]; + + r_gradV[k] += Dr * s_DrV[j][n]; + r_gradV[k] += Ds * s_DsV[n][i]; + r_gradV[n] += Dt * s_DtV[j][i]; + + r_gradW[k] += Dr * s_DrW[j][n]; + r_gradW[k] += Ds * s_DsW[n][i]; + r_gradW[n] += Dt * s_DtW[j][i]; + } + } + } + + @barrier("local"); + } //k loop + + //write out + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { +#pragma unroll p_Nq + for(int k = 0; k < p_Nq; ++k) { + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + gradP[id + 0 * offset] = r_gradU[k]; + gradP[id + 1 * offset] = r_gradV[k]; + gradP[id + 2 * offset] = r_gradW[k]; + } + } + } + } +} diff --git a/okl/parAlmond/vectorDotStar1.okl b/okl/parAlmond/vectorDotStar1.okl new file mode 100644 index 000000000..17d730632 --- /dev/null +++ b/okl/parAlmond/vectorDotStar1.okl @@ -0,0 +1,37 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +// b = b.*a + +// c = alpha*(a.*b) + beta *c +@kernel void vectorDotStar1(const dlong N, + @restrict const dfloat * a, + @restrict dfloat * b){ + + for(dlong i=0;i 0) itau = 1 / tau; - - dfloat sigd = p_sigd_min; - dfloat f_beta_str = 1.0; - if (xk > 0) { - const dfloat xk3 = xk * xk * tau * tau; - sigd = p_sigd_max; - f_beta_str = (1.0 + p_fb_c1st * xk3) / (1.0 + p_fb_c2st * xk3); - } - - // compute source term for k - const dfloat Y_k = rho * p_betainf_str * f_beta_str * itau; - const dfloat kSrc = mu_t * stMag2; - const dfloat kDiag = Y_k; - - // compute rource term for omega - const dfloat x_w = abs(OiOjSk) * (tau * tau * tau * p_ibetainf_str3); - const dfloat f_b = (1.0 + p_fb_c1 * x_w) / (1.0 + p_fb_c2 * x_w); - dfloat tauSrc = rho * (p_beta0 * f_b - sigd * xk * tau); - dfloat tauDiag = rho * tau * p_alp_inf * stMag2 + - 8.0 * rho * p_alpinf_str * kk * xtq * p_sigma_tau; - - // apply correction - const dfloat S_tau = 8.0 * mue * xtq; - if (tau <= p_tiny) - tauSrc -= S_tau; - else - tauDiag += S_tau * itau; - - SRC[id + 0 * offset] = kSrc; - SRC[id + 1 * offset] = tauSrc; - SRCDIAG[id + 0 * offset] = kDiag; - SRCDIAG[id + 1 * offset] = tauDiag; - } - } - @barrier("local"); - } - } -} - -@kernel void SijOijHex3D(const dlong Nelements, - const dlong offset, - @restrict const dfloat* vgeo, - @restrict const dfloat* D, - @restrict const dfloat* U, - @restrict dfloat* SO) -{ - for(dlong e = 0; e < Nelements; ++e; @outer(0)) { - @shared dfloat s_U[p_Nq][p_Nq]; - @shared dfloat s_V[p_Nq][p_Nq]; - @shared dfloat s_W[p_Nq][p_Nq]; - @exclusive dfloat s_Uloc[p_Nq]; - @exclusive dfloat s_Vloc[p_Nq]; - @exclusive dfloat s_Wloc[p_Nq]; - - @shared dfloat s_D[p_Nq][p_Nq]; - - #pragma unroll p_Nq - for(int k = 0; k < p_Nq; ++k){ - for(int j = 0; j < p_Nq; ++j; @inner(1)){ - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - dlong id = i + j * p_Nq; - if(k == 0) s_D[0][id] = D[id]; - - id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - - const dfloat Un = U[id + 0 * offset]; - const dfloat Vn = U[id + 1 * offset]; - const dfloat Wn = U[id + 2 * offset]; - - s_U[j][i] = Un; - s_V[j][i] = Vn; - s_W[j][i] = Wn; - if(k == 0){ - #pragma unroll p_Nq - for(int l = 0 ; l < p_Nq; ++l){ - const dlong other_id = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; - const dfloat locUn = U[other_id + 0 * offset]; - const dfloat locVn = U[other_id + 1 * offset]; - const dfloat locWn = U[other_id + 2 * offset]; - s_Uloc[l] = locUn; - s_Vloc[l] = locVn; - s_Wloc[l] = locWn; - } - } - } - } - @barrier("local"); - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - const dlong gid = e * p_Np * p_Nvgeo + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat drdx = vgeo[gid + p_RXID * p_Np]; - const dfloat drdy = vgeo[gid + p_RYID * p_Np]; - const dfloat drdz = vgeo[gid + p_RZID * p_Np]; - const dfloat dsdx = vgeo[gid + p_SXID * p_Np]; - const dfloat dsdy = vgeo[gid + p_SYID * p_Np]; - const dfloat dsdz = vgeo[gid + p_SZID * p_Np]; - const dfloat dtdx = vgeo[gid + p_TXID * p_Np]; - const dfloat dtdy = vgeo[gid + p_TYID * p_Np]; - const dfloat dtdz = vgeo[gid + p_TZID * p_Np]; - const dfloat JW = vgeo[gid + p_JWID * p_Np]; - - dfloat dudr = 0, duds = 0, dudt = 0; - dfloat dvdr = 0, dvds = 0, dvdt = 0; - dfloat dwdr = 0, dwds = 0, dwdt = 0; - -#pragma unroll p_Nq - for (int n = 0; n < p_Nq; n++) { - const dfloat Dr = s_D[i][n]; - const dfloat Ds = s_D[j][n]; - const dfloat Dt = s_D[k][n]; - dudr += Dr * s_U[j][n]; - duds += Ds * s_U[n][i]; - dudt += Dt * s_Uloc[n]; - - dvdr += Dr * s_V[j][n]; - dvds += Ds * s_V[n][i]; - dvdt += Dt * s_Vloc[n]; - - dwdr += Dr * s_W[j][n]; - dwds += Ds * s_W[n][i]; - dwdt += Dt * s_Wloc[n]; - } - - const dfloat dudx = drdx * dudr + dsdx * duds + dtdx * dudt; - const dfloat dudy = drdy * dudr + dsdy * duds + dtdy * dudt; - const dfloat dudz = drdz * dudr + dsdz * duds + dtdz * dudt; - - const dfloat dvdx = drdx * dvdr + dsdx * dvds + dtdx * dvdt; - const dfloat dvdy = drdy * dvdr + dsdy * dvds + dtdy * dvdt; - const dfloat dvdz = drdz * dvdr + dsdz * dvds + dtdz * dvdt; - - const dfloat dwdx = drdx * dwdr + dsdx * dwds + dtdx * dwdt; - const dfloat dwdy = drdy * dwdr + dsdy * dwds + dtdy * dwdt; - const dfloat dwdz = drdz * dwdr + dsdz * dwds + dtdz * dwdt; - - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - - SO[id + 0 * offset] = JW * 2 * dudx; - SO[id + 1 * offset] = JW * 2 * dvdy; - SO[id + 2 * offset] = JW * 2 * dwdz; - SO[id + 3 * offset] = JW * (dudy + dvdx); - SO[id + 4 * offset] = JW * (dvdz + dwdy); - SO[id + 5 * offset] = JW * (dudz + dwdx); - - SO[id + 6 * offset] = JW * (dwdy - dvdz); - SO[id + 7 * offset] = JW * (dudz - dwdx); - SO[id + 8 * offset] = JW * (dvdx - dudy); - } - } - } - } -} - -@kernel void SijOijMag2(const dlong N, - const dlong offset, - @restrict const dfloat* SOIJ, - @restrict dfloat* OIOJSK, - @restrict dfloat* MAGSIJ) -{ - for(dlong n = 0; n < N; ++n; @tile(p_blockSize,@outer,@inner)) - if(n < N) { - const dfloat s1 = SOIJ[n + 0 * offset]; - const dfloat s2 = SOIJ[n + 1 * offset]; - const dfloat s3 = SOIJ[n + 2 * offset]; - const dfloat s4 = SOIJ[n + 3 * offset]; - const dfloat s5 = SOIJ[n + 4 * offset]; - const dfloat s6 = SOIJ[n + 5 * offset]; - - const dfloat o1 = SOIJ[n + 6 * offset]; - const dfloat o2 = SOIJ[n + 7 * offset]; - const dfloat o3 = SOIJ[n + 8 * offset]; - - const dfloat magSij = s1 * s1 + s2 * s2 + s3 * s3 + 2 * (s4 * s4 + s5 * s5 + s6 * s6); - const dfloat OiOjSk = s1 * (o2 * o2 + o3 * o3) + s2 * (o1 * o1 + o3 * o3) + s3 * - (o1 * o1 + o2 * o2) - + 2 * (o1 * o2 * s4 + o2 * o3 * s5 - o1 * o3 * s6); - - MAGSIJ[n] = 0.5 * magSij; - OIOJSK[n] = OiOjSk; - } -} - -@kernel void limit(const dlong N, - @restrict dfloat* K, - @restrict dfloat* TAU) -{ - for(dlong n = 0; n < N; ++n; @tile(p_blockSize,@outer,@inner)) - if(n < N) { - dfloat k = K[n]; - dfloat tau = TAU[n]; - if(k < 0) k = 0.01 * abs(k); - if(tau < 0) tau = 0.01 * abs(tau); - - K[n] = k; - TAU[n] = tau; - } -} - -@kernel void mue(const dlong N, - const dlong offset, - const dfloat rho, - const dfloat mueLam, - @restrict const dfloat* K, - @restrict const dfloat* TAU, - @restrict dfloat* MUET, - @restrict dfloat* MUE, - @restrict dfloat* DIFF) -{ - for(dlong n = 0; n < N; ++n; @tile(p_blockSize,@outer,@inner)) - if(n < N) { - const dfloat k = K[n]; - const dfloat tau = TAU[n]; - const dfloat mut = rho * p_alpinf_str * k * tau; - - MUET[n] = mut; - - MUE[n] = mueLam + mut; - DIFF[n + 0 * offset] = mueLam + p_sigma_k * mut; - DIFF[n + 1 * offset] = mueLam + p_sigma_tau * mut; - } -} diff --git a/okl/plugins/RANSktauComputeHex3D.okl b/okl/plugins/RANSktauComputeHex3D.okl new file mode 100644 index 000000000..3a324e13c --- /dev/null +++ b/okl/plugins/RANSktauComputeHex3D.okl @@ -0,0 +1,157 @@ +@kernel void RANSktauComputeHex3D(const dlong Nelements, + const dlong offset, + const dfloat rho, + const dfloat mue, + @restrict const dfloat* vgeo, + @restrict const dfloat* D, + @restrict const dfloat* K, + @restrict const dfloat* TAU, + @restrict const dfloat* STMAG2, + @restrict const dfloat* OIOJSK, + @restrict dfloat* SRCDIAG, + @restrict dfloat* SRC) +{ + for(dlong e = 0; e < Nelements; ++e; @outer(0)) { + @shared dfloat s_k[p_Nq][p_Nq]; + @shared dfloat s_tau[p_Nq][p_Nq]; + @shared dfloat s_tauSqrt[p_Nq][p_Nq]; + @exclusive dfloat s_kloc[p_Nq]; + @exclusive dfloat s_tauloc[p_Nq]; + @exclusive dfloat s_tauSqrtloc[p_Nq]; + + @shared dfloat s_D[p_Nq][p_Nq]; + + #pragma unroll p_Nq + for(int k = 0; k < p_Nq; ++k){ + for(int j = 0; j < p_Nq; ++j; @inner(1)){ + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + dlong id = i + j * p_Nq; + if(k == 0) s_D[0][id] = D[id]; + + id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat kn = K[id]; + const dfloat taun = TAU[id]; + + s_k[j][i] = kn; + s_tau[j][i] = taun; + s_tauSqrt[j][i] = sqrt(taun); + if(k == 0){ + #pragma unroll p_Nq + for(int l = 0 ; l < p_Nq; ++l){ + const dlong other_id = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; + const dfloat lockn = K[other_id]; + const dfloat loctaun = TAU[other_id]; + s_kloc[l] = lockn; + s_tauloc[l] = loctaun; + s_tauSqrtloc[l] = sqrt(loctaun); + } + } + } + } + + @barrier("local"); + + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong gid = e * p_Np * p_Nvgeo + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat drdx = vgeo[gid + p_RXID * p_Np]; + const dfloat drdy = vgeo[gid + p_RYID * p_Np]; + const dfloat drdz = vgeo[gid + p_RZID * p_Np]; + const dfloat dsdx = vgeo[gid + p_SXID * p_Np]; + const dfloat dsdy = vgeo[gid + p_SYID * p_Np]; + const dfloat dsdz = vgeo[gid + p_SZID * p_Np]; + const dfloat dtdx = vgeo[gid + p_TXID * p_Np]; + const dfloat dtdy = vgeo[gid + p_TYID * p_Np]; + const dfloat dtdz = vgeo[gid + p_TZID * p_Np]; + const dfloat JW = vgeo[gid + p_JWID * p_Np]; + + dfloat dkdr = 0, dkds = 0, dkdt = 0; + dfloat dtaudr = 0, dtauds = 0, dtaudt = 0; + dfloat dtauSqrtdr = 0, dtauSqrtds = 0, dtauSqrtdt = 0; + +#pragma unroll p_Nq + for (int n = 0; n < p_Nq; n++) { + const dfloat Dr = s_D[i][n]; + const dfloat Ds = s_D[j][n]; + const dfloat Dt = s_D[k][n]; + + dkdr += Dr * s_k[j][n]; + dkds += Ds * s_k[n][i]; + dkdt += Dt * s_kloc[n]; + + dtaudr += Dr * s_tau[j][n]; + dtauds += Ds * s_tau[n][i]; + dtaudt += Dt * s_tauloc[n]; + + dtauSqrtdr += Dr * s_tauSqrt[j][n]; + dtauSqrtds += Ds * s_tauSqrt[n][i]; + dtauSqrtdt += Dt * s_tauSqrtloc[n]; + } + + const dfloat dkdx = drdx * dkdr + dsdx * dkds + dtdx * dkdt; + const dfloat dkdy = drdy * dkdr + dsdy * dkds + dtdy * dkdt; + const dfloat dkdz = drdz * dkdr + dsdz * dkds + dtdz * dkdt; + + const dfloat dtaudx = drdx * dtaudr + dsdx * dtauds + dtdx * dtaudt; + const dfloat dtaudy = drdy * dtaudr + dsdy * dtauds + dtdy * dtaudt; + const dfloat dtaudz = drdz * dtaudr + dsdz * dtauds + dtdz * dtaudt; + + const dfloat dtauSqrtdx = drdx * dtauSqrtdr + dsdx * dtauSqrtds + dtdx * dtauSqrtdt; + const dfloat dtauSqrtdy = drdy * dtauSqrtdr + dsdy * dtauSqrtds + dtdy * dtauSqrtdt; + const dfloat dtauSqrtdz = drdz * dtauSqrtdr + dsdz * dtauSqrtds + dtdz * dtauSqrtdt; + + const dfloat xk = -(dkdx * dtaudx + dkdy * dtaudy + dkdz * dtaudz); + const dfloat xt = dtaudx * dtaudx + dtaudy * dtaudy + dtaudz * dtaudz; + const dfloat xtq = dtauSqrtdx * dtauSqrtdx + dtauSqrtdy * dtauSqrtdy + dtauSqrtdz * + dtauSqrtdz; + + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat stMag2 = STMAG2[id]; + const dfloat OiOjSk = OIOJSK[id]; + + const dfloat kk = K[id]; + const dfloat tau = TAU[id]; + + const dfloat mu_t = rho * p_alpinf_str * kk * tau; + dfloat itau = 0; + if(tau > 0) itau = 1 / tau; + + dfloat sigd = p_sigd_min; + dfloat f_beta_str = 1.0; + if (xk > 0) { + const dfloat xk3 = xk * xk * tau * tau; + sigd = p_sigd_max; + f_beta_str = (1.0 + p_fb_c1st * xk3) / (1.0 + p_fb_c2st * xk3); + } + + // compute source term for k + const dfloat Y_k = rho * p_betainf_str * f_beta_str * itau; + const dfloat kSrc = mu_t * stMag2; + const dfloat kDiag = Y_k; + + // compute rource term for omega + const dfloat x_w = abs(OiOjSk) * (tau * tau * tau * p_ibetainf_str3); + const dfloat f_b = (p_pope) ? (1.0 + p_fb_c1 * x_w) / (1.0 + p_fb_c2 * x_w) : 1.0; + dfloat tauSrc = rho * (p_beta0 * f_b - sigd * xk * tau); + dfloat tauDiag = rho * tau * p_alp_inf * stMag2 + + 8.0 * rho * p_alpinf_str * kk * xtq * p_sigma_tau; + + // apply correction + const dfloat S_tau = 8.0 * mue * xtq; + if (tau <= p_tiny) + tauSrc -= S_tau; + else + tauDiag += S_tau * itau; + + SRC[id + 0 * offset] = kSrc; + SRC[id + 1 * offset] = tauSrc; + SRCDIAG[id + 0 * offset] = kDiag; + SRCDIAG[id + 1 * offset] = tauDiag; + } + } + @barrier("local"); + } + } +} diff --git a/okl/plugins/SijOijHex3D.okl b/okl/plugins/SijOijHex3D.okl new file mode 100644 index 000000000..31843f1ca --- /dev/null +++ b/okl/plugins/SijOijHex3D.okl @@ -0,0 +1,118 @@ + + + + +@kernel void SijOijHex3D(const dlong Nelements, + const dlong offset, + @restrict const dfloat* vgeo, + @restrict const dfloat* D, + @restrict const dfloat* U, + @restrict dfloat* SO) +{ + for(dlong e = 0; e < Nelements; ++e; @outer(0)) { + @shared dfloat s_U[p_Nq][p_Nq]; + @shared dfloat s_V[p_Nq][p_Nq]; + @shared dfloat s_W[p_Nq][p_Nq]; + @exclusive dfloat s_Uloc[p_Nq]; + @exclusive dfloat s_Vloc[p_Nq]; + @exclusive dfloat s_Wloc[p_Nq]; + + @shared dfloat s_D[p_Nq][p_Nq]; + + #pragma unroll p_Nq + for(int k = 0; k < p_Nq; ++k){ + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)){ + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + dlong id = i + j * p_Nq; + if(k == 0) s_D[0][id] = D[id]; + + id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + const dfloat Un = U[id + 0 * offset]; + const dfloat Vn = U[id + 1 * offset]; + const dfloat Wn = U[id + 2 * offset]; + + s_U[j][i] = Un; + s_V[j][i] = Vn; + s_W[j][i] = Wn; + if(k == 0){ + #pragma unroll p_Nq + for(int l = 0 ; l < p_Nq; ++l){ + const dlong other_id = e * p_Np + l * p_Nq * p_Nq + j * p_Nq + i; + const dfloat locUn = U[other_id + 0 * offset]; + const dfloat locVn = U[other_id + 1 * offset]; + const dfloat locWn = U[other_id + 2 * offset]; + s_Uloc[l] = locUn; + s_Vloc[l] = locVn; + s_Wloc[l] = locWn; + } + } + } + } + @barrier("local"); + for(int j = 0; j < p_Nq; ++j; @inner(1)) { + for(int i = 0; i < p_Nq; ++i; @inner(0)) { + const dlong gid = e * p_Np * p_Nvgeo + k * p_Nq * p_Nq + j * p_Nq + i; + const dfloat drdx = vgeo[gid + p_RXID * p_Np]; + const dfloat drdy = vgeo[gid + p_RYID * p_Np]; + const dfloat drdz = vgeo[gid + p_RZID * p_Np]; + const dfloat dsdx = vgeo[gid + p_SXID * p_Np]; + const dfloat dsdy = vgeo[gid + p_SYID * p_Np]; + const dfloat dsdz = vgeo[gid + p_SZID * p_Np]; + const dfloat dtdx = vgeo[gid + p_TXID * p_Np]; + const dfloat dtdy = vgeo[gid + p_TYID * p_Np]; + const dfloat dtdz = vgeo[gid + p_TZID * p_Np]; + const dfloat JW = vgeo[gid + p_JWID * p_Np]; + + dfloat dudr = 0, duds = 0, dudt = 0; + dfloat dvdr = 0, dvds = 0, dvdt = 0; + dfloat dwdr = 0, dwds = 0, dwdt = 0; + +#pragma unroll p_Nq + for (int n = 0; n < p_Nq; n++) { + const dfloat Dr = s_D[i][n]; + const dfloat Ds = s_D[j][n]; + const dfloat Dt = s_D[k][n]; + dudr += Dr * s_U[j][n]; + duds += Ds * s_U[n][i]; + dudt += Dt * s_Uloc[n]; + + dvdr += Dr * s_V[j][n]; + dvds += Ds * s_V[n][i]; + dvdt += Dt * s_Vloc[n]; + + dwdr += Dr * s_W[j][n]; + dwds += Ds * s_W[n][i]; + dwdt += Dt * s_Wloc[n]; + } + + const dfloat dudx = drdx * dudr + dsdx * duds + dtdx * dudt; + const dfloat dudy = drdy * dudr + dsdy * duds + dtdy * dudt; + const dfloat dudz = drdz * dudr + dsdz * duds + dtdz * dudt; + + const dfloat dvdx = drdx * dvdr + dsdx * dvds + dtdx * dvdt; + const dfloat dvdy = drdy * dvdr + dsdy * dvds + dtdy * dvdt; + const dfloat dvdz = drdz * dvdr + dsdz * dvds + dtdz * dvdt; + + const dfloat dwdx = drdx * dwdr + dsdx * dwds + dtdx * dwdt; + const dfloat dwdy = drdy * dwdr + dsdy * dwds + dtdy * dwdt; + const dfloat dwdz = drdz * dwdr + dsdz * dwds + dtdz * dwdt; + + const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; + + SO[id + 0 * offset] = JW * 2 * dudx; + SO[id + 1 * offset] = JW * 2 * dvdy; + SO[id + 2 * offset] = JW * 2 * dwdz; + SO[id + 3 * offset] = JW * (dudy + dvdx); + SO[id + 4 * offset] = JW * (dvdz + dwdy); + SO[id + 5 * offset] = JW * (dudz + dwdx); + + SO[id + 6 * offset] = JW * (dwdy - dvdz); + SO[id + 7 * offset] = JW * (dudz - dwdx); + SO[id + 8 * offset] = JW * (dvdx - dudy); + } + } + } + } +} diff --git a/okl/plugins/SijOijMag2.okl b/okl/plugins/SijOijMag2.okl new file mode 100644 index 000000000..06c3acef4 --- /dev/null +++ b/okl/plugins/SijOijMag2.okl @@ -0,0 +1,32 @@ + + + + +@kernel void SijOijMag2(const dlong N, + const dlong offset, + @restrict const dfloat* SOIJ, + @restrict dfloat* OIOJSK, + @restrict dfloat* MAGSIJ) +{ + for(dlong n = 0; n < N; ++n; @tile(p_blockSize,@outer,@inner)) + if(n < N) { + const dfloat s1 = SOIJ[n + 0 * offset]; + const dfloat s2 = SOIJ[n + 1 * offset]; + const dfloat s3 = SOIJ[n + 2 * offset]; + const dfloat s4 = SOIJ[n + 3 * offset]; + const dfloat s5 = SOIJ[n + 4 * offset]; + const dfloat s6 = SOIJ[n + 5 * offset]; + + const dfloat o1 = SOIJ[n + 6 * offset]; + const dfloat o2 = SOIJ[n + 7 * offset]; + const dfloat o3 = SOIJ[n + 8 * offset]; + + const dfloat magSij = s1 * s1 + s2 * s2 + s3 * s3 + 2 * (s4 * s4 + s5 * s5 + s6 * s6); + const dfloat OiOjSk = s1 * (o2 * o2 + o3 * o3) + s2 * (o1 * o1 + o3 * o3) + s3 * + (o1 * o1 + o2 * o2) + + 2 * (o1 * o2 * s4 + o2 * o3 * s5 - o1 * o3 * s6); + + MAGSIJ[n] = 0.5 * magSij; + OIOJSK[n] = OiOjSk; + } +} diff --git a/okl/plugins/avg.okl b/okl/plugins/avg.okl deleted file mode 100644 index 16167eb0b..000000000 --- a/okl/plugins/avg.okl +++ /dev/null @@ -1,55 +0,0 @@ -@kernel void EX(const dlong N, - const dlong offset, - const dlong Nfields, - const dfloat a, - const dfloat b, - @restrict const dfloat* X, - @restrict dfloat* OUT) -{ - for(dlong n = 0; n < N; ++n; @tile(p_blockSize,@outer,@inner)) - if(n < N) { - for(dlong fld = 0; fld < Nfields; ++fld) { - const dlong id = n + fld * offset; - const dfloat xn = X[id]; - OUT[id] = a * OUT[id] + b * xn; - } - } -} - -@kernel void EXX(const dlong N, - const dlong offset, - const dlong Nfields, - const dfloat a, - const dfloat b, - @restrict const dfloat* X, - @restrict dfloat* OUT) -{ - for(dlong n = 0; n < N; ++n; @tile(p_blockSize,@outer,@inner)) - if(n < N) { - for(dlong fld = 0; fld < Nfields; ++fld) { - const dlong id = n + fld * offset; - const dfloat xn = X[id]; - OUT[id] = a * OUT[id] + b * xn * xn; - } - } -} - -@kernel void EXY(const dlong N, - const dlong offset, - const dlong Nfields, - const dfloat a, - const dfloat b, - @restrict const dfloat* X, - @restrict const dfloat* Y, - @restrict dfloat* OUT) -{ - for(dlong n = 0; n < N; ++n; @tile(p_blockSize,@outer,@inner)) - if(n < N) { - for(dlong fld = 0; fld < Nfields; ++fld) { - const dlong id = n + fld * offset; - const dfloat xn = X[id]; - const dfloat yn = Y[id]; - OUT[id] = a * OUT[id] + b * xn * yn; - } - } -} diff --git a/okl/plugins/getBCFlux.okl b/okl/plugins/getBCFlux.okl new file mode 100644 index 000000000..a4106cf93 --- /dev/null +++ b/okl/plugins/getBCFlux.okl @@ -0,0 +1,39 @@ + + + +@kernel void getBCFlux(const dlong Nelements, + const dlong bcid, + const dlong offset, + @restrict const dfloat* U, + @restrict const dlong* vmapM, + @restrict const int* EToB, + @restrict const dfloat* sgeo, + @restrict dfloat* Area, + @restrict dfloat* Flux) +{ + for(dlong e = 0; e < Nelements; e++; @outer(0)) + for(int f = 0; f < p_Nfaces; f++) { + @barrier("global"); + for(int m = 0; m < p_Nfp; ++m; @inner(0)) { + const int n = m + f * p_Nfp; + const int id = EToB[f + p_Nfaces * e]; + const dlong sk = e * p_Nfp * p_Nfaces + n; + const int idM = vmapM[sk]; + + const dfloat sWJ = sgeo[sk * p_Nsgeo + p_WSJID]; + const dfloat nx = sgeo[sk * p_Nsgeo + p_NXID]; + const dfloat ny = sgeo[sk * p_Nsgeo + p_NYID]; + const dfloat nz = sgeo[sk * p_Nsgeo + p_NZID]; + const dfloat un = U[idM + 0 * offset] * nx + + U[idM + 1 * offset] * ny + + U[idM + 2 * offset] * nz; + + Area[sk] = 0; + Flux[sk] = 0; + if(id == bcid) { + Area[sk] = sWJ; + Flux[sk] = sWJ * un; + } + } + } +} diff --git a/okl/plugins/limit.okl b/okl/plugins/limit.okl new file mode 100644 index 000000000..fce562b2c --- /dev/null +++ b/okl/plugins/limit.okl @@ -0,0 +1,19 @@ + + + + +@kernel void limit(const dlong N, + @restrict dfloat* K, + @restrict dfloat* TAU) +{ + for(dlong n = 0; n < N; ++n; @tile(p_blockSize,@outer,@inner)) + if(n < N) { + dfloat k = K[n]; + dfloat tau = TAU[n]; + if(k < 0) k = 0.01 * abs(k); + if(tau < 0) tau = 0.01 * abs(tau); + + K[n] = k; + TAU[n] = tau; + } +} diff --git a/okl/plugins/mue.okl b/okl/plugins/mue.okl new file mode 100644 index 000000000..5d1162c1b --- /dev/null +++ b/okl/plugins/mue.okl @@ -0,0 +1,27 @@ + + + + +@kernel void mue(const dlong N, + const dlong offset, + const dfloat rho, + const dfloat mueLam, + @restrict const dfloat* K, + @restrict const dfloat* TAU, + @restrict dfloat* MUET, + @restrict dfloat* MUE, + @restrict dfloat* DIFF) +{ + for(dlong n = 0; n < N; ++n; @tile(p_blockSize,@outer,@inner)) + if(n < N) { + const dfloat k = K[n]; + const dfloat tau = TAU[n]; + const dfloat mut = rho * p_alpinf_str * k * tau; + + MUET[n] = mut; + + MUE[n] = mueLam + mut; + DIFF[n + 0 * offset] = mueLam + p_sigma_k * mut; + DIFF[n + 1 * offset] = mueLam + p_sigma_tau * mut; + } +} diff --git a/src/mesh/meshParallelGatherScatter.cpp b/okl/plugins/p0thHelper.okl similarity index 68% rename from src/mesh/meshParallelGatherScatter.cpp rename to okl/plugins/p0thHelper.okl index 468613504..cbff65fbd 100644 --- a/src/mesh/meshParallelGatherScatter.cpp +++ b/okl/plugins/p0thHelper.okl @@ -24,8 +24,19 @@ */ -#include -#include -#include - -#include "mesh.h" \ No newline at end of file +@kernel void p0thHelper(const dlong N, + const dfloat dd, + @restrict const dfloat* rhoCp, + @restrict const dfloat* rho, + @restrict const dfloat* massMatrix, + @restrict dfloat * w1, + @restrict dfloat * w2) +{ + for(dlong n=0;n 512 + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 512) { + s_w1[t] += s_w1[t + 512]; + s_w2[t] += s_w2[t + 512]; + } + @barrier("local"); +#endif +#if p_blockSize > 256 + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 256) { + s_w1[t] += s_w1[t + 256]; + s_w2[t] += s_w2[t + 256]; + } + @barrier("local"); +#endif + + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 128) { + s_w1[t] += s_w1[t + 128]; + s_w2[t] += s_w2[t + 128]; + } + @barrier("local"); + + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 64) { + s_w1[t] += s_w1[t + 64]; + s_w2[t] += s_w2[t + 64]; + } + @barrier("local"); + + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 32) { + s_w1[t] += s_w1[t + 32]; + s_w2[t] += s_w2[t + 32]; + } + @barrier("local"); + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 16) { + s_w1[t] += s_w1[t + 16]; + s_w2[t] += s_w2[t + 16]; + } + @barrier("local"); + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 8) { + s_w1[t] += s_w1[t + 8]; + s_w2[t] += s_w2[t + 8]; + } + @barrier("local"); + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 4) { + s_w1[t] += s_w1[t + 4]; + s_w2[t] += s_w2[t + 4]; + } + @barrier("local"); + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 2) { + s_w1[t] += s_w1[t + 2]; + s_w2[t] += s_w2[t + 2]; + } + @barrier("local"); + for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 1) { + w1[b] = s_w1[0] + s_w1[1]; + w2[b] = s_w2[0] + s_w2[1]; + } + } +} diff --git a/okl/plugins/lowMach.okl b/okl/plugins/surfaceFlux.okl similarity index 65% rename from okl/plugins/lowMach.okl rename to okl/plugins/surfaceFlux.okl index cd8029a9e..5c2926915 100644 --- a/okl/plugins/lowMach.okl +++ b/okl/plugins/surfaceFlux.okl @@ -24,112 +24,6 @@ */ -@kernel void qtlHex3D(const dlong Nelements, - @restrict const dfloat* vgeo, - @restrict const dfloat* D, - const dlong offset, - @restrict const dfloat* gradQ, - @restrict const dfloat* Q, - @restrict const dfloat* DIFF, - @restrict const dfloat* RHOCP, - @restrict const dfloat* SRC, - @restrict dfloat* QTL) -{ - for(dlong e = 0; e < Nelements; ++e; @outer(0)) { - @shared dfloat s_U[p_Nq][p_Nq][p_Nq]; - @shared dfloat s_V[p_Nq][p_Nq][p_Nq]; - @shared dfloat s_W[p_Nq][p_Nq][p_Nq]; - - @shared dfloat s_D[p_Nq][p_Nq]; - - for(int k = 0; k < p_Nq; ++k; @inner(2)) - for(int j = 0; j < p_Nq; ++j; @inner(1)) - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - dlong id = i + j * p_Nq; - if(k == 0) - s_D[0][id] = D[id]; - - id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - - const dfloat Un = gradQ[id + 0 * offset]; - const dfloat Vn = gradQ[id + 1 * offset]; - const dfloat Wn = gradQ[id + 2 * offset]; - const dfloat cond = DIFF[id]; - - s_U[k][j][i] = cond * Un; - s_V[k][j][i] = cond * Vn; - s_W[k][j][i] = cond * Wn; - } - - @barrier("local"); - - for(int k = 0; k < p_Nq; ++k; @inner(2)) { - for(int j = 0; j < p_Nq; ++j; @inner(1)) { - for(int i = 0; i < p_Nq; ++i; @inner(0)) { - const dlong id = e * p_Np + k * p_Nq * p_Nq + j * p_Nq + i; - const dlong gid = e * p_Np * p_Nvgeo + k * p_Nq * p_Nq + j * p_Nq + i; - const dfloat drdx = vgeo[gid + p_RXID * p_Np]; - const dfloat drdy = vgeo[gid + p_RYID * p_Np]; - const dfloat drdz = vgeo[gid + p_RZID * p_Np]; - const dfloat dsdx = vgeo[gid + p_SXID * p_Np]; - const dfloat dsdy = vgeo[gid + p_SYID * p_Np]; - const dfloat dsdz = vgeo[gid + p_SZID * p_Np]; - const dfloat dtdx = vgeo[gid + p_TXID * p_Np]; - const dfloat dtdy = vgeo[gid + p_TYID * p_Np]; - const dfloat dtdz = vgeo[gid + p_TZID * p_Np]; - const dfloat JW = vgeo[gid + p_JWID * p_Np]; - - const dfloat iRhoCpT = 1 / (RHOCP[id] * Q[id]); - const dfloat qvol = SRC[id]; - - dfloat dudr = 0, duds = 0, dudt = 0; - dfloat dvdr = 0, dvds = 0, dvdt = 0; - dfloat dwdr = 0, dwds = 0, dwdt = 0; - dfloat div = 0; -#pragma unroll p_Nq - for (int n = 0; n < p_Nq; n++) { - const dfloat Dr = s_D[i][n]; - const dfloat Ds = s_D[j][n]; - const dfloat Dt = s_D[k][n]; - dudr += Dr * s_U[k][j][n]; - duds += Ds * s_U[k][n][i]; - dudt += Dt * s_U[n][j][i]; - - dvdr += Dr * s_V[k][j][n]; - dvds += Ds * s_V[k][n][i]; - dvdt += Dt * s_V[n][j][i]; - - dwdr += Dr * s_W[k][j][n]; - dwds += Ds * s_W[k][n][i]; - dwdt += Dt * s_W[n][j][i]; - } - - div = (drdx * dudr + dsdx * duds + dtdx * dudt); - div += (drdy * dvdr + dsdy * dvds + dtdy * dvdt); - div += (drdz * dwdr + dsdz * dwds + dtdz * dwdt); - - QTL[id] += JW * iRhoCpT * (div + qvol); - } - } - } - } -} -@kernel void p0thHelper(const dlong N, - const dfloat dd, - @restrict const dfloat* rhoCp, - @restrict const dfloat* rho, - @restrict const dfloat* massMatrix, - @restrict dfloat * w1, - @restrict dfloat * w2) -{ - for(dlong n=0;n 512 - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 512) { - s_w1[t] += s_w1[t + 512]; - s_w2[t] += s_w2[t + 512]; - } - @barrier("local"); -#endif -#if p_blockSize > 256 - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 256) { - s_w1[t] += s_w1[t + 256]; - s_w2[t] += s_w2[t + 256]; - } - @barrier("local"); -#endif - - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 128) { - s_w1[t] += s_w1[t + 128]; - s_w2[t] += s_w2[t + 128]; - } - @barrier("local"); - - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 64) { - s_w1[t] += s_w1[t + 64]; - s_w2[t] += s_w2[t + 64]; - } - @barrier("local"); - - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 32) { - s_w1[t] += s_w1[t + 32]; - s_w2[t] += s_w2[t + 32]; - } - @barrier("local"); - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 16) { - s_w1[t] += s_w1[t + 16]; - s_w2[t] += s_w2[t + 16]; - } - @barrier("local"); - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 8) { - s_w1[t] += s_w1[t + 8]; - s_w2[t] += s_w2[t + 8]; - } - @barrier("local"); - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 4) { - s_w1[t] += s_w1[t + 4]; - s_w2[t] += s_w2[t + 4]; - } - @barrier("local"); - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 2) { - s_w1[t] += s_w1[t + 2]; - s_w2[t] += s_w2[t + 2]; - } - @barrier("local"); - for(int t = 0; t < p_blockSize; ++t; @inner(0)) if(t < 1) { - w1[b] = s_w1[0] + s_w1[1]; - w2[b] = s_w2[0] + s_w2[1]; - } - } -} diff --git a/scripts/nrs-format b/scripts/nrs-format new file mode 100755 index 000000000..d8cfc758e --- /dev/null +++ b/scripts/nrs-format @@ -0,0 +1,4 @@ +#/bin/bash +find . -regex '.*\.\(cpp\)' -exec clang-format -style=file -i {} \; +find . -regex '.*\.\(hpp\)' -exec clang-format -style=file -i {} \; +find . -regex '.*\.\(okl\)' -exec clang-format -style=file -i {} \; diff --git a/scripts/nrsbmpi b/scripts/nrsbmpi index 482090ead..50e5e5c91 100755 --- a/scripts/nrsbmpi +++ b/scripts/nrsbmpi @@ -5,21 +5,12 @@ mv $1.log.$2 $1.log1.$2 2>/dev/null ulimit -s unlimited 2>/dev/null export NEKRS_HOME=${NEKRS_HOME:="`dirname "$0"`/../"} -: ${NEKRS_CI:=0} -if [ $NEKRS_CI -eq 1 ]; then - if [ $# -eq 0 ] || [ $# -ne 3 ] || [ "$1" == "-h" ] || [ "$1" == "-help" ]; then - echo "usage: ${0##*/} <#tasks> " - exit 1 - fi - mpirun -np $2 $NEKRS_HOME/bin/nekrs --setup $1 --cimode $3 >$1.log.$2 2>&1 & -else - if [ $# -eq 0 ] || [ $# -ne 2 ] || [ "$1" == "-h" ] || [ "$1" == "-help" ]; then - echo "usage: ${0##*/} <#tasks>" - exit 1 - fi - nohup mpirun -np $2 $NEKRS_HOME/bin/nekrs --setup $1 >$1.log.$2 &1 & +if [ $# -eq 0 ] || [ $# -lt 2 ] || [ "$1" == "-h" ] || [ "$1" == "-help" ]; then + echo "usage: ${0##*/} casename #tasks [args]" + exit 1 fi +nohup mpirun -np $2 $NEKRS_HOME/bin/nekrs --setup $1 ${@:3} >$1.log.$2 &1 & ln -sf $1.log.$2 logfile echo "started job in background, redirecting output to ./logfile ..." diff --git a/scripts/nrsmpi b/scripts/nrsmpi index 316e73984..dc20256cb 100755 --- a/scripts/nrsmpi +++ b/scripts/nrsmpi @@ -2,18 +2,9 @@ ulimit -s unlimited 2>/dev/null export NEKRS_HOME=${NEKRS_HOME:="`dirname "$0"`/../"} -: ${NEKRS_CI:=0} -if [ $NEKRS_CI -eq 1 ]; then - if [ $# -eq 0 ] || [ $# -ne 3 ] || [ "$1" == "-h" ] || [ "$1" == "-help" ]; then - echo "usage: ${0##*/} <#tasks> " - exit 1 - fi - mpirun -np $2 $NEKRS_HOME/bin/nekrs --setup $1 --cimode $3 -else - if [ $# -eq 0 ] || [ $# -ne 2 ] || [ "$1" == "-h" ] || [ "$1" == "-help" ]; then - echo "usage: ${0##*/} <#tasks>" - exit 1 - fi - mpirun -np $2 $NEKRS_HOME/bin/nekrs --setup $1 +if [ $# -eq 0 ] || [ $# -lt 2 ] || [ "$1" == "-h" ] || [ "$1" == "--help" ]; then + echo "usage: ${0##*/} casename #tasks [args]" + exit 1 fi +mpirun -np $2 $NEKRS_HOME/bin/nekrs --setup $1 ${@:3} diff --git a/scripts/nrsqsub_crusher b/scripts/nrsqsub_crusher new file mode 100755 index 000000000..c6e1e8f3c --- /dev/null +++ b/scripts/nrsqsub_crusher @@ -0,0 +1,119 @@ +#!/bin/bash + +: ${PROJ_ID:=""} +: ${QUEUE:=""} +: ${NEKRS_HOME:="$HOME/.local/nekrs"} + +export NVME_HOME="/mnt/bb/$USER/" + +if [ $# -ne 3 ]; then + echo "usage: [PROJ_ID] [QUEUE] $0 " + exit 0 +fi + +if [ -z "$PROJ_ID" ]; then + echo "ERROR: PROJ_ID is empty" + exit 1 +fi + +if [ -z "$QUEUE" ]; then + echo "ERROR: QUEUE is empty" + exit 1 +fi + +bin=${NEKRS_HOME}/bin/nekrs +case=$1 +nodes=$2 +gpu_per_node=8 +cores_per_socket=8 +let nn=$nodes*$gpu_per_node +let ntasks=nn +time=$3 +backend=HIP + + +if [ ! -f $bin ]; then + echo "Cannot find" $bin + exit 1 +fi + +if [ ! -f $case.par ]; then + echo "Cannot find" $case.par + exit 1 +fi + +if [ ! -f $case.udf ]; then + echo "Cannot find" $case.udf + exit 1 +fi + +if [ ! -f $case.oudf ]; then + echo "Cannot find" $case.oudf + exit 1 +fi + +if [ ! -f $case.re2 ]; then + echo "Cannot find" $case.re2 + exit 1 +fi + + +# romio setup +export ROMIO_HINTS="$(pwd)/.romio_hint" +if [ ! -f "$ROMIO_HINTS" ]; then + echo "romio_no_indep_rw true" >$ROMIO_HINTS + echo "romio_cb_write enable" >>$ROMIO_HINTS + echo "romio_ds_write enable" >>$ROMIO_HINTS + echo "romio_cb_read enable" >>$ROMIO_HINTS + echo "romio_ds_read enable" >>$ROMIO_HINTS + echo "cb_buffer_size 16777216" >>$ROMIO_HINTS + echo "cb_config_list *:1" >>$ROMIO_HINTS +fi + + +# sbatch +SFILE=s.bin +echo "#!/bin/bash" > $SFILE +echo "#SBATCH -A $PROJ_ID" >>$SFILE +echo "#SBATCH -J nekRS_$case" >>$SFILE +echo "#SBATCH -o %x-%j.out" >>$SFILE +echo "#SBATCH -t $time" >>$SFILE +echo "#SBATCH -N $nodes" >>$SFILE +echo "#SBATCH -p $QUEUE" >>$SFILE +echo "#SBATCH --exclusive" >>$SFILE +echo "#SBATCH --ntasks-per-node=$gpu_per_node" >>$SFILE +echo "#SBATCH --gpus-per-task=1" >>$SFILE +echo "#SBATCH --gpu-bind=closest" >>$SFILE +echo "#SBATCH --cpus-per-task=$cores_per_socket" >>$SFILE + +echo "module load PrgEnv-gnu" >> $SFILE +echo "module load craype-accel-amd-gfx90a" >> $SFILE +echo "module load cray-mpich" >> $SFILE +echo "module load rocm" >> $SFILE +echo "module unload cray-libsci" >> $SFILE +echo "module list" >> $SFILE + +echo "rocm-smi" >>$SFILE +echo "rocm-smi --showpids" >>$SFILE + +echo "export MPICH_GPU_SUPPORT_ENABLED=1" >>$SFILE + +## These must be set before compiling so the executable picks up GTL +echo "export PE_MPICH_GTL_DIR_amd_gfx90a=\"-L${CRAY_MPICH_ROOTDIR}/gtl/lib\"" >> $SFILE +echo "export PE_MPICH_GTL_LIBS_amd_gfx90a=\"-lmpi_gtl_hsa\"" >> $SFILE + +echo "ulimit -s unlimited " >>$SFILE +echo "export NEKRS_HOME=$NEKRS_HOME" >>$SFILE +echo "export NEKRS_GPU_MPI=1 " >>$SFILE + +echo "export NVME_HOME=$NVME_HOME" >>$SFILE +echo "export ROMIO_HINTS=$ROMIO_HINTS" >>$SFILE + +echo "# actual run" >>$SFILE +echo "date" >>$SFILE +echo "srun -n $ntasks $bin --backend $backend --device-id 0 --setup $case" >>$SFILE + +sbatch $SFILE + +# clean-up +rm -rf $SFILE $ROMIO_HINTS diff --git a/scripts/nrsqsub_perlmutter b/scripts/nrsqsub_perlmutter new file mode 100755 index 000000000..24991874b --- /dev/null +++ b/scripts/nrsqsub_perlmutter @@ -0,0 +1,116 @@ +#!/bin/bash + +: ${PROJ_ID:=""} +: ${QUEUE:=""} +: ${NEKRS_HOME:="$HOME/.local/nekrs"} + +if [ $# -ne 3 ]; then + echo "usage: [PROJ_ID] [QUEUE] $0 " + exit 0 +fi + +if [ -z "$PROJ_ID" ]; then + echo "ERROR: PROJ_ID is empty" + exit 1 +fi + +if [ -z "$QUEUE" ]; then + echo "ERROR: QUEUE is empty" + exit 1 +fi + +bin=${NEKRS_HOME}/bin/nekrs +case=$1 +nodes=$2 +gpu_per_node=4 +cores_per_socket=16 +let nn=$nodes*$gpu_per_node +let ntasks=nn +time=$3 +backend=CUDA + + +if [ ! -f $bin ]; then + echo "Cannot find" $bin + exit 1 +fi + +if [ ! -f $case.par ]; then + echo "Cannot find" $case.par + exit 1 +fi + +if [ ! -f $case.udf ]; then + echo "Cannot find" $case.udf + exit 1 +fi + +if [ ! -f $case.oudf ]; then + echo "Cannot find" $case.oudf + exit 1 +fi + +if [ ! -f $case.re2 ]; then + echo "Cannot find" $case.re2 + exit 1 +fi + + +# romio setup +export ROMIO_HINTS="$(pwd)/.romio_hint" +if [ ! -f "$ROMIO_HINTS" ]; then + echo "romio_no_indep_rw true" >$ROMIO_HINTS + echo "romio_cb_write enable" >>$ROMIO_HINTS + echo "romio_ds_write enable" >>$ROMIO_HINTS + echo "romio_cb_read enable" >>$ROMIO_HINTS + echo "romio_ds_read enable" >>$ROMIO_HINTS + echo "cb_buffer_size 16777216" >>$ROMIO_HINTS + echo "cb_config_list *:1" >>$ROMIO_HINTS +fi + + +# sbatch +SFILE=s.bin +echo "#!/bin/bash" > $SFILE +echo "#SBATCH -A $PROJ_ID" >>$SFILE +echo "#SBATCH -J nekRS_$case" >>$SFILE +echo "#SBATCH -o %x-%j.out" >>$SFILE +echo "#SBATCH -t $time" >>$SFILE +echo "#SBATCH -N $nodes" >>$SFILE +echo "#SBATCH -q $QUEUE" >>$SFILE +echo "#SBATCH -C gpu" >>$SFILE +echo "#SBATCH --exclusive" >>$SFILE +echo "#SBATCH --ntasks-per-node=$gpu_per_node" >>$SFILE +echo "#SBATCH --cpus-per-task=$cores_per_socket" >>$SFILE +echo "#SBATCH --gpu-bind=none" >> $SFILE +echo "#SBATCH --gpus-per-node=4" >> $SFILE + +echo "module load PrgEnv-gnu" >>$SFILE +echo "module load cudatoolkit" >>$SFILE +echo "module load cpe-cuda" >>$SFILE +echo "module load cmake" >>$SFILE +echo "module unload cray-libsci" >>$SFILE +echo "module list" >>$SFILE +echo "nvidia-smi" >>$SFILE + +echo "export SLURM_CPU_BIND=\"cores\"" >> $SFILE +echo "export CRAY_ACCEL_TARGET=nvidia80" >>$SFILE +echo "export MPICH_GPU_SUPPORT_ENABLED=1" >>$SFILE + +echo "ulimit -s unlimited " >>$SFILE +echo "export NEKRS_HOME=$NEKRS_HOME" >>$SFILE +echo "export NEKRS_GPU_MPI=1 " >>$SFILE + +echo "export ROMIO_HINTS=$ROMIO_HINTS" >>$SFILE + +echo "# Workaround for https://github.com/Nek5000/Nek5000/issues/759" >> $SFILE +echo "export FI_OFI_RXM_RX_SIZE=32768" >> $SFILE + + +echo "date" >>$SFILE +echo "srun $bin --backend $backend --setup $case" >>$SFILE + +sbatch $SFILE + +# clean-up +rm -rf $SFILE $ROMIO_HINTS diff --git a/scripts/uncrustify.cfg b/scripts/uncrustify.cfg deleted file mode 100644 index 98ab33a61..000000000 --- a/scripts/uncrustify.cfg +++ /dev/null @@ -1,129 +0,0 @@ -indent_align_string=false -indent_braces=false -indent_braces_no_func=false -indent_brace_parent=false -indent_namespace=false -indent_extern=false -indent_class=true -indent_class_colon=true -indent_else_if=false -indent_func_call_param=false -indent_func_def_param=false -indent_func_proto_param=false -indent_func_class_param=false -indent_func_ctor_var_param=false -indent_template_param=false -indent_func_param_double=false -indent_relative_single_line_comments=false -indent_col1_comment=false -indent_access_spec_body=false -indent_paren_nl=false -indent_comma_paren=false -indent_bool_paren=false -indent_square_nl=false -indent_preserve_sql=false -indent_align_assign=true -sp_balance_nested_parens=false -align_keep_tabs=false -align_with_tabs=false -align_on_tabstop=false -#align_number_left=false -align_func_params=false -align_same_func_call_params=false -align_var_def_colon=false -align_var_def_attribute=false -align_var_def_inline=false -align_right_cmt_mix=false -align_on_operator=false -align_mix_var_proto=false -align_single_line_func=false -align_single_line_brace=false -align_nl_cont=false -align_left_shift=true -nl_after_func_body=2 -nl_collapse_empty_body=false -nl_assign_leave_one_liners=false -nl_class_leave_one_liners=true -nl_enum_leave_one_liners=true -nl_getset_leave_one_liners=true -nl_func_leave_one_liners=true -nl_if_leave_one_liners=false -nl_multi_line_cond=false -nl_multi_line_define=false -nl_before_case=false -nl_after_case=false -nl_after_return=false -nl_after_semicolon=true -nl_after_brace_open=false -nl_after_brace_open_cmt=false -nl_after_vbrace_open=false -nl_after_brace_close=false -nl_define_macro=false -nl_squeeze_ifdef=false -nl_ds_struct_enum_cmt=false -nl_ds_struct_enum_close_brace=false -nl_create_if_one_liner=false -nl_create_for_one_liner=false -nl_create_while_one_liner=false -ls_for_split_full=false -ls_func_split_full=true -nl_after_multiline_comment=false -eat_blanks_after_open_brace=true -eat_blanks_before_close_brace=true -mod_pawn_semicolon=false -mod_full_paren_if_bool=false -mod_remove_extra_semicolon=true -mod_sort_import=false -mod_sort_using=false -mod_sort_include=true -mod_move_case_break=false -mod_remove_empty_return=true -cmt_indent_multi=true -cmt_c_group=false -cmt_c_nl_start=false -cmt_c_nl_end=false -cmt_cpp_group=false -cmt_cpp_nl_start=false -cmt_cpp_nl_end=false -cmt_cpp_to_c=false -cmt_star_cont=false -cmt_multi_check_last=true -cmt_insert_before_preproc=false -pp_indent_at_level=false -pp_region_indent_code=false -pp_if_indent_code=false -pp_define_at_level=false -input_tab_size=1 -code_width=120 -nl_max=2 -mod_add_long_ifdef_else_comment=10 -indent_with_tabs=0 -indent_columns = 2 -sp_arith=add -sp_assign=add -sp_enum_assign=add -sp_bool=add -sp_compare=add -sp_before_ptr_star=remove -sp_after_ptr_star=add -nl_end_of_file=add -nl_struct_brace=add -nl_if_brace=remove -nl_brace_else=remove -nl_elseif_brace=remove -nl_else_brace=remove -nl_for_brace=remove -nl_while_brace=remove -nl_switch_brace=remove -nl_namespace_brace=add -nl_class_brace=add -nl_fdef_brace=add -mod_full_brace_for=remove -mod_full_brace_if=remove -mod_full_brace_if_chain=true -mod_full_brace_while=remove -mod_paren_on_return=remove -pp_indent=remove -mod_sort_include=false - -#nl_func_def_start = add diff --git a/scripts/uncrustify.sh b/scripts/uncrustify.sh deleted file mode 100755 index 240c645f6..000000000 --- a/scripts/uncrustify.sh +++ /dev/null @@ -1,9 +0,0 @@ -#/bin/bash -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -CMD="uncrustify --no-backup --replace -c $DIR/uncrustify.cfg" -if [ "$#" -ne 0 ]; then - $CMD $1 - exit 0 -fi -find . -type f -regextype posix-extended -regex '.*.(cpp|hpp|c|h|okl)' -not -path "*/libP/*" -exec $CMD {} \; diff --git a/scripts/update3rdParty b/scripts/update3rdParty deleted file mode 100644 index 85be69a69..000000000 --- a/scripts/update3rdParty +++ /dev/null @@ -1,11 +0,0 @@ -git subtree split --prefix=3rd_party/nek5000 --annotate='(split)' --rejoin --branch nek5000-subtree-update -git subtree pull --prefix 3rd_party/nek5000 https://github.com/Nek5000/nek5000.git master --squash - -git subtree split --prefix=3rd_party/nek5000_parRSB --annotate='(split)' --rejoin --branch nek5000_parRSB-subtree-update -git subtree pull --prefix 3rd_party/nek5000_parRSB https://github.com/Nek5000/parRSB.git master --squash - -git subtree split --prefix=3rd_party/nek5000_gslib --annotate='(split)' --rejoin --branch nek5000_gslib-subtree-update -git subtree pull --prefix 3rd_party/nek5000_gslib https://github.com/Nek5000/gslib.git master --squash - -git subtree split --prefix=3rd_party/occa --annotate='(split)' --rejoin --branch occa-subtree-update -git subtree pull --prefix 3rd_party/occa https://github.com/Nek5000/occa.git main --squash diff --git a/scripts/updateNek b/scripts/updateNek new file mode 100755 index 000000000..f5352c656 --- /dev/null +++ b/scripts/updateNek @@ -0,0 +1,11 @@ +#!/bin/bash + +git rm -rf 3rd_party/nek5000 3rd_party/nek5000_parRSB 3rd_party/nek5000_gslib +git commit -m 'remove nek' +git subtree add --prefix 3rd_party/nek5000 https://github.com/Nek5000/nek5000.git master --squash +git subtree add --prefix 3rd_party/nek5000_parRSB https://github.com/Nek5000/parRSB.git master --squash +git subtree add --prefix 3rd_party/nek5000_gslib https://github.com/Nek5000/gslib.git master --squash +rm -rf 3rd_party/nek5000/tools 3rd_party/nek5000/run 3rd_party/nek5000/examples 3rd_party/nek5000/short_tests +git reset HEAD~4 --soft +git add -u +git commit -m 'import latest nek' diff --git a/src/.clang-tidy b/src/.clang-tidy new file mode 100644 index 000000000..b4606f0a1 --- /dev/null +++ b/src/.clang-tidy @@ -0,0 +1,10 @@ +--- +Checks: '-*,bugprone*,cppcoreguidelines*,-cppcoreguidelines-pro-bounds*,-cppcoreguidelines-owning-memory,-cppcoreguidelines-special-member-functions,-cppcoreguidelines-avoid-magic-numbers,misc-unconventional-assign-operator,modernize*,-modernize-use-trailing-return-type,mpi*,performance*,readability*,-readability-implicit-bool-conversion,-readability-magic-numbers' +WarningsAsErrors: '' +HeaderFilterRegex: '' +AnalyzeTemporaryDtors: false +CheckOptions: + - key: readability-braces-around-statements.ShortStatementLines + value: '2' + - key: modernize-use-nullptr.NullMacros + value: 'NULL' diff --git a/src/bdry/applyDirichlet.cpp b/src/bdry/applyDirichlet.cpp new file mode 100644 index 000000000..af7ad0d6f --- /dev/null +++ b/src/bdry/applyDirichlet.cpp @@ -0,0 +1,262 @@ +#include "nrs.hpp" +#include "bcMap.hpp" + +void createZeroNormalMask(nrs_t *nrs, occa::memory &o_EToB, occa::memory& o_EToBV, occa::memory &o_mask) +{ + auto mesh = nrs->meshV; + + platform->linAlg->fill(3 * nrs->fieldOffset, 0.0, o_mask); + + nrs->initializeZeroNormalMaskKernel(mesh->Nlocal, nrs->fieldOffset, o_EToBV, o_mask); + + // normal + count (4 fields) + auto o_avgNormal = platform->o_mempool.slice0; + + nrs->averageNormalBcTypeKernel(mesh->Nelements, + nrs->fieldOffset, + ZERO_NORMAL, + mesh->o_sgeo, + mesh->o_vmapM, + o_EToB, + o_avgNormal); + + oogs::startFinish(o_avgNormal, 4, nrs->fieldOffset, ogsDfloat, ogsAdd, nrs->gsh); + + nrs->fixZeroNormalMaskKernel(mesh->Nelements, + nrs->fieldOffset, + mesh->o_sgeo, + mesh->o_vmapM, + o_EToB, + o_avgNormal, + o_mask); + + oogs::startFinish(o_mask, 3, nrs->fieldOffset, ogsDfloat, ogsMin, nrs->gsh); +} + + +void applyZeroNormalMask(nrs_t *nrs, + dlong Nelements, + occa::memory &o_elementList, + occa::memory &o_EToB, + occa::memory &o_mask, + occa::memory &o_x) +{ + if (Nelements == 0) + return; + + auto *mesh = nrs->meshV; + + nrs->applyZeroNormalMaskKernel(Nelements, + nrs->fieldOffset, + o_elementList, + mesh->o_sgeo, + o_mask, + mesh->o_vmapM, + o_EToB, + o_x); +} + +void applyZeroNormalMask(nrs_t *nrs, occa::memory &o_EToB, occa::memory &o_mask, occa::memory &o_x) +{ + auto *mesh = nrs->meshV; + nrs->applyZeroNormalMaskKernel(mesh->Nelements, + nrs->fieldOffset, + mesh->o_elementList, + mesh->o_sgeo, + o_mask, + mesh->o_vmapM, + o_EToB, + o_x); +} + +void applyDirichlet(nrs_t *nrs, double time) +{ + if (nrs->Nscalar) { + cds_t *cds = nrs->cds; + for (int is = 0; is < cds->NSfields; is++) { + if (!cds->compute[is]) + continue; + mesh_t *mesh = cds->mesh[0]; + ; + oogs_t *gsh = cds->gshT; + if (is) { + mesh = cds->meshV; + gsh = cds->gsh; + } + + platform->linAlg->fill(cds->fieldOffset[is], + -1.0 * std::numeric_limits::max(), + platform->o_mempool.slice2); + for (int sweep = 0; sweep < 2; sweep++) { + cds->dirichletBCKernel(mesh->Nelements, + cds->fieldOffset[is], + is, + time, + mesh->o_sgeo, + mesh->o_x, + mesh->o_y, + mesh->o_z, + mesh->o_vmapM, + mesh->o_EToB, + cds->o_EToB[is], + *(cds->o_usrwrk), + platform->o_mempool.slice2); + + if (sweep == 0) + oogs::startFinish(platform->o_mempool.slice2, 1, cds->fieldOffset[is], ogsDfloat, ogsMax, gsh); + if (sweep == 1) + oogs::startFinish(platform->o_mempool.slice2, 1, cds->fieldOffset[is], ogsDfloat, ogsMin, gsh); + } + occa::memory o_Si = + cds->o_S.slice(cds->fieldOffsetScan[is] * sizeof(dfloat), cds->fieldOffset[is] * sizeof(dfloat)); + if (cds->solver[is]->Nmasked) + cds->maskCopyKernel(cds->solver[is]->Nmasked, + 0, + cds->solver[is]->o_maskIds, + platform->o_mempool.slice2, + o_Si); + } + } + + if (nrs->flow) { + mesh_t *mesh = nrs->meshV; + + if (bcMap::unalignedBoundary(mesh->cht, "velocity")) { + applyZeroNormalMask(nrs, nrs->uvwSolver->o_EToB, nrs->o_zeroNormalMaskVelocity, nrs->o_U); + } + + platform->linAlg->fill((1 + nrs->NVfields) * nrs->fieldOffset, + -1.0 * std::numeric_limits::max(), + platform->o_mempool.slice6); + for (int sweep = 0; sweep < 2; sweep++) { + nrs->pressureDirichletBCKernel(mesh->Nelements, + time, + nrs->fieldOffset, + mesh->o_sgeo, + mesh->o_x, + mesh->o_y, + mesh->o_z, + mesh->o_vmapM, + mesh->o_EToB, + nrs->o_EToB, + nrs->o_usrwrk, + nrs->o_U, + platform->o_mempool.slice6); + + nrs->velocityDirichletBCKernel(mesh->Nelements, + nrs->fieldOffset, + time, + mesh->o_sgeo, + nrs->o_zeroNormalMaskVelocity, + mesh->o_x, + mesh->o_y, + mesh->o_z, + mesh->o_vmapM, + mesh->o_EToB, + nrs->o_EToB, + nrs->o_usrwrk, + nrs->o_U, + platform->o_mempool.slice7); + + if (sweep == 0) + oogs::startFinish(platform->o_mempool.slice6, + 1 + nrs->NVfields, + nrs->fieldOffset, + ogsDfloat, + ogsMax, + nrs->gsh); + if (sweep == 1) + oogs::startFinish(platform->o_mempool.slice6, + 1 + nrs->NVfields, + nrs->fieldOffset, + ogsDfloat, + ogsMin, + nrs->gsh); + } + + if (nrs->pSolver->Nmasked) + nrs->maskCopyKernel(nrs->pSolver->Nmasked, + 0, + nrs->pSolver->o_maskIds, + platform->o_mempool.slice6, + nrs->o_P); + + if (nrs->uvwSolver) { + if (nrs->uvwSolver->Nmasked) + nrs->maskCopyKernel(nrs->uvwSolver->Nmasked, + 0 * nrs->fieldOffset, + nrs->uvwSolver->o_maskIds, + platform->o_mempool.slice7, + nrs->o_U); + } + else { + if (nrs->uSolver->Nmasked) + nrs->maskCopyKernel(nrs->uSolver->Nmasked, + 0 * nrs->fieldOffset, + nrs->uSolver->o_maskIds, + platform->o_mempool.slice7, + nrs->o_U); + if (nrs->vSolver->Nmasked) + nrs->maskCopyKernel(nrs->vSolver->Nmasked, + 1 * nrs->fieldOffset, + nrs->vSolver->o_maskIds, + platform->o_mempool.slice7, + nrs->o_U); + if (nrs->wSolver->Nmasked) + nrs->maskCopyKernel(nrs->wSolver->Nmasked, + 2 * nrs->fieldOffset, + nrs->wSolver->o_maskIds, + platform->o_mempool.slice7, + nrs->o_U); + } + } + + if (platform->options.compareArgs("MESH SOLVER", "ELASTICITY")) { + mesh_t *mesh = nrs->meshV; + if (bcMap::unalignedBoundary(mesh->cht, "mesh")) { + applyZeroNormalMask(nrs, nrs->meshSolver->o_EToB, nrs->o_zeroNormalMaskMeshVelocity, mesh->o_U); + } + platform->linAlg->fill(nrs->NVfields * nrs->fieldOffset, + -1.0 * std::numeric_limits::max(), + platform->o_mempool.slice3); + for (int sweep = 0; sweep < 2; sweep++) { + nrs->meshV->velocityDirichletKernel(mesh->Nelements, + nrs->fieldOffset, + time, + bcMap::useDerivedMeshBoundaryConditions(), + mesh->o_sgeo, + nrs->o_zeroNormalMaskMeshVelocity, + mesh->o_x, + mesh->o_y, + mesh->o_z, + mesh->o_vmapM, + mesh->o_EToB, + nrs->o_EToBMeshVelocity, + nrs->o_usrwrk, + nrs->o_U, + platform->o_mempool.slice3); + + if (sweep == 0) + oogs::startFinish(platform->o_mempool.slice3, + nrs->NVfields, + nrs->fieldOffset, + ogsDfloat, + ogsMax, + nrs->gsh); + if (sweep == 1) + oogs::startFinish(platform->o_mempool.slice3, + nrs->NVfields, + nrs->fieldOffset, + ogsDfloat, + ogsMin, + nrs->gsh); + } + + if (nrs->meshSolver->Nmasked) + nrs->maskCopyKernel(nrs->meshSolver->Nmasked, + 0 * nrs->fieldOffset, + nrs->meshSolver->o_maskIds, + platform->o_mempool.slice3, + mesh->o_U); + } +} diff --git a/src/core/bcData.h b/src/bdry/bcData.h similarity index 65% rename from src/core/bcData.h rename to src/bdry/bcData.h index 772b0a3dd..87281c540 100644 --- a/src/core/bcData.h +++ b/src/bdry/bcData.h @@ -10,11 +10,19 @@ struct bcData dfloat x, y, z; dfloat nx, ny, nz; + // tangential directions + dfloat t1x, t1y, t1z; + dfloat t2x, t2y, t2z; + + dfloat trn, tr1, tr2; + dfloat u, v, w; dfloat p; int scalarId; dfloat s, flux; + dfloat meshu, meshv, meshw; + @globalPtr const dfloat* wrk; }; diff --git a/src/bdry/bcMap.cpp b/src/bdry/bcMap.cpp new file mode 100644 index 000000000..1d0621c5d --- /dev/null +++ b/src/bdry/bcMap.cpp @@ -0,0 +1,698 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nrs.hpp" +#include "platform.hpp" +#include "udf.hpp" + +#include +#include "alignment.hpp" + +namespace { +boundaryAlignment_t computeAlignment(mesh_t *mesh, dlong element, dlong face) +{ + const dfloat alignmentTol = 1e-3; + dfloat nxDiff = 0.0; + dfloat nyDiff = 0.0; + dfloat nzDiff = 0.0; + + for (int fp = 0; fp < mesh->Nfp; ++fp) { + const dlong sid = mesh->Nsgeo * (mesh->Nfaces * mesh->Nfp * element + mesh->Nfp * face + fp); + const dfloat nx = mesh->sgeo[sid + NXID]; + const dfloat ny = mesh->sgeo[sid + NYID]; + const dfloat nz = mesh->sgeo[sid + NZID]; + nxDiff += std::abs(std::abs(nx) - 1.0); + nyDiff += std::abs(std::abs(ny) - 1.0); + nzDiff += std::abs(std::abs(nz) - 1.0); + } + + nxDiff /= mesh->Nfp; + nyDiff /= mesh->Nfp; + nzDiff /= mesh->Nfp; + + if (nxDiff < alignmentTol) + return boundaryAlignment_t::X; + if (nyDiff < alignmentTol) + return boundaryAlignment_t::Y; + if (nzDiff < alignmentTol) + return boundaryAlignment_t::Z; + + return boundaryAlignment_t::UNALIGNED; +} +} // namespace + +static bool meshConditionsDerived = false; +static std::set fields; +// stores for every (field, boundaryID) pair a bcID +static std::map, int> bToBc; +static int nbid[] = {0, 0}; +static bool importFromNek = true; + +static std::map vBcTextToID = { + {"periodic", 0}, + {"zerovalue", 1}, + {"fixedvalue", 2}, + {"codedFixedvalue", 2}, + {"zerogradient", 3}, + {"zeroxvalue/zerogradient", 4}, + {"zeroyvalue/zerogradient", 5}, + {"zerozvalue/zerogradient", 6}, + {"zeronvalue/zerogradient", 7}, + {"zeronvalue/fixedgradient", 8}, + {"zeronvalue/codedFixedgradient", 8} +}; + +static std::map vBcIDToText = {{0, "periodic"}, + {1, "zeroValue"}, + {2, "codedFixedValue"}, + {3, "zeroGradient"}, + {4, "zeroXValue/zeroGradient"}, + {5, "zeroYValue/zeroGradient"}, + {6, "zeroZValue/zeroGradient"}, + {7, "zeroNValue/zeroGradient"}, + {8, "zeroNValue/codedFixedGradient"}}; + +static std::map sBcTextToID = { + {"periodic", 0}, + {"fixedvalue", 1}, + {"codedFixedvalue", 1}, + {"zerogradient", 2}, + {"fixedgradient", 3}, + {"codedFixedgradient", 3} +}; + +static std::map sBcIDToText = { + {0, "periodic" }, + {1, "codedFixedValue" }, + {2, "zeroGradient" }, + {3, "codedFixedGradient"} +}; + +static void v_setup(std::string s); +static void s_setup(std::string s); + +static void v_setup(std::string field, std::vector slist) +{ + for (int bid = 0; bid < slist.size(); bid++) { + std::string key = slist[bid]; + if (key.compare("p") == 0) key = "periodic"; + if (key.compare("w") == 0) key = "zerovalue"; + if (key.compare("wall") == 0) key = "zerovalue"; + if (key.compare("inlet") == 0) key = "fixedvalue"; + if (key.compare("v") == 0) key = "fixedvalue"; + if (key.compare("mv") == 0) key = "fixedvalue"; + if (key.compare("fixedvalue+moving") == 0) key = "fixedvalue"; + if (key.compare("outlet") == 0) key = "zerogradient"; + if (key.compare("outflow") == 0) key = "zerogradient"; + if (key.compare("o") == 0) key = "zerogradient"; + if (key.compare("slipx") == 0) key = "zeroxvalue/zerogradient"; + if (key.compare("slipy") == 0) key = "zeroyvalue/zerogradient"; + if (key.compare("slipz") == 0) key = "zerozvalue/zerogradient"; + if (key.compare("symx") == 0) key = "zeroxvalue/zerogradient"; + if (key.compare("symy") == 0) key = "zeroyvalue/zerogradient"; + if (key.compare("symz") == 0) key = "zerozvalue/zerogradient"; + if (key.compare("sym") == 0) key = "zeronvalue/zerogradient"; + if (key.compare("shl") == 0) + key = "zeronvalue/fixedgradient"; + + if (vBcTextToID.find(key) == vBcTextToID.end()) { + std::cout << "Invalid bcType " << "\'" << key << "\'" << "!\n"; + ABORT(1); + } + + bToBc[make_pair(field, bid)] = vBcTextToID.at(key); + } +} + +static void s_setup(std::string field, std::vector slist) +{ + for (int bid = 0; bid < slist.size(); bid++) { + std::string key = slist[bid]; + if (key.compare("p") == 0) key = "periodic"; + if (key.compare("t") == 0) key = "fixedvalue"; + if (key.compare("inlet") == 0) key = "fixedvalue"; + if (key.compare("flux") == 0) key = "fixedgradient"; + if (key.compare("f") == 0) key = "fixedgradient"; + if (key.compare("zeroflux") == 0) key = "zerogradient"; + if (key.compare("i") == 0) key = "zerogradient"; + if (key.compare("insulated") == 0) key = "zerogradient"; + if (key.compare("outflow") == 0) key = "zerogradient"; + if (key.compare("outlet") == 0) key = "zerogradient"; + if (key.compare("o") == 0) key = "zerogradient"; + + if (sBcTextToID.find(key) == sBcTextToID.end()) { + std::cout << "Invalid bcType " << "\'" << key << "\'" << "!\n"; + ABORT(1); + } + + bToBc[make_pair(field, bid)] = sBcTextToID.at(key); + } +} + +namespace bcMap +{ +bool useNekBCs() { return importFromNek; } + +void setup(std::vector slist, std::string field) +{ + if (slist.size() == 0) + return; + + importFromNek = false; + + if (slist[0].compare("none") == 0) + return; + + fields.insert(field); + + if (field.compare(0, 8, "scalar00") == 0) /* tmesh */ + nbid[1] = slist.size(); + else + nbid[0] = slist.size(); + + if (field.compare("velocity") == 0) + v_setup(field, slist); + else if (field.compare("mesh") == 0) + v_setup(field, slist); + else if (field.compare(0, 6, "scalar") == 0) + s_setup(field, slist); +} + +void deriveMeshBoundaryConditions(std::vector velocityBCs) +{ + if (velocityBCs.size() == 0 || velocityBCs[0].compare("none") == 0) return; + + meshConditionsDerived = true; + + const std::string field = "mesh"; + + fields.insert(field); + + for (int bid = 0; bid < velocityBCs.size(); bid++) { + std::string key = velocityBCs[bid]; + if (key.compare("p") == 0) key = "periodic"; + if (key.compare("w") == 0) key = "zerovalue"; + if (key.compare("wall") == 0) key = "zerovalue"; + if (key.compare("inlet") == 0) key = "zerovalue"; + if (key.compare("v") == 0) key = "zerovalue"; + if (key.compare("mv") == 0) key = "fixedvalue"; + if (key.compare("fixedvalue+moving") == 0) key = "fixedvalue"; + + // all other bounds map to SYM + if (key.compare("outlet") == 0) key = "zeronvalue/zerogradient"; + if (key.compare("outflow") == 0) key = "zeronvalue/zerogradient"; + if (key.compare("o") == 0) key = "zeronvalue/zerogradient"; + if (key.compare("slipx") == 0) key = "zeronvalue/zerogradient"; + if (key.compare("slipy") == 0) key = "zeronvalue/zerogradient"; + if (key.compare("slipz") == 0) key = "zeronvalue/zerogradient"; + if (key.compare("symx") == 0) key = "zeronvalue/zerogradient"; + if (key.compare("symy") == 0) key = "zeronvalue/zerogradient"; + if (key.compare("symz") == 0) key = "zeronvalue/zerogradient"; + if (key.compare("sym") == 0) key = "zeronvalue/zerogradient"; + if (key.compare("shl") == 0) + key = "zeronvalue/zerogradient"; + + if (vBcTextToID.find(key) == vBcTextToID.end()) { + std::cout << "Invalid bcType " << "\'" << key << "\'" << "!\n"; + ABORT(1); + } + + bToBc[make_pair(field, bid)] = vBcTextToID.at(key); + } +} + +// return boundary type id for a given boundary id +int id(int bid, std::string field) +{ + if (bid < 1) + return NO_OP; + + try { + return bToBc.at({field, bid - 1}); + } + catch (const std::out_of_range &oor) { + return NO_OP; + } +} + +int type(int bid, std::string field) +{ + if (bid < 1) + return NO_OP; + + // printf("%d %s\n", bid, field.c_str()); + + try { + int bcType; + if (field.compare("x-velocity") == 0) { + const int bcID = bToBc.at({"velocity", bid - 1}); + if (bcID == 1) + bcType = DIRICHLET; + if (bcID == 2) + bcType = DIRICHLET; + if (bcID == 3) + bcType = NEUMANN; + if (bcID == 4) + bcType = DIRICHLET; + if (bcID == 5) + bcType = NEUMANN; + if (bcID == 6) + bcType = NEUMANN; + if (bcID == 7) + bcType = ZERO_NORMAL; + if (bcID == 8) + bcType = ZERO_NORMAL; + } + else if (field.compare("y-velocity") == 0) { + const int bcID = bToBc.at({"velocity", bid - 1}); + if (bcID == 1) + bcType = DIRICHLET; + if (bcID == 2) + bcType = DIRICHLET; + if (bcID == 3) + bcType = NEUMANN; + if (bcID == 4) + bcType = NEUMANN; + if (bcID == 5) + bcType = DIRICHLET; + if (bcID == 6) + bcType = NEUMANN; + if (bcID == 7) + bcType = ZERO_NORMAL; + if (bcID == 8) + bcType = ZERO_NORMAL; + } + else if (field.compare("z-velocity") == 0) { + const int bcID = bToBc.at({"velocity", bid - 1}); + if (bcID == 1) + bcType = DIRICHLET; + if (bcID == 2) + bcType = DIRICHLET; + if (bcID == 3) + bcType = NEUMANN; + if (bcID == 4) + bcType = NEUMANN; + if (bcID == 5) + bcType = NEUMANN; + if (bcID == 6) + bcType = DIRICHLET; + if (bcID == 7) + bcType = ZERO_NORMAL; + if (bcID == 8) + bcType = ZERO_NORMAL; + } + else if (field.compare("x-mesh") == 0) { + const int bcID = bToBc.at({"mesh", bid - 1}); + if (bcID == 1) + bcType = DIRICHLET; + if (bcID == 2) + bcType = DIRICHLET; + if (bcID == 3) + bcType = NEUMANN; + if (bcID == 4) + bcType = DIRICHLET; + if (bcID == 5) + bcType = NEUMANN; + if (bcID == 6) + bcType = NEUMANN; + if (bcID == 7) + bcType = ZERO_NORMAL; + if (bcID == 8) + bcType = ZERO_NORMAL; + } + else if (field.compare("y-mesh") == 0) { + const int bcID = bToBc.at({"mesh", bid - 1}); + if (bcID == 1) + bcType = DIRICHLET; + if (bcID == 2) + bcType = DIRICHLET; + if (bcID == 3) + bcType = NEUMANN; + if (bcID == 4) + bcType = NEUMANN; + if (bcID == 5) + bcType = DIRICHLET; + if (bcID == 6) + bcType = NEUMANN; + if (bcID == 7) + bcType = ZERO_NORMAL; + if (bcID == 8) + bcType = ZERO_NORMAL; + } + else if (field.compare("z-mesh") == 0) { + const int bcID = bToBc.at({"mesh", bid - 1}); + if (bcID == 1) + bcType = DIRICHLET; + if (bcID == 2) + bcType = DIRICHLET; + if (bcID == 3) + bcType = NEUMANN; + if (bcID == 4) + bcType = NEUMANN; + if (bcID == 5) + bcType = NEUMANN; + if (bcID == 6) + bcType = DIRICHLET; + if (bcID == 7) + bcType = ZERO_NORMAL; + if (bcID == 8) + bcType = ZERO_NORMAL; + } + else if (field.compare("pressure") == 0) { + const int bcID = bToBc.at({"velocity", bid - 1}); + if (bcID == 1) + bcType = NEUMANN; + if (bcID == 2) + bcType = NEUMANN; + if (bcID == 3) + bcType = DIRICHLET; + if (bcID == 4) + bcType = NEUMANN; + if (bcID == 5) + bcType = NEUMANN; + if (bcID == 6) + bcType = NEUMANN; + if (bcID == 7) + bcType = NEUMANN; + if (bcID == 8) + bcType = NEUMANN; + } + else if (field.compare(0, 6, "scalar") == 0) { + const int bcID = bToBc.at({field, bid - 1}); + if (bcID == 1) + bcType = DIRICHLET; + if (bcID == 2) + bcType = NEUMANN; + if (bcID == 3) + bcType = NEUMANN; + } + return bcType; + } + catch (const std::out_of_range &oor) { + return NO_OP; + } +} + +std::string text(int bid, std::string field) +{ + if (bid < 1) return std::string(); + + const int bcID = bToBc.at({field, bid - 1}); + + if (field.compare("velocity") == 0 && bcID == 2) + oudfFindDirichlet(field); + if (field.compare("mesh") == 0 && bcID == 2) + oudfFindDirichlet(field); + if (field.compare("pressure") == 0 && bcID == 3) + oudfFindDirichlet(field); + if (field.compare(0, 6, "scalar") == 0 && bcID == 1) + oudfFindDirichlet(field); + + if (field.compare("velocity") == 0 && bcID == 8) + oudfFindNeumann(field); + if (field.compare("mesh") == 0 && bcID == 8) + oudfFindNeumann(field); + if (field.compare(0, 6, "scalar") == 0 && bcID == 3) + oudfFindNeumann(field); + + if (field.compare("velocity") == 0 || field.compare("mesh") == 0) + return vBcIDToText.at(bcID); + else if (field.compare(0, 6, "scalar") == 0) + return sBcIDToText.at(bcID); + + std::cout << __func__ << "(): Unexpected error occured!" << std::endl; + ABORT(1); + return 0; +} + +int size(int isTmesh) +{ + return isTmesh ? nbid[1] : nbid[0]; +} + +bool useDerivedMeshBoundaryConditions() +{ + if (importFromNek) { + return true; + } + else { + return meshConditionsDerived; + } +} + + +void check(mesh_t* mesh) +{ + + int nid = nbid[0]; + if(mesh->cht) nid = nbid[1]; + + int err = 0; + int found = 0; + + for (int id = 1; id <= nid; id++) { + found = 0; + for (int f = 0; f < mesh->Nelements * mesh->Nfaces; f++) { + if (mesh->EToB[f] == id) { + found = 1; + break; + } + } + MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_MAX, platform->comm.mpiComm); + err += (found ? 0 : 1); + if (err && platform->comm.mpiRank == 0) + printf("Cannot find boundary ID %d in mesh!\n", id); + } + if (err) EXIT_AND_FINALIZE(EXIT_FAILURE); + + found = 0; + for (int f = 0; f < mesh->Nelements * mesh->Nfaces; f++) { + if (mesh->EToB[f] < -1 || mesh->EToB[f] == 0 || mesh->EToB[f] > nid) + found = 1; + } + MPI_Allreduce(MPI_IN_PLACE, &found, 1, MPI_INT, MPI_MAX, platform->comm.mpiComm); + if (found) { + if (platform->comm.mpiRank == 0) printf("WARNING: Mesh has unmapped boundary IDs!\n"); + } + + +} + +void setBcMap(std::string field, int* map, int nIDs) +{ + if (field.compare(0, 8, "scalar00") == 0) + nbid[1] = nIDs; + else + nbid[0] = nIDs; + + fields.insert(field); + for (int i = 0; i < nIDs; i++) + bToBc[make_pair(field, i)] = map[i]; +} + +void checkBoundaryAlignment(mesh_t *mesh) +{ + int nid = nbid[0]; + if (mesh->cht) + nid = nbid[1]; + bool bail = false; + for (auto &&field : fields) { + if (field != std::string("velocity") && field != std::string("mesh")) + continue; + + std::map expectedAlignmentInvalidBIDs; + std::map> actualAlignmentsInvalidBIDs; + + for (int e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + int bid = mesh->EToB[e * mesh->Nfaces + f]; + int bc = id(bid, field); + if (bc == 4 || bc == 5 || bc == 6) { + auto expectedAlignment = boundaryAlignment_t::UNALIGNED; + switch (bc) { + case 4: + expectedAlignment = boundaryAlignment_t::X; + break; + case 5: + expectedAlignment = boundaryAlignment_t::Y; + break; + case 6: + expectedAlignment = boundaryAlignment_t::Z; + break; + } + + auto alignment = computeAlignment(mesh, e, f); + if (alignment != expectedAlignment) { + expectedAlignmentInvalidBIDs[bid] = expectedAlignment; + actualAlignmentsInvalidBIDs[bid].insert(alignment); + } + } + } + } + + int err = expectedAlignmentInvalidBIDs.size(); + MPI_Allreduce(MPI_IN_PLACE, &err, 1, MPI_INT, MPI_MAX, platform->comm.mpiComm); + if (err > 0) { + bail = true; + + std::vector valid(nid, 1); + for (int bid = 1; bid <= nid; bid++) { + valid[bid - 1] = expectedAlignmentInvalidBIDs.count(bid) == 0; + } + + constexpr int invalidAlignment = -1; + constexpr int nAlignments = 4; + std::vector expectedAlignments(nid, invalidAlignment); + std::vector encounteredAlignments(nid * nAlignments, invalidAlignment); + for (auto &&bidAndAlignments : actualAlignmentsInvalidBIDs) { + const auto bid = bidAndAlignments.first; + const auto &alignments = bidAndAlignments.second; + encounteredAlignments[(bid - 1) * nAlignments + 0] = (alignments.count(boundaryAlignment_t::X)); + encounteredAlignments[(bid - 1) * nAlignments + 1] = (alignments.count(boundaryAlignment_t::Y)); + encounteredAlignments[(bid - 1) * nAlignments + 2] = (alignments.count(boundaryAlignment_t::Z)); + encounteredAlignments[(bid - 1) * nAlignments + 3] = + (alignments.count(boundaryAlignment_t::UNALIGNED)); + expectedAlignments[(bid - 1)] = static_cast(expectedAlignmentInvalidBIDs[bid]); + } + MPI_Allreduce(MPI_IN_PLACE, valid.data(), nid, MPI_INT, MPI_MIN, platform->comm.mpiComm); + MPI_Allreduce(MPI_IN_PLACE, + encounteredAlignments.data(), + nid * nAlignments, + MPI_INT, + MPI_MAX, + platform->comm.mpiComm); + MPI_Allreduce(MPI_IN_PLACE, expectedAlignments.data(), nid, MPI_INT, MPI_MAX, platform->comm.mpiComm); + + if (platform->comm.mpiRank == 0) { + std::cout << "Encountered incorrectly aligned boundaries in field \"" << field << "\":\n"; + for (int bid = 1; bid <= nid; bid++) { + if (valid[bid - 1] == 0) { + std::cout << "\tBoundary ID " << bid << ":\n"; + std::cout << "\t\texpected alignment : " + << to_string(static_cast(expectedAlignments[bid - 1])) << "\n"; + std::cout << "\t\tencountered alignments:\n"; + if (encounteredAlignments[(bid - 1) * nAlignments + 0]) + std::cout << "\t\t\tX\n"; + if (encounteredAlignments[(bid - 1) * nAlignments + 1]) + std::cout << "\t\t\tY\n"; + if (encounteredAlignments[(bid - 1) * nAlignments + 2]) + std::cout << "\t\t\tZ\n"; + if (encounteredAlignments[(bid - 1) * nAlignments + 3]) + std::cout << "\t\t\tUNALIGNED\n"; + } + } + } + + fflush(stdout); + MPI_Barrier(platform->comm.mpiComm); + } + } + + if (bail) { + ABORT(1); + } +} + +void remapUnalignedBoundaries(mesh_t *mesh) +{ + for (auto &&field : fields) { + if (field != std::string("velocity") && field != std::string("mesh")) + continue; + + std::map remapBID; + std::map alignmentBID; + + int nid = nbid[0]; + if (mesh->cht) + nid = nbid[1]; + + for (int bid = 1; bid <= nid; ++bid) { + int bcType = id(bid, field); + remapBID[bid] = (bcType == 7); + } + + for (int e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + int bid = mesh->EToB[f + e * mesh->Nfaces]; + int bc = id(bid, field); + auto alignment = computeAlignment(mesh, e, f); + if (alignmentBID.count(bid) == 0) { + alignmentBID[bid] = alignment; + } + + auto previousAlignment = alignmentBID[bid]; + remapBID[bid] &= (alignment != boundaryAlignment_t::UNALIGNED) && (alignment == previousAlignment); + } + } + + // if a single unaligned boundary with SYM/SHL is present, no remapping may occur. + int unalignedBoundaryPresent = 0; + for (int bid = 1; bid <= nid; ++bid) { + int canRemap = remapBID[bid]; + int bc = id(bid, field); + bool unalignedBoundaryType = bc == 7 || bc == 8; + if (!canRemap && unalignedBoundaryType) { + unalignedBoundaryPresent++; + } + } + + MPI_Allreduce(MPI_IN_PLACE, &unalignedBoundaryPresent, 1, MPI_INT, MPI_MAX, platform->comm.mpiComm); + if (unalignedBoundaryPresent > 0) { + return; + } + + for (int bid = 1; bid <= nid; ++bid) { + int canRemap = remapBID[bid]; + MPI_Allreduce(MPI_IN_PLACE, &canRemap, 1, MPI_INT, MPI_MIN, platform->comm.mpiComm); + if (canRemap) { + if(platform->comm.mpiRank == 0 && platform->options.compareArgs("VERBOSE","TRUE")){ + std::cout << "Remapping bid " << bid << " to an aligned type!\n"; + } + + auto alignmentType = alignmentBID[bid]; + + int newBcType = 0; + switch (alignmentType) { + case boundaryAlignment_t::X: + newBcType = 4; + break; + case boundaryAlignment_t::Y: + newBcType = 5; + break; + case boundaryAlignment_t::Z: + newBcType = 6; + break; + default: + break; + } + + bToBc.at({field, bid - 1}) = newBcType; + } + } + } +} + +bool unalignedBoundary(bool cht, std::string field) +{ + int nid = nbid[0]; + if (cht) + nid = nbid[1]; + + for (int bid = 1; bid <= nid; bid++) { + int bcType = id(bid, field); + if (bcType == 7) + return true; + if (bcType == 8) + return true; + } + + return false; +} + +} // namespace diff --git a/src/core/bcMap.hpp b/src/bdry/bcMap.hpp similarity index 61% rename from src/core/bcMap.hpp rename to src/bdry/bcMap.hpp index dcb6e6412..9dbd4aef4 100644 --- a/src/core/bcMap.hpp +++ b/src/bdry/bcMap.hpp @@ -7,6 +7,7 @@ namespace bcMap { +bool useNekBCs(); void setup(std::vector slist, std::string field); int id(int bid, std::string field); int type(int bid, std::string field); @@ -14,6 +15,11 @@ std::string text(int bid, std::string field); int size(int isTmesh); void check(mesh_t* mesh); void setBcMap(std::string field, int* map, int nbid); +void checkBoundaryAlignment(mesh_t *mesh); +void remapUnalignedBoundaries(mesh_t *mesh); +bool unalignedBoundary(bool cht, std::string field); +void deriveMeshBoundaryConditions(std::vector velocityBCs); +bool useDerivedMeshBoundaryConditions(); } #endif diff --git a/src/bdry/bdry.hpp b/src/bdry/bdry.hpp new file mode 100644 index 000000000..c24450c41 --- /dev/null +++ b/src/bdry/bdry.hpp @@ -0,0 +1,16 @@ +#if !defined(BDRY_HPP) +#define BDRY_HPP + +class nrs_t; +void applyDirichlet(nrs_t *nrs, double time); +void createEToBV(const mesh_t* mesh, const int* EToB, occa::memory& o_EToBV); +void createZeroNormalMask(nrs_t *nrs, occa::memory &o_EToB, occa::memory& o_EToBV, occa::memory &o_mask); +void applyZeroNormalMask(nrs_t *nrs, occa::memory &o_EToB, occa::memory &o_mask, occa::memory &o_x); +void applyZeroNormalMask(nrs_t *nrs, + dlong Nelements, + occa::memory &o_elementList, + occa::memory &o_EToB, + occa::memory &o_mask, + occa::memory &o_x); + +#endif diff --git a/src/bdry/createEToBV.cpp b/src/bdry/createEToBV.cpp new file mode 100644 index 000000000..603137419 --- /dev/null +++ b/src/bdry/createEToBV.cpp @@ -0,0 +1,34 @@ +#include +#include "elliptic.h" + +// pre: EToB allocated, capacity mesh->Nfaces * mesh->Nelements dlong words, +// o_EToBV allocated, capacity mesh->Nlocal dlong words +void createEToBV(const mesh_t* mesh, const int* EToB, occa::memory& o_EToBV) +{ + const int largeNumber = 1 << 20; + + std::vector EToBV(mesh->Nlocal, largeNumber); + for (dlong e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + int bc = EToB[f + e * mesh->Nfaces]; + if (bc > 0) { + for (int n = 0; n < mesh->Nfp; n++) { + int fid = mesh->faceNodes[n + f * mesh->Nfp]; + EToBV[fid + e * mesh->Np] = std::min(bc, EToBV[fid + e * mesh->Np]); + } + } + } + } + ogsGatherScatter(EToBV.data(), + ogsInt, + ogsMin, + mesh->ogs); + + for (dlong n = 0; n < mesh->Nlocal; n++) { + if (EToBV[n] == largeNumber) { + EToBV[n] = 0; + } + } + + o_EToBV.copyFrom(EToBV.data(), EToBV.size() * sizeof(int)); +} \ No newline at end of file diff --git a/src/bench/advsub/README.md b/src/bench/advsub/README.md new file mode 100644 index 000000000..28a9c0ddc --- /dev/null +++ b/src/bench/advsub/README.md @@ -0,0 +1,22 @@ +This benchmark applies the subcycling operator. + +# Usage + +``` +Usage: ./nekrs-bench-advsub --p-order --elements --backend + [--no-cubature] [--ext-order ] [--c-order] [--iterations ] +``` + +# Examples + +### NVIDIA A100 +``` +> mpirun -np 1 nekrs-bench-advsub --p-order 7 --elements 4096 --backend CUDA +MPItasks=1 OMPthreads=1 NRepetitions=12768 N=7 cubN=10 Nelements=4096 elapsed time=0.00061705 wordSize=64 GDOF/s=2.27685 GB/s=614.419 GFLOPS/s=3799.65 +``` + +### AMD EPYC 7742 64-Core Processor +``` +> OCCA_CXXFLAGS='-O3 -march=native -mtune=native' mpirun -np 64 --bind-to core --map-by ppr:64:socket nekrs-bench-advsub --p-order 7 --elements 1024 --backend CPU +MPItasks=64 OMPthreads=2 NRepetitions=7396 N=7 cubN=10 Nelements=1024 elapsed time=0.00132257 wordSize=64 GDOF/s=0.265569 GB/s=71.7457 GFLOPS/s=443.187 +``` diff --git a/src/bench/advsub/benchmarkAdvsub.cpp b/src/bench/advsub/benchmarkAdvsub.cpp new file mode 100644 index 000000000..5dd470631 --- /dev/null +++ b/src/bench/advsub/benchmarkAdvsub.cpp @@ -0,0 +1,374 @@ +#include "benchmarkAdvsub.hpp" +#include +#include +#include +#include "nrs.hpp" + +#include "randomVector.hpp" +#include "kernelBenchmarker.hpp" +#include "omp.h" + +namespace{ + +// for a given Nq, return the largest cubNq +const std::map maximumCubaturePoints = { + {2,3}, + {3,5}, + {4,6}, + {5,8}, + {6,9}, + {7,11}, + {8,12}, + {9,14}, + {10,15}, + {11,17}, + {12,18}, + {13,20}, + {14,21}, +}; +struct CallParameters{ + int Nfields; + int Nelements; + int Nq; + int cubNq; + int nEXT; + bool dealias; + bool isScalar; +}; +} + +namespace std +{ + template<> struct less + { + bool operator() (const CallParameters& lhs, const CallParameters& rhs) const + { + auto tier = [](const CallParameters& v) + { + return std::tie(v.Nfields, v.Nelements, v.Nq, v.cubNq, v.nEXT, v.dealias, v.isScalar); + }; + return tier(lhs) < tier(rhs); + } + }; +} + +namespace{ +std::map cachedResults; +} + +template +occa::kernel +benchmarkAdvsub(int Nfields, int Nelements, int Nq, int cubNq, int nEXT, bool dealias, bool isScalar, int verbosity, T NtestsOrTargetTime, bool requiresBenchmark) +{ + if(platform->options.compareArgs("BUILD ONLY", "TRUE")){ + Nelements = 1; + } + + CallParameters params{ + Nfields, + Nelements, + Nq, + cubNq, + nEXT, + dealias, + isScalar, + }; + + if(cachedResults.count(params) > 0){ + return cachedResults.at(params); + } + + if(Nq > 14){ + if(platform->comm.mpiRank == 0){ + std::cout << "Error: maximum Nq of 14 has been exceed with Nq=" << Nq << ".\n"; + } + ABORT(1); + } + + const auto largestCubNq = maximumCubaturePoints.at(Nq); + if(cubNq > largestCubNq){ + if(platform->comm.mpiRank == 0){ + std::cout << "Error: maximum cubNq for Nq = " << Nq << " is " << largestCubNq << ".\n"; + std::cout << "cubNq as specified is " << cubNq << ".\n"; + } + ABORT(1); + } + + if (!dealias || cubNq < Nq) { + cubNq = Nq; + } + + static constexpr int NVFields = 3; + const int N = Nq-1; + const int cubN = cubNq - 1; + const int Np = Nq * Nq * Nq; + const int cubNp = cubNq * cubNq * cubNq; + int fieldOffset = Np * Nelements; + const int pageW = ALIGN_SIZE / sizeof(dfloat); + if (fieldOffset % pageW) fieldOffset = (fieldOffset / pageW + 1) * pageW; + int cubatureOffset = std::max(fieldOffset, Nelements * cubNp); + if (cubatureOffset % pageW) + cubatureOffset = (cubatureOffset / pageW + 1) * pageW; + + occa::properties props = platform->kernelInfo + meshKernelProperties(N); + props["defines"].asObject(); + props["includes"].asArray(); + props["header"].asArray(); + props["flags"].asObject(); + props["include_paths"].asArray(); + + props["defines/p_cubNq"] = cubNq; + props["defines/p_cubNp"] = cubNp; + props["defines/p_nEXT"] = nEXT; + props["defines/p_NVfields"] = NVFields; + props["defines/p_MovingMesh"] = platform->options.compareArgs("MOVING MESH", "TRUE"); + + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + + std::string diffDataFile = installDir + "/okl/mesh/constantDifferentiationMatrices.h"; + std::string interpDataFile = installDir + "/okl/mesh/constantInterpolationMatrices.h"; + std::string diffInterpDataFile = installDir + "/okl/mesh/constantDifferentiationInterpolationMatrices.h"; + + props["includes"] += diffDataFile.c_str(); + props["includes"] += interpDataFile.c_str(); + props["includes"] += diffInterpDataFile.c_str(); + + std::string fileName = + installDir + "/okl/bench/advsub/readCubDMatrix.okl"; + auto readCubDMatrixKernel = platform->device.buildKernel(fileName, props, true); + + fileName = + installDir + "/okl/bench/advsub/readIMatrix.okl"; + auto readIMatrixKernel = platform->device.buildKernel(fileName, props, true); + + std::string kernelName; + if(dealias){ + kernelName = "subCycleStrongCubatureVolumeHex3D"; + } else { + kernelName = "subCycleStrongVolumeHex3D"; + } + + const std::string ext = (platform->device.mode() == "Serial") ? ".c" : ".okl"; + fileName = + installDir + "/okl/nrs/" + kernelName + ext; + + if(isScalar){ + fileName = + installDir + "/okl/cds/" + kernelName + ext; + } + + // currently lacking a native implementation of the non-dealiased kernel + if(!dealias) { + fileName = installDir + "/okl/nrs/" + kernelName + ".okl"; + if(isScalar){ + fileName = installDir + "/okl/cds/" + kernelName + ".okl"; + } + } + + std::vector kernelVariants = {0}; + if(!platform->serial && dealias && !isScalar){ + // TODO: reduce number of kernel variants + constexpr int Nkernels = 14; + for(int i = 1; i < Nkernels; ++i){ + + // v12 requires cubNq <=13 + if(i == 11 && cubNq > 13) continue; + + kernelVariants.push_back(i); + } + } else if(!platform->serial && dealias && isScalar){ + kernelVariants.push_back(8); + } + + if(kernelVariants.size() == 1 && !requiresBenchmark){ + auto newProps = props; + if(!platform->serial && dealias) newProps["defines/p_knl"] = kernelVariants.back(); + return platform->device.buildKernel(fileName, newProps, true); + } + + occa::kernel referenceKernel; + { + auto newProps = props; + if(!platform->serial && dealias) newProps["defines/p_knl"] = kernelVariants.front(); + referenceKernel = platform->device.buildKernel(fileName, newProps, true); + } + + const int wordSize = sizeof(dfloat); + + auto invLMM = randomVector(fieldOffset * nEXT); + auto cubD = randomVector(cubNq * cubNq); + auto NU = randomVector(NVFields * fieldOffset); + auto conv = randomVector(NVFields * cubatureOffset * nEXT); + auto cubInterpT = randomVector(Nq * cubNq); + auto Ud = randomVector(NVFields * fieldOffset); + auto BdivW = randomVector(fieldOffset * nEXT); + + // elementList[e] = e + std::vector elementList(Nelements); + std::iota(elementList.begin(), elementList.end(), 0); + auto o_elementList = platform->device.malloc(Nelements * sizeof(dlong), elementList.data()); + + auto o_invLMM = platform->device.malloc(nEXT * fieldOffset * wordSize, invLMM.data()); + auto o_cubD = platform->device.malloc(cubNq * cubNq * wordSize, cubD.data()); + auto o_NU = platform->device.malloc(NVFields * fieldOffset * wordSize, NU.data()); + auto o_conv = platform->device.malloc(NVFields * cubatureOffset * nEXT * wordSize, conv.data()); + auto o_cubInterpT = platform->device.malloc(Nq * cubNq * wordSize, cubInterpT.data()); + auto o_Ud = platform->device.malloc(NVFields * fieldOffset * wordSize, Ud.data()); + auto o_BdivW = platform->device.malloc(nEXT * fieldOffset * wordSize, BdivW.data()); + + // popular cubD, cubInterpT with correct data + readCubDMatrixKernel(o_cubD); + readIMatrixKernel(o_cubInterpT); + + auto kernelRunner = [&](occa::kernel & subcyclingKernel){ + const auto c0 = 0.1; + const auto c1 = 0.2; + const auto c2 = 0.3; + if(!dealias) { + subcyclingKernel(Nelements, o_elementList, o_cubD, fieldOffset, + 0, o_invLMM, o_BdivW, c0, c1, c2, o_conv, o_Ud, o_NU); + } else { + subcyclingKernel(Nelements, o_elementList, o_cubD, o_cubInterpT, fieldOffset, + cubatureOffset, 0, o_invLMM, o_BdivW, c0, c1, c2, o_conv, o_Ud, o_NU); + } + }; + + auto advSubKernelBuilder = [&](int kernelVariant){ + auto newProps = props; + if(!platform->serial && dealias) newProps["defines/p_knl"] = kernelVariant; + auto kernel = platform->device.buildKernel(fileName, newProps, true); + if(platform->options.compareArgs("BUILD ONLY", "TRUE")) return kernel; + + // perform correctness check + std::vector referenceResults(3*fieldOffset); + std::vector results(3*fieldOffset); + + kernelRunner(referenceKernel); + o_NU.copyTo(referenceResults.data(), referenceResults.size() * sizeof(dfloat)); + + kernelRunner(kernel); + o_NU.copyTo(results.data(), results.size() * sizeof(dfloat)); + + double err = 0.0; + for(auto i = 0; i < results.size(); ++i){ + err = std::max(err, std::abs(results[i] - referenceResults[i])); + } + + if(platform->comm.mpiRank == 0 && verbosity > 1){ + std::cout << "Error in kernel " << kernelVariant << " is " << err << " compared to reference implementation.\n"; + } + + return kernel; + }; + + + auto printPerformanceInfo = [&](int kernelVariant, double elapsed, int Ntests, bool skipPrint) { + const dfloat GDOFPerSecond = NVFields * ( Nelements * (N * N * N) / elapsed) / 1.e9; + + size_t bytesPerElem = 2 * NVFields * Np; // Ud, NU + bytesPerElem += Np; // inv mass matrix + bytesPerElem += NVFields * cubNp * nEXT; // U(r,s,t) + + size_t otherBytes = cubNq * cubNq; // D + if(cubNq > Nq){ + otherBytes += Nq * cubNq; // interpolator + } + otherBytes *= wordSize; + bytesPerElem *= wordSize; + const double bw = ( (Nelements * bytesPerElem + otherBytes) / elapsed) / 1.e9; + + double flopCount = 0.0; // per elem basis + if(dealias){ + flopCount += 6. * cubNp * nEXT; // extrapolate U(r,s,t) to current time + flopCount += 6. * cubNp * cubNq * Nfields; // apply Dcub + flopCount += 3. * Np * Nfields; // compute NU + flopCount += 4. * Nq * (cubNp + cubNq * cubNq * Nq + cubNq * Nq * Nq) * Nfields; // interpolation + } else { + flopCount = Nq * Nq * Nq * (6. * Nq + 6. * nEXT + 8.) * Nfields; + } + const double gflops = ( flopCount * Nelements / elapsed) / 1.e9; + const int Nthreads = omp_get_max_threads(); + + if(platform->comm.mpiRank == 0 && !skipPrint){ + + if(verbosity > 0){ + std::cout << "advSub:"; + } + + if(verbosity > 1){ + std::cout << " MPItasks=" << platform->comm.mpiCommSize << " OMPthreads=" << Nthreads << " NRepetitions=" << Ntests; + } + if(verbosity > 0){ + std::cout << " N=" << N; + if(dealias){ + std::cout << " cubN=" << cubN; + } + + if(verbosity > 1){ + std::cout << " nEXT=" << nEXT; + } + + std::cout << " Nfields=" << Nfields; + if(verbosity > 1){ + std::cout << " Nelements=" << Nelements; + std::cout << " elapsed time=" << elapsed; + } + std::cout << " wordSize=" << 8 * wordSize << " GDOF/s=" << GDOFPerSecond + << " GB/s=" << bw << " GFLOPS/s=" << gflops << " kernelVer=" << kernelVariant << "\n"; + } + } + }; + + auto printCallBack = [&](int kernelVariant, double elapsed, int Ntests) { + printPerformanceInfo(kernelVariant, elapsed, Ntests, verbosity < 2); + }; + + auto kernelAndTime = + benchmarkKernel(advSubKernelBuilder, kernelRunner, printCallBack, kernelVariants, NtestsOrTargetTime); + + if(kernelAndTime.first.properties().has("defines/p_knl") && platform->options.compareArgs("BUILD ONLY","FALSE")){ + int bestKernelVariant = static_cast(kernelAndTime.first.properties()["defines/p_knl"]); + + // print only the fastest kernel + if(verbosity == 1){ + printPerformanceInfo(bestKernelVariant, kernelAndTime.second, 0, false); + } + } + + free(o_elementList); + free(o_invLMM); + free(o_cubD); + free(o_NU); + free(o_conv); + free(o_cubInterpT); + free(o_Ud); + + cachedResults[params] = kernelAndTime.first; + + return kernelAndTime.first; +} + +template +occa::kernel benchmarkAdvsub(int Nfields, + int Nelements, + int Nq, + int cubNq, + int nEXT, + bool dealias, + bool isScalar, + int verbosity, + int Ntests, + bool requiresBenchmark); + +template +occa::kernel benchmarkAdvsub(int Nfields, + int Nelements, + int Nq, + int cubNq, + int nEXT, + bool dealias, + bool isScalar, + int verbosity, + double targetTime, + bool requiresBenchmark); diff --git a/src/bench/advsub/benchmarkAdvsub.hpp b/src/bench/advsub/benchmarkAdvsub.hpp new file mode 100644 index 000000000..27ad36533 --- /dev/null +++ b/src/bench/advsub/benchmarkAdvsub.hpp @@ -0,0 +1,13 @@ +#include "occa.hpp" + +template +occa::kernel benchmarkAdvsub(int Nfields, + int Nelements, + int Nq, + int cubNq, + int nEXT, + bool dealias, + bool isScalar, + int verbosity, + T NtestsOrTargetTime, + bool requiresBenchmark); \ No newline at end of file diff --git a/src/bench/advsub/main.cpp b/src/bench/advsub/main.cpp new file mode 100644 index 000000000..25eac56d0 --- /dev/null +++ b/src/bench/advsub/main.cpp @@ -0,0 +1,164 @@ +#include +#include +#include +#include +#include "omp.h" +#include +#include "mpi.h" + +#include "nrssys.hpp" +#include "setupAide.hpp" +#include "platform.hpp" +#include "configReader.hpp" + +#include "benchmarkAdvsub.hpp" + +namespace { + +int Nfields = 3; +int Nelements; +int Np; +int cubNp; +dlong fieldOffset; +dlong cubatureOffset; +bool dealias; + +} // namespace + +int main(int argc, char** argv) +{ + int rank = 0, size = 1; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + configRead(MPI_COMM_WORLD); + std::string installDir(getenv("NEKRS_HOME")); + setupAide options; + + int err = 0; + int cmdCheck = 0; + + std::string threadModel; + int N; + int cubN = -1; + int okl = 1; + int Ntests = -1; + int nEXT = 2; + dealias = true; + static constexpr size_t wordSize = 8; + + while(1) { + static struct option long_options[] = + { + {"p-order", required_argument, 0, 'p'}, + {"ext-order", required_argument, 0, 'x'}, + {"c-order", required_argument, 0, 'c'}, + {"block-dim", required_argument, 0, 'n'}, + {"no-cubature", no_argument, 0, 'd'}, + {"elements", required_argument, 0, 'e'}, + {"backend", required_argument, 0, 'b'}, + {"arch", required_argument, 0, 'a'}, + {"help", required_argument, 0, 'h'}, + {"iterations", required_argument, 0, 'i'}, + {0, 0, 0, 0} + }; + int option_index = 0; + int c = getopt_long (argc, argv, "", long_options, &option_index); + + if (c == -1) + break; + + switch(c) { + case 'n': + Nfields = atoi(optarg); + break; + case 'p': + N = atoi(optarg); + cmdCheck++; + break; + case 'c': + cubN = atoi(optarg); + break; + case 'd': + dealias = false; + break; + case 'x': + nEXT = atoi(optarg); + if(nEXT <= 0 || nEXT > 3){ + if(rank == 0){ + printf("Error, 0 < nEXT <= 3!\n"); + } + exit(1); + } + break; + case 'e': + Nelements = atoi(optarg); + cmdCheck++; + break; + case 'b': + options.setArgs("THREAD MODEL", std::string(optarg)); + cmdCheck++; + break; + case 'i': + Ntests = atoi(optarg); + break; + case 'h': + err = 1; + break; + default: + err = 1; + } + } + + if(err || cmdCheck != 3) { + if(rank == 0) + printf("Usage: ./nekrs-bench-advsub --p-order --elements --backend \n" + " [--block-dim ] [--c-order ] [--no-cubature] [--ext-order ] [--iterations ]\n"); + exit(1); + } + + if(cubN < 0) { + if(dealias) cubN = round((3./2) * (N+1) - 1) - 1; + else cubN = N; + } + if(cubN < N){ + if(rank == 0) + printf("Error: cubature order (%d) must be larger than or equal to the quadrature order (%d)!\n", + cubN, + N); + exit(1); + } + + if (Nfields != 1 && Nfields != 3){ + printf("Error: Nfields (%d) must be 1 or 3!\n", + Nfields); + exit(1); + } + Nelements = std::max(1, Nelements/size); + const int Nq = N + 1; + Np = Nq * Nq * Nq; + const int cubNq = cubN + 1; + cubNp = cubNq * cubNq * cubNq; + fieldOffset = Np * Nelements; + const int pageW = ALIGN_SIZE / sizeof(dfloat); + if (fieldOffset % pageW) fieldOffset = (fieldOffset / pageW + 1) * pageW; + cubatureOffset = std::max(fieldOffset, Nelements * cubNp); + if (cubatureOffset % pageW) + cubatureOffset = (cubatureOffset / pageW + 1) * pageW; + + platform = platform_t::getInstance(options, MPI_COMM_WORLD, MPI_COMM_WORLD); + platform->options.setArgs("BUILD ONLY", "FALSE"); + const int Nthreads = omp_get_max_threads(); + + bool isScalar = Nfields == 1; + + if(Ntests != -1){ + benchmarkAdvsub(Nfields, Nelements, Nq, cubNq, nEXT, dealias, isScalar, 2, Ntests, true); + } else { + benchmarkAdvsub(Nfields, Nelements, Nq, cubNq, nEXT, dealias, isScalar, 2, 10.0, true); + } + + MPI_Finalize(); + exit(0); +} diff --git a/src/bench/axHelm/README.md b/src/bench/axHelm/README.md new file mode 100644 index 000000000..ae2bbdf78 --- /dev/null +++ b/src/bench/axHelm/README.md @@ -0,0 +1,63 @@ +This benchmark computes the Helmholtz matrix-vector product +``` +AU = lambda0*[A]u + lambda1*[B]u +``` +or in BK mode +``` +AU = [A]u +``` +on deformed hexhedral spectral elements where A is the Laplace operator. + +# Usage + +``` +Usage: ./nekrs-bench-axhelm --p-order --elements --backend + [--block-dim ] [--bk-mode] [--fp32] [--iterations ] +``` + +# Examples + +### Nvidia A100 +``` + +> mpirun -np 1 nekrs-bench-axhelm --p-order 9 --elements 4096 --bk-mode --backend CUDA +MPItasks=1 OMPthreads=2 NRepetitions=33120 Ndim=1 N=9 Ng=9 Nelements=4096 elapsed time=0.000250596 bkMode=1 wordSize=64 GDOF/s=11.9155 GB/s=1046.08 GFLOPS/s=2206.58 + +> mpirun -np 1 nekrs-bench-axhelm --p-order 9 --elements 4096 --bk-mode --fp32 --backend CUDA +MPItasks=1 OMPthreads=2 NRepetitions=33654 Ndim=1 N=9 Ng=9 Nelements=4096 elapsed time=0.000140483 bkMode=1 wordSize=32 GDOF/s=21.2551 GB/s=933.008 GFLOPS/s=3936.13 + +> mpirun -np 1 nekrs-bench-axhelm --p-order 7 --elements 4096 --bk-mode --backend CUDA +MPItasks=1 OMPthreads=1 NRepetitions=78036 Ndim=1 N=7 Nelements=4096 elapsed time=0.00010919 bkMode=1 wordSize=64 GDOF/s=12.8668 GB/s=1229.21 GFLOPS/s=2131.91 + +> mpirun -np 1 nekrs-bench-axhelm --p-order 7 --elements 4096 --bk-mode --fp32 --backend CUDA +MPItasks=1 OMPthreads=1 NRepetitions=131933 Ndim=1 N=7 Nelements=4096 elapsed time=5.85106e-05 bkMode=1 wordSize=32 GDOF/s=24.0115 GB/s=1146.95 GFLOPS/s=3978.49 + +> mpirun -np 1 nekrs-bench-axhelm --p-order 5 --elements 4096 --bk-mode --backend CUDA +MPItasks=1 OMPthreads=1 NRepetitions=127495 Ndim=1 N=5 Ng=5 Nelements=4096 elapsed time=5.98018e-05 bkMode=1 wordSize=64 GDOF/s=8.56161 GB/s=946.846 GFLOPS/s=1287.12 + +> mpirun -np 1 nekrs-bench-axhelm --p-order 5 --elements 4096 --bk-mode --fp32 --backend CUDA +MPItasks=1 OMPthreads=1 NRepetitions=200223 Ndim=1 N=5 Ng=5 Nelements=4096 elapsed time=3.01184e-05 bkMode=1 wordSize=32 GDOF/s=16.9996 GB/s=940.008 GFLOPS/s=2555.65 + +``` + +### AMD EPYC 7742 64-Core +``` + +> OCCA_CXXFLAGS='-O3 -march=native -mtune=native' mpirun -np 64 --bind-to core --map-by ppr:64:socket nekrs-bench-axhelm --p-order 9 --elements 1024 --bk-mode --backend CPU +MPItasks=64 OMPthreads=1 NRepetitions=20507 Ndim=1 N=9 Ng=9 Nelements=1024 elapsed time=0.000353903 bkMode=1 wordSize=64 GDOF/s=2.10932 GB/s=185.181 GFLOPS/s=390.615 + +> OCCA_CXXFLAGS='-O3 -march=native -mtune=native' mpirun -np 64 --bind-to core --map-by ppr:64:socket nekrs-bench-axhelm --p-order 9 --elements 1024 --bk-mode --fp32 --backend CPU +MPItasks=64 OMPthreads=2 NRepetitions=24709 Ndim=1 N=9 Ng=9 Nelements=1024 elapsed time=0.000336351 bkMode=1 wordSize=32 GDOF/s=2.2194 GB/s=97.422 GFLOPS/s=410.999 + +> OCCA_CXXFLAGS='-O3 -march=native -mtune=native' mpirun -np 64 --bind-to core --map-by ppr:64:socket nekrs-bench-axhelm --p-order 7 --elements 1024 --bk-mode --backend CPU +MPItasks=64 OMPthreads=1 NRepetitions=103160 Ndim=1 N=7 Ng=7 Nelements=1024 elapsed time=7.878e-05 bkMode=1 wordSize=64 GDOF/s=4.45839 GB/s=425.926 GFLOPS/s=738.715 + +> OCCA_CXXFLAGS='-O3 -march=native -mtune=native' mpirun -np 64 --bind-to core --map-by ppr:64:socket nekrs-bench-axhelm --p-order 7 --elements 1024 --bk-mode --fp32 --backend CPU +MPItasks=64 OMPthreads=1 NRepetitions=213085 Ndim=1 N=7 Ng=7 Nelements=1024 elapsed time=2.89539e-05 bkMode=1 wordSize=32 GDOF/s=12.1307 GB/s=579.446 GFLOPS/s=2009.95 + +> OCCA_CXXFLAGS='-O3 -march=native -mtune=native' mpirun -np 64 --bind-to core --map-by ppr:64:socket nekrs-bench-axhelm --p-order 5 --elements 1024 --bk-mode --backend CPU +MPItasks=64 OMPthreads=1 NRepetitions=153752 Ndim=1 N=5 Ng=5 Nelements=1024 elapsed time=4.67946e-05 bkMode=1 wordSize=64 GDOF/s=2.73536 GB/s=302.509 GFLOPS/s=411.223 + +> OCCA_CXXFLAGS='-O3 -march=native -mtune=native' mpirun -np 64 --bind-to core --map-by ppr:64:socket nekrs-bench-axhelm --p-order 5 --elements 1024 --bk-mode --fp32 --backend CPU +MPItasks=64 OMPthreads=1 NRepetitions=132865 Ndim=1 N=5 Ng=5 Nelements=1024 elapsed time=4.77546e-05 bkMode=1 wordSize=32 GDOF/s=2.68037 GB/s=148.214 GFLOPS/s=402.956 +``` diff --git a/src/bench/axHelm/benchmarkAx.cpp b/src/bench/axHelm/benchmarkAx.cpp new file mode 100644 index 000000000..66251a498 --- /dev/null +++ b/src/bench/axHelm/benchmarkAx.cpp @@ -0,0 +1,367 @@ +#include "benchmarkAx.hpp" +#include +#include +#include +#include "nrs.hpp" + +#include "kernelBenchmarker.hpp" +#include "randomVector.hpp" +#include "omp.h" +#include +#include + +namespace{ +struct CallParameters{ + int Nelements; + int Nq; + int Ng; + bool constCoeff; + bool poisson; + bool computeGeom; + int wordSize; + int Ndim; + bool stressForm; + std::string suffix; +}; +} + +namespace std +{ + template<> struct less + { + bool operator() (const CallParameters& lhs, const CallParameters& rhs) const + { + auto tier = [](const CallParameters &v) { + return std::tie(v.Nelements, + v.Nq, + v.Ng, + v.constCoeff, + v.poisson, + v.computeGeom, + v.wordSize, + v.Ndim, + v.stressForm, + v.suffix); + }; + return tier(lhs) < tier(rhs); + } + }; +} + +namespace{ +std::map cachedResults; +} + +template +occa::kernel benchmarkAx(int Nelements, + int Nq, + int Ng, + bool constCoeff, + bool poisson, + bool computeGeom, + int wordSize, + int Ndim, + bool stressForm, + int verbosity, + T NtestsOrTargetTime, + bool requiresBenchmark, + std::string suffix) +{ + if(platform->options.compareArgs("BUILD ONLY", "TRUE")){ + Nelements = 1; + } + + CallParameters params{Nelements, Nq, Ng, constCoeff, poisson, computeGeom, wordSize, Ndim, stressForm, suffix}; + + if(cachedResults.count(params) > 0){ + return cachedResults.at(params); + } + + const auto N = Nq-1; + + const auto Np = Nq * Nq * Nq; + const auto Nq_g = Ng + 1; + const int Np_g = Nq_g * Nq_g * Nq_g; + + occa::properties props = platform->kernelInfo + meshKernelProperties(N); + if (wordSize == 4) + props["defines/dfloat"] = "float"; + if (Ng != N) { + props["defines/p_Nq_g"] = Nq_g; + props["defines/p_Np_g"] = Np_g; + } + if (poisson) + props["defines/p_poisson"] = 1; + + std::string kernelName = "elliptic"; + if (Ndim > 1){ + kernelName += stressForm ? "Stress" : "Block"; + } + kernelName += "PartialAx"; + if (!constCoeff) + kernelName += "Coeff"; + if (Ng != N) { + if (computeGeom) { + if (Ng == 1) { + kernelName += "Trilinear"; + } + else { + printf("Unsupported g-order=%d\n", Ng); + exit(1); + } + } + else { + printf("for now g-order != p-order requires --computeGeom!\n"); + exit(1); + kernelName += "Ngeom"; + } + } + kernelName += "Hex3D"; + if (Ndim > 1 && !stressForm) + kernelName += "_N" + std::to_string(Ndim); + + auto benchmarkAxWithPrecision = [&](auto sampleWord) { + using FPType = decltype(sampleWord); + const auto wordSize = sizeof(FPType); + constexpr int p_Nggeo{7}; + + int Nkernels = 1; + if (kernelName == "ellipticPartialAxHex3D") + Nkernels = 7; + std::vector kernelVariants; + if (platform->serial) { + kernelVariants.push_back(0); + } + else { + for (int knl = 0; knl < Nkernels; ++knl) { + +#if 0 + // v3 requires Nq^3 < 1024 (max threads/thread block on CUDA/HIP) + if (knl == 3 && Np > 1024) + continue; +#else + // disable v3 for now, since correctness check is off + if (knl == 3) + continue; +#endif + kernelVariants.push_back(knl); + } + } + + const std::string installDir(getenv("NEKRS_HOME")); + + // only a single choice, no need to run benchmark + if (kernelVariants.size() == 1 && !requiresBenchmark) { + + auto newProps = props; + if (kernelName == "ellipticPartialAxHex3D" && !platform->serial) { + newProps["defines/p_knl"] = kernelVariants.back(); + } + + const std::string ext = platform->serial ? ".c" : ".okl"; + const std::string fileName = installDir + "/okl/elliptic/" + kernelName + ext; + + return std::make_pair(platform->device.buildKernel(fileName, newProps, suffix, true), -1.0); + } + + auto DrV = randomVector(Nq * Nq); + auto ggeo = randomVector(Np_g * Nelements * p_Nggeo); + auto q = randomVector((Ndim * Np) * Nelements); + auto Aq = randomVector((Ndim * Np) * Nelements); + auto exyz = randomVector((3 * Np_g) * Nelements); + auto gllwz = randomVector(2 * Nq_g); + auto lambda = randomVector(2 * Np * Nelements); + + // elementList[e] = e + std::vector elementList(Nelements); + std::iota(elementList.begin(), elementList.end(), 0); + auto o_elementList = platform->device.malloc(Nelements * sizeof(dlong), elementList.data()); + + auto o_D = platform->device.malloc(Nq * Nq * wordSize, DrV.data()); + auto o_S = o_D; + auto o_ggeo = platform->device.malloc(Np_g * Nelements * p_Nggeo * wordSize, ggeo.data()); + auto o_q = platform->device.malloc((Ndim * Np) * Nelements * wordSize, q.data()); + auto o_Aq = platform->device.malloc((Ndim * Np) * Nelements * wordSize, Aq.data()); + auto o_exyz = platform->device.malloc((3 * Np_g) * Nelements * wordSize, exyz.data()); + auto o_gllwz = platform->device.malloc(2 * Nq_g * wordSize, gllwz.data()); + + auto o_lambda = platform->device.malloc(2 * Np * Nelements * wordSize, lambda.data()); + + occa::kernel referenceKernel; + { + auto newProps = props; + if (!platform->serial) + newProps["defines/p_knl"] = kernelVariants.front(); + + const std::string ext = platform->serial ? ".c" : ".okl"; + const std::string fileName = installDir + "/okl/elliptic/" + kernelName + ext; + + referenceKernel = platform->device.buildKernel(fileName, newProps, suffix, true); + } + + auto kernelRunner = [&](occa::kernel &kernel) { + const int loffset = 0; + const int offset = Nelements * Np; + if (computeGeom) { + kernel(Nelements, offset, loffset, o_elementList, o_exyz, o_gllwz, o_D, o_S, o_lambda, o_q, o_Aq); + } + else { + kernel(Nelements, offset, loffset, o_elementList, o_ggeo, o_D, o_S, o_lambda, o_q, o_Aq); + } + }; + + auto axKernelBuilder = [&](int kernelVariant) { + auto newProps = props; + if (!platform->serial) + newProps["defines/p_knl"] = kernelVariant; + + const std::string ext = platform->serial ? ".c" : ".okl"; + const std::string fileName = installDir + "/okl/elliptic/" + kernelName + ext; + + auto kernel = platform->device.buildKernel(fileName, newProps, suffix, true); + + if(platform->options.compareArgs("BUILD ONLY", "TRUE")) return kernel; + + std::vector refResults((Ndim * Np) * Nelements); + std::vector results((Ndim * Np) * Nelements); + + kernelRunner(referenceKernel); + o_Aq.copyTo(refResults.data(), refResults.size() * sizeof(FPType)); + + kernelRunner(kernel); + o_Aq.copyTo(results.data(), results.size() * sizeof(FPType)); + + FPType err = 0.0; + for (int i = 0; i < refResults.size(); ++i) { + err = std::max(err, std::abs(refResults[i] - results[i])); + } + + if (platform->comm.mpiRank == 0 && verbosity > 1) { + std::cout << "Error in kernel compared to reference implementation " << kernelVariant << ": " << err + << std::endl; + } + + return kernel; + }; + + auto printPerformanceInfo = [&](int kernelVariant, double elapsed, int Ntests, bool skipPrint) { + const bool BKmode = constCoeff && poisson; + + // print statistics + const dfloat GDOFPerSecond = (Nelements * Ndim * (N * N * N) / elapsed) / 1.e9; + + size_t bytesMoved = Ndim * 2 * Np * wordSize; // x, Ax + bytesMoved += 6 * Np_g * wordSize; // geo + if (!constCoeff) + bytesMoved += 3 * Np * wordSize; // lambda1, lambda2, Jw + const double bw = (Nelements * bytesMoved / elapsed) / 1.e9; + + double flopCount = Np * 12 * Nq + 15 * Np; + if (!constCoeff) + flopCount += 5 * Np; + const double gflops = Ndim * (flopCount * Nelements / elapsed) / 1.e9; + const int Nthreads = omp_get_max_threads(); + + if (platform->comm.mpiRank == 0 && !skipPrint) { + if (verbosity > 0) { + std::cout << "Ax:"; + } + if (verbosity > 1) { + std::cout << " MPItasks=" << platform->comm.mpiCommSize << " OMPthreads=" << Nthreads + << " NRepetitions=" << Ntests; + } + if (verbosity > 0) { + if (Ndim > 1) + std::cout << " Ndim=" << Ndim; + + std::cout << " N=" << N; + if (Ng != N) + std::cout << " Ng=" << Ng; + + if (verbosity > 1) + std::cout << " Nelements=" << Nelements; + + if (verbosity > 1) + std::cout << " elapsed time=" << elapsed; + + std::cout << " wordSize=" << 8 * wordSize << " GDOF/s=" << GDOFPerSecond << " GB/s=" << bw + << " GFLOPS/s=" << gflops << " bkMode=" << BKmode << " kernelVer=" << kernelVariant + << "\n"; + } + } + }; + + auto printCallBack = [&](int kernelVariant, double elapsed, int Ntests) { + printPerformanceInfo(kernelVariant, elapsed, Ntests, verbosity < 2); + }; + + auto kernelAndTime = + benchmarkKernel(axKernelBuilder, kernelRunner, printCallBack, kernelVariants, NtestsOrTargetTime); + + if (kernelAndTime.first.properties().has("defines/p_knl") && platform->options.compareArgs("BUILD ONLY","FALSE")) { + int bestKernelVariant = static_cast(kernelAndTime.first.properties()["defines/p_knl"]); + + // print only the fastest kernel + if (verbosity == 1) { + printPerformanceInfo(bestKernelVariant, kernelAndTime.second, 0, false); + } + } + + free(o_D); + free(o_S); + free(o_ggeo); + free(o_q); + free(o_Aq); + free(o_exyz); + free(o_gllwz); + free(o_lambda); + free(o_elementList); + + return kernelAndTime; + }; + + occa::kernel kernel; + + if (wordSize == sizeof(float)) { + float p = 0.0; + auto kernelAndTime = benchmarkAxWithPrecision(p); + kernel = kernelAndTime.first; + } + else { + double p = 0.0; + auto kernelAndTime = benchmarkAxWithPrecision(p); + kernel = kernelAndTime.first; + } + + cachedResults[params] = kernel; + + return kernel; +} + +template occa::kernel benchmarkAx(int Nelements, + int Nq, + int Ng, + bool constCoeff, + bool poisson, + bool computeGeom, + int wordSize, + int Ndim, + bool stressForm, + int verbosity, + int Ntests, + bool requiresBenchmark, + std::string suffix); + +template occa::kernel benchmarkAx(int Nelements, + int Nq, + int Ng, + bool constCoeff, + bool poisson, + bool computeGeom, + int wordSize, + int Ndim, + bool stressForm, + int verbosity, + double targetTime, + bool requiresBenchmark, + std::string suffix); diff --git a/src/bench/axHelm/benchmarkAx.hpp b/src/bench/axHelm/benchmarkAx.hpp new file mode 100644 index 000000000..0fef570f3 --- /dev/null +++ b/src/bench/axHelm/benchmarkAx.hpp @@ -0,0 +1,16 @@ +#include "occa.hpp" + +template +occa::kernel benchmarkAx(int Nelements, + int Nq, + int Ng, + bool constCoeff, + bool poisson, + bool computeGeom, + int wordSize, + int Ndim, + bool stressForm, + int verbosity, + T NtestsOrTargetTime, + bool requiresBenchmark, + std::string suffix); \ No newline at end of file diff --git a/src/bench/axHelm/main.cpp b/src/bench/axHelm/main.cpp new file mode 100644 index 000000000..456134b98 --- /dev/null +++ b/src/bench/axHelm/main.cpp @@ -0,0 +1,163 @@ +#include +#include +#include +#include +#include "omp.h" +#include +#include "mpi.h" + +#include "nrssys.hpp" +#include "setupAide.hpp" +#include "platform.hpp" +#include "configReader.hpp" + +#include "benchmarkAx.hpp" + +int main(int argc, char** argv) +{ + int rank = 0, size = 1; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + configRead(MPI_COMM_WORLD); + std::string installDir(getenv("NEKRS_HOME")); + setupAide options; + + int err = 0; + int cmdCheck = 0; + + std::string threadModel; + int N; + int Nelements; + int Ng = -1; + int Ndim = 1; + int okl = 1; + int BKmode = 0; + int Ntests = -1; + size_t wordSize = 8; + int computeGeom = 0; + + while(1) { + static struct option long_options[] = + { + {"p-order", required_argument, 0, 'p'}, + {"g-order", required_argument, 0, 'g'}, + {"computeGeom", no_argument, 0, 'c'}, + {"block-dim", required_argument, 0, 'd'}, + {"elements", required_argument, 0, 'e'}, + {"backend", required_argument, 0, 'b'}, + {"arch", required_argument, 0, 'a'}, + {"bk-mode", no_argument, 0, 'm'}, + {"fp32", no_argument, 0, 'f'}, + {"help", required_argument, 0, 'h'}, + {"iterations", required_argument, 0, 'i'}, + {0, 0, 0, 0} + }; + int option_index = 0; + int c = getopt_long (argc, argv, "", long_options, &option_index); + + if (c == -1) + break; + + switch(c) { + case 'p': + N = atoi(optarg); + cmdCheck++; + break; + case 'g': + Ng = atoi(optarg); + break; + case 'c': + computeGeom = 1; + break; + case 'd': + Ndim = atoi(optarg); + break; + case 'e': + Nelements = atoi(optarg); + cmdCheck++; + break; + case 'b': + options.setArgs("THREAD MODEL", std::string(optarg)); + cmdCheck++; + break; + case 'm': + BKmode = 1; + break; + case 'f': + wordSize = 4;; + break; + case 'i': + Ntests = atoi(optarg); + break; + case 'h': + err = 1; + break; + default: + err = 1; + } + } + + if(err || cmdCheck != 3) { + if(rank == 0) + printf("Usage: ./nekrs-axhelm --p-order --elements --backend \n" + " [--block-dim ]\n" + " [--g-order ] [--computeGeom]\n" + " [--bk-mode] [--fp32] [--iterations ]\n"); + exit(1); + } + + if(Ng < 0) Ng = N; + Nelements = std::max(1, Nelements/size); + constexpr int p_Nggeo {7}; + const int Nq = N + 1; + const int Np = Nq * Nq * Nq; + const int Nq_g = Ng + 1; + const int Np_g = Nq_g * Nq_g * Nq_g; + + // BKmode <-> both constant coeff AND poisson + bool poisson = false; + bool constCoeff = false; + if(BKmode){ + poisson = true; + constCoeff = true; + } + + platform = platform_t::getInstance(options, MPI_COMM_WORLD, MPI_COMM_WORLD); + platform->options.setArgs("BUILD ONLY", "FALSE"); + const int verbosity = 2; + if (Ntests != -1) { + benchmarkAx(Nelements, + Nq, + Ng, + poisson, + constCoeff, + computeGeom, + wordSize, + Ndim, + false, // no stress formulation + verbosity, + Ntests, + true, + ""); + } + else { + const double targetTime = 10.0; + benchmarkAx(Nelements, + Nq, + Ng, + poisson, + constCoeff, + computeGeom, + wordSize, + Ndim, + false, // no stress formulation + verbosity, + targetTime, + true, + ""); + } + MPI_Finalize(); + exit(0); +} diff --git a/src/bench/core/kernelBenchmarker.cpp b/src/bench/core/kernelBenchmarker.cpp new file mode 100644 index 000000000..3a26e4daf --- /dev/null +++ b/src/bench/core/kernelBenchmarker.cpp @@ -0,0 +1,90 @@ +#include "kernelBenchmarker.hpp" +#include +#include "nrs.hpp" + +namespace { +double run(int Nsamples, std::function kernelRunner, occa::kernel &kernel) +{ + platform->device.finish(); + MPI_Barrier(platform->comm.mpiComm); + const double start = MPI_Wtime(); + + for (int test = 0; test < Nsamples; ++test) { + kernelRunner(kernel); + } + + platform->device.finish(); + return (MPI_Wtime() - start) / Nsamples; +} +} // namespace +std::pair +benchmarkKernel(std::function kernelBuilder, + std::function kernelRunner, + std::function printCallback, + const std::vector &kernelVariants, + int Ntests) +{ + occa::kernel fastestKernel; + double fastestTime = std::numeric_limits::max(); + for (auto &&kernelVariant : kernelVariants) { + + auto candidateKernel = kernelBuilder(kernelVariant); + + if(platform->options.compareArgs("BUILD ONLY", "FALSE")){ + // warmup + double elapsed = run(10, kernelRunner, candidateKernel); + + double candidateKernelTiming = run(Ntests, kernelRunner, candidateKernel); + MPI_Allreduce(MPI_IN_PLACE, &candidateKernelTiming, 1, MPI_DOUBLE, MPI_MAX, platform->comm.mpiComm); + + if (candidateKernelTiming < fastestTime) { + fastestTime = candidateKernelTiming; + fastestKernel = candidateKernel; + } + + printCallback(kernelVariant, candidateKernelTiming, Ntests); + } else { + fastestKernel = candidateKernel; + } + } + + return std::make_pair(fastestKernel, fastestTime); +} + +std::pair +benchmarkKernel(std::function kernelBuilder, + std::function kernelRunner, + std::function printCallback, + const std::vector &kernelVariants, + double targetTime) +{ + occa::kernel fastestKernel; + double fastestTime = std::numeric_limits::max(); + for (auto &&kernelVariant : kernelVariants) { + + auto candidateKernel = kernelBuilder(kernelVariant); + if(platform->options.compareArgs("BUILD ONLY", "FALSE")){ + + // warmup + double elapsed = run(10, kernelRunner, candidateKernel); + + // evaluation + int Ntests = static_cast(targetTime / elapsed); + MPI_Allreduce(MPI_IN_PLACE, &Ntests, 1, MPI_INT, MPI_MAX, platform->comm.mpiComm); + + double candidateKernelTiming = run(Ntests, kernelRunner, candidateKernel); + MPI_Allreduce(MPI_IN_PLACE, &candidateKernelTiming, 1, MPI_DOUBLE, MPI_MAX, platform->comm.mpiComm); + + if (candidateKernelTiming < fastestTime) { + fastestTime = candidateKernelTiming; + fastestKernel = candidateKernel; + } + + printCallback(kernelVariant, candidateKernelTiming, Ntests); + } else { + fastestKernel = candidateKernel; + } + } + + return std::make_pair(fastestKernel, fastestTime); +} diff --git a/src/bench/core/kernelBenchmarker.hpp b/src/bench/core/kernelBenchmarker.hpp new file mode 100644 index 000000000..db45cef41 --- /dev/null +++ b/src/bench/core/kernelBenchmarker.hpp @@ -0,0 +1,17 @@ +#include "occa.hpp" +#include +#include + +std::pair +benchmarkKernel(std::function kernelBuilder, + std::function kernelRunner, + std::function printCallback, + const std::vector &kernelVariants, + int Ntests); + +std::pair +benchmarkKernel(std::function kernelBuilder, + std::function kernelRunner, + std::function printCallback, + const std::vector &kernelVariants, + double targetTime); diff --git a/src/bench/fdm/README.md b/src/bench/fdm/README.md new file mode 100644 index 000000000..0014d5bd0 --- /dev/null +++ b/src/bench/fdm/README.md @@ -0,0 +1,31 @@ +This benchmark applies the fast diagonalization method (FDM) +``` +Su = (S_x \cross S_y \cross S_z) \Lambda^{-1} (S_x^T \cross S_y^T \cross S_z^T)u +``` + +# Usage + +``` +Usage: ./nekrs-bench-fdm --p-order --elements --backend + [--fp32] [--iterations ] +``` + +# Examples + +### NVIDIA A100 +``` +> mpirun -np 1 nekrs-bench-fdm --p-order 9 --elements 4096 --backend CUDA --fp32 +MPItasks=1 OMPthreads=1 NRepetitions=76285 N=9 Nelements=4096 elapsed time=9.7327e-05 wordSize=32 GDOF/s=30.6799 GB/s=555.521 GFLOPS/s=5092.28 + +> mpirun -np 1 nekrs-bench-fdm --p-order 7 --elements 4096 --backend CUDA --fp32 +MPItasks=1 OMPthreads=1 NRepetitions=98526 N=7 Nelements=4096 elapsed time=7.08805e-05 wordSize=32 GDOF/s=19.8211 GB/s=399.427 GFLOPS/s=2869.96 +``` + +### AMD EPYC 7742 64-Core +``` +> OCCA_CXXFLAGS='-O3 -march=native -mtune=native' mpirun -np 64 --bind-to core --map-by ppr:64:socket nekrs-bench-fdm --p-order 9 --elements 1024 --fp32 --backend CPU +MPItasks=64 OMPthreads=1 NRepetitions=71000 N=9 Nelements=1024 elapsed time=0.000119283 wordSize=32 GDOF/s=6.25821 GB/s=113.317 GFLOPS/s=1038.74 + +> OCCA_CXXFLAGS='-O3 -march=native -mtune=native' mpirun -np 64 --bind-to core --map-by ppr:64:socket nekrs-bench-fdm --p-order 7 --elements 1024 --fp32 --backend CPU +MPItasks=64 OMPthreads=1 NRepetitions=179174 N=7 Nelements=1024 elapsed time=3.69751e-05 wordSize=32 GDOF/s=9.49915 GB/s=191.423 GFLOPS/s=1375.41 +``` diff --git a/src/bench/fdm/benchmarkFDM.cpp b/src/bench/fdm/benchmarkFDM.cpp new file mode 100644 index 000000000..7d7d2e1b9 --- /dev/null +++ b/src/bench/fdm/benchmarkFDM.cpp @@ -0,0 +1,318 @@ +#include "benchmarkFDM.hpp" +#include +#include +#include +#include "nrs.hpp" + +#include "randomVector.hpp" +#include "kernelBenchmarker.hpp" +#include "omp.h" +#include +#include + +namespace{ +struct CallParameters{ + int Nelements; + int Nq_e; + int wordSize; + bool useRAS; + bool overlap; + std::string suffix; +}; +} + +namespace std +{ + template<> struct less + { + bool operator() (const CallParameters& lhs, const CallParameters& rhs) const + { + auto tier = [](const CallParameters &v) { + return std::tie(v.Nelements, v.Nq_e, v.wordSize, v.useRAS, v.overlap, v.suffix); + }; + return tier(lhs) < tier(rhs); + } + }; +} + +namespace{ +std::map cachedResults; +} + +template +occa::kernel benchmarkFDM(int Nelements, + int Nq_e, + int wordSize, + bool useRAS, + bool overlap, + int verbosity, + T NtestsOrTargetTime, + bool requiresBenchmark, + std::string suffix) +{ + if(platform->options.compareArgs("BUILD ONLY", "TRUE")){ + Nelements = 1; + } + + CallParameters params{Nelements, Nq_e, wordSize, useRAS, overlap, suffix}; + + if(cachedResults.count(params) > 0){ + return cachedResults.at(params); + } + + const auto Nq = Nq_e - 2; + const auto N_e = Nq_e - 1; + const auto N = Nq - 1; + const auto Np_e = Nq_e * Nq_e * Nq_e; + + occa::properties props = platform->kernelInfo + meshKernelProperties(N); // regular, non-extended mesh + if (wordSize == 4) + props["defines/pfloat"] = "float"; + else + props["defines/pfloat"] = "dfloat"; + + props["defines/p_Nq_e"] = Nq_e; + props["defines/p_Np_e"] = Np_e; + props["defines/p_overlap"] = overlap; + + if (useRAS) { + props["defines/p_restrict"] = 1; + } + else { + props["defines/p_restrict"] = 0; + } + + auto benchmarkFDMWithPrecision = [&](auto sampleWord) { + using FPType = decltype(sampleWord); + const auto wordSize = sizeof(FPType); + + constexpr int Nkernels = 5; + std::vector kernelVariants; + if (platform->serial) { + kernelVariants.push_back(0); + } + else { + for (int knl = 0; knl < Nkernels; ++knl) { + kernelVariants.push_back(knl); + } + } + + const std::string installDir(getenv("NEKRS_HOME")); + // only a single choice, no need to run benchmark + if (kernelVariants.size() == 1 && !requiresBenchmark) { + auto newProps = props; + if (!platform->serial) + newProps["defines/p_knl"] = kernelVariants.back(); + + const std::string kernelName = "fusedFDM"; + const std::string ext = platform->serial ? ".c" : ".okl"; + const std::string fileName = installDir + "/okl/elliptic/" + kernelName + ext; + + return std::make_pair(platform->device.buildKernel(fileName, newProps, suffix, true), -1.0); + } + + auto Sx = randomVector(Nelements * Nq_e * Nq_e); + auto Sy = randomVector(Nelements * Nq_e * Nq_e); + auto Sz = randomVector(Nelements * Nq_e * Nq_e); + auto invL = randomVector(Nelements * Np_e); + auto Su = randomVector(Nelements * Np_e); + auto u = randomVector(Nelements * Np_e); + auto invDegree = randomVector(Nelements * Np_e); + + // elementList[e] = e + std::vector elementList(Nelements); + std::iota(elementList.begin(), elementList.end(), 0); + auto o_elementList = platform->device.malloc(Nelements * sizeof(int), elementList.data()); + + auto o_Sx = platform->device.malloc(Nelements * Nq_e * Nq_e * wordSize, Sx.data()); + auto o_Sy = platform->device.malloc(Nelements * Nq_e * Nq_e * wordSize, Sy.data()); + auto o_Sz = platform->device.malloc(Nelements * Nq_e * Nq_e * wordSize, Sz.data()); + auto o_invL = platform->device.malloc(Nelements * Np_e * wordSize, invL.data()); + auto o_Su = platform->device.malloc(Nelements * Np_e * wordSize, Su.data()); + auto o_u = platform->device.malloc(Nelements * Np_e * wordSize, u.data()); + auto o_invDegree = platform->device.malloc(Nelements * Np_e * sizeof(dfloat), invDegree.data()); + + occa::kernel referenceKernel; + { + auto newProps = props; + if (!platform->serial) + newProps["defines/p_knl"] = kernelVariants.front(); + + const std::string kernelName = "fusedFDM"; + const std::string ext = platform->serial ? ".c" : ".okl"; + const std::string fileName = installDir + "/okl/elliptic/" + kernelName + ext; + + referenceKernel = platform->device.buildKernel(fileName, newProps, suffix, true); + } + + auto kernelRunner = [&](occa::kernel &kernel) { + if (useRAS) { + if (!overlap) { + kernel(Nelements, o_Su, o_Sx, o_Sy, o_Sz, o_invL, o_invDegree, o_u); + } + else { + kernel(Nelements, o_elementList, o_Su, o_Sx, o_Sy, o_Sz, o_invL, o_invDegree, o_u); + } + } + else { + if (!overlap) { + kernel(Nelements, o_Su, o_Sx, o_Sy, o_Sz, o_invL, o_u); + } + else { + kernel(Nelements, o_elementList, o_Su, o_Sx, o_Sy, o_Sz, o_invL, o_u); + } + } + }; + + auto fdmKernelBuilder = [&](int kernelVariant) { + auto newProps = props; + if (!platform->serial) + newProps["defines/p_knl"] = kernelVariant; + + const std::string kernelName = "fusedFDM"; + const std::string ext = platform->serial ? ".c" : ".okl"; + const std::string fileName = installDir + "/okl/elliptic/" + kernelName + ext; + + auto kernel = platform->device.buildKernel(fileName, newProps, suffix, true); + if(platform->options.compareArgs("BUILD ONLY", "TRUE")) return kernel; + + auto dumpResult = [&]() { + std::vector result; + if (useRAS) { + const auto Nq = Nq_e - 2; + const auto Np = Nq * Nq * Nq; + result.resize(Nelements * Np); + } + else { + result.resize(Nelements * Np_e); + } + + o_Su.copyTo(result.data(), result.size() * sizeof(FPType)); + return result; + }; + + auto resetFields = [&]() { + o_Su.copyFrom(Su.data(), Nelements * Np_e * wordSize); + o_u.copyFrom(u.data(), Nelements * Np_e * wordSize); + }; + + resetFields(); + kernelRunner(referenceKernel); + auto referenceResult = dumpResult(); + + resetFields(); + kernelRunner(kernel); + auto result = dumpResult(); + + FPType err = 0.0; + for (int i = 0; i < result.size(); ++i) { + err = std::max(err, std::abs(result[i] - referenceResult[i])); + } + + if (platform->comm.mpiRank == 0 && verbosity > 1) { + std::cout << "Error in kernel compared to reference implementation " << kernelVariant << ": " << err + << std::endl; + } + + return kernel; + }; + + auto printPerformanceInfo = [&](int kernelVariant, double elapsed, int Ntests, bool skipPrint) { + // print statistics + const double GDOFPerSecond = (Nelements * (N_e * N_e * N_e) / elapsed) / 1.e9; + + size_t bytesPerElem = (3 * Np_e + 3 * Nq_e * Nq_e) * wordSize; + const double bw = (Nelements * bytesPerElem / elapsed) / 1.e9; + + double flopsPerElem = 12 * Nq_e * Np_e + Np_e; + const double gflops = (Nelements * flopsPerElem / elapsed) / 1.e9; + const int Nthreads = omp_get_max_threads(); + + if (platform->comm.mpiRank == 0 && !skipPrint) { + if (verbosity > 0) { + std::cout << "fdm:"; + } + if (verbosity > 1) { + std::cout << "MPItasks=" << platform->comm.mpiCommSize << " OMPthreads=" << Nthreads + << " NRepetitions=" << Ntests; + } + if (verbosity > 0) { + std::cout << " N=" << N_e; + + if (verbosity > 1) + std::cout << " Nelements=" << Nelements; + + if (verbosity > 1) + std::cout << " elapsed time=" << elapsed; + + std::cout << " wordSize=" << 8 * wordSize << " GDOF/s=" << GDOFPerSecond << " GB/s=" << bw + << " GFLOPS/s=" << gflops << " kernelVer=" << kernelVariant << "\n"; + } + } + }; + + auto printCallBack = [&](int kernelVariant, double elapsed, int Ntests) { + printPerformanceInfo(kernelVariant, elapsed, Ntests, verbosity < 2); + }; + + auto kernelAndTime = + benchmarkKernel(fdmKernelBuilder, kernelRunner, printCallBack, kernelVariants, NtestsOrTargetTime); + + if (kernelAndTime.first.properties().has("defines/p_knl") && platform->options.compareArgs("BUILD ONLY","FALSE")) { + int bestKernelVariant = static_cast(kernelAndTime.first.properties()["defines/p_knl"]); + + // print only the fastest kernel + if (verbosity == 1) { + printPerformanceInfo(bestKernelVariant, kernelAndTime.second, 0, false); + } + } + + free(o_Sx); + free(o_Sy); + free(o_Sz); + free(o_invL); + free(o_Su); + free(o_u); + free(o_invDegree); + free(o_elementList); + + return kernelAndTime; + }; + + occa::kernel kernel; + + if (wordSize == sizeof(float)) { + float p = 0.0; + auto kernelAndTime = benchmarkFDMWithPrecision(p); + kernel = kernelAndTime.first; + } + else { + double p = 0.0; + auto kernelAndTime = benchmarkFDMWithPrecision(p); + kernel = kernelAndTime.first; + } + + cachedResults[params] = kernel; + + return kernel; +} + +template occa::kernel benchmarkFDM(int Nelements, + int Nq_e, + int wordSize, + bool useRAS, + bool overlap, + int verbosity, + int Ntests, + bool requiresBenchmark, + std::string suffix); + +template occa::kernel benchmarkFDM(int Nelements, + int Nq_e, + int wordSize, + bool useRAS, + bool overlap, + int verbosity, + double targetTime, + bool requiresBenchmark, + std::string suffix); diff --git a/src/bench/fdm/benchmarkFDM.hpp b/src/bench/fdm/benchmarkFDM.hpp new file mode 100644 index 000000000..fc4749b0d --- /dev/null +++ b/src/bench/fdm/benchmarkFDM.hpp @@ -0,0 +1,12 @@ +#include "occa.hpp" + +template +occa::kernel benchmarkFDM(int Nelements, + int Nq_e, + int wordSize, + bool useRAS, + bool overlap, + int verbosity, + T NtestsOrTargetTime, + bool requiresBenchmark, + std::string suffix); \ No newline at end of file diff --git a/src/bench/fdm/main.cpp b/src/bench/fdm/main.cpp new file mode 100644 index 000000000..a782e436d --- /dev/null +++ b/src/bench/fdm/main.cpp @@ -0,0 +1,116 @@ +#include +#include +#include +#include +#include "omp.h" +#include +#include "mpi.h" +#include +#include + +#include "nrssys.hpp" +#include "setupAide.hpp" +#include "platform.hpp" +#include "configReader.hpp" + +#include "benchmarkFDM.hpp" + +int main(int argc, char** argv) +{ + int rank = 0, size = 1; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + configRead(MPI_COMM_WORLD); + std::string installDir(getenv("NEKRS_HOME")); + setupAide options; + + int err = 0; + int cmdCheck = 0; + + int wordSize = 8; + int Nelements; + + int N; + int okl = 1; + int Ntests = -1; + + while(1) { + static struct option long_options[] = + { + {"p-order", required_argument, 0, 'p'}, + {"elements", required_argument, 0, 'e'}, + {"backend", required_argument, 0, 'b'}, + {"arch", required_argument, 0, 'a'}, + {"fp32", no_argument, 0, 'f'}, + {"help", required_argument, 0, 'h'}, + {"iterations", required_argument, 0, 'i'}, + {0, 0, 0, 0} + }; + int option_index = 0; + int c = getopt_long (argc, argv, "", long_options, &option_index); + + if (c == -1) + break; + + switch(c) { + case 'p': + N = atoi(optarg); + cmdCheck++; + break; + case 'e': + Nelements = atoi(optarg); + cmdCheck++; + break; + case 'b': + options.setArgs("THREAD MODEL", std::string(optarg)); + cmdCheck++; + break; + case 'f': + wordSize = 4;; + break; + case 'i': + Ntests = atoi(optarg); + break; + case 'h': + err = 1; + break; + default: + err = 1; + } + } + + if(err || cmdCheck != 3) { + if(rank == 0) + printf("Usage: ./nekrs-fdm --p-order --elements --backend \n" + " [--fp32] [--iterations ]\n"); + exit(1); + } + + if(N <= 2){ + if(rank == 0){ + printf("Error: N > 2!\n"); + } + exit(1); + } + + Nelements = std::max(1, Nelements/size); + const int Nq = N + 1; + const int Np = Nq * Nq * Nq; + + platform = platform_t::getInstance(options, MPI_COMM_WORLD, MPI_COMM_WORLD); + platform->options.setArgs("BUILD ONLY", "FALSE"); + + const int verbosity = 2; + if (Ntests != -1) { + benchmarkFDM(Nelements, Nq, wordSize, false, false, verbosity, Ntests, true, ""); + } + else { + const double targetTime = 10.0; + benchmarkFDM(Nelements, Nq, wordSize, false, false, verbosity, targetTime, true, ""); + } + + MPI_Finalize(); + exit(0); +} diff --git a/src/cds/cds.cpp b/src/cds/cds.cpp deleted file mode 100644 index 7d1f78a57..000000000 --- a/src/cds/cds.cpp +++ /dev/null @@ -1,66 +0,0 @@ -#include -#include "nrs.hpp" -#include "linAlg.hpp" - -occa::memory cdsSolve(const int is, cds_t* cds, dfloat time, int stage) -{ - - mesh_t* mesh; - oogs_t* gsh; - if(is) { - mesh = cds->meshV; - gsh = cds->gsh; - } else { - mesh = cds->mesh[0]; - gsh = cds->gshT; - } - elliptic_t* solver = cds->solver[is]; - - platform->o_mempool.slice0.copyFrom(cds->o_S, cds->fieldOffset[is] * sizeof(dfloat), 0, cds->fieldOffsetScan[is] * sizeof(dfloat)); - - //enforce Dirichlet BCs - platform->linAlg->fill(cds->fieldOffset[is], -1.0*std::numeric_limits::max(), platform->o_mempool.slice2); - for (int sweep = 0; sweep < 2; sweep++) { - cds->dirichletBCKernel(mesh->Nelements, - cds->fieldOffset[is], - is, - time, - mesh->o_sgeo, - mesh->o_x, - mesh->o_y, - mesh->o_z, - mesh->o_vmapM, - mesh->o_EToB, - cds->o_EToB[is], - *(cds->o_usrwrk), - platform->o_mempool.slice2); - - //take care of Neumann-Dirichlet shared edges across elements - if(sweep == 0) oogs::startFinish(platform->o_mempool.slice2, 1, cds->fieldOffset[is], ogsDfloat, ogsMax, gsh); - if(sweep == 1) oogs::startFinish(platform->o_mempool.slice2, 1, cds->fieldOffset[is], ogsDfloat, ogsMin, gsh); - } - if (solver->Nmasked) cds->maskCopyKernel(solver->Nmasked, 0, solver->o_maskIds, platform->o_mempool.slice2, platform->o_mempool.slice0); - - //build RHS - platform->o_mempool.slice1.copyFrom(cds->o_BF, cds->fieldOffset[is] * sizeof(dfloat), 0, cds->fieldOffsetScan[is] * sizeof(dfloat)); - cds->helmholtzRhsBCKernel(mesh->Nelements, - mesh->o_sgeo, - mesh->o_vmapM, - mesh->o_EToB, - is, - time, - cds->fieldOffset[is], - mesh->o_x, - mesh->o_y, - mesh->o_z, - platform->o_mempool.slice0, - cds->o_EToB[is], - *(cds->o_usrwrk), - platform->o_mempool.slice1); - - ellipticSolve(solver, platform->o_mempool.slice1, platform->o_mempool.slice0); - - return platform->o_mempool.slice0; -} - - diff --git a/src/cds/cds.hpp b/src/cds/cds.hpp index 80c285b26..8ecd2c524 100644 --- a/src/cds/cds.hpp +++ b/src/cds/cds.hpp @@ -14,6 +14,7 @@ struct cds_t { + static constexpr double targetBenchmark {0.1}; int dim, elementType; mesh_t* mesh[NSCALAR_MAX]; @@ -31,6 +32,7 @@ struct cds_t oogs_t *gsh, *gshT; dlong vFieldOffset; + dlong vCubatureOffset; dfloat idt; dfloat *dt; int tstep; @@ -42,10 +44,7 @@ struct cds_t int compute[NSCALAR_MAX]; - dfloat* U, * S; - dfloat* rkNS; - // dfloat *rhsS; - dfloat* rkS; + dfloat *U, *S; // filter int filterNc; @@ -70,12 +69,11 @@ struct cds_t occa::memory* o_usrwrk; int Nsubsteps; - dfloat sdt; + dfloat* Ue; occa::memory o_Ue; - int var_coeff; - dfloat* prop, * ellipticCoeff; + dfloat* prop; occa::memory o_prop, o_ellipticCoeff; occa::memory o_rho, o_diff; @@ -106,8 +104,8 @@ struct cds_t occa::kernel advectionSurfaceKernel; occa::kernel advectionCubatureVolumeKernel; occa::kernel advectionCubatureSurfaceKernel; - occa::kernel advectionStrongVolumeKernel; - occa::kernel advectionStrongCubatureVolumeKernel; + occa::kernel strongAdvectionVolumeKernel; + occa::kernel strongAdvectionCubatureVolumeKernel; occa::kernel advectMeshVelocityKernel; occa::kernel helmholtzRhsIpdgBCKernel; diff --git a/src/cds/cdsSolve.cpp b/src/cds/cdsSolve.cpp new file mode 100644 index 000000000..0760a680b --- /dev/null +++ b/src/cds/cdsSolve.cpp @@ -0,0 +1,40 @@ +#include +#include "nrs.hpp" +#include "linAlg.hpp" + +occa::memory cdsSolve(const int is, cds_t* cds, dfloat time, int stage) +{ + platform->timer.tic("scalar rhs", 1); + mesh_t* mesh = cds->mesh[0]; + oogs_t* gsh = cds->gshT; + if(is) { + mesh = cds->meshV; + gsh = cds->gsh; + } + + occa::memory o_Si = cds->o_S.slice(cds->fieldOffsetScan[is] * sizeof(dfloat), cds->fieldOffset[is] * sizeof(dfloat)); + + platform->o_mempool.slice1.copyFrom(cds->o_BF, cds->fieldOffset[is] * sizeof(dfloat), 0, cds->fieldOffsetScan[is] * sizeof(dfloat)); + cds->helmholtzRhsBCKernel(mesh->Nelements, + mesh->o_sgeo, + mesh->o_vmapM, + mesh->o_EToB, + is, + time, + cds->fieldOffset[is], + mesh->o_x, + mesh->o_y, + mesh->o_z, + o_Si, + cds->o_EToB[is], + *(cds->o_usrwrk), + platform->o_mempool.slice1); + + platform->timer.toc("scalar rhs"); + platform->o_mempool.slice0.copyFrom(o_Si, mesh->Nlocal * sizeof(dfloat)); + ellipticSolve(cds->solver[is], platform->o_mempool.slice1, platform->o_mempool.slice0); + + return platform->o_mempool.slice0; +} + + diff --git a/src/cds/registerCdsKernels.cpp b/src/cds/registerCdsKernels.cpp new file mode 100644 index 000000000..6d44c24a2 --- /dev/null +++ b/src/cds/registerCdsKernels.cpp @@ -0,0 +1,184 @@ +#include +#include +#include "re2Reader.hpp" +#include "benchmarkAdvsub.hpp" + +void registerCdsKernels(occa::properties kernelInfoBC) { + const bool serial = platform->serial; + const std::string extension = serial ? ".c" : ".okl"; + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + occa::properties kernelInfo = platform->kernelInfo; + kernelInfo["defines"].asObject(); + kernelInfo["includes"].asArray(); + kernelInfo["header"].asArray(); + kernelInfo["flags"].asObject(); + kernelInfo["include_paths"].asArray(); + + int N, cubN; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + platform->options.getArgs("CUBATURE POLYNOMIAL DEGREE", cubN); + const int Nq = N + 1; + const int cubNq = cubN + 1; + const int Np = Nq * Nq * Nq; + const int cubNp = cubNq * cubNq * cubNq; + constexpr int Nfaces{6}; + + constexpr int NVfields{3}; + kernelInfo["defines/p_NVfields"] = NVfields; + + int Nsubsteps = 0; + int nBDF = 0; + int nEXT = 0; + platform->options.getArgs("SUBCYCLING STEPS", Nsubsteps); + + if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO1")) { + nBDF = 1; + } else if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO2")) { + nBDF = 2; + } else if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO3")) { + nBDF = 3; + } + nEXT = 3; + if (Nsubsteps) + nEXT = nBDF; + + + std::string fileName, kernelName; + const std::string suffix = "Hex3D"; + const std::string oklpath = installDir + "/okl/"; + const std::string section = "cds-"; + occa::properties meshProps = kernelInfo; + meshProps += meshKernelProperties(N); + { + kernelName = "relativeMassHighestMode"; + fileName = oklpath + "cds/regularization/" + kernelName + ".okl"; + platform->kernels.add(kernelName, fileName, meshProps); + + kernelName = "computeMaxVisc"; + fileName = oklpath + "cds/regularization/" + kernelName + ".okl"; + platform->kernels.add(kernelName, fileName, meshProps); + + kernelName = "interpolateP1"; + fileName = oklpath + "cds/regularization/" + kernelName + ".okl"; + platform->kernels.add(kernelName, fileName, meshProps); + + { + occa::properties prop = meshProps; + prop["defines/p_cubNq"] = cubNq; + prop["defines/p_cubNp"] = cubNp; + + kernelName = "strongAdvectionVolume" + suffix; + fileName = oklpath + "cds/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + + kernelName = "strongAdvectionCubatureVolume" + suffix; + fileName = oklpath + "cds/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + } + + kernelName = "advectMeshVelocityHex3D"; + fileName = oklpath + "cds/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, meshProps); + + kernelName = "maskCopy"; + fileName = oklpath + "core/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, meshProps); + + { + occa::properties prop = kernelInfo; + const int movingMesh = + platform->options.compareArgs("MOVING MESH", "TRUE"); + prop["defines/p_MovingMesh"] = movingMesh; + prop["defines/p_nEXT"] = nEXT; + prop["defines/p_nBDF"] = nBDF; + if (Nsubsteps) + prop["defines/p_SUBCYCLING"] = 1; + else + prop["defines/p_SUBCYCLING"] = 0; + + kernelName = "sumMakef"; + fileName = oklpath + "cds/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + } + + kernelName = "helmholtzBC" + suffix; + fileName = oklpath + "cds/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, kernelInfoBC); + kernelName = "dirichletBC"; + fileName = oklpath + "cds/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, kernelInfoBC); + + kernelName = "setEllipticCoeff"; + fileName = oklpath + "core/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, kernelInfo); + + kernelName = "filterRT" + suffix; + fileName = oklpath + "cds/regularization/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, meshProps); + + kernelName = "nStagesSum3"; + fileName = oklpath + "core/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, platform->kernelInfo); + + { + occa::properties prop = meshProps; + const int movingMesh = + platform->options.compareArgs("MOVING MESH", "TRUE"); + prop["defines/p_MovingMesh"] = movingMesh; + prop["defines/p_nEXT"] = nEXT; + prop["defines/p_nBDF"] = nBDF; + prop["defines/p_cubNq"] = cubNq; + prop["defines/p_cubNp"] = cubNp; + + occa::properties subCycleStrongCubatureProps = prop; + + int nelgt, nelgv; + const std::string meshFile = platform->options.getArgs("MESH FILE"); + re2::nelg(meshFile, nelgt, nelgv, platform->comm.mpiComm); + const int NelemBenchmark = nelgv/platform->comm.mpiCommSize; + bool verbose = platform->options.compareArgs("VERBOSE", "TRUE"); + const int verbosity = verbose ? 2 : 1; + + int Nsubsteps; + platform->options.getArgs("SUBCYCLING STEPS", Nsubsteps); + + if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE") && Nsubsteps) { + auto subCycleKernel = + benchmarkAdvsub(1, NelemBenchmark, Nq, cubNq, nEXT, true, true, verbosity, cds_t::targetBenchmark, false); + + kernelName = "subCycleStrongCubatureVolume" + suffix; + platform->kernels.add(section + kernelName, subCycleKernel); + } + + kernelName = "subCycleStrongVolume" + suffix; + fileName = oklpath + "cds/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + + kernelName = "subCycleRKUpdate"; + fileName = oklpath + "cds/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + kernelName = "subCycleRK"; + fileName = oklpath + "cds/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + + kernelName = "subCycleInitU0"; + fileName = oklpath + "cds/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + } + } +} diff --git a/src/core/alignment.cpp b/src/core/alignment.cpp new file mode 100644 index 000000000..0765574a1 --- /dev/null +++ b/src/core/alignment.cpp @@ -0,0 +1,32 @@ +#include "alignment.hpp" +std::string to_string(boundaryAlignment_t a) +{ + switch (a) { + case boundaryAlignment_t::X: + return "X"; + case boundaryAlignment_t::Y: + return "Y"; + case boundaryAlignment_t::Z: + return "Z"; + case boundaryAlignment_t::UNALIGNED: + return "UNALIGNED"; + } + + return ""; +} +boundaryAlignment_t computeAlignment(const std::array &n) +{ + const dfloat alignmentTol = 1e-4; + const dfloat nxDiff = std::abs(std::abs(n[0]) - 1.0); + const dfloat nyDiff = std::abs(std::abs(n[1]) - 1.0); + const dfloat nzDiff = std::abs(std::abs(n[2]) - 1.0); + + if (nxDiff < alignmentTol) + return boundaryAlignment_t::X; + if (nyDiff < alignmentTol) + return boundaryAlignment_t::Y; + if (nzDiff < alignmentTol) + return boundaryAlignment_t::Z; + + return boundaryAlignment_t::UNALIGNED; +} \ No newline at end of file diff --git a/src/core/alignment.hpp b/src/core/alignment.hpp new file mode 100644 index 000000000..bad887ad2 --- /dev/null +++ b/src/core/alignment.hpp @@ -0,0 +1,9 @@ +#if !defined(alignment_hpp_) +#define alignment_hpp_ +#include "nrssys.hpp" +#include +#include +enum class boundaryAlignment_t { X, Y, Z, UNALIGNED }; +std::string to_string(boundaryAlignment_t a); +boundaryAlignment_t computeAlignment(const std::array &n); +#endif \ No newline at end of file diff --git a/src/core/bcMap.cpp b/src/core/bcMap.cpp deleted file mode 100644 index 9996ed4a1..000000000 --- a/src/core/bcMap.cpp +++ /dev/null @@ -1,345 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "nrs.hpp" -#include "platform.hpp" -#include "udf.hpp" - -#define NOTBOUNDARY 0 -#define DIRICHLET 1 -#define NEUMANN 2 - -// stores for every (field, boundaryID) pair a bcID -static std::map, int> bToBc; -static int nbid[] = {0, 0}; - -static std::map vBcTextToID = { - {"periodic", 0}, - {"zerovalue", 1}, - {"fixedvalue", 2}, - {"zerogradient", 3}, - {"zeroxvalue/zerogradient", 4}, - {"zeroyvalue/zerogradient", 5}, - {"zerozvalue/zerogradient", 6} -}; - -static std::map vBcIDToText = { - {0, "periodic" }, - {1, "zeroValue" }, - {2, "fixedValue" }, - {3, "zeroGradient" }, - {4, "zeroXValue/zeroGradient"}, - {5, "zeroYValue/zeroGradient"}, - {6, "zeroZValue/zeroGradient"} -}; - -static std::map sBcTextToID = { - {"periodic", 0}, - {"fixedvalue", 1}, - {"zerogradient", 2}, - {"fixedgradient", 3} -}; - -static std::map sBcIDToText = { - {0, "periodic" }, - {1, "fixedValue" }, - {2, "zeroGradient" }, - {3, "fixedGradient"} -}; - -static void v_setup(std::string s); -static void m_setup(std::string s); -static void s_setup(std::string s); - -static void m_setup(std::string field, std::vector slist) -{ - for(int i = 0; i < slist.size(); i++) { - std::string key = slist[i]; - if (key.compare("p") == 0) key = "periodic"; - if (key.compare("w") == 0) key = "zerovalue"; - if (key.compare("wall") == 0) key = "zerovalue"; - if (key.compare("inlet") == 0) key = "fixedvalue"; - if (key.compare("v") == 0) key = "zerovalue"; // non-moving boundary, which is the same as a wall - if (key.compare("mv") == 0) key = "fixedvalue"; - if (key.compare("outlet") == 0) key = "zerogradient"; - if (key.compare("outflow") == 0) key = "zerogradient"; - if (key.compare("o") == 0) key = "zerogradient"; - if (key.compare("slipx") == 0) key = "zeroxvalue/zerogradient"; - if (key.compare("slipy") == 0) key = "zeroyvalue/zerogradient"; - if (key.compare("slipz") == 0) key = "zerozvalue/zerogradient"; - if (key.compare("symx") == 0) key = "zeroxvalue/zerogradient"; - if (key.compare("symy") == 0) key = "zeroyvalue/zerogradient"; - if (key.compare("symz") == 0) key = "zerozvalue/zerogradient"; - - if (vBcTextToID.find(key) == vBcTextToID.end()) { - std::cout << "Invalid bcType " << "\'" << key << "\'" << "!\n"; - ABORT(1); - } - - try - { - bToBc[make_pair(field, i)] = vBcTextToID.at(key); - } - catch (const std::out_of_range& oor) - { - std::cout << "Out of Range error: " << oor.what() << "!\n"; - ABORT(1); - } - } -} -static void v_setup(std::string field, std::vector slist) -{ - for(int i = 0; i < slist.size(); i++) { - std::string key = slist[i]; - if (key.compare("p") == 0) key = "periodic"; - if (key.compare("w") == 0) key = "zerovalue"; - if (key.compare("wall") == 0) key = "zerovalue"; - if (key.compare("inlet") == 0) key = "fixedvalue"; - if (key.compare("v") == 0 || key.compare("mv") == 0) key = "fixedvalue"; - if (key.compare("outlet") == 0) key = "zerogradient"; - if (key.compare("outflow") == 0) key = "zerogradient"; - if (key.compare("o") == 0) key = "zerogradient"; - if (key.compare("slipx") == 0) key = "zeroxvalue/zerogradient"; - if (key.compare("slipy") == 0) key = "zeroyvalue/zerogradient"; - if (key.compare("slipz") == 0) key = "zerozvalue/zerogradient"; - if (key.compare("symx") == 0) key = "zeroxvalue/zerogradient"; - if (key.compare("symy") == 0) key = "zeroyvalue/zerogradient"; - if (key.compare("symz") == 0) key = "zerozvalue/zerogradient"; - - if (vBcTextToID.find(key) == vBcTextToID.end()) { - std::cout << "Invalid bcType " << "\'" << key << "\'" << "!\n"; - ABORT(1); - } - - try - { - bToBc[make_pair(field, i)] = vBcTextToID.at(key); - } - catch (const std::out_of_range& oor) - { - std::cout << "Out of Range error: " << oor.what() << "!\n"; - ABORT(1); - } - } -} - -static void s_setup(std::string field, std::vector slist) -{ - for(int i = 0; i < slist.size(); i++) { - std::string key = slist[i]; - if (key.compare("p") == 0) key = "periodic"; - if (key.compare("t") == 0) key = "fixedvalue"; - if (key.compare("inlet") == 0) key = "fixedvalue"; - if (key.compare("flux") == 0) key = "fixedgradient"; - if (key.compare("f") == 0) key = "fixedgradient"; - if (key.compare("zeroflux") == 0) key = "zerogradient"; - if (key.compare("i") == 0) key = "zerogradient"; - if (key.compare("insulated") == 0) key = "zerogradient"; - if (key.compare("outflow") == 0) key = "zerogradient"; - if (key.compare("outlet") == 0) key = "zerogradient"; - if (key.compare("o") == 0) key = "zerogradient"; - - if (sBcTextToID.find(key) == sBcTextToID.end()) { - std::cout << "Invalid bcType " << "\'" << key << "\'" << "!\n"; - ABORT(1); - } - - try - { - bToBc[make_pair(field, i)] = sBcTextToID.at(key); - } - catch (const std::out_of_range& oor) - { - std::cout << "Out of Range error: " << oor.what() << "!\n"; - ABORT(1); - } - } -} - -namespace bcMap -{ -void setup(std::vector slist, std::string field) -{ - if (slist.size() == 0 || slist[0].compare("none") == 0) return; - - nbid[0] = slist.size(); - if (field.compare(0, 8, "scalar00") == 0) nbid[1] = slist.size(); - - if (field.compare("velocity") == 0) - v_setup(field, slist); - else if (field.compare("mesh") == 0) - m_setup(field, slist); - else if (field.compare(0, 6, "scalar") == 0) - s_setup(field, slist); -} - -int id(int bid, std::string field) -{ - if (bid < 1) return NOTBOUNDARY; - - return bToBc[{field, bid - 1}]; -} - -int type(int bid, std::string field) -{ - if (bid < 1) return NOTBOUNDARY; - - int bcType = -1; - - if (field.compare("x-velocity") == 0) { - const int bcID = bToBc[{"velocity", bid - 1}]; - if (bcID == 1) bcType = DIRICHLET; - if (bcID == 2) bcType = DIRICHLET; - if (bcID == 3) bcType = NEUMANN; - if (bcID == 4) bcType = DIRICHLET; - if (bcID == 5) bcType = NEUMANN; - if (bcID == 6) bcType = NEUMANN; - if (bcID == 2) oudfFindDirichlet(field); - } else if (field.compare("y-velocity") == 0) { - const int bcID = bToBc[{"velocity", bid - 1}]; - if (bcID == 1) bcType = DIRICHLET; - if (bcID == 2) bcType = DIRICHLET; - if (bcID == 3) bcType = NEUMANN; - if (bcID == 4) bcType = NEUMANN; - if (bcID == 5) bcType = DIRICHLET; - if (bcID == 6) bcType = NEUMANN; - if (bcID == 2) oudfFindDirichlet(field); - } else if (field.compare("z-velocity") == 0) { - const int bcID = bToBc[{"velocity", bid - 1}]; - if (bcID == 1) bcType = DIRICHLET; - if (bcID == 2) bcType = DIRICHLET; - if (bcID == 3) bcType = NEUMANN; - if (bcID == 4) bcType = NEUMANN; - if (bcID == 5) bcType = NEUMANN; - if (bcID == 6) bcType = DIRICHLET; - if (bcID == 2) oudfFindDirichlet(field); - } else if (field.compare("x-mesh") == 0) { - const int bcID = bToBc[{"mesh", bid - 1}]; - if (bcID == 1) bcType = DIRICHLET; - if (bcID == 2) bcType = DIRICHLET; - if (bcID == 3) bcType = NEUMANN; - if (bcID == 4) bcType = DIRICHLET; - if (bcID == 5) bcType = NEUMANN; - if (bcID == 6) bcType = NEUMANN; - } else if (field.compare("y-mesh") == 0) { - const int bcID = bToBc[{"mesh", bid - 1}]; - if (bcID == 1) bcType = DIRICHLET; - if (bcID == 2) bcType = DIRICHLET; - if (bcID == 3) bcType = NEUMANN; - if (bcID == 4) bcType = NEUMANN; - if (bcID == 5) bcType = DIRICHLET; - if (bcID == 6) bcType = NEUMANN; - } else if (field.compare("z-mesh") == 0) { - const int bcID = bToBc[{"mesh", bid - 1}]; - if (bcID == 1) bcType = DIRICHLET; - if (bcID == 2) bcType = DIRICHLET; - if (bcID == 3) bcType = NEUMANN; - if (bcID == 4) bcType = NEUMANN; - if (bcID == 5) bcType = NEUMANN; - if (bcID == 6) bcType = DIRICHLET; - } else if (field.compare("pressure") == 0) { - const int bcID = bToBc[{"velocity", bid - 1}]; - if (bcID == 1) bcType = NEUMANN; - if (bcID == 2) bcType = NEUMANN; - if (bcID == 3) bcType = DIRICHLET; - if (bcID == 4) bcType = NEUMANN; - if (bcID == 5) bcType = NEUMANN; - if (bcID == 6) bcType = NEUMANN; - if (bcID == 3) oudfFindDirichlet(field); - } else if (field.compare(0, 6, "scalar") == 0) { - const int bcID = bToBc[{field, bid - 1}]; - if (bcID == 1) bcType = DIRICHLET; - if (bcID == 2) bcType = NEUMANN; - if (bcID == 3) bcType = NEUMANN; - if (bcID == 1) oudfFindDirichlet(field); - if (bcID == 3) oudfFindNeumann(field); - } - - if(bcType < 0) { - std::cout << __func__ << "(): Unexpected error occured!" << std::endl; - ABORT(1); - } - - return bcType; -} - -std::string text(int bid, std::string field) -{ - if (bid < 1) return std::string(); - - const int bcID = bToBc[{field, bid - 1}]; - if (field.compare("velocity") == 0 || field.compare("mesh") == 0) - - return vBcIDToText[bcID]; - - else if (field.compare(0, 6, "scalar") == 0) - - return sBcIDToText[bcID]; - - - std::cout << __func__ << "(): Unexpected error occured!" << std::endl; - ABORT(1); - return 0; -} - -int size(int isTmesh) -{ - return isTmesh ? nbid[1] : nbid[0]; -} - -void check(mesh_t* mesh) -{ - - int nid = nbid[0]; - if(mesh->cht) nid = nbid[1]; - - int retval = 0; - - retval = 0; - for (int id = 1; id <= nid; id++) { - retval = 0; - for (int f = 0; f < mesh->Nelements * mesh->Nfaces; f++) { - if (mesh->EToB[f] == id) retval = 1; - } - MPI_Allreduce(MPI_IN_PLACE, &retval, 1, MPI_INT, MPI_MAX, platform->comm.mpiComm); - if (retval == 0) { - if (platform->comm.mpiRank == 0) printf("Cannot find boundary ID %d in mesh!\n", id); - ABORT(1); - } - } - - retval = 0; - for (int f = 0; f < mesh->Nelements * mesh->Nfaces; f++) - if (mesh->EToB[f] < -1 || mesh->EToB[f] == 0 || mesh->EToB[f] > nid) retval = 1; - MPI_Allreduce(MPI_IN_PLACE, &retval, 1, MPI_INT, MPI_MAX, platform->comm.mpiComm); - if (retval > 0) { - if (platform->comm.mpiRank == 0) printf("Mesh has unmapped boundary IDs!\n"); - ABORT(1); - } - - -} - -void setBcMap(std::string field, int* map, int nIDs) -{ - if (field.compare(0, 8, "scalar00") == 0) - nbid[1] = nIDs; - else - nbid[0] = nIDs; - - try - { - for(int i = 0; i < nIDs; i++) bToBc[make_pair(field, i)] = map[i]; - } - catch (const std::out_of_range& oor) - { - std::cout << "Out of Range error: " << oor.what() << "!\n"; - ABORT(1); - } -} -} // namespace diff --git a/src/core/compileKernels.cpp b/src/core/compileKernels.cpp index 607059286..973fed5ea 100644 --- a/src/core/compileKernels.cpp +++ b/src/core/compileKernels.cpp @@ -1,14 +1,13 @@ +#include #include "elliptic.h" #include "mesh.h" #include "ogs.hpp" #include "ogsKernels.hpp" #include "udf.hpp" -#include #include +#include -namespace { -static occa::properties kernelInfoBC; -std::string createOptionsPrefix(const std::string §ion) { +std::string createOptionsPrefix(std::string section) { std::string prefix = section + std::string(" "); if (section.find("temperature") != std::string::npos) { prefix = std::string("scalar00 "); @@ -20,1416 +19,90 @@ std::string createOptionsPrefix(const std::string §ion) { return prefix; } -void registerGMRESKernels(const std::string §ion, int Nfields) { - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - const std::string oklpath = install_dir + "/okl/elliptic/"; - std::string filename; - const bool serial = (platform->device.mode() == "Serial" || - platform->device.mode() == "OpenMP"); - - const std::string sectionIdentifier = std::to_string(Nfields) + "-"; - - occa::properties gmresKernelInfo = platform->kernelInfo; - gmresKernelInfo["defines/p_Nfields"] = Nfields; - std::string kernelName = "gramSchmidtOrthogonalization"; - filename = serial ? oklpath + "ellipticGramSchmidtOrthogonalization.c" - : oklpath + "ellipticGramSchmidtOrthogonalization.okl"; - platform->kernels.add_kernel( - sectionIdentifier + kernelName, filename, kernelName, gmresKernelInfo); - filename = serial ? oklpath + "ellipticUpdatePGMRES.c" - : oklpath + "ellipticUpdatePGMRES.okl"; - kernelName = "updatePGMRESSolution"; - platform->kernels.add_kernel( - sectionIdentifier + kernelName, filename, kernelName, gmresKernelInfo); - filename = serial ? oklpath + "ellipticFusedResidualAndNorm.c" - : oklpath + "ellipticFusedResidualAndNorm.okl"; - kernelName = "fusedResidualAndNorm"; - platform->kernels.add_kernel( - sectionIdentifier + kernelName, filename, kernelName, gmresKernelInfo); -} - -void registerNrsKernels() { - const device_t &device = platform->device; - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - // build kernels - std::string fileName, kernelName; - const std::string suffix = "Hex3D"; - const std::string oklpath = install_dir + "/okl/"; - int N, cubN; - platform->options.getArgs("POLYNOMIAL DEGREE", N); - platform->options.getArgs("CUBATURE POLYNOMIAL DEGREE", cubN); - const int Nq = N + 1; - const int cubNq = cubN + 1; - const int Np = Nq * Nq * Nq; - const int cubNp = cubNq * cubNq * cubNq; - constexpr int Nfaces{6}; - - occa::properties kernelInfo = platform->kernelInfo; - kernelInfo["defines"].asObject(); - kernelInfo["includes"].asArray(); - kernelInfo["header"].asArray(); - kernelInfo["flags"].asObject(); - kernelInfo["include_paths"].asArray(); - - constexpr int NVfields{3}; - kernelInfo["defines/p_NVfields"] = NVfields; - - int Nsubsteps = 0; - int nBDF = 0; - int nEXT = 0; - platform->options.getArgs("SUBCYCLING STEPS", Nsubsteps); - - if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO1")) { - nBDF = 1; - } else if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO2")) { - nBDF = 2; - } else if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO3")) { - nBDF = 3; - } - nEXT = 3; - if (Nsubsteps) - nEXT = nBDF; - - { - fileName = oklpath + "core/nStagesSum.okl"; - kernelName = "nStagesSum3"; - const std::string section = "nrs-"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, platform->kernelInfo); - - fileName = oklpath + "nrs/computeFieldDotNormal.okl"; - kernelName = "computeFieldDotNormal"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, platform->kernelInfo); - - occa::properties centroidProp = kernelInfo; - centroidProp["defines/p_Nfp"] = Nq * Nq; - centroidProp["defines/p_Nfaces"] = Nfaces; - fileName = oklpath + "nrs/computeFaceCentroid.okl"; - kernelName = "computeFaceCentroid"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, centroidProp); - - occa::properties meshProps = kernelInfo; - meshProps += populateMeshProperties(N); - - { - occa::properties prop = meshProps; - prop["defines/p_cubNq"] = cubNq; - prop["defines/p_cubNp"] = cubNp; - fileName = oklpath + "nrs/advection" + suffix + ".okl"; - kernelName = "strongAdvectionVolume" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - kernelName = "strongAdvectionCubatureVolume" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - } - - fileName = oklpath + "nrs/curl" + suffix + ".okl"; - kernelName = "curl" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, meshProps); - - fileName = oklpath + "nrs/gradient" + suffix + ".okl"; - kernelName = "gradientVolume" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, meshProps); - - kernelName = "nrswGradientVolume" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, meshProps); - - { - occa::properties prop = kernelInfo; - const int movingMesh = - platform->options.compareArgs("MOVING MESH", "TRUE"); - prop["defines/p_nEXT"] = nEXT; - prop["defines/p_nBDF"] = nBDF; - prop["defines/p_MovingMesh"] = movingMesh; - if (Nsubsteps) - prop["defines/p_SUBCYCLING"] = 1; - else - prop["defines/p_SUBCYCLING"] = 0; - - fileName = oklpath + "nrs/sumMakef.okl"; - kernelName = "sumMakef"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - } - - fileName = oklpath + "nrs/divergence" + suffix + ".okl"; - kernelName = "nrswDivergenceVolume" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, kernelInfoBC); - kernelName = "divergenceVolume" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, kernelInfoBC); - - kernelName = "divergenceSurfaceTOMBO" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, kernelInfoBC); - - fileName = oklpath + "nrs/advectMeshVelocityHex3D.okl"; - kernelName = "advectMeshVelocityHex3D"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, meshProps); - - fileName = oklpath + "nrs/pressureRhs" + suffix + ".okl"; - kernelName = "pressureRhsTOMBO" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, meshProps); - - fileName = oklpath + "nrs/pressureStress" + suffix + ".okl"; - kernelName = "pressureStress" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, meshProps); - - fileName = oklpath + "nrs/pressureBC" + suffix + ".okl"; - kernelName = "pressureDirichletBC" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, kernelInfoBC); - - fileName = oklpath + "nrs/velocityRhs" + suffix + ".okl"; - kernelName = "velocityRhsTOMBO" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, meshProps); - - fileName = oklpath + "nrs/velocityBC" + suffix + ".okl"; - kernelName = "velocityDirichletBC" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, kernelInfoBC); - - kernelName = "velocityNeumannBC" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, kernelInfoBC); - - occa::properties prop = meshProps; - const int movingMesh = platform->options.compareArgs("MOVING MESH", "TRUE"); - prop["defines/p_relative"] = movingMesh && Nsubsteps; - prop["defines/p_cubNq"] = cubNq; - prop["defines/p_cubNp"] = cubNp; - fileName = oklpath + "nrs/Urst" + suffix + ".okl"; - - const bool serial = (platform->device.mode() == "Serial" || - platform->device.mode() == "OpenMP"); - if(serial) fileName = oklpath + "nrs/Urst" + suffix + ".c"; - kernelName = "UrstCubature" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - - fileName = oklpath + "nrs/Urst" + suffix + ".okl"; - kernelName = "Urst" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - - { - occa::properties prop = meshProps; - const int movingMesh = - platform->options.compareArgs("MOVING MESH", "TRUE"); - prop["defines/p_MovingMesh"] = movingMesh; - prop["defines/p_nEXT"] = nEXT; - prop["defines/p_nBDF"] = nBDF; - prop["defines/p_cubNq"] = cubNq; - prop["defines/p_cubNp"] = cubNp; - - fileName = oklpath + "nrs/subCycle" + suffix + ".okl"; - occa::properties subCycleStrongCubatureProps = prop; - if (platform->device.mode() == "Serial" || - platform->device.mode() == "OpenMP") { - fileName = oklpath + "nrs/subCycle" + suffix + ".c"; - } - kernelName = "subCycleStrongCubatureVolume" + suffix; - platform->kernels.add_kernel(section + kernelName, - fileName, - kernelName, - subCycleStrongCubatureProps); - fileName = oklpath + "nrs/subCycle" + suffix + ".okl"; - - kernelName = "subCycleStrongVolume" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - - fileName = oklpath + "nrs/subCycleRKUpdate" + ".okl"; - kernelName = "subCycleERKUpdate"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - kernelName = "subCycleRK"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - - kernelName = "subCycleInitU0"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - } - - fileName = oklpath + "nrs/extrapolate" + ".okl"; - kernelName = "multiExtrapolate"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, meshProps); - - fileName = oklpath + "core/mask" + ".okl"; - kernelName = "maskCopy"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, kernelInfo); - kernelName = "mask"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, kernelInfo); - - fileName = oklpath + "nrs/regularization/filterRT" + suffix + ".okl"; - kernelName = "filterRT" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, meshProps); - - occa::properties cflProps = meshProps; - cflProps["defines/p_MovingMesh"] = movingMesh; - fileName = oklpath + "nrs/cfl" + suffix + ".okl"; - kernelName = "cfl" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, cflProps); - - fileName = oklpath + "nrs/pressureAddQtl" + ".okl"; - kernelName = "pressureAddQtl"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, meshProps); - - fileName = oklpath + "core/setEllipticCoeff.okl"; - kernelName = "setEllipticCoeff"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, kernelInfo); - kernelName = "setEllipticCoeffPressure"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, kernelInfo); - } -} -void registerCdsKernels() { - const device_t &device = platform->device; - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - occa::properties kernelInfo = platform->kernelInfo; - kernelInfo["defines"].asObject(); - kernelInfo["includes"].asArray(); - kernelInfo["header"].asArray(); - kernelInfo["flags"].asObject(); - kernelInfo["include_paths"].asArray(); - - int N, cubN; - platform->options.getArgs("POLYNOMIAL DEGREE", N); - platform->options.getArgs("CUBATURE POLYNOMIAL DEGREE", cubN); - const int Nq = N + 1; - const int cubNq = cubN + 1; - const int Np = Nq * Nq * Nq; - const int cubNp = cubNq * cubNq * cubNq; - constexpr int Nfaces{6}; - - constexpr int NVfields{3}; - kernelInfo["defines/p_NVfields"] = NVfields; - - int Nsubsteps = 0; - int nBDF = 0; - int nEXT = 0; - platform->options.getArgs("SUBCYCLING STEPS", Nsubsteps); - - if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO1")) { - nBDF = 1; - } else if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO2")) { - nBDF = 2; - } else if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO3")) { - nBDF = 3; - } - nEXT = 3; - if (Nsubsteps) - nEXT = nBDF; - - - std::string fileName, kernelName; - const std::string suffix = "Hex3D"; - const std::string oklpath = install_dir + "/okl/"; - const std::string section = "cds-"; - occa::properties meshProps = kernelInfo; - meshProps += populateMeshProperties(N); - { - { - occa::properties prop = meshProps; - prop["defines/p_cubNq"] = cubNq; - prop["defines/p_cubNp"] = cubNp; - fileName = oklpath + "cds/advection" + suffix + ".okl"; - - kernelName = "strongAdvectionVolume" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - - kernelName = "strongAdvectionCubatureVolume" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - } - - fileName = oklpath + "cds/advectMeshVelocityHex3D.okl"; - kernelName = "advectMeshVelocityHex3D"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, meshProps); - - fileName = oklpath + "core/mask.okl"; - kernelName = "maskCopy"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, meshProps); +void compileKernels() { - { - occa::properties prop = kernelInfo; - const int movingMesh = - platform->options.compareArgs("MOVING MESH", "TRUE"); - prop["defines/p_MovingMesh"] = movingMesh; - prop["defines/p_nEXT"] = nEXT; - prop["defines/p_nBDF"] = nBDF; - if (Nsubsteps) - prop["defines/p_SUBCYCLING"] = 1; - else - prop["defines/p_SUBCYCLING"] = 0; + MPI_Barrier(platform->comm.mpiComm); + const double tStart = MPI_Wtime(); + if (platform->comm.mpiRank == 0) + printf("loading kernels (this may take awhile) ...\n"); + fflush(stdout); - fileName = oklpath + "cds/sumMakef.okl"; - kernelName = "sumMakef"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - } + const occa::properties kernelInfoBC = compileUDFKernels(); - fileName = oklpath + "cds/helmholtzBC" + suffix + ".okl"; - kernelName = "helmholtzBC" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, kernelInfoBC); - kernelName = "dirichletBC"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, kernelInfoBC); + registerLinAlgKernels(); - fileName = oklpath + "core/setEllipticCoeff.okl"; - kernelName = "setEllipticCoeff"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, kernelInfo); + registerMeshKernels(kernelInfoBC); - fileName = oklpath + "cds/regularization/filterRT" + suffix + ".okl"; - kernelName = "filterRT" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, meshProps); + registerNrsKernels(kernelInfoBC); - fileName = oklpath + "core/nStagesSum.okl"; - kernelName = "nStagesSum3"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, platform->kernelInfo); + int Nscalars; + platform->options.getArgs("NUMBER OF SCALARS", Nscalars); + const int scalarWidth = getDigitsRepresentation(NSCALAR_MAX - 1); - { - occa::properties prop = meshProps; - const int movingMesh = - platform->options.compareArgs("MOVING MESH", "TRUE"); - prop["defines/p_MovingMesh"] = movingMesh; - prop["defines/p_nEXT"] = nEXT; - prop["defines/p_nBDF"] = nBDF; - prop["defines/p_cubNq"] = cubNq; - prop["defines/p_cubNp"] = cubNp; + if (Nscalars) { + registerCdsKernels(kernelInfoBC); + for(int is = 0; is < Nscalars; is++){ + std::stringstream ss; + ss << std::setfill('0') << std::setw(scalarWidth) << is; + std::string sid = ss.str(); + const std::string section = "scalar" + sid; + const int poisson = 0; - fileName = oklpath + "cds/subCycle" + suffix + ".okl"; - occa::properties subCycleStrongCubatureProps = prop; - if (platform->device.mode() == "Serial" || - platform->device.mode() == "OpenMP") { - fileName = oklpath + "cds/subCycle" + suffix + ".c"; + if(!platform->options.compareArgs("SCALAR" + sid + " SOLVER", "NONE")){ + registerEllipticKernels(section, poisson); + registerEllipticPreconditionerKernels(section, poisson); } - kernelName = "subCycleStrongCubatureVolume" + suffix; - platform->kernels.add_kernel(section + kernelName, - fileName, - kernelName, - subCycleStrongCubatureProps); - fileName = oklpath + "cds/subCycle" + suffix + ".okl"; - kernelName = "subCycleStrongVolume" + suffix; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - - fileName = oklpath + "cds/subCycleRKUpdate.okl"; - kernelName = "subCycleERKUpdate"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - kernelName = "subCycleRK"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); - - kernelName = "subCycleInitU0"; - platform->kernels.add_kernel( - section + kernelName, fileName, kernelName, prop); } } -} -void registerCommonMGPreconditionerKernels(int N, occa::properties kernelInfo) { - const std::string prefix = "Hex3D"; - std::string filename, kernelName; - - kernelInfo["defines/pfloat"] = pfloatString; - - kernelInfo["defines/p_Nfields"] = 1; - occa::properties pfloatKernelInfo = kernelInfo; - pfloatKernelInfo["defines/dfloat"] = pfloatString; - pfloatKernelInfo["defines/pfloat"] = pfloatString; - - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - - const std::string orderSuffix = std::string("_") + std::to_string(N); - - { - const std::string oklpath = install_dir + "/okl/core/"; - std::string filename; - - filename = oklpath + "mask.okl"; - kernelName = "mask"; - platform->kernels.add_kernel(kernelName + orderSuffix, - filename, - kernelName, - kernelInfo, - orderSuffix); - - filename = oklpath + "mask.okl"; - platform->kernels.add_kernel(kernelName + orderSuffix + "pfloat", - filename, - kernelName, - pfloatKernelInfo, - orderSuffix + "pfloat"); - filename = install_dir + "/okl/elliptic/ellipticLinAlg.okl"; - kernelName = "fusedCopyDfloatToPfloat"; - platform->kernels.add_kernel(kernelName + orderSuffix, - filename, - kernelName, - kernelInfo, - orderSuffix); - kernelName = "copyDfloatToPfloat"; - platform->kernels.add_kernel(kernelName + orderSuffix, - filename, - kernelName, - kernelInfo, - orderSuffix); - - kernelName = "copyPfloatToDfloat"; - platform->kernels.add_kernel(kernelName + orderSuffix, - filename, - kernelName, - kernelInfo, - orderSuffix); - - kernelName = "scaledAdd"; - platform->kernels.add_kernel(kernelName + orderSuffix, - filename, - kernelName, - kernelInfo, - orderSuffix); - kernelName = "dotMultiply"; - platform->kernels.add_kernel(kernelName + orderSuffix, - filename, - kernelName, - kernelInfo, - orderSuffix); - filename = install_dir + "/okl/elliptic/chebyshev.okl"; - kernelName = "updateSmoothedSolutionVec"; - platform->kernels.add_kernel(kernelName + orderSuffix, - filename, - kernelName, - kernelInfo, - orderSuffix); - kernelName = "updateChebyshevSolutionVec"; - platform->kernels.add_kernel(kernelName + orderSuffix, - filename, - kernelName, - kernelInfo, - orderSuffix); - - kernelName = "updateIntermediateSolutionVec"; - platform->kernels.add_kernel(kernelName + orderSuffix, - filename, - kernelName, - kernelInfo, - orderSuffix); - } -} - -void registerSchwarzKernels(const std::string §ion, int N) { - const std::string optionsPrefix = createOptionsPrefix(section); - const int Nq = N + 1; - const int Nq_e = Nq + 2; - const int Np = Nq * Nq * Nq; - const int Np_e = Nq_e * Nq_e * Nq_e; - - bool overlap = false; - const bool serial = (platform->device.mode() == "Serial" || - platform->device.mode() == "OpenMP"); - if (Nq >= 5 && !serial) - overlap = true; - - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - const std::string oklpath = install_dir + "/okl/elliptic/"; - std::string filename, kernelName; - - { - occa::properties properties = platform->kernelInfo; - properties["defines/p_Nq"] = Nq; - properties["defines/p_Nq_e"] = Nq_e; - properties["defines/p_restrict"] = 0; - const std::string suffix = - std::string("_") + std::to_string(Nq_e - 1) + std::string("pfloat"); - properties["defines/p_overlap"] = (int)overlap; - if (platform->options.compareArgs( - optionsPrefix + "MULTIGRID SMOOTHER", "RAS")) - properties["defines/p_restrict"] = 1; - - filename = oklpath + "ellipticSchwarzSolverHex3D.okl"; - if (serial) { - filename = oklpath + "ellipticSchwarzSolverHex3D.c"; - } - platform->kernels.add_kernel( - "preFDM" + suffix, filename, "preFDM", properties, suffix); - platform->kernels.add_kernel( - "fusedFDM" + suffix, filename, "fusedFDM", properties, suffix); - platform->kernels.add_kernel( - "postFDM" + suffix, filename, "postFDM", properties, suffix); - } -} -void registerFineLevelKernels(const std::string §ion, int N) { - auto gen_suffix = [N](const char *floatString) { - const std::string precision = std::string(floatString); - if (precision.find(pfloatString) != std::string::npos) { - return std::string("_") + std::to_string(N) + std::string("pfloat"); - } else { - return std::string("_") + std::to_string(N); - } + // Scalar section is omitted + // as pressure section kernels are the same. + const std::vector> sections = { + {"pressure", 1}, + {"velocity", 0} }; - auto kernelInfo = ellipticKernelInfo(N); - registerCommonMGPreconditionerKernels(N, kernelInfo); - - const std::string suffix = "Hex3D"; - constexpr int Nverts{8}; - constexpr int Nfields{1}; - - kernelInfo["defines/p_Nverts"] = Nverts; - kernelInfo["defines/p_Nfields"] = Nfields; - - std::string filename, kernelName; - - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - const std::string oklpath = install_dir + "/okl/elliptic/"; - const bool serial = platform->device.mode() == "Serial" || - platform->device.mode() == "OpenMP"; - - { - occa::properties AxKernelInfo = kernelInfo; - - filename = oklpath + "ellipticAx" + suffix + ".okl"; - kernelName = "ellipticAx" + suffix; - if (serial) { - filename = oklpath + "ellipticSerialAx" + suffix + ".c"; - } - { - const std::string kernelSuffix = gen_suffix(dfloatString); - platform->kernels.add_kernel(kernelName + kernelSuffix, - filename, - kernelName, - AxKernelInfo, - kernelSuffix); - } - - if (!strstr(pfloatString, dfloatString)) { - AxKernelInfo["defines/dfloat"] = pfloatString; - kernelName = "ellipticAx" + suffix; - const std::string kernelSuffix = gen_suffix(pfloatString); - platform->kernels.add_kernel(kernelName + kernelSuffix, - filename, - kernelName, - AxKernelInfo, - kernelSuffix); - AxKernelInfo["defines/dfloat"] = dfloatString; - } - - if (platform->options.compareArgs("ELEMENT MAP", "TRILINEAR")) - kernelName = "ellipticPartialAxTrilinear" + suffix; - else - kernelName = "ellipticPartialAx" + suffix; - - if (!serial) { - { - const std::string kernelSuffix = gen_suffix(dfloatString); - platform->kernels.add_kernel(kernelName + kernelSuffix, - filename, - kernelName, - AxKernelInfo, - kernelSuffix); - } - if (!strstr(pfloatString, dfloatString)) { - AxKernelInfo["defines/dfloat"] = pfloatString; - const std::string kernelSuffix = gen_suffix(pfloatString); - platform->kernels.add_kernel(kernelName + kernelSuffix, - filename, - kernelName, - AxKernelInfo, - kernelSuffix); - AxKernelInfo["defines/dfloat"] = dfloatString; - } - } + std::string section; + int poissonEquation; + for (auto&& entry : sections) { + std::tie(section, poissonEquation) = entry; + registerEllipticKernels(section, poissonEquation); + registerEllipticPreconditionerKernels(section, poissonEquation); } - registerSchwarzKernels(section, N); -} -void registerSEMFEMKernels(const std::string §ion, int N); -void registerLevelKernels(const std::string §ion, int Nf, int N) { - const int Nc = N; - auto gen_suffix = [N](const char *floatString) { - const std::string precision = std::string(floatString); - if (precision.find(pfloatString) != std::string::npos) { - return std::string("_") + std::to_string(N) + std::string("pfloat"); - } else { - return std::string("_") + std::to_string(N); - } - }; - - occa::properties kernelInfo = ellipticKernelInfo(N); - - const std::string suffix = "Hex3D"; - - std::string filename, kernelName; - - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - const std::string oklpath = install_dir + "/okl/elliptic/"; - registerCommonMGPreconditionerKernels(N, kernelInfo); - - constexpr int Nverts{8}; - const bool serial = platform->device.mode() == "Serial" || - platform->device.mode() == "OpenMP"; - - constexpr int elementType = HEXAHEDRA; { - kernelInfo["defines/p_Nverts"] = Nverts; - occa::properties AxKernelInfo = kernelInfo; - filename = oklpath + "ellipticAx" + suffix + ".okl"; - kernelName = "ellipticAx" + suffix; - if (serial) { - filename = oklpath + "ellipticSerialAx" + suffix + ".c"; - } - { - const std::string kernelSuffix = gen_suffix(dfloatString); - platform->kernels.add_kernel(kernelName + kernelSuffix, - filename, - kernelName, - AxKernelInfo, - kernelSuffix); - } - if (!strstr(pfloatString, dfloatString)) { - AxKernelInfo["defines/dfloat"] = pfloatString; - kernelName = "ellipticAx" + suffix; - { - const std::string kernelSuffix = gen_suffix(pfloatString); - platform->kernels.add_kernel(kernelName + kernelSuffix, - filename, - kernelName, - AxKernelInfo, - kernelSuffix); - } - AxKernelInfo["defines/dfloat"] = dfloatString; - } - - // check for trilinear - if (elementType != HEXAHEDRA) { - kernelName = "ellipticPartialAx" + suffix; - } else { - if (platform->options.compareArgs("ELEMENT MAP", "TRILINEAR")) - kernelName = "ellipticPartialAxTrilinear" + suffix; - else - kernelName = "ellipticPartialAx" + suffix; - } - - if (!serial) { - { - const std::string kernelSuffix = gen_suffix(dfloatString); - platform->kernels.add_kernel(kernelName + kernelSuffix, - filename, - kernelName, - AxKernelInfo, - kernelSuffix); - } - if (!strstr(pfloatString, dfloatString)) { - AxKernelInfo["defines/dfloat"] = pfloatString; - const std::string kernelSuffix = gen_suffix(pfloatString); - platform->kernels.add_kernel(kernelName + kernelSuffix, - filename, - kernelName, - AxKernelInfo, - kernelSuffix); - AxKernelInfo["defines/dfloat"] = dfloatString; - } - } - } - - { - filename = oklpath + "ellipticBlockJacobiPrecon.okl"; - kernelName = "ellipticBlockJacobiPrecon"; - // sizes for the coarsen and prolongation kernels. degree NFine to degree N - int NqFine = (Nf + 1); - int NqCoarse = (Nc + 1); - occa::properties coarsenProlongateKernelInfo = kernelInfo; - coarsenProlongateKernelInfo["defines/p_NqFine"] = Nf + 1; - coarsenProlongateKernelInfo["defines/p_NqCoarse"] = Nc + 1; - - const int NpFine = (Nf + 1) * (Nf + 1) * (Nf + 1); - const int NpCoarse = (Nc + 1) * (Nc + 1) * (Nc + 1); - coarsenProlongateKernelInfo["defines/p_NpFine"] = NpFine; - coarsenProlongateKernelInfo["defines/p_NpCoarse"] = NpCoarse; - - const std::string orderSuffix = std::string("_") + std::to_string(Nf); - - if (serial) { - filename = oklpath + "ellipticPreconCoarsen" + suffix + ".c"; - kernelName = "ellipticPreconCoarsen" + suffix; - platform->kernels.add_kernel(kernelName + orderSuffix, - filename, - kernelName, - coarsenProlongateKernelInfo, - orderSuffix); - filename = oklpath + "ellipticPreconProlongate" + suffix + ".c"; - kernelName = "ellipticPreconProlongate" + suffix; - platform->kernels.add_kernel(kernelName + orderSuffix, - filename, - kernelName, - coarsenProlongateKernelInfo, - orderSuffix); - } else { - filename = oklpath + "ellipticPreconCoarsen" + suffix + ".okl"; - kernelName = "ellipticPreconCoarsen" + suffix; - platform->kernels.add_kernel(kernelName + orderSuffix, - filename, - kernelName, - coarsenProlongateKernelInfo, - orderSuffix); - filename = oklpath + "ellipticPreconProlongate" + suffix + ".okl"; - kernelName = "ellipticPreconProlongate" + suffix; - platform->kernels.add_kernel(kernelName + orderSuffix, - filename, - kernelName, - coarsenProlongateKernelInfo, - orderSuffix); - } - } - { registerSchwarzKernels(section, N); } -} -void registerMultiGridKernels(const std::string §ion) { - int N; - platform->options.getArgs("POLYNOMIAL DEGREE", N); - const std::string optionsPrefix = createOptionsPrefix(section); - - registerFineLevelKernels(section, N); - - auto levels = determineMGLevels(section); - - for (unsigned levelIndex = 1U; levelIndex < levels.size(); ++levelIndex) { - const int levelFine = levels[levelIndex - 1]; - const int levelCoarse = levels[levelIndex]; - registerLevelKernels(section, levelFine, levelCoarse); - } - const int coarseLevel = levels.back(); - if (platform->options.compareArgs( - optionsPrefix + "MULTIGRID COARSE SOLVE", "TRUE")) { - if (platform->options.compareArgs( - optionsPrefix + "MULTIGRID COARSE SEMFEM", "TRUE")) { - registerSEMFEMKernels(section, coarseLevel); - } else { - { - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - const std::string oklpath = install_dir + "/okl/"; - std::string fileName = oklpath + "parAlmond/convertFP64ToFP32.okl"; - std::string kernelName = "convertFP64ToFP32"; - platform->kernels.add_kernel( - kernelName, fileName, kernelName, platform->kernelInfo); - - fileName = oklpath + "parAlmond/convertFP32ToFP64.okl"; - kernelName = "convertFP32ToFP64"; - platform->kernels.add_kernel( - kernelName, fileName, kernelName, platform->kernelInfo); - fileName = oklpath + "parAlmond/vectorDotStar.okl"; - kernelName = "vectorDotStar2"; - platform->kernels.add_kernel( - kernelName, fileName, kernelName, platform->kernelInfo); - } - } - } -} -void registerSEMFEMKernels(const std::string §ion, int N) { - const int Nq = N + 1; - const int Np = Nq * Nq * Nq; - const std::string optionsPrefix = createOptionsPrefix(section); - const int useFP32 = platform->options.compareArgs( - optionsPrefix + "SEMFEM SOLVER PRECISION", "FP32"); - occa::properties SEMFEMKernelProps = platform->kernelInfo; - if (useFP32) { - SEMFEMKernelProps["defines/pfloat"] = "float"; - } else { - SEMFEMKernelProps["defines/pfloat"] = "double"; - } - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - const std::string oklpath = install_dir + "/okl/elliptic/"; - std::string filename = oklpath + "ellipticGather.okl"; - platform->kernels.add_kernel("gather", filename, "gather", SEMFEMKernelProps); - filename = oklpath + "ellipticScatter.okl"; - platform->kernels.add_kernel( - "scatter", filename, "scatter", SEMFEMKernelProps); - occa::properties stiffnessKernelInfo = platform->kernelInfo; - filename = oklpath + "ellipticSEMFEMStiffness.okl"; - stiffnessKernelInfo["defines/p_Nq"] = Nq; - stiffnessKernelInfo["defines/p_Np"] = Np; - stiffnessKernelInfo["defines/p_rows_sorted"] = 1; - stiffnessKernelInfo["defines/p_cols_sorted"] = 0; - - const bool constructOnHost = - platform->device.mode() == std::string("OpenCL") || - platform->device.mode() == std::string("HIP") || - platform->device.mode() == std::string("Serial"); - - if (!constructOnHost) { - platform->kernels.add_kernel("computeStiffnessMatrix", - filename, - "computeStiffnessMatrix", - stiffnessKernelInfo); - } -} -void registerJacobiKernels(const std::string §ion) { - const std::string optionsPrefix = createOptionsPrefix(section); - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - const std::string oklpath = install_dir + "/okl/"; - std::string fileName = oklpath + "elliptic/ellipticJacobi.okl"; - std::string kernelName = "axmyzManyPfloat"; - platform->kernels.add_kernel( - kernelName, fileName, kernelName, platform->kernelInfo); - - kernelName = "adyManyPfloat"; - platform->kernels.add_kernel( - kernelName, fileName, kernelName, platform->kernelInfo); -} -void registerEllipticPreconditionerKernels(const std::string §ion) { - const std::string optionsPrefix = createOptionsPrefix(section); - int N; - platform->options.getArgs("POLYNOMIAL DEGREE", N); - - if(platform->options.compareArgs(optionsPrefix + "PRECONDITIONER", "MULTIGRID")) { - registerMultiGridKernels(section); - } else if(platform->options.compareArgs(optionsPrefix + "PRECONDITIONER", "SEMFEM")) { - registerSEMFEMKernels(section, N); - } else if(platform->options.compareArgs(optionsPrefix + "PRECONDITIONER", "JACOBI")) { - registerJacobiKernels(section); - } else if(platform->options.compareArgs(optionsPrefix + "PRECONDITIONER", "NONE")) { - // nothing - } else { - printf("ERROR: Unknown preconditioner!\n"); - ABORT(EXIT_FAILURE); - } -} -void registerEllipticKernels(const std::string §ion) { - int N; - platform->options.getArgs("POLYNOMIAL DEGREE", N); - const std::string optionsPrefix = createOptionsPrefix(section); - - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - occa::properties kernelInfo = platform->kernelInfo; - kernelInfo["defines"].asObject(); - kernelInfo["includes"].asArray(); - kernelInfo["header"].asArray(); - kernelInfo["flags"].asObject(); - kernelInfo["include_paths"].asArray(); - kernelInfo += ellipticKernelInfo(N); - const int Nfields = (section.find("velocity") != std::string::npos) ? 3 : 1; - constexpr int Nverts{8}; - - const bool blockSolver = [§ion]() { - if (section.find("velocity") == std::string::npos) - return false; - if (platform->options.compareArgs("STRESSFORMULATION", "TRUE")) - return true; - if (platform->options.compareArgs("VELOCITY BLOCK SOLVER", "TRUE")) - return true; - return false; - }(); - const bool stressForm = [§ion]() { - if (section.find("velocity") == std::string::npos) - return false; - if (platform->options.compareArgs("STRESSFORMULATION", "TRUE")) - return true; - return false; - }(); - - const bool serial = platform->device.mode() == "Serial" || - platform->device.mode() == "OpenMP"; - const std::string sectionIdentifier = std::to_string(Nfields) + "-"; - - if (platform->options.compareArgs( - optionsPrefix + "KRYLOV SOLVER", "PGMRES")) { - registerGMRESKernels(section, Nfields); - } - - // solution projection kernels - { - const std::string oklpath = install_dir + "/okl/elliptic/"; - std::string filename, kernelName; - - { - occa::properties properties = platform->kernelInfo; - properties["defines/p_Nfields"] = Nfields; - - filename = oklpath + "ellipticResidualProjection.okl"; - kernelName = "multiScaledAddwOffset"; - platform->kernels.add_kernel( - sectionIdentifier + kernelName, filename, kernelName, properties); - kernelName = "accumulate"; - platform->kernels.add_kernel( - sectionIdentifier + kernelName, filename, kernelName, properties); - } - } - - { - const std::string oklpath = install_dir + "/okl/core/"; - std::string filename; - - filename = oklpath + "mask.okl"; - platform->kernels.add_kernel("mask", filename, "mask", kernelInfo); - } - - kernelInfo["defines/p_Nverts"] = Nverts; - kernelInfo["defines/p_Nfields"] = Nfields; - - occa::properties dfloatKernelInfo = kernelInfo; - occa::properties floatKernelInfo = kernelInfo; - floatKernelInfo["defines/pfloat"] = pfloatString; - floatKernelInfo["defines/dfloat"] = pfloatString; - - constexpr bool var_coeff = true; - constexpr int elementType{HEXAHEDRA}; - - const std::string suffix = "Hex3D"; - - occa::properties AxKernelInfo = dfloatKernelInfo; - { - const std::string oklpath = install_dir + "/okl/elliptic/"; - std::string filename; - std::string kernelName; - - filename = oklpath + "ellipticBuildDiagonal" + suffix + ".okl"; - kernelName = "ellipticBlockBuildDiagonal" + suffix; - dfloatKernelInfo["defines/dfloat"] = dfloatString; - dfloatKernelInfo["defines/pfloat"] = pfloatString; - platform->kernels.add_kernel( - sectionIdentifier + kernelName, filename, kernelName, dfloatKernelInfo); - dfloatKernelInfo["defines/pfloat"] = dfloatString; - if (blockSolver) { - filename = oklpath + "ellipticBlockAx" + suffix + ".okl"; - if (serial) - filename = oklpath + "ellipticSerialAx" + suffix + ".c"; - if (var_coeff && elementType == HEXAHEDRA) { - if (stressForm) - kernelName = "ellipticStressAxVar" + suffix; - else - kernelName = - "ellipticBlockAxVar" + suffix + "_N" + std::to_string(Nfields); - } else { - if (stressForm) - kernelName = "ellipticStressAx" + suffix; - else - kernelName = "ellipticBlockAx", - suffix + "_N" + std::to_string(Nfields); - } - } else { - filename = oklpath + "ellipticAx" + suffix + ".okl"; - if (serial) - filename = oklpath + "ellipticSerialAx" + suffix + ".c"; - if (var_coeff && elementType == HEXAHEDRA) - kernelName = "ellipticAxVar" + suffix; - else - kernelName = "ellipticAx" + suffix; - } - platform->kernels.add_kernel( - kernelName, filename, kernelName, AxKernelInfo); - if (blockSolver) { - filename = oklpath + "ellipticBlockAx" + suffix + ".okl"; - if (serial) - filename = oklpath + "ellipticSerialAx" + suffix + ".c"; - if (var_coeff && elementType == HEXAHEDRA) - kernelName = - "ellipticBlockAxVar" + suffix + "_N" + std::to_string(Nfields); - else - kernelName = - "ellipticBlockAx" + suffix + "_N" + std::to_string(Nfields); - } else { - filename = oklpath + "ellipticAx" + suffix + ".okl"; - if (serial) - filename = oklpath + "ellipticSerialAx" + suffix + ".c"; - if (var_coeff && elementType == HEXAHEDRA) - kernelName = "ellipticAxVar" + suffix; - else - kernelName = "ellipticAx" + suffix; - } - // Keep other kernel around - platform->kernels.add_kernel( - kernelName, filename, kernelName, AxKernelInfo); - - if (!serial) { - if (elementType != HEXAHEDRA) { - kernelName = "ellipticPartialAx" + suffix; - } else { - if (platform->options.compareArgs("ELEMENT MAP", "TRILINEAR")) { - if (var_coeff || blockSolver) { - printf( - "ERROR: TRILINEAR form is not implemented for varibale coefficient and block solver yet \n"); - ABORT(EXIT_FAILURE); - } - kernelName = "ellipticPartialAxTrilinear" + suffix; - } else { - if (blockSolver) { - if (var_coeff) { - if (stressForm) - kernelName = "ellipticStressPartialAxVar" + suffix; - else - kernelName = "ellipticBlockPartialAxVar" + suffix + "_N" + - std::to_string(Nfields); - } else { - if (stressForm) - kernelName = "ellipticStessPartialAx" + suffix; - else - kernelName = "ellipticBlockPartialAx" + suffix + "_N" + - std::to_string(Nfields); - } - } else { - if (var_coeff) - kernelName = "ellipticPartialAxVar" + suffix; - else - kernelName = "ellipticPartialAx" + suffix; - } - } - } - platform->kernels.add_kernel( - kernelName, filename, kernelName, AxKernelInfo); - platform->kernels.add_kernel( - kernelName, filename, kernelName, AxKernelInfo); - } - - // combined PCG update and r.r kernel - if (serial) { - filename = oklpath + "ellipticSerialUpdatePCG.c"; - } else { - filename = oklpath + "ellipticUpdatePCG.okl"; - } - platform->kernels.add_kernel(sectionIdentifier + "ellipticBlockUpdatePCG", - filename, - "ellipticBlockUpdatePCG", - dfloatKernelInfo); + const bool buildNodeLocal = useNodeLocalCache(); + const bool buildOnly = platform->options.compareArgs("BUILD ONLY", "TRUE"); + auto communicator = buildNodeLocal ? platform->comm.mpiCommLocal : platform->comm.mpiComm; + oogs::compile( + platform->device.occaDevice(), platform->device.mode(), communicator, buildOnly); } - // projection - {} -} -void registerMeshKernels() { - int N, cubN; - platform->options.getArgs("POLYNOMIAL DEGREE", N); - platform->options.getArgs("CUBATURE POLYNOMIAL DEGREE", cubN); - const int Nq = N + 1; - const int cubNq = cubN + 1; - const int Np = Nq * Nq * Nq; - const int cubNp = cubNq * cubNq * cubNq; - - int nAB; - platform->options.getArgs("MESH INTEGRATION ORDER", nAB); + platform->kernels.compile(); - auto kernelInfo = populateMeshProperties(N); - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - std::string oklpath = install_dir + "/okl/"; + // load platform related kernels std::string kernelName; + kernelName = "copyDfloatToPfloat"; + platform->copyDfloatToPfloatKernel = platform->kernels.get(kernelName); - const std::string meshPrefix = "mesh-"; - { - std::string filename = oklpath + "mesh/velocityBCHex3D.okl"; - kernelName = "velocityDirichletBCHex3D"; - platform->kernels.add_kernel(meshPrefix + kernelName, filename, kernelName, kernelInfo); - occa::properties meshKernelInfo = kernelInfo; - meshKernelInfo["defines/p_cubNq"] = cubNq; - meshKernelInfo["defines/p_cubNp"] = cubNp; - - filename = oklpath + "mesh/geometricFactorsHex3D.okl"; - kernelName = "geometricFactorsHex3D"; - platform->kernels.add_kernel( - meshPrefix + kernelName, filename, kernelName, meshKernelInfo); - filename = oklpath + "mesh/surfaceGeometricFactorsHex3D.okl"; - kernelName = "surfaceGeometricFactorsHex3D"; - platform->kernels.add_kernel( - meshPrefix + kernelName, filename, kernelName, meshKernelInfo); - - meshKernelInfo = kernelInfo; - meshKernelInfo["defines/p_nAB"] = nAB; - filename = oklpath + "core/nStagesSum.okl"; - kernelName = "nStagesSumVector"; - platform->kernels.add_kernel( - meshPrefix + kernelName, filename, kernelName, meshKernelInfo); - } -} - -void registerLinAlgKernels() { - occa::properties kernelInfo = platform->kernelInfo; - - std::string oklDir; - oklDir.assign(getenv("NEKRS_INSTALL_DIR")); - oklDir += "/okl/linAlg/"; - std::string filename; - const bool serial = (platform->device.mode() == "Serial" || - platform->device.mode() == "OpenMP"); - - platform->kernels.add_kernel( - "fill", oklDir + "linAlgFill.okl", "fill", kernelInfo); - platform->kernels.add_kernel( - "vabs", oklDir + "linAlgAbs.okl", "vabs", kernelInfo); - platform->kernels.add_kernel( - "add", oklDir + "linAlgAdd.okl", "add", kernelInfo); - platform->kernels.add_kernel( - "scale", oklDir + "linAlgScale.okl", "scale", kernelInfo); - platform->kernels.add_kernel( - "scaleMany", oklDir + "linAlgScale.okl", "scaleMany", kernelInfo); - filename = std::string("linAlgAXPBY") + - (serial ? std::string(".c") : std::string(".okl")); - platform->kernels.add_kernel("axpby", oklDir + filename, "axpby", kernelInfo); - filename = std::string("linAlgAXPBY") + - (serial ? std::string(".c") : std::string(".okl")); - platform->kernels.add_kernel( - "axpbyMany", oklDir + filename, "axpbyMany", kernelInfo); - platform->kernels.add_kernel( - "axpbyz", oklDir + "linAlgAXPBY.okl", "axpbyz", kernelInfo); - platform->kernels.add_kernel( - "axpbyzMany", oklDir + "linAlgAXPBY.okl", "axpbyzMany", kernelInfo); - filename = std::string("linAlgAXMY") + - (serial ? std::string(".c") : std::string(".okl")); - platform->kernels.add_kernel("axmy", oklDir + filename, "axmy", kernelInfo); - filename = std::string("linAlgAXMY") + - (serial ? std::string(".c") : std::string(".okl")); - platform->kernels.add_kernel( - "axmyMany", oklDir + filename, "axmyMany", kernelInfo); - filename = std::string("linAlgAXMY") + - (serial ? std::string(".c") : std::string(".okl")); - platform->kernels.add_kernel( - "axmyVector", oklDir + filename, "axmyVector", kernelInfo); - platform->kernels.add_kernel( - "axmyz", oklDir + "linAlgAXMY.okl", "axmyz", kernelInfo); - platform->kernels.add_kernel( - "axmyzMany", oklDir + "linAlgAXMY.okl", "axmyzMany", kernelInfo); - platform->kernels.add_kernel( - "ady", oklDir + "linAlgAXDY.okl", "ady", kernelInfo); - platform->kernels.add_kernel( - "adyMany", oklDir + "linAlgAXDY.okl", "adyMany", kernelInfo); - platform->kernels.add_kernel( - "axdy", oklDir + "linAlgAXDY.okl", "axdy", kernelInfo); - platform->kernels.add_kernel( - "aydx", oklDir + "linAlgAXDY.okl", "aydx", kernelInfo); - platform->kernels.add_kernel( - "aydxMany", oklDir + "linAlgAXDY.okl", "aydxMany", kernelInfo); - platform->kernels.add_kernel( - "axdyz", oklDir + "linAlgAXDY.okl", "axdyz", kernelInfo); - platform->kernels.add_kernel( - "sum", oklDir + "linAlgSum.okl", "sum", kernelInfo); - platform->kernels.add_kernel( - "sumMany", oklDir + "linAlgSum.okl", "sumMany", kernelInfo); - platform->kernels.add_kernel( - "min", oklDir + "linAlgMin.okl", "min", kernelInfo); - platform->kernels.add_kernel( - "max", oklDir + "linAlgMax.okl", "max", kernelInfo); - filename = std::string("linAlgNorm2") + - (serial ? std::string(".c") : std::string(".okl")); - platform->kernels.add_kernel( - "norm2", oklDir + filename, "norm2", kernelInfo); - platform->kernels.add_kernel( - "norm2Many", oklDir + filename, "norm2Many", kernelInfo); - filename = std::string("linAlgNorm1") + - (serial ? std::string(".c") : std::string(".okl")); - platform->kernels.add_kernel( - "norm1", oklDir + filename, "norm1", kernelInfo); - platform->kernels.add_kernel( - "norm1Many", oklDir + filename, "norm1Many", kernelInfo); - filename = std::string("linAlgWeightedNorm1") + - (serial ? std::string(".c") : std::string(".okl")); - platform->kernels.add_kernel( - "weightedNorm1", oklDir + filename, "weightedNorm1", kernelInfo); - platform->kernels.add_kernel( - "weightedNorm1Many", oklDir + filename, "weightedNorm1Many", kernelInfo); - filename = std::string("linAlgWeightedNorm2") + - (serial ? std::string(".c") : std::string(".okl")); - platform->kernels.add_kernel( - "weightedNorm2", oklDir + filename, "weightedNorm2", kernelInfo); - filename = std::string("linAlgWeightedNorm2") + - (serial ? std::string(".c") : std::string(".okl")); - platform->kernels.add_kernel( - "weightedNorm2Many", oklDir + filename, "weightedNorm2Many", kernelInfo); - platform->kernels.add_kernel( - "innerProd", oklDir + "linAlgInnerProd.okl", "innerProd", kernelInfo); - filename = std::string("linAlgWeightedInnerProd") + - (serial ? std::string(".c") : std::string(".okl")); - platform->kernels.add_kernel( - "weightedInnerProd", oklDir + filename, "weightedInnerProd", kernelInfo); - filename = std::string("linAlgWeightedInnerProd") + - (serial ? std::string(".c") : std::string(".okl")); - platform->kernels.add_kernel("weightedInnerProdMany", - oklDir + filename, - "weightedInnerProdMany", - kernelInfo); - platform->kernels.add_kernel("weightedInnerProdMulti", - oklDir + "linAlgWeightedInnerProd.okl", - "weightedInnerProdMulti", - kernelInfo); -} -void compileUDFKernels() -{ - int buildNodeLocal = 0; - if (getenv("NEKRS_BUILD_NODE_LOCAL")) - buildNodeLocal = std::stoi(getenv("NEKRS_BUILD_NODE_LOCAL")); - - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - int N; - platform->options.getArgs("POLYNOMIAL DEGREE", N); - occa::properties kernelInfo = platform->kernelInfo; - kernelInfo["defines"].asObject(); - kernelInfo["includes"].asArray(); - kernelInfo["header"].asArray(); - kernelInfo["flags"].asObject(); - kernelInfo["include_paths"].asArray(); - - auto rank = buildNodeLocal ? platform->comm.localRank : platform->comm.mpiRank; - auto communicator = buildNodeLocal ? platform->comm.mpiCommLocal : platform->comm.mpiComm; + kernelName = "copyPfloatToDfloat"; + platform->copyPfloatToDfloatKernel = platform->kernels.get(kernelName); MPI_Barrier(platform->comm.mpiComm); - const double tStart = MPI_Wtime(); - if (platform->comm.mpiRank == 0) - printf("loading udf kernels ... "); - fflush(stdout); + const double loadTime = MPI_Wtime() - tStart; - for(int pass = 0; pass < 2; ++pass) - { - bool executePass = (pass == 0) && (rank == 0); - executePass |= (pass == 1) && (rank != 0); - if(executePass){ - kernelInfoBC = kernelInfo; - if (udf.loadKernels) { - // side-effect: kernelInfoBC will include any relevant user-defined kernel - // info - udf.loadKernels(kernelInfoBC); - } - const std::string bcDataFile = install_dir + "/include/core/bcData.h"; - kernelInfoBC["includes"] += bcDataFile.c_str(); - std::string boundaryHeaderFileName; - platform->options.getArgs("DATA FILE", boundaryHeaderFileName); - kernelInfoBC["includes"] += realpath(boundaryHeaderFileName.c_str(), NULL); - kernelInfoBC += populateMeshProperties(N); - } - MPI_Barrier(communicator); + fflush(stdout); + if (platform->comm.mpiRank == 0) { + std::ofstream ofs; + ofs.open(occa::env::OCCA_CACHE_DIR + "cache/compile.timestamp", + std::ofstream::out | std::ofstream::trunc); + ofs.close(); } - - MPI_Barrier(platform->comm.mpiComm); - const double loadTime = MPI_Wtime() - tStart; + + platform->timer.set("loadKernels", loadTime); if (platform->comm.mpiRank == 0) printf("done (%gs)\n\n", loadTime); fflush(stdout); } -void compileDummyKernel() -{ - int buildNodeLocal = 0; - if (getenv("NEKRS_BUILD_NODE_LOCAL")) - buildNodeLocal = std::stoi(getenv("NEKRS_BUILD_NODE_LOCAL")); - auto rank = buildNodeLocal ? platform->comm.localRank : platform->comm.mpiRank; - const std::string dummyKernelName = "myDummyKernelName"; - const std::string dummyKernelStr = std::string( - "@kernel void myDummyKernelName(int N) {" - " for (int i = 0; i < N; ++i; @tile(64, @outer, @inner)) {}" - "}" - ); - - if(rank == 0){ - platform->device.buildKernelFromString( - dummyKernelStr, - dummyKernelName, - platform->kernelInfo - ); - } - -} -} // namespace - -void compileKernels() { - - compileDummyKernel(); // trigger occa's compilerVendorTest - - compileUDFKernels(); - - registerLinAlgKernels(); - - registerMeshKernels(); - - registerNrsKernels(); - - { - int Nscalars; - platform->options.getArgs("NUMBER OF SCALARS", Nscalars); - if (Nscalars) { - registerCdsKernels(); - } - } - - { - const std::vector sections = { - "pressure", - "velocity", - }; - for (auto &§ion : sections) { - registerEllipticKernels(section); - registerEllipticPreconditionerKernels(section); - } - } - - { - MPI_Barrier(platform->comm.mpiComm); - const double tStart = MPI_Wtime(); - if (platform->comm.mpiRank == 0) - printf("loading kernels ... "); - fflush(stdout); - - { - int buildNodeLocal = 0; - if (getenv("NEKRS_BUILD_NODE_LOCAL")) - buildNodeLocal = std::stoi(getenv("NEKRS_BUILD_NODE_LOCAL")); - const bool buildOnly = platform->options.compareArgs("BUILD ONLY", "TRUE"); - auto communicator = buildNodeLocal ? platform->comm.mpiCommLocal : platform->comm.mpiComm; - oogs::compile( - platform->device, platform->device.mode(), communicator, buildOnly); - } - - platform->kernels.compile(); - - MPI_Barrier(platform->comm.mpiComm); - const double loadTime = MPI_Wtime() - tStart; - - - fflush(stdout); - if (platform->comm.mpiRank == 0) { - std::ofstream ofs; - ofs.open(occa::env::OCCA_CACHE_DIR + "cache/compile.timestamp", - std::ofstream::out | std::ofstream::trunc); - ofs.close(); - } - - platform->timer.set("loadKernels", loadTime); - if (platform->comm.mpiRank == 0) - printf("done (%gs)\n\n", loadTime); - fflush(stdout); - } -} diff --git a/src/core/compileKernels.hpp b/src/core/compileKernels.hpp new file mode 100644 index 000000000..39702a6e9 --- /dev/null +++ b/src/core/compileKernels.hpp @@ -0,0 +1,16 @@ +#if !defined(compile_kernels_hpp_) +#define compile_kernels_hpp_ + +#include +#include + +occa::properties compileUDFKernels(); +void registerLinAlgKernels(); +void registerMeshKernels(occa::properties kernelInfoBC); +void registerNrsKernels(occa::properties kernelInfoBC); +void registerCdsKernels(occa::properties kernelInfoBC); +void registerEllipticKernels(std::string section, int poissonEquation); +void registerEllipticPreconditionerKernels(std::string section, int poissonEquation); + +std::string createOptionsPrefix(std::string section); +#endif \ No newline at end of file diff --git a/src/core/configReader.cpp b/src/core/configReader.cpp index 05b1fce86..6d2aad342 100644 --- a/src/core/configReader.cpp +++ b/src/core/configReader.cpp @@ -27,8 +27,8 @@ void configRead(MPI_Comm comm) std::cout << "\nERROR: The environment variable NEKRS_HOME is not defined!\n"; EXIT_AND_FINALIZE(1); } - std::string install_dir{nekrs_home}; - std::string configFile = install_dir + "/nekrs.conf"; + std::string installDir{nekrs_home}; + std::string configFile = installDir + "/nekrs.conf"; const char* ptr = realpath(configFile.c_str(), NULL); if (!ptr) { @@ -101,7 +101,7 @@ void configRead(MPI_Comm comm) ini.extract("general", "nekrs_gpu_mpi", buf); if(!getenv("NEKRS_GPU_MPI")) setenv("NEKRS_GPU_MPI", buf.c_str(), 1); - buf = install_dir; + buf = installDir; setenv("NEKRS_INSTALL_DIR", buf.c_str(), 1); ini.extract("general", "occa_mode_default", buf); diff --git a/src/core/device.cpp b/src/core/device.cpp index 41bec0d71..99f72e926 100644 --- a/src/core/device.cpp +++ b/src/core/device.cpp @@ -1,32 +1,92 @@ #include "device.hpp" #include "platform.hpp" #include +#include occa::kernel -device_t::buildNativeKernel(const std::string &filename, +device_t::buildNativeKernel(const std::string &fileName, const std::string &kernelName, const occa::properties &props) const { occa::properties nativeProperties = props; nativeProperties["okl/enabled"] = false; - if(platform->options.compareArgs("BUILD ONLY", "TRUE")) + if(_verbose) nativeProperties["verbose"] = true; - if(platform->device.mode() == "OpenMP") + if(this->mode() == "OpenMP") nativeProperties["defines/__NEKRS__OMP__"] = 1; - return occa::device::buildKernel(filename, kernelName, nativeProperties); + return _device.buildKernel(fileName, kernelName, nativeProperties); } + occa::kernel -device_t::buildKernel(const std::string &filename, - const std::string &kernelName, - const occa::properties &props, - std::string suffix) const +device_t::buildKernel(const std::string &fullPath, + const occa::properties &props) const +{ + const std::string noSuffix = std::string(""); + return this->buildKernel(fullPath, props, noSuffix); +} + +occa::kernel +device_t::buildKernel(const std::string &fullPath, + const occa::properties &props, + const std::string & suffix) const +{ + const std::string fileName = fullPath; + std::string kernelName; + std::regex kernelNameRegex(R"((.+)\/(.+)\.)"); + std::smatch kernelNameMatch; + const bool foundKernelName = std::regex_search(fullPath, kernelNameMatch, kernelNameRegex); + + // e.g. /path/to/install/nekrs/okl/cds/advectMeshVelocityHex3D.okl + + // Full string + // 0: /path/to/install/nekrs/okl/cds/advectMeshVelocityHex3D.okl + + // First capture group + // 1: /path/to/install/nekrs/okl/cds + + // Second capture group (kernel name) + // 2: advectMeshVelocityHex3D.okl + if(foundKernelName){ + if(kernelNameMatch.size() == 3){ + kernelName = kernelNameMatch[2].str(); + } + } + + return this->buildKernel(fileName, kernelName, props, suffix); +} + +occa::kernel +device_t::buildKernel(const std::string &fileName, + const std::string &kernelName, + const occa::properties &props, + const std::string& suffix) const { - if(filename.find(".okl") != std::string::npos){ + + if(fileName.find(".okl") != std::string::npos){ occa::properties propsWithSuffix = props; propsWithSuffix["kernelNameSuffix"] = suffix; - if(platform->options.compareArgs("BUILD ONLY", "TRUE")) + if(_verbose) propsWithSuffix["verbose"] = true; - return occa::device::buildKernel(filename, kernelName, propsWithSuffix); + + if (this->mode() == "CUDA") + propsWithSuffix["defines/smXX"] = 1; + if (this->mode() == "HIP") + propsWithSuffix["defines/gfxXX"] = 1; + + const std::string floatingPointType = static_cast(propsWithSuffix["defines/dfloat"]); + + if (floatingPointType.find("float") != std::string::npos) { + propsWithSuffix["defines/FP32"] = 1; + } + + // if p_knl is defined, add _v(p_knl) to the kernel name + std::string newKernelName = kernelName; + if (props.has("defines/p_knl")) { + const int kernelVariant = static_cast(props["defines/p_knl"]); + newKernelName += "_v" + std::to_string(kernelVariant); + }; + + return _device.buildKernel(fileName, newKernelName, propsWithSuffix); } else{ occa::properties propsWithSuffix = props; @@ -35,77 +95,129 @@ device_t::buildKernel(const std::string &filename, propsWithSuffix["defines/TOKEN_PASTE(a,b)"] = std::string("TOKEN_PASTE_(a,b)"); propsWithSuffix["defines/FUNC(a)"] = std::string("TOKEN_PASTE(a,SUFFIX)"); const std::string alteredName = kernelName + suffix; - return this->buildNativeKernel(filename, alteredName, propsWithSuffix); + return this->buildNativeKernel(fileName, alteredName, propsWithSuffix); + } +} + +occa::kernel +device_t::buildKernel(const std::string &fileName, + const std::string &kernelName, + const occa::properties &props) const +{ + + const std::string suffix(""); + const bool buildNodeLocal = useNodeLocalCache(); + const int rank = buildNodeLocal ? _comm.localRank : _comm.mpiRank; + MPI_Comm localCommunicator = buildNodeLocal ? _comm.mpiCommLocal : _comm.mpiComm; + + occa::kernel constructedKernel; + for(int pass = 0; pass < 2; ++pass){ + if((pass == 0 && rank == 0) || (pass == 1 && rank != 0)){ + constructedKernel = this->buildKernel(fileName, kernelName, props, suffix); + } + MPI_Barrier(localCommunicator); } + return constructedKernel; + +} + +occa::kernel +device_t::buildKernel(const std::string &fullPath, + const occa::properties &props, + const std::string & suffix, + bool buildRank0) const +{ + + if(buildRank0){ + + const bool buildNodeLocal = useNodeLocalCache(); + const int rank = buildNodeLocal ? _comm.localRank : _comm.mpiRank; + MPI_Comm localCommunicator = buildNodeLocal ? _comm.mpiCommLocal : _comm.mpiComm; + occa::kernel constructedKernel; + for(int pass = 0; pass < 2; ++pass){ + if((pass == 0 && rank == 0) || (pass == 1 && rank != 0)){ + constructedKernel = this->buildKernel(fullPath, props, suffix); + } + MPI_Barrier(localCommunicator); + } + return constructedKernel; + + } + + return this->buildKernel(fullPath, props, suffix); + +} + +occa::kernel +device_t::buildKernel(const std::string &fullPath, + const occa::properties &props, + bool buildRank0) const +{ + std::string noSuffix = std::string(""); + return this->buildKernel(fullPath, props, noSuffix, buildRank0); } + occa::memory -device_t::mallocHost(const hlong Nbytes) +device_t::mallocHost(const size_t Nbytes) { occa::properties props; props["host"] = true; void* buffer = std::calloc(Nbytes, 1); - occa::memory h_scratch = occa::device::malloc(Nbytes, buffer, props); + occa::memory h_scratch = _device.malloc(Nbytes, buffer, props); std::free(buffer); return h_scratch; } + occa::memory -device_t::malloc(const hlong Nbytes, const occa::properties& properties) +device_t::malloc(const size_t Nbytes, const occa::properties& properties) { void* buffer = std::calloc(Nbytes, 1); - occa::memory o_returnValue = occa::device::malloc(Nbytes, buffer, properties); + occa::memory o_returnValue = _device.malloc(Nbytes, buffer, properties); std::free(buffer); return o_returnValue; } + occa::memory -device_t::malloc(const hlong Nbytes, const void* src, const occa::properties& properties) +device_t::malloc(const size_t Nbytes, const void* src, const occa::properties& properties) { void* buffer; - if(!src){ - buffer = std::calloc(Nbytes, 1); - } + buffer = std::calloc(Nbytes, 1); const void* init_ptr = (src) ? src : buffer; - occa::memory o_returnValue = occa::device::malloc(Nbytes, init_ptr, properties); - if(!src){ - std::free(buffer); - } + occa::memory o_returnValue = _device.malloc(Nbytes, init_ptr, properties); + std::free(buffer); return o_returnValue; } + occa::memory device_t::malloc(const hlong Nword , const dlong wordSize, occa::memory src) { - return occa::device::malloc(Nword * wordSize, src); + return _device.malloc(Nword * wordSize, src); } + occa::memory device_t::malloc(const hlong Nword , const dlong wordSize) { - const hlong Nbytes = Nword * wordSize; + const size_t Nbytes = Nword * wordSize; void* buffer = std::calloc(Nword, wordSize); - occa::memory o_returnValue = occa::device::malloc(Nword * wordSize, buffer); + occa::memory o_returnValue = _device.malloc(Nword * wordSize, buffer); std::free(buffer); return o_returnValue; } -device_t::device_t(setupAide& options, MPI_Comm comm) + +device_t::device_t(setupAide& options, comm_t& comm) +:_comm(comm) { + _verbose = options.compareArgs("BUILD ONLY", "TRUE"); + // OCCA build stuff char deviceConfig[BUFSIZ]; - int rank, size; - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); + int worldRank = _comm.mpiRank; int device_id = 0; if(options.compareArgs("DEVICE NUMBER", "LOCAL-RANK")) { - long int hostId = gethostid(); - - long int* hostIds = (long int*) std::calloc(size,sizeof(long int)); - MPI_Allgather(&hostId,1,MPI_LONG,hostIds,1,MPI_LONG,comm); - - int totalDevices = 0; - for (int r = 0; r < rank; r++) - if (hostIds[r] == hostId) device_id++; - for (int r = 0; r < size; r++) - if (hostIds[r] == hostId) totalDevices++; + device_id = _comm.localRank; } else { options.getArgs("DEVICE NUMBER",device_id); } @@ -123,7 +235,7 @@ device_t::device_t(setupAide& options, MPI_Comm comm) options.getArgs("PLATFORM NUMBER", plat); sprintf(deviceConfig, "{mode: 'OpenCL', device_id: %d, platform_id: %d}", device_id, plat); }else if(strcasecmp(requestedOccaMode.c_str(), "OPENMP") == 0) { - if(rank == 0) printf("OpenMP backend currently not supported!\n"); + if(worldRank == 0) printf("OpenMP backend currently not supported!\n"); ABORT(EXIT_FAILURE); sprintf(deviceConfig, "{mode: 'OpenMP'}"); }else if(strcasecmp(requestedOccaMode.c_str(), "CPU") == 0 || @@ -132,19 +244,18 @@ device_t::device_t(setupAide& options, MPI_Comm comm) options.setArgs("THREAD MODEL", "SERIAL"); options.getArgs("THREAD MODEL", requestedOccaMode); } else { - if(rank == 0) printf("Invalid requested backend!\n"); + if(worldRank == 0) printf("Invalid requested backend!\n"); ABORT(EXIT_FAILURE); } - if(rank == 0) printf("Initializing device \n"); - this->setup((std::string)deviceConfig); - this->comm = comm; + if(worldRank == 0) printf("Initializing device \n"); + this->_device.setup((std::string)deviceConfig); - if(rank == 0) + if(worldRank == 0) std::cout << "active occa mode: " << this->mode() << "\n\n"; if(strcasecmp(requestedOccaMode.c_str(), this->mode().c_str()) != 0) { - if(rank == 0) printf("active occa mode does not match selected backend!\n"); + if(worldRank == 0) printf("active occa mode does not match selected backend!\n"); ABORT(EXIT_FAILURE); } @@ -159,4 +270,6 @@ device_t::device_t(setupAide& options, MPI_Comm comm) } _device_id = device_id; + + deviceAtomic = this->mode() == "CUDA"; } \ No newline at end of file diff --git a/src/core/device.hpp b/src/core/device.hpp index c8fddc207..f4d6602f5 100644 --- a/src/core/device.hpp +++ b/src/core/device.hpp @@ -6,27 +6,58 @@ #include "nrssys.hpp" class setupAide; +class comm_t; -class device_t : public occa::device{ +class device_t { public: - device_t(setupAide& options, MPI_Comm comm); - MPI_Comm comm; - occa::memory malloc(const hlong Nbytes, const void* src = nullptr, const occa::properties& properties = occa::properties()); - occa::memory malloc(const hlong Nbytes, const occa::properties& properties); + device_t(setupAide& options, comm_t& comm); + occa::memory malloc(const size_t Nbytes, const void* src = nullptr, const occa::properties& properties = occa::properties()); + occa::memory malloc(const size_t Nbytes, const occa::properties& properties); occa::memory malloc(const hlong Nwords, const dlong wordSize, occa::memory src); occa::memory malloc(const hlong Nwords, const dlong wordSize); - occa::memory mallocHost(const hlong Nbytes); + occa::memory mallocHost(const size_t Nbytes); int id() const { return _device_id; } - occa::kernel buildNativeKernel(const std::string &filename, + const occa::device& occaDevice() const { return _device; } + std::string mode() const { return _device.mode(); } + occa::device& occaDevice() { return _device; } + void finish() { _device.finish(); } + + occa::kernel buildKernel(const std::string &fullPath, + const occa::properties &props, + const std::string& suffix, + bool buildRank0) const; + occa::kernel buildKernel(const std::string &fullPath, + const occa::properties &props, + bool buildRank0) const; + + // collective + occa::kernel buildKernel(const std::string &fileName, const std::string &kernelName, const occa::properties &props) const; - occa::kernel buildKernel(const std::string &filename, + + bool deviceAtomic; + + private: + + // non-collective + occa::kernel buildKernel(const std::string &fullPath, + const occa::properties &props) const; + occa::kernel buildKernel(const std::string &fullPath, + const occa::properties &props, + const std::string& suffix) const; + occa::kernel buildKernel(const std::string &fileName, const std::string &kernelName, const occa::properties &props, - std::string suffix = std::string()) const; - private: + const std::string& suffix) const; + + occa::kernel buildNativeKernel(const std::string &fileName, + const std::string &kernelName, + const occa::properties &props) const; + comm_t& _comm; + occa::device _device; int _device_id; + bool _verbose; }; #endif diff --git a/src/core/flopCounter.cpp b/src/core/flopCounter.cpp new file mode 100644 index 000000000..5e2581c30 --- /dev/null +++ b/src/core/flopCounter.cpp @@ -0,0 +1,70 @@ +#include +#include "flopCounter.hpp" +#include "platform.hpp" +#include + +void flopCounter_t::add(const std::string &entry, dfloat work) +{ + if (!flopMap.count(entry)) { + flopMap[entry] = 0.0; + } + flopMap[entry] += work; +} + +dfloat flopCounter_t::get(const std::string &entry, MPI_Comm comm) const +{ + dfloat total = flopMap.at(entry); + if (comm != MPI_COMM_NULL) { + MPI_Allreduce(MPI_IN_PLACE, &total, 1, MPI_DFLOAT, MPI_SUM, comm); + } + return total; +} + +dfloat flopCounter_t::get(MPI_Comm comm) const +{ + dfloat err = 0; + dfloat total = 0.0; + for (auto const &entry : flopMap) { + if (entry.second < 0.0) + err += 1; + total += entry.second; + } + + std::array errAndTotal = {err, total}; + if (comm != MPI_COMM_NULL) { + MPI_Allreduce(MPI_IN_PLACE, errAndTotal.data(), 2, MPI_DFLOAT, MPI_SUM, comm); + } + + err = errAndTotal[0]; + total = errAndTotal[1]; + + if (err > 0.0) { + int rank = 0; + if (comm != MPI_COMM_NULL) { + MPI_Comm_rank(comm, &rank); + } + + if (rank == 0) { + std::cout << "Encountered error in flopCounter_t::get" << std::endl; + } + ABORT(1) + } + + return total; +} + +void flopCounter_t::clear() { flopMap.clear(); } + +std::vector flopCounter_t::entries(MPI_Comm comm) const +{ + std::vector loggedCategory; + for (auto const &entry : flopMap) { + loggedCategory.push_back(entry.first); + } + + // sort by flops (largest first) + std::sort(loggedCategory.begin(), loggedCategory.end(), [&](const std::string &a, const std::string &b) { + return get(a, comm) > get(b, comm); + }); + return loggedCategory; +} \ No newline at end of file diff --git a/src/core/flopCounter.hpp b/src/core/flopCounter.hpp new file mode 100644 index 000000000..28bd52a46 --- /dev/null +++ b/src/core/flopCounter.hpp @@ -0,0 +1,26 @@ +#if !defined(nekrs_counter_hpp_) +#define nekrs_counter_hpp_ +#include "nrssys.hpp" +#include +#include +class flopCounter_t { +public: + // Not collective + void clear(); + + // Not collective + void add(const std::string &entry, dfloat work); + + // Note: must be called collectively + dfloat get(const std::string &entry, MPI_Comm comm) const; + + // Note: must be called collectively + std::vector entries(MPI_Comm comm) const; + + // Note: must be called collectively + dfloat get(MPI_Comm comm) const; + +private: + std::map flopMap; +}; +#endif \ No newline at end of file diff --git a/src/core/kernelRequestManager.cpp b/src/core/kernelRequestManager.cpp new file mode 100644 index 000000000..4e906d5ba --- /dev/null +++ b/src/core/kernelRequestManager.cpp @@ -0,0 +1,147 @@ +#include +#include +kernelRequestManager_t::kernelRequestManager_t(const platform_t& m_platform) +: kernelsProcessed(false), + platformRef(m_platform) +{} + +void +kernelRequestManager_t::add(const std::string& m_requestName, + const std::string& m_fileName, + const occa::properties& m_props, + std::string m_suffix, + bool checkUnique) +{ + this->add(kernelRequest_t{m_requestName, m_fileName, m_props, m_suffix}, checkUnique); +} +void +kernelRequestManager_t::add(kernelRequest_t request, bool checkUnique) +{ + auto iterAndBoolPair = kernels.insert(request); + if(checkUnique) + { + int unique = (iterAndBoolPair.second) ? 1 : 0; + MPI_Allreduce(MPI_IN_PLACE, &unique, 1, MPI_INT, MPI_MIN, platformRef.comm.mpiComm); + if(!unique){ + if(platformRef.comm.mpiRank == 0) + { + std::cout << "Error in kernelRequestManager_t::add\n"; + std::cout << "Request details:\n"; + std::cout << request.to_string(); + } + ABORT(1); + } + } + + const std::string fileName = request.fileName; + fileNameToRequestMap[fileName].insert(request); +} +occa::kernel +kernelRequestManager_t::get(const std::string& request, bool checkValid) const +{ + if(checkValid){ + bool issueError = 0; + issueError = !processed(); + issueError = (requestToKernelMap.count(request) == 0); + + int errorFlag = issueError ? 1 : 0; + MPI_Allreduce(MPI_IN_PLACE, &errorFlag, 1, MPI_INT, MPI_MAX, platformRef.comm.mpiComm); + + if(errorFlag){ + if(platformRef.comm.mpiRank == 0) + { + std::cout << "\n"; + std::cout << "Error in kernelRequestManager_t::getKernel():\n"; + std::cout << "Cannot find requested kernel " << request << "!\n"; + + std::cout << "Available:\n"; + for(auto&& keyAndValue : requestToKernelMap) + { + std::cout << "\t" << keyAndValue.first << "\n"; + } + std::cout << "===========================================================\n"; + } + ABORT(1); + } + } + + return requestToKernelMap.at(request); +} + + + +void +kernelRequestManager_t::compile() +{ + + if(kernelsProcessed) return; + kernelsProcessed = true; + + constexpr int maxCompilingRanks {100}; + + const bool buildNodeLocal = useNodeLocalCache(); + + const int rank = buildNodeLocal ? platformRef.comm.localRank : platformRef.comm.mpiRank; + const int ranksCompiling = + std::min( + maxCompilingRanks, + buildNodeLocal ? + platformRef.comm.mpiCommLocalSize : + platformRef.comm.mpiCommSize + ); + + std::vector kernelFiles(fileNameToRequestMap.size()); + + unsigned ctr = 0; + for(auto&& fileNameAndRequests : fileNameToRequestMap) + { + kernelFiles[ctr] = fileNameAndRequests.first; + ctr++; + } + + const auto& device = platformRef.device; + auto& requestToKernel = requestToKernelMap; + auto& fileNameToRequest = fileNameToRequestMap; + auto compileKernels = [&kernelFiles, &requestToKernel, &fileNameToRequest, &device, rank, ranksCompiling](){ + if(rank >= ranksCompiling) return; + const unsigned nFiles = kernelFiles.size(); + for(unsigned fileId = 0; fileId < nFiles; ++fileId) + { + if(fileId % ranksCompiling == rank){ + const std::string fileName = kernelFiles[fileId]; + for(auto && kernelRequest : fileNameToRequest[fileName]){ + const std::string requestName = kernelRequest.requestName; + const std::string fileName = kernelRequest.fileName; + const std::string suffix = kernelRequest.suffix; + const occa::properties props = kernelRequest.props; + + // MPI staging already handled + auto kernel = device.buildKernel(fileName, props, suffix, false); + requestToKernel[requestName] = kernel; + } + } + } + }; + + const auto& kernelRequests = this->kernels; + auto loadKernels = [&requestToKernel, &kernelRequests,&device](){ + for(auto&& kernelRequest : kernelRequests) + { + const std::string requestName = kernelRequest.requestName; + if(requestToKernel.count(requestName) == 0){ + const std::string fileName = kernelRequest.fileName; + const std::string suffix = kernelRequest.suffix; + const occa::properties props = kernelRequest.props; + + // MPI staging already handled + auto kernel = device.buildKernel(fileName, props, suffix, false); + requestToKernel[requestName] = kernel; + } + } + }; + + MPI_Barrier(platform->comm.mpiComm); + compileKernels(); + MPI_Barrier(platform->comm.mpiComm); + loadKernels(); +} \ No newline at end of file diff --git a/src/core/kernelRequestManager.hpp b/src/core/kernelRequestManager.hpp new file mode 100644 index 000000000..30b7853c0 --- /dev/null +++ b/src/core/kernelRequestManager.hpp @@ -0,0 +1,83 @@ +#ifndef kernelRequestManager_hpp_ +#define kernelRequestManager_hpp_ +#include +#include +#include +#include +#include +#include + +class platform_t; + +class kernelRequestManager_t +{ + + struct kernelRequest_t + { + inline bool operator==(const kernelRequest_t& other) const + { + return requestName == other.requestName; + } + inline bool operator<(const kernelRequest_t& other) const + { + return requestName < other.requestName; + } + inline bool operator> (const kernelRequest_t& other) const { return *this < other; } + inline bool operator<=(const kernelRequest_t& other) const { return !(*this > other); } + inline bool operator>=(const kernelRequest_t& other) const { return !(*this < other); } + inline bool operator!=(const kernelRequest_t& other) const { return !(*this == other); } + + kernelRequest_t(const std::string& m_requestName, + const std::string& m_fileName, + const occa::properties& m_props, + std::string m_suffix = std::string()) + : + requestName(m_requestName), + fileName(m_fileName), + suffix(m_suffix), + props(m_props) + {} + std::string requestName; + std::string fileName; + std::string suffix; + occa::properties props; + + std::string to_string() const { + std::ostringstream ss; + ss << "requestName : " << requestName << "\n"; + ss << "fileName : " << fileName << "\n"; + ss << "suffix : " << suffix << "\n"; + ss << "props : " << props << "\n";; + return ss.str(); + } + }; +public: + kernelRequestManager_t(const platform_t& m_platform); + void add(const std::string& m_requestName, + const std::string& m_fileName, + const occa::properties& m_props, + std::string m_suffix = std::string(), + bool assertUnique = false); + void add(const std::string& requestName, occa::kernel kernel){ + requestToKernelMap[requestName] = kernel; + } + + void compile(); + + occa::kernel + get(const std::string& request, bool checkValid = true) const; + + bool + processed() const { return kernelsProcessed; } + +private: + const platform_t& platformRef; + bool kernelsProcessed; + std::set kernels; + std::map requestToKernelMap; + std::map> fileNameToRequestMap; + + void add(kernelRequest_t request, bool assertUnique = true); + +}; +#endif /** kernelRequestManager_hpp_ **/ \ No newline at end of file diff --git a/src/core/kernelTuner.hpp b/src/core/kernelTuner.hpp new file mode 100644 index 000000000..755b1f2af --- /dev/null +++ b/src/core/kernelTuner.hpp @@ -0,0 +1,8 @@ +#include "occa.hpp" +#include +#include + +std::pair tuneKernel(std::function kernelBuilder, + std::function kernelRunner, + occa::properties baseProps, + int NKernels); \ No newline at end of file diff --git a/src/core/nrssys.hpp b/src/core/nrssys.hpp index af74ec901..a90ba1094 100644 --- a/src/core/nrssys.hpp +++ b/src/core/nrssys.hpp @@ -2,6 +2,7 @@ #define nekrs_nrssys_hpp_ #define BLOCKSIZE 256 +#define ALIGN_SIZE 4096 //float data type #if 0 @@ -71,6 +72,8 @@ static occa::memory o_NULL; struct platform_t; extern platform_t* platform; +bool useNodeLocalCache(); + #define EXIT_AND_FINALIZE(a) { fflush(stdout); MPI_Finalize(); exit(a); } #define ABORT(a) { fflush(stdout); MPI_Abort(MPI_COMM_WORLD, a); } diff --git a/src/core/numberActiveFields.cpp b/src/core/numberActiveFields.cpp new file mode 100644 index 000000000..7e34d4a96 --- /dev/null +++ b/src/core/numberActiveFields.cpp @@ -0,0 +1,11 @@ +#include + +int numberActiveFields(nrs_t* nrs) +{ + int fields = 0; + if(!platform->options.compareArgs("VELOCITY SOLVER", "NONE")) fields++; + for(int is = 0; is < nrs->Nscalar; ++is){ + if(nrs->cds->compute[is]) fields++; + } + return fields; +} \ No newline at end of file diff --git a/src/core/platform.cpp b/src/core/platform.cpp index 219575a26..2a1efed41 100644 --- a/src/core/platform.cpp +++ b/src/core/platform.cpp @@ -5,9 +5,37 @@ #include "linAlg.hpp" #include "omp.h" #include +#include "flopCounter.hpp" -comm_t::comm_t(MPI_Comm _comm) +namespace{ + +static void compileDummyKernel(const platform_t& plat) { + const bool buildNodeLocal = useNodeLocalCache(); + auto rank = buildNodeLocal ? plat.comm.localRank : plat.comm.mpiRank; + const std::string dummyKernelName = "myDummyKernelName"; + const std::string dummyKernelStr = std::string( + "@kernel void myDummyKernelName(int N) {" + " for (int i = 0; i < N; ++i; @tile(64, @outer, @inner)) {}" + "}" + ); + + if(rank == 0){ + plat.device.occaDevice().buildKernelFromString( + dummyKernelStr, + dummyKernelName, + plat.kernelInfo + ); + } + +} + +} + +comm_t::comm_t(MPI_Comm _commg, MPI_Comm _comm) +{ + + mpiCommParent = _commg; mpiComm = _comm; MPI_Comm_rank(_comm, &mpiRank); MPI_Comm_size(_comm, &mpiCommSize); @@ -18,17 +46,22 @@ comm_t::comm_t(MPI_Comm _comm) } -deviceVector_t::deviceVector_t(const dlong _vectorSize, const dlong _nVectors, const dlong _wordSize, const std::string _vectorName) -: vectorSize(_vectorSize), +deviceVector_t::deviceVector_t(const size_t _offset, const size_t _nVectors, const size_t _wordSize, const std::string _vectorName) +: nVectors(_nVectors), wordSize(_wordSize), - vectorName(_vectorName) + vectorName(_vectorName), + offset(_offset) { - if(vectorSize <= 0 || nVectors <= 0 || wordSize <= 0) return; // bail - o_vector = platform->device.malloc(vectorSize * nVectors, wordSize); - // set slices - for(int s = 0 ; s < nVectors; ++s){ - slices.push_back(o_vector + s * vectorSize * wordSize); + if(offset <= 0 || nVectors <= 0 || wordSize <= 0) { + if(platform->comm.mpiRank == 0) + printf("ERROR: deviceVector_t invalid input!\n"); + ABORT(EXIT_FAILURE); + } + + o_vector = platform->device.malloc(nVectors * offset * wordSize); + for(int s = 0; s < nVectors; ++s){ + slices.push_back(o_vector + s * offset * wordSize); } } @@ -37,7 +70,7 @@ deviceVector_t::at(const int i) { if(i >= nVectors){ if(platform->comm.mpiRank == 0){ - printf("ERROR: deviceVector_t(%s) has %d size, but an attempt to access entry %i was made!\n", + printf("ERROR: deviceVector_t(%s) has %zu size, but an attempt to access entry %i was made!\n", vectorName.c_str(), nVectors, i @@ -46,20 +79,22 @@ deviceVector_t::at(const int i) ABORT(EXIT_FAILURE); return o_vector; } - occa::memory slice = o_vector + i * vectorSize * wordSize; return slices[i]; } platform_t* platform_t::singleton = nullptr; -platform_t::platform_t(setupAide& _options, MPI_Comm _comm) +platform_t::platform_t(setupAide& _options, MPI_Comm _commg, MPI_Comm _comm) : options(_options), - warpSize(32), // CUDA specific warp size - device(options, _comm), - timer(_comm, device, 0), - comm(_comm), + warpSize(32), + comm(_commg, _comm), + device(options, comm), + timer(_comm, device.occaDevice(), 0), kernels(*this) { + + flopCounter = std::make_unique(); + kernelInfo["defines/" "p_NVec"] = 3; kernelInfo["defines/" "p_blockSize"] = BLOCKSIZE; kernelInfo["defines/" "dfloat"] = dfloatString; @@ -68,12 +103,12 @@ platform_t::platform_t(setupAide& _options, MPI_Comm _comm) kernelInfo["defines/" "hlong"] = hlongString; if(device.mode() == "CUDA" && !getenv("OCCA_CUDA_COMPILER_FLAGS")) { + kernelInfo["compiler_flags"] += " -O3 "; kernelInfo["compiler_flags"] += "--ftz=true "; kernelInfo["compiler_flags"] += "--prec-div=false "; kernelInfo["compiler_flags"] += "--prec-sqrt=false "; kernelInfo["compiler_flags"] += "--use_fast_math "; kernelInfo["compiler_flags"] += "--fmad=true "; - //kernelInfo["compiler_flags"] += "-Xptxas -dlcm=ca"; } @@ -89,12 +124,30 @@ platform_t::platform_t(setupAide& _options, MPI_Comm _comm) } if(device.mode() == "HIP" && !getenv("OCCA_HIP_COMPILER_FLAGS")) { - warpSize = 64; + warpSize = 64; // can be arch specific kernelInfo["compiler_flags"] += " -O3 "; kernelInfo["compiler_flags"] += " -ffp-contract=fast "; kernelInfo["compiler_flags"] += " -funsafe-math-optimizations "; kernelInfo["compiler_flags"] += " -ffast-math "; } + + serial = device.mode() == "Serial" || + device.mode() == "OpenMP"; + + const std::string extension = serial ? ".c" : ".okl"; + + compileDummyKernel(*this); + + std::string installDir, kernelName, fileName; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + const std::string oklpath = installDir + "/okl/"; + kernelName = "copyDfloatToPfloat"; + fileName = installDir + "/okl/core/" + kernelName + extension; + this->kernels.add(kernelName, fileName, this->kernelInfo); + + kernelName = "copyPfloatToDfloat"; + fileName = installDir + "/okl/core/" + kernelName + extension; + this->kernels.add(kernelName, fileName, this->kernelInfo); } void memPool_t::allocate(const dlong offset, const dlong fields) { @@ -137,148 +190,3 @@ platform_t::create_mempool(const dlong offset, const dlong fields) mempool.allocate(offset, fields); o_mempool.allocate(mempool, offset, fields); } - -void -kernelRequestManager_t::add_kernel(const std::string& m_requestName, - const std::string& m_fileName, - const std::string& m_kernelName, - const occa::properties& m_props, - std::string m_suffix, - bool checkUnique) -{ - this->add_kernel(kernelRequest_t{m_requestName, m_fileName, m_kernelName, m_props, m_suffix}, checkUnique); -} -void -kernelRequestManager_t::add_kernel(kernelRequest_t request, bool checkUnique) -{ - auto iterAndBoolPair = kernels.insert(request); - if(checkUnique) - { - int unique = (iterAndBoolPair.second) ? 1 : 0; - MPI_Allreduce(MPI_IN_PLACE, &unique, 1, MPI_INT, MPI_MIN, platformRef.comm.mpiComm); - if(!unique){ - if(platformRef.comm.mpiRank == 0) - { - std::cout << "Error in kernelRequestManager_t::add_kernel\n"; - std::cout << "Request details:\n"; - std::cout << request.to_string(); - } - ABORT(1); - } - } - - const std::string fileName = request.fileName; - fileNameToRequestMap[fileName].insert(request); -} -occa::kernel -kernelRequestManager_t::getKernel(const std::string& request, bool checkValid) const -{ - if(checkValid){ - bool issueError = 0; - issueError = !processed(); - issueError = (requestToKernelMap.count(request) == 0); - - int errorFlag = issueError ? 1 : 0; - MPI_Allreduce(MPI_IN_PLACE, &errorFlag, 1, MPI_INT, MPI_MAX, platformRef.comm.mpiComm); - - if(errorFlag){ - if(platformRef.comm.mpiRank == 0) - { - std::cout << "\n"; - std::cout << "===========================================================\n"; - std::cout << "Error in kernelRequestManager_t::get. Failing now.\n"; - std::cout << "Requested kernel : " << request << "\n"; - - std::cout << "All entries:\n"; - for(auto&& keyAndValue : requestToKernelMap) - { - std::cout << "\t" << keyAndValue.first << "\n"; - } - std::cout << "===========================================================\n"; - } - ABORT(1); - } - } - - return requestToKernelMap.at(request); -} - - - -void -kernelRequestManager_t::compile() -{ - - if(kernelsProcessed) return; - kernelsProcessed = true; - - constexpr int maxCompilingRanks {100}; - - int buildNodeLocal = 0; - if(getenv("NEKRS_BUILD_NODE_LOCAL")) - buildNodeLocal = std::stoi(getenv("NEKRS_BUILD_NODE_LOCAL")); - - const int rank = buildNodeLocal ? platformRef.comm.localRank : platformRef.comm.mpiRank; - const int ranksCompiling = - std::min( - maxCompilingRanks, - buildNodeLocal ? - platformRef.comm.mpiCommLocalSize : - platformRef.comm.mpiCommSize - ); - - std::vector kernelFiles(fileNameToRequestMap.size()); - - unsigned ctr = 0; - for(auto&& fileNameAndRequests : fileNameToRequestMap) - { - kernelFiles[ctr] = fileNameAndRequests.first; - ctr++; - } - - const auto& device = platformRef.device; - auto& requestToKernel = requestToKernelMap; - auto& fileNameToRequest = fileNameToRequestMap; - auto compileKernels = [&kernelFiles, &requestToKernel, &fileNameToRequest, &device, rank, ranksCompiling](){ - if(rank >= ranksCompiling) return; - const unsigned nFiles = kernelFiles.size(); - for(unsigned fileId = 0; fileId < nFiles; ++fileId) - { - if(fileId % ranksCompiling == rank){ - const std::string fileName = kernelFiles[fileId]; - for(auto && kernelRequest : fileNameToRequest[fileName]){ - const std::string requestName = kernelRequest.requestName; - const std::string fileName = kernelRequest.fileName; - const std::string kernelName = kernelRequest.kernelName; - const std::string suffix = kernelRequest.suffix; - const occa::properties props = kernelRequest.props; - auto kernel = device.buildKernel(fileName, kernelName, props, suffix); - requestToKernel[requestName] = kernel; - } - } - } - }; - - const auto& kernelRequests = this->kernels; - auto loadKernels = [&requestToKernel, &kernelRequests,&device](){ - for(auto&& kernelRequest : kernelRequests) - { - const std::string requestName = kernelRequest.requestName; - if(requestToKernel.count(requestName) == 0){ - const std::string fileName = kernelRequest.fileName; - const std::string kernelName = kernelRequest.kernelName; - const std::string suffix = kernelRequest.suffix; - const occa::properties props = kernelRequest.props; - auto kernel = device.buildKernel(fileName, kernelName, props, suffix); - requestToKernel[requestName] = kernel; - } - } - }; - - MPI_Barrier(platform->comm.mpiComm); - compileKernels(); - MPI_Barrier(platform->comm.mpiComm); - loadKernels(); -} - - diff --git a/src/core/platform.hpp b/src/core/platform.hpp index ae57801f9..b792621b6 100644 --- a/src/core/platform.hpp +++ b/src/core/platform.hpp @@ -1,18 +1,20 @@ #ifndef platform_hpp_ #define platform_hpp_ #include -#include #include +#include "flopCounter.hpp" #include "nrssys.hpp" #include "timer.hpp" #include "inipp.hpp" #include "device.hpp" +#include "kernelRequestManager.hpp" #include #include #include +#include class setupAide; class linAlg_t; -class kernelRequestManager_t; +class flopCounter_t; class deviceVector_t{ public: @@ -20,14 +22,14 @@ class deviceVector_t{ operator occa::memory&(){ return o_vector; } // allow implicit conversion between this and kernelArg (for passing to kernels) operator occa::kernelArg(){ return o_vector; } - deviceVector_t(const dlong _vectorSize, const dlong _nVectors, const dlong _wordSize, std::string _vectorName = ""); + deviceVector_t(const size_t _offset, const size_t _nVectors, const size_t _wordSize, std::string _vectorName = ""); occa::memory& at(const int); + const size_t offset; private: occa::memory o_vector; std::vector slices; - const dlong vectorSize; - const dlong nVectors; - const dlong wordSize; + const size_t nVectors; + const size_t wordSize; const std::string vectorName; }; @@ -64,88 +66,13 @@ struct deviceMemPool_t{ occa::memory slice18; occa::memory slice19; occa::memory o_ptr; - long long bytesAllocated; + size_t bytesAllocated; }; -class kernelRequestManager_t -{ - - struct kernelRequest_t - { - inline bool operator==(const kernelRequest_t& other) const - { - return requestName == other.requestName; - } - inline bool operator<(const kernelRequest_t& other) const - { - return requestName < other.requestName; - } - inline bool operator> (const kernelRequest_t& other) const { return *this < other; } - inline bool operator<=(const kernelRequest_t& other) const { return !(*this > other); } - inline bool operator>=(const kernelRequest_t& other) const { return !(*this < other); } - inline bool operator!=(const kernelRequest_t& other) const { return !(*this == other); } - - kernelRequest_t(const std::string& m_requestName, - const std::string& m_fileName, - const std::string& m_kernelName, - const occa::properties& m_props, - std::string m_suffix = std::string()) - : - requestName(m_requestName), - fileName(m_fileName), - kernelName(m_kernelName), - suffix(m_suffix), - props(m_props) - {} - std::string requestName; - std::string fileName; - std::string kernelName; - std::string suffix; - occa::properties props; - - std::string to_string() const { - std::ostringstream ss; - ss << "requestName : " << requestName << "\n"; - ss << "fileName : " << fileName << "\n"; - ss << "kernelName : " << kernelName << "\n"; - ss << "suffix : " << suffix << "\n"; - ss << "props : " << props << "\n";; - return ss.str(); - } - }; -public: - kernelRequestManager_t(const platform_t& m_platform) - : kernelsProcessed(false), - platformRef(m_platform) - {} - void add_kernel(const std::string& m_requestName, - const std::string& m_fileName, - const std::string& m_kernelName, - const occa::properties& m_props, - std::string m_suffix = std::string(), - bool assertUnique = false); - - void compile(); - - occa::kernel - getKernel(const std::string& request, bool checkValid = true) const; - - bool - processed() const { return kernelsProcessed; } - -private: - const platform_t& platformRef; - bool kernelsProcessed; - std::set kernels; - std::map requestToKernelMap; - std::map> fileNameToRequestMap; - - void add_kernel(kernelRequest_t request, bool assertUnique = true); - -}; struct comm_t{ - comm_t(MPI_Comm); + comm_t(MPI_Comm, MPI_Comm); + MPI_Comm mpiCommParent; MPI_Comm mpiComm; int mpiRank; int mpiCommSize; @@ -153,32 +80,49 @@ struct comm_t{ MPI_Comm mpiCommLocal; int mpiCommLocalSize; int localRank; + + std::string to_string() const { + std::ostringstream ss; + ss << "mpiRank = " << mpiRank << std::endl; + ss << "mpiCommSize = " << mpiCommSize << std::endl; + ss << "mpiCommLocalSize = " << mpiCommLocalSize << std::endl; + ss << "localRank = " << localRank << std::endl; + return ss.str(); + } }; struct platform_t{ - setupAide& options; - int warpSize; - device_t device; - occa::properties kernelInfo; - timer::timer_t timer; - comm_t comm; - linAlg_t* linAlg; - memPool_t mempool; - deviceMemPool_t o_mempool; - kernelRequestManager_t kernels; +public: void create_mempool(const dlong offset, const dlong fields); - platform_t(setupAide& _options, MPI_Comm _comm); - inipp::Ini *par; + platform_t(setupAide& _options, MPI_Comm _commg, MPI_Comm _comm); - static platform_t* getInstance(setupAide& _options, MPI_Comm _comm){ + static platform_t* getInstance(setupAide& _options, MPI_Comm _commg, MPI_Comm _comm){ if(!singleton) - singleton = new platform_t(_options, _comm); + singleton = new platform_t(_options, _commg, _comm); return singleton; } static platform_t* getInstance(){ return singleton; } - private: +private: static platform_t * singleton; + +public: + setupAide& options; + int warpSize; + comm_t comm; + device_t device; + occa::properties kernelInfo; + timer::timer_t timer; + deviceMemPool_t o_mempool; + kernelRequestManager_t kernels; + inipp::Ini *par; + bool serial; + linAlg_t* linAlg; + std::unique_ptr flopCounter; + memPool_t mempool; + + occa::kernel copyDfloatToPfloatKernel; + occa::kernel copyPfloatToDfloatKernel; }; #endif diff --git a/src/core/printHeader.cpp b/src/core/printHeader.cpp index b72aa5bc7..86c49e788 100644 --- a/src/core/printHeader.cpp +++ b/src/core/printHeader.cpp @@ -1,12 +1,16 @@ +#include +#include "nrs.hpp" + void printHeader() { std::cout << R"( __ ____ _____)" << std::endl - << R"( ____ ___ / /__ / __ \/ ___/)" << std::endl - << R"( / __ \ / _ \ / //_// /_/ /\__ \ )" << std::endl - << R"( / / / // __// ,< / _, _/___/ / )" << std::endl - << R"(/_/ /_/ \___//_/|_|/_/ |_|/____/ )" - << "v" << NEKRS_VERSION << "." << NEKRS_SUBVERSION << "." << NEKRS_PATCHVERSION << GITCOMMITHASH << std::endl - << std::endl - << "COPYRIGHT (c) 2019-2021 UCHICAGO ARGONNE, LLC" << std::endl - << std::endl; + << R"( ____ ___ / /__ / __ \/ ___/)" << std::endl + << R"( / __ \ / _ \ / //_// /_/ /\__ \ )" << std::endl + << R"( / / / // __// ,< / _, _/___/ / )" << std::endl + << R"(/_/ /_/ \___//_/|_|/_/ |_|/____/ )" + << "v" << NEKRS_VERSION << "." << NEKRS_SUBVERSION << "." << NEKRS_PATCHVERSION + << " (" GITCOMMITHASH << ")" << std::endl + << std::endl + << "COPYRIGHT (c) 2019-2022 UCHICAGO ARGONNE, LLC" << std::endl + << std::endl; } diff --git a/src/core/printHeader.hpp b/src/core/printHeader.hpp new file mode 100644 index 000000000..2bfdd756f --- /dev/null +++ b/src/core/printHeader.hpp @@ -0,0 +1,4 @@ +#include +#include "nrs.hpp" + +void printHeader(); diff --git a/src/core/registerNrsKernels.cpp b/src/core/registerNrsKernels.cpp new file mode 100644 index 000000000..60f221fb8 --- /dev/null +++ b/src/core/registerNrsKernels.cpp @@ -0,0 +1,329 @@ +#include "nrs.hpp" +#include "mesh.h" +#include + +#include "re2Reader.hpp" +#include "benchmarkAdvsub.hpp" + +void registerNrsKernels(occa::properties kernelInfoBC) +{ + const bool serial = platform->serial; + const std::string extension = serial ? ".c" : ".okl"; + const device_t &device = platform->device; + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + // build kernels + std::string fileName, kernelName; + const std::string suffix = "Hex3D"; + const std::string oklpath = installDir + "/okl/"; + int N, cubN; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + platform->options.getArgs("CUBATURE POLYNOMIAL DEGREE", cubN); + const int Nq = N + 1; + const int cubNq = cubN + 1; + const int Np = Nq * Nq * Nq; + const int cubNp = cubNq * cubNq * cubNq; + constexpr int Nfaces{6}; + + occa::properties kernelInfo = platform->kernelInfo; + kernelInfo["defines"].asObject(); + kernelInfo["includes"].asArray(); + kernelInfo["header"].asArray(); + kernelInfo["flags"].asObject(); + kernelInfo["include_paths"].asArray(); + + constexpr int NVfields{3}; + kernelInfo["defines/p_NVfields"] = NVfields; + + int Nsubsteps = 0; + int nBDF = 0; + int nEXT = 0; + platform->options.getArgs("SUBCYCLING STEPS", Nsubsteps); + + if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO1")) { + nBDF = 1; + } else if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO2")) { + nBDF = 2; + } else if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO3")) { + nBDF = 3; + } + nEXT = 3; + if (Nsubsteps) + nEXT = nBDF; + + { + kernelName = "nStagesSum3"; + fileName = oklpath + "core/" + kernelName + ".okl"; + const std::string section = "nrs-"; + platform->kernels.add( + section + kernelName, fileName, platform->kernelInfo); + + kernelName = "computeFieldDotNormal"; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, platform->kernelInfo); + + occa::properties centroidProp = kernelInfo; + centroidProp["defines/p_Nfp"] = Nq * Nq; + centroidProp["defines/p_Nfaces"] = Nfaces; + { + int N; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + const int Nq = N + 1; + if (BLOCKSIZE < Nq * Nq) { + if (platform->comm.mpiRank == 0) + printf("ERROR: computeFaceCentroid kernel requires BLOCKSIZE >= Nq * Nq." + "BLOCKSIZE = %d, Nq*Nq = %d\n", + BLOCKSIZE, + Nq * Nq); + ABORT(EXIT_FAILURE); + } + } + kernelName = "computeFaceCentroid"; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, centroidProp); + + occa::properties meshProps = kernelInfo; + meshProps += meshKernelProperties(N); + + { + occa::properties prop = meshProps; + prop["defines/p_cubNq"] = cubNq; + prop["defines/p_cubNp"] = cubNp; + + kernelName = "strongAdvectionVolume" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + kernelName = "strongAdvectionCubatureVolume" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + } + + kernelName = "curl" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, meshProps); + + kernelName = "gradientVolume" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, meshProps); + + kernelName = "wGradientVolume" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, meshProps); + + { + occa::properties prop = kernelInfo; + const int movingMesh = + platform->options.compareArgs("MOVING MESH", "TRUE"); + prop["defines/p_nEXT"] = nEXT; + prop["defines/p_nBDF"] = nBDF; + prop["defines/p_MovingMesh"] = movingMesh; + if (Nsubsteps) + prop["defines/p_SUBCYCLING"] = 1; + else + prop["defines/p_SUBCYCLING"] = 0; + + kernelName = "sumMakef"; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + } + + kernelName = "wDivergenceVolume" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, kernelInfoBC); + kernelName = "divergenceVolume" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, kernelInfoBC); + + kernelName = "divergenceSurface" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, kernelInfoBC); + + kernelName = "advectMeshVelocityHex3D"; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, meshProps); + + kernelName = "pressureRhs" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, meshProps); + + kernelName = "pressureStress" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, meshProps); + + kernelName = "pressureDirichletBC" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, kernelInfoBC); + + kernelName = "velocityRhs" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, meshProps); + + auto zeroNormalProps = kernelInfoBC; + zeroNormalProps["defines/p_ZERO_NORMAL"] = ZERO_NORMAL; + zeroNormalProps["defines/p_NO_OP"] = NO_OP; + kernelName = "averageNormalBcType"; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add(section + kernelName, fileName, zeroNormalProps); + + kernelName = "fixZeroNormalMask"; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add(section + kernelName, fileName, zeroNormalProps); + + kernelName = "applyZeroNormalMask"; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add(section + kernelName, fileName, zeroNormalProps); + + kernelName = "initializeZeroNormalMask"; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add(section + kernelName, fileName, zeroNormalProps); + + kernelName = "velocityDirichletBC" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, kernelInfoBC); + + kernelName = "velocityNeumannBC" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, kernelInfoBC); + + occa::properties prop = meshProps; + const int movingMesh = platform->options.compareArgs("MOVING MESH", "TRUE"); + prop["defines/p_relative"] = movingMesh && Nsubsteps; + prop["defines/p_cubNq"] = cubNq; + prop["defines/p_cubNp"] = cubNp; + fileName = oklpath + "nrs/Urst" + suffix + ".okl"; + + kernelName = "UrstCubature" + suffix; + fileName = oklpath + "nrs/" + kernelName + extension; + platform->kernels.add( + section + kernelName, fileName, prop); + + kernelName = "Urst" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + + { + occa::properties prop = meshProps; + const int movingMesh = + platform->options.compareArgs("MOVING MESH", "TRUE"); + prop["defines/p_MovingMesh"] = movingMesh; + prop["defines/p_nEXT"] = nEXT; + prop["defines/p_nBDF"] = nBDF; + prop["defines/p_cubNq"] = cubNq; + prop["defines/p_cubNp"] = cubNp; + + occa::properties subCycleStrongCubatureProps = prop; + + int nelgt, nelgv; + const std::string meshFile = platform->options.getArgs("MESH FILE"); + re2::nelg(meshFile, nelgt, nelgv, platform->comm.mpiComm); + const int NelemBenchmark = nelgv/platform->comm.mpiCommSize; + + bool verbose = platform->options.compareArgs("VERBOSE", "TRUE"); + const int verbosity = verbose ? 2 : 1; + + int Nsubsteps; + platform->options.getArgs("SUBCYCLING STEPS", Nsubsteps); + + if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE") && Nsubsteps) { + auto subCycleKernel = + benchmarkAdvsub(3, NelemBenchmark, Nq, cubNq, nEXT, true, false, verbosity, nrs_t::targetBenchmark, false); + + kernelName = "subCycleStrongCubatureVolume" + suffix; + platform->kernels.add(section + kernelName, subCycleKernel); + } + + kernelName = "subCycleStrongVolume" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + + kernelName = "subCycleRKUpdate"; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + kernelName = "subCycleRK"; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + + kernelName = "subCycleInitU0"; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, prop); + } + + kernelName = "extrapolate"; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, meshProps); + + kernelName = "maskCopy"; + fileName = oklpath + "core/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, kernelInfo); + + kernelName = "mask"; + fileName = oklpath + "core/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, kernelInfo); + + kernelName = "filterRT" + suffix; + fileName = oklpath + "nrs/regularization/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, meshProps); + + { + int N; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + const int Nq = N + 1; + if (BLOCKSIZE < Nq * Nq) { + if (platform->comm.mpiRank == 0) + printf("ERROR: cfl kernel requires BLOCKSIZE >= Nq * Nq." + "BLOCKSIZE = %d, Nq*Nq = %d\n", + BLOCKSIZE, + Nq * Nq); + ABORT(EXIT_FAILURE); + } + } + + occa::properties cflProps = meshProps; + cflProps["defines/p_MovingMesh"] = movingMesh; + kernelName = "cfl" + suffix; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, cflProps); + + kernelName = "pressureAddQtl"; + fileName = oklpath + "nrs/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, meshProps); + + kernelName = "setEllipticCoeff"; + fileName = oklpath + "core/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, kernelInfo); + kernelName = "setEllipticCoeffPressure"; + fileName = oklpath + "core/" + kernelName + ".okl"; + platform->kernels.add( + section + kernelName, fileName, kernelInfo); + } +} \ No newline at end of file diff --git a/src/core/setup.cpp b/src/core/setup.cpp deleted file mode 100644 index b67ba25f0..000000000 --- a/src/core/setup.cpp +++ /dev/null @@ -1,1288 +0,0 @@ -#include "nrs.hpp" -#include "meshSetup.hpp" -#include "nekInterfaceAdapter.hpp" -#include "udf.hpp" -#include "bcMap.hpp" -#include -#include -#include "filter.hpp" -#include "avm.hpp" -#include - -namespace{ -cds_t* cdsSetup(nrs_t* nrs, setupAide options); -} - -std::vector -determineMGLevels(std::string section) -{ - const std::string optionsPrefix = [section](){ - std::string prefix = section + std::string(" "); - if(section.find("temperature") != std::string::npos){ - prefix = std::string("scalar00 "); - } - std::transform(prefix.begin(), prefix.end(), prefix.begin(), - [](unsigned char c){ return std::toupper(c); }); - return prefix; - }(); - - std::vector levels; - int N; - platform->options.getArgs("POLYNOMIAL DEGREE", N); - - std::string p_mglevels; - if(platform->options.getArgs(optionsPrefix + "MULTIGRID COARSENING", p_mglevels)) { - const std::vector mgLevelList = serializeString(p_mglevels,','); - for(auto && s : mgLevelList){ - levels.push_back(std::stoi(s)); - } - - - bool invalid = false; - invalid |= (levels[0] != N); // top level order must match - for(unsigned i = 0U; i < levels.size(); ++i){ - invalid |= (levels[i] < 0); // each level must be positive - if(i > 0) - invalid |= (levels[i] >= levels[i-1]); // each successive level must be smaller - } - - if(invalid){ - if(platform->comm.mpiRank == 0) printf("ERROR: Invalid multigrid coarsening!\n"); - ABORT(EXIT_FAILURE);; - } - if(levels.back() > 1) - { - if(platform->options.compareArgs(optionsPrefix + "MULTIGRID COARSE SOLVE", "TRUE")){ - // if the coarse level has p > 1 and requires solving the coarsest level, - // rather than just smoothing, then use the SEMFEM discretization - platform->options.setArgs(optionsPrefix + "MULTIGRID COARSE SEMFEM", "TRUE"); - platform->options.setArgs(optionsPrefix + "MULTIGRID COARSE SEMFEM", "TRUE"); - - // However, if the user explicitly asked for the FEM discretization, bail - if(platform->options.compareArgs(optionsPrefix + "USER SPECIFIED FEM COARSE SOLVER", "TRUE")) - { - if(platform->comm.mpiRank == 0){ - printf("Error! FEM coarse discretization only supports p=1 for the coarsest level!\n"); - } - ABORT(1); - } - } - } - - return levels; - - } else if(platform->options.compareArgs(optionsPrefix + "MULTIGRID DOWNWARD SMOOTHER","ASM") || - platform->options.compareArgs(optionsPrefix + "MULTIGRID DOWNWARD SMOOTHER","RAS")) { - std::map > mg_level_lookup = - { - {1,{1}}, - {2,{2,1}}, - {3,{3,1}}, - {4,{4,2,1}}, - {5,{5,3,1}}, - {6,{6,3,1}}, - {7,{7,3,1}}, - {8,{8,5,1}}, - {9,{9,5,1}}, - {10,{10,6,1}}, - {11,{11,6,1}}, - {12,{12,7,1}}, - {13,{13,7,1}}, - {14,{14,8,1}}, - {15,{15,9,1}}, - }; - - return mg_level_lookup.at(N); - } else if(platform->options.compareArgs(optionsPrefix + "MULTIGRID DOWNWARD SMOOTHER","JAC")) { - std::map > mg_level_lookup = - { - {1,{1}}, - {2,{2,1}}, - {3,{3,1}}, - {4,{4,2,1}}, - {5,{5,3,1}}, - {6,{6,4,2,1}}, - {7,{7,5,3,1}}, - {8,{8,6,4,1}}, - {9,{9,7,5,1}}, - {10,{10,8,5,1}}, - {11,{11,9,5,1}}, - {12,{12,10,5,1}}, - {13,{13,11,5,1}}, - {14,{14,12,5,1}}, - {15,{15,13,5,1}}, - }; - - return mg_level_lookup.at(N); - } - - return {}; -} - -void nrsSetup(MPI_Comm comm, setupAide &options, nrs_t *nrs) -{ - { - int N; - platform->options.getArgs("POLYNOMIAL DEGREE", N); - const int Nq = N+1; - if( BLOCKSIZE < Nq * Nq ){ - if(platform->comm.mpiRank == 0) - printf("ERROR: several kernels require BLOCKSIZE >= Nq * Nq." - "BLOCKSIZE = %d, Nq*Nq = %d\n", BLOCKSIZE, Nq * Nq); - ABORT(EXIT_FAILURE); - } - } - - platform_t* platform = platform_t::getInstance(); - device_t& device = platform->device; - nrs->kernelInfo = new occa::properties(); - *(nrs->kernelInfo) = platform->kernelInfo; - occa::properties& kernelInfo = *nrs->kernelInfo; - kernelInfo["defines"].asObject(); - kernelInfo["includes"].asArray(); - kernelInfo["header"].asArray(); - kernelInfo["flags"].asObject(); - kernelInfo["include_paths"].asArray(); - - int N, cubN; - platform->options.getArgs("POLYNOMIAL DEGREE", N); - platform->options.getArgs("CUBATURE POLYNOMIAL DEGREE", cubN); - platform->options.getArgs("NUMBER OF SCALARS", nrs->Nscalar); - platform->options.getArgs("MESH DIMENSION", nrs->dim); - platform->options.getArgs("ELEMENT TYPE", nrs->elementType); - if(platform->device.mode() == "Serial") - platform->options.setArgs("ENABLE OVERLAP", "FALSE"); - - nrs->flow = 1; - if(platform->options.compareArgs("VELOCITY", "FALSE")) nrs->flow = 0; - if(platform->options.compareArgs("VELOCITY SOLVER", "NONE")) nrs->flow = 0; - - if(nrs->flow) { - if(platform->options.compareArgs("STRESSFORMULATION", "TRUE")) - platform->options.setArgs("VELOCITY BLOCK SOLVER", "TRUE"); - } - - platform->options.setArgs("CHECKPOINT OUTPUT MESH", "FALSE"); - - if(platform->options.compareArgs("CONSTANT FLOW RATE", "TRUE")) - { - platform->options.getArgs("FLOW RATE", nrs->flowRate); - nrs->fromBID = -1; - nrs->toBID = -1; - platform->options.getArgs("CONSTANT FLOW FROM BID", nrs->fromBID); - platform->options.getArgs("CONSTANT FLOW TO BID", nrs->toBID); - if(platform->options.compareArgs("CONSTANT FLOW DIRECTION", "X")) - { - nrs->flowDirection[0] = 1.0; - nrs->flowDirection[1] = 0.0; - nrs->flowDirection[2] = 0.0; - } - if(platform->options.compareArgs("CONSTANT FLOW DIRECTION", "Y")) - { - nrs->flowDirection[0] = 0.0; - nrs->flowDirection[1] = 1.0; - nrs->flowDirection[2] = 0.0; - } - if(platform->options.compareArgs("CONSTANT FLOW DIRECTION", "Z")) - { - nrs->flowDirection[0] = 0.0; - nrs->flowDirection[1] = 0.0; - nrs->flowDirection[2] = 1.0; - } - } - - - // init nek - { - int rank, size; - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); - std::string casename; - platform->options.getArgs("CASENAME", casename); - - nek::setup(nrs); - nek::setic(); - nek::userchk(); - if (platform->comm.mpiRank == 0) std::cout << "\n"; - } - - nrs->cht = 0; - if (nekData.nelv != nekData.nelt && nrs->Nscalar) nrs->cht = 1; - if (nrs->cht && !platform->options.compareArgs("SCALAR00 IS TEMPERATURE", "TRUE")) { - if (platform->comm.mpiRank == 0) std::cout << "Conjugate heat transfer requires solving for temperature!\n"; - ABORT(EXIT_FAILURE);; - } - - nrs->_mesh = createMesh(comm, N, cubN, nrs->cht, kernelInfo); - nrs->meshV = (mesh_t*) nrs->_mesh->fluid; - mesh_t* mesh = nrs->meshV; - - { - double val = (double)mesh->NlocalGatherElements/mesh->Nelements; - MPI_Allreduce(MPI_IN_PLACE,&val,1,MPI_DOUBLE,MPI_MIN,platform->comm.mpiComm); - if(platform->comm.mpiRank == 0) - printf("min %2.0f%% of the local elements are internal\n", 100*val); - } - - nrs->NVfields = 3; - nrs->NTfields = nrs->NVfields + 1; // Total Velocity + Pressure - mesh->Nfields = 1; - - platform->options.getArgs("SUBCYCLING STEPS",nrs->Nsubsteps); - platform->options.getArgs("DT", nrs->dt[0]); - - if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO1")) { - nrs->nBDF = 1; - } else if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO2")) { - nrs->nBDF = 2; - } else if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO3")) { - nrs->nBDF = 3; - } - nrs->nEXT = 3; - if(nrs->Nsubsteps) nrs->nEXT = nrs->nBDF; - nrs->coeffEXT = (dfloat*) calloc(nrs->nEXT, sizeof(dfloat)); - nrs->coeffBDF = (dfloat*) calloc(nrs->nBDF, sizeof(dfloat)); - - nrs->nRK = 4; - nrs->coeffSubEXT = (dfloat*) calloc(3, sizeof(dfloat)); - - dfloat mue = 1; - dfloat rho = 1; - platform->options.getArgs("VISCOSITY", mue); - platform->options.getArgs("DENSITY", rho); - - const dlong Nlocal = mesh->Nlocal; - - { // setup fieldOffset - nrs->fieldOffset = mesh->Np * (mesh->Nelements + mesh->totalHaloPairs); - mesh_t* meshT = nrs->_mesh; - nrs->fieldOffset = mymax(nrs->fieldOffset, meshT->Np * (meshT->Nelements + meshT->totalHaloPairs)); - - int PAGESIZE = 4096; // default is 4kB - char* tmp; - tmp = getenv("NEKRS_PAGE_SIZE"); - if (tmp != NULL) PAGESIZE = std::stoi(tmp); - const int pageW = PAGESIZE / sizeof(dfloat); - if (nrs->fieldOffset % pageW) nrs->fieldOffset = (nrs->fieldOffset / pageW + 1) * pageW; - } - - nrs->_mesh->fieldOffset = nrs->fieldOffset; - - if(nrs->Nsubsteps) { - int Sorder; - platform->options.getArgs("SUBCYCLING TIME ORDER", Sorder); - if(Sorder == 4 && nrs->nRK == 4) { // ERK(4,4) - dfloat rka[4] = {0.0, 1.0 / 2.0, 1.0 / 2.0, 1.0}; - dfloat rkb[4] = {1.0 / 6.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 6.0}; - dfloat rkc[4] = {0.0, 1.0 / 2.0, 1.0 / 2.0, 1.0}; - nrs->coeffsfRK = (dfloat*) calloc(nrs->nRK, sizeof(dfloat)); - nrs->weightsRK = (dfloat*) calloc(nrs->nRK, sizeof(dfloat)); - nrs->nodesRK = (dfloat*) calloc(nrs->nRK, sizeof(dfloat)); - memcpy(nrs->coeffsfRK, rka, nrs->nRK * sizeof(dfloat)); - memcpy(nrs->weightsRK, rkb, nrs->nRK * sizeof(dfloat)); - memcpy(nrs->nodesRK, rkc, nrs->nRK * sizeof(dfloat)); - }else{ - if(platform->comm.mpiRank == 0) std::cout << "Unsupported subcycling scheme!\n"; - ABORT(1); - } - nrs->o_coeffsfRK = device.malloc(nrs->nRK * sizeof(dfloat), nrs->coeffsfRK); - nrs->o_weightsRK = device.malloc(nrs->nRK * sizeof(dfloat), nrs->weightsRK); - } - - // setup mempool - int ellipticMaxFields = 1; - if(platform->options.compareArgs("VELOCITY BLOCK SOLVER", "TRUE")) - ellipticMaxFields = nrs->NVfields; - const int ellipticWrkFields = elliptic_t::NScratchFields*ellipticMaxFields; - - int wrkFields = 9; - if(nrs->Nsubsteps) wrkFields += 3*nrs->NVfields; - if(options.compareArgs("MOVING MESH", "TRUE")) wrkFields += nrs->NVfields; - - const int mempoolNflds = std::max(wrkFields, 2*nrs->NVfields + ellipticWrkFields); - platform->create_mempool(nrs->fieldOffset, mempoolNflds); - - // offset mempool available for elliptic because we pool is also used for ellipticSolve input/output - auto const o_mempoolElliptic = - platform->o_mempool.o_ptr.slice(2*nrs->NVfields * nrs->fieldOffset * sizeof(dfloat)); - - if(options.compareArgs("MOVING MESH", "TRUE")){ - const int nBDF = std::max(nrs->nBDF, nrs->nEXT); - platform->o_mempool.slice0.copyFrom(mesh->o_LMM, mesh->Nlocal * sizeof(dfloat)); - mesh->o_LMM.free(); - mesh->o_LMM = platform->device.malloc(nrs->fieldOffset * nBDF , sizeof(dfloat)); - mesh->o_LMM.copyFrom(platform->o_mempool.slice0, mesh->Nlocal * sizeof(dfloat)); - platform->o_mempool.slice0.copyFrom(mesh->o_invLMM, mesh->Nlocal * sizeof(dfloat)); - mesh->o_invLMM.free(); - mesh->o_invLMM = platform->device.malloc(nrs->fieldOffset * nBDF , sizeof(dfloat)); - mesh->o_invLMM.copyFrom(platform->o_mempool.slice0, mesh->Nlocal * sizeof(dfloat)); - - const int nAB = std::max(nrs->nEXT, mesh->nAB); - mesh->U = (dfloat*) calloc(nrs->NVfields * nrs->fieldOffset * nAB, sizeof(dfloat)); - mesh->o_U = platform->device.malloc(nrs->NVfields * nrs->fieldOffset * nAB * sizeof(dfloat), mesh->U); - if(nrs->Nsubsteps) - mesh->o_divU = platform->device.malloc(nrs->fieldOffset * nAB, sizeof(dfloat)); - } - - { - dlong offset; - if(platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) - offset = std::max(nrs->fieldOffset, mesh->Nelements * mesh->cubNp); - else - offset = nrs->fieldOffset; - - const dlong Nstates = nrs->Nsubsteps ? std::max(nrs->nBDF, nrs->nEXT) : 1; - if(nrs->Nsubsteps && platform->options.compareArgs("MOVING MESH", "TRUE")) - nrs->o_relUrst = platform->device.malloc(Nstates * nrs->NVfields * offset, sizeof(dfloat)); - else - nrs->o_Urst = platform->device.malloc(Nstates * nrs->NVfields * offset, sizeof(dfloat)); - } - - nrs->U = (dfloat*) calloc(nrs->NVfields * std::max(nrs->nBDF, nrs->nEXT) * nrs->fieldOffset,sizeof(dfloat)); - nrs->Ue = (dfloat*) calloc(nrs->NVfields * nrs->fieldOffset,sizeof(dfloat)); - nrs->P = (dfloat*) calloc(nrs->fieldOffset,sizeof(dfloat)); - nrs->BF = (dfloat*) calloc(nrs->NVfields * nrs->fieldOffset,sizeof(dfloat)); - nrs->FU = (dfloat*) calloc(nrs->NVfields * nrs->nEXT * nrs->fieldOffset,sizeof(dfloat)); - - nrs->o_U = platform->device.malloc(nrs->NVfields * std::max(nrs->nBDF,nrs->nEXT) * nrs->fieldOffset * sizeof(dfloat), nrs->U); - nrs->o_Ue = platform->device.malloc(nrs->NVfields * nrs->fieldOffset * sizeof(dfloat), nrs->Ue); - nrs->o_P = platform->device.malloc(nrs->fieldOffset * sizeof(dfloat), nrs->P); - nrs->o_BF = platform->device.malloc(nrs->NVfields * nrs->fieldOffset * sizeof(dfloat), nrs->BF); - nrs->o_FU = platform->device.malloc(nrs->NVfields * nrs->nEXT * nrs->fieldOffset * sizeof(dfloat), nrs->FU); - - nrs->var_coeff = 1; // use always var coeff elliptic - nrs->ellipticCoeff = (dfloat*) calloc(2 * nrs->fieldOffset,sizeof(dfloat)); - nrs->o_ellipticCoeff = device.malloc(2 * nrs->fieldOffset * sizeof(dfloat), - nrs->ellipticCoeff); - - int nProperties = 2; - if(options.compareArgs("MESH SOLVER", "ELASTICITY")) nProperties = 4; - nrs->prop = (dfloat*) calloc(nProperties * nrs->fieldOffset,sizeof(dfloat)); - for (int e = 0; e < mesh->Nelements; e++) - for (int n = 0; n < mesh->Np; n++) { - nrs->prop[0 * nrs->fieldOffset + e * mesh->Np + n] = mue; - nrs->prop[1 * nrs->fieldOffset + e * mesh->Np + n] = rho; - } - - nrs->o_prop = device.malloc(nProperties * nrs->fieldOffset * sizeof(dfloat), nrs->prop); - nrs->o_mue = nrs->o_prop.slice(0 * nrs->fieldOffset * sizeof(dfloat)); - nrs->o_rho = nrs->o_prop.slice(1 * nrs->fieldOffset * sizeof(dfloat)); - if(options.compareArgs("MESH SOLVER", "ELASTICITY")){ - nrs->o_meshMue = nrs->o_prop.slice(2 * nrs->fieldOffset * sizeof(dfloat)); - nrs->o_meshRho = nrs->o_prop.slice(3 * nrs->fieldOffset * sizeof(dfloat)); - } - - if(platform->options.compareArgs("CONSTANT FLOW RATE", "TRUE")){ - nrs->o_Uc = platform->device.malloc(nrs->NVfields * nrs->fieldOffset * sizeof(dfloat)); - nrs->o_Pc = platform->device.malloc(nrs->fieldOffset * sizeof(dfloat)); - nrs->o_prevProp = device.malloc(2 * nrs->fieldOffset * sizeof(dfloat), nrs->prop); - } - - nrs->div = (dfloat*) calloc(nrs->fieldOffset,sizeof(dfloat)); - nrs->o_div = device.malloc(nrs->fieldOffset * sizeof(dfloat), nrs->div); - - nrs->o_coeffEXT = platform->device.malloc(nrs->nEXT * sizeof(dfloat), nrs->coeffEXT); - nrs->o_coeffBDF = platform->device.malloc(nrs->nBDF * sizeof(dfloat), nrs->coeffBDF); - nrs->o_coeffSubEXT = platform->device.malloc(nrs->nEXT * sizeof(dfloat), nrs->coeffEXT); - - meshParallelGatherScatterSetup(mesh, mesh->Nlocal, mesh->globalIds, platform->comm.mpiComm, 0); - oogs_mode oogsMode = OOGS_AUTO; - //if(platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP") oogsMode = OOGS_DEFAULT; - nrs->gsh = oogs::setup(mesh->ogs, nrs->NVfields, nrs->fieldOffset, ogsDfloat, NULL, oogsMode); - - linAlg_t * linAlg = platform->linAlg; - - int err = 0; - dlong gNelements = mesh->Nelements; - MPI_Allreduce(MPI_IN_PLACE, &gNelements, 1, MPI_DLONG, MPI_SUM, platform->comm.mpiComm); - const dfloat sum2 = (dfloat)gNelements * mesh->Np; - linAlg->fillKernel(nrs->fieldOffset, 1.0, platform->o_mempool.slice0); - ogsGatherScatter(platform->o_mempool.slice0, ogsDfloat, ogsAdd, mesh->ogs); - linAlg->axmyKernel(Nlocal, 1.0, mesh->ogs->o_invDegree, platform->o_mempool.slice0); - dfloat* tmp = (dfloat*) calloc(Nlocal, sizeof(dfloat)); - platform->o_mempool.slice0.copyTo(tmp, Nlocal * sizeof(dfloat)); - dfloat sum1 = 0; - for(int i = 0; i < Nlocal; i++) sum1 += tmp[i]; - MPI_Allreduce(MPI_IN_PLACE, &sum1, 1, MPI_DFLOAT, MPI_SUM, platform->comm.mpiComm); - sum1 = abs(sum1 - sum2) / sum2; - if(sum1 > 1e-15) { - if(platform->comm.mpiRank == 0) printf("ogsGatherScatter test err=%g!\n", sum1); - fflush(stdout); - err++; - } - - mesh->ogs->o_invDegree.copyTo(tmp, Nlocal * sizeof(dfloat)); - double* vmult = (double*) nek::ptr("vmult"); - sum1 = 0; - for(int i = 0; i < Nlocal; i++) sum1 += abs(tmp[i] - vmult[i]); - MPI_Allreduce(MPI_IN_PLACE, &sum1, 1, MPI_DFLOAT, MPI_SUM, platform->comm.mpiComm); - if(sum1 > 1e-15) { - if(platform->comm.mpiRank == 0) printf("multiplicity test err=%g!\n", sum1); - fflush(stdout); - err++; - } - - if(err) ABORT(1); - free(tmp); - - nrs->EToB = (int*) calloc(mesh->Nelements * mesh->Nfaces, sizeof(int)); - int cnt = 0; - for (int e = 0; e < mesh->Nelements; e++) { - for (int f = 0; f < mesh->Nfaces; f++) { - int bc = bcMap::id(mesh->EToB[f + e * mesh->Nfaces], "velocity"); - nrs->EToB[cnt] = bc; - cnt++; - } - } - nrs->o_EToB = device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int),nrs->EToB); - - if(platform->options.compareArgs("MESH SOLVER", "ELASTICITY")) { - nrs->EToBMesh = (int*) calloc(mesh->Nelements * mesh->Nfaces, sizeof(int)); - int cnt = 0; - for (int e = 0; e < mesh->Nelements; e++) { - for (int f = 0; f < mesh->Nfaces; f++) { - int bc = bcMap::id(mesh->EToB[f + e * mesh->Nfaces], "mesh"); - nrs->EToBMesh[cnt] = bc; - cnt++; - } - } - nrs->o_EToBMesh = device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int),nrs->EToBMesh); - } - - if(platform->options.compareArgs("VELOCITY REGULARIZATION METHOD", "RELAXATION")){ - - nrs->filterNc = -1; - dfloat filterS; - platform->options.getArgs("VELOCITY HPFRT STRENGTH", filterS); - platform->options.getArgs("VELOCITY HPFRT MODES", nrs->filterNc); - filterS = -1.0 * fabs(filterS); - nrs->filterS = filterS; - - dfloat* A = filterSetup(nrs->meshV, nrs->filterNc); - - const dlong Nmodes = nrs->meshV->N + 1; - - nrs->o_filterMT = platform->device.malloc(Nmodes * Nmodes * sizeof(dfloat), A); - - free(A); - } - - // build kernels - std::string kernelName; - const std::string suffix = "Hex3D"; - - MPI_Barrier(platform->comm.mpiComm); - double tStartLoadKernel = MPI_Wtime(); - if(platform->comm.mpiRank == 0) printf("loading ns kernels ... "); fflush(stdout); - - { - const std::string section = "nrs-"; - kernelName = "nStagesSum3"; - nrs->nStagesSum3Kernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "computeFieldDotNormal"; - nrs->computeFieldDotNormalKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "computeFaceCentroid"; - nrs->computeFaceCentroidKernel = - platform->kernels.getKernel( section + kernelName); - - { - kernelName = "strongAdvectionVolume" + suffix; - nrs->advectionStrongVolumeKernel = - platform->kernels.getKernel( section + kernelName); - kernelName = "strongAdvectionCubatureVolume" + suffix; - nrs->advectionStrongCubatureVolumeKernel = - platform->kernels.getKernel( section + kernelName); - } - - kernelName = "curl" + suffix; - nrs->curlKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "gradientVolume" + suffix; - nrs->gradientVolumeKernel = platform->kernels.getKernel( section + kernelName); - - kernelName = "nrswGradientVolume" + suffix; - nrs->wgradientVolumeKernel = - platform->kernels.getKernel( section + kernelName); - - { - kernelName = "sumMakef"; - nrs->sumMakefKernel = platform->kernels.getKernel( section + kernelName); - } - - kernelName = "nrswDivergenceVolume" + suffix; - nrs->wDivergenceVolumeKernel = - platform->kernels.getKernel( section + kernelName); - kernelName = "divergenceVolume" + suffix; - nrs->divergenceVolumeKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "divergenceSurfaceTOMBO" + suffix; - nrs->divergenceSurfaceKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "advectMeshVelocityHex3D"; - nrs->advectMeshVelocityKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "pressureRhsTOMBO" + suffix; - nrs->pressureRhsKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "pressureStress" + suffix; - nrs->pressureStressKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "pressureDirichletBC" + suffix; - nrs->pressureDirichletBCKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "velocityRhsTOMBO" + suffix; - nrs->velocityRhsKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "velocityDirichletBC" + suffix; - nrs->velocityDirichletBCKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "velocityNeumannBC" + suffix; - nrs->velocityNeumannBCKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "UrstCubature" + suffix; - nrs->UrstCubatureKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "Urst" + suffix; - nrs->UrstKernel = - platform->kernels.getKernel( section + kernelName); - - - if(nrs->Nsubsteps){ - kernelName = "subCycleStrongCubatureVolume" + suffix; - nrs->subCycleStrongCubatureVolumeKernel = - platform->kernels.getKernel( section + kernelName); - kernelName = "subCycleStrongVolume" + suffix; - nrs->subCycleStrongVolumeKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "subCycleERKUpdate"; - nrs->subCycleRKUpdateKernel = - platform->kernels.getKernel( section + kernelName); - kernelName = "subCycleRK"; - nrs->subCycleRKKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "subCycleInitU0"; - nrs->subCycleInitU0Kernel = platform->kernels.getKernel( section + kernelName); - } - - kernelName = "multiExtrapolate"; - nrs->extrapolateKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "maskCopy"; - nrs->maskCopyKernel = - platform->kernels.getKernel( section + kernelName); - kernelName = "mask"; - nrs->maskKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "filterRT" + suffix; - nrs->filterRTKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "cfl" + suffix; - nrs->cflKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "pressureAddQtl"; - nrs->pressureAddQtlKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "setEllipticCoeff"; - nrs->setEllipticCoeffKernel = - platform->kernels.getKernel( section + kernelName); - kernelName = "setEllipticCoeffPressure"; - nrs->setEllipticCoeffPressureKernel = - platform->kernels.getKernel( section + kernelName); - } - - MPI_Barrier(platform->comm.mpiComm); - if(platform->comm.mpiRank == 0) printf("done (%gs)\n", MPI_Wtime() - tStartLoadKernel); fflush(stdout); - - if(nrs->Nscalar) { - nrs->cds = cdsSetup(nrs, platform->options); - } - - // get IC + t0 from nek - double startTime; - nek::copyFromNek(startTime); - platform->options.setArgs("START TIME", to_string_f(startTime)); - - if(platform->comm.mpiRank == 0) printf("calling udf_setup ... "); fflush(stdout); - udf.setup(nrs); - if(platform->comm.mpiRank == 0) printf("done\n"); fflush(stdout); - - // setup elliptic solvers - - const int nbrBIDs = bcMap::size(0); - int NBCType = nbrBIDs + 1; - - if(nrs->Nscalar) { - cds_t* cds = nrs->cds; - - for (int is = 0; is < cds->NSfields; is++) { - std::stringstream ss; - ss << std::setfill('0') << std::setw(2) << is; - std::string sid = ss.str(); - - if(!cds->compute[is]) continue; - - mesh_t* mesh; - (is) ? mesh = cds->meshV : mesh = cds->mesh[0]; // only first scalar can be a CHT mesh - - if (platform->comm.mpiRank == 0) - std::cout << "================= ELLIPTIC SETUP SCALAR" << sid << " ===============\n"; - - int nbrBIDs = bcMap::size(0); - if(nrs->cht && is == 0) nbrBIDs = bcMap::size(1); - int* sBCType = (int*) calloc(nbrBIDs + 1, sizeof(int)); - - for (int bID = 1; bID <= nbrBIDs; bID++) { - std::string bcTypeText(bcMap::text(bID, "scalar" + sid)); - if(platform->comm.mpiRank == 0) printf("bID %d -> bcType %s\n", bID, bcTypeText.c_str()); - sBCType[bID] = bcMap::type(bID, "scalar" + sid); - } - - cds->solver[is] = new elliptic_t(); - cds->solver[is]->name = "scalar" + sid; - cds->solver[is]->blockSolver = 0; - cds->solver[is]->Nfields = 1; - cds->solver[is]->Ntotal = nrs->fieldOffset; - cds->solver[is]->o_wrk = o_mempoolElliptic; - cds->solver[is]->mesh = mesh; - cds->solver[is]->dim = cds->dim; - cds->solver[is]->elementType = cds->elementType; - cds->solver[is]->BCType = (int*) calloc(nbrBIDs + 1,sizeof(int)); - memcpy(cds->solver[is]->BCType,sBCType,(nbrBIDs + 1) * sizeof(int)); - free(sBCType); - cds->solver[is]->var_coeff = cds->var_coeff; - for (int i = 0; i < 2 * nrs->fieldOffset; i++) nrs->ellipticCoeff[i] = 1; - cds->solver[is]->lambda = cds->ellipticCoeff; - cds->solver[is]->o_lambda = cds->o_ellipticCoeff; - cds->solver[is]->loffset = 0; - - cds->solver[is]->options = cds->options[is]; - ellipticSolveSetup(cds->solver[is]); - } - } - - if (nrs->flow) { - if (platform->comm.mpiRank == 0) printf("================ ELLIPTIC SETUP VELOCITY ================\n"); - - nrs->uvwSolver = NULL; - - if(platform->options.compareArgs("VELOCITY BLOCK SOLVER", "TRUE")) - nrs->uvwSolver = new elliptic_t(); - - int* uvwBCType = (int*) calloc(3 * NBCType, sizeof(int)); - int* uBCType = uvwBCType + 0 * NBCType; - int* vBCType = uvwBCType + 1 * NBCType; - int* wBCType = uvwBCType + 2 * NBCType; - for (int bID = 1; bID <= nbrBIDs; bID++) { - std::string bcTypeText(bcMap::text(bID, "velocity")); - if(platform->comm.mpiRank == 0) printf("bID %d -> bcType %s\n", bID, bcTypeText.c_str()); - - uBCType[bID] = bcMap::type(bID, "x-velocity"); - vBCType[bID] = bcMap::type(bID, "y-velocity"); - wBCType[bID] = bcMap::type(bID, "z-velocity"); - } - - nrs->vOptions = options; - nrs->vOptions.setArgs("PGMRES RESTART", options.getArgs("VELOCITY PGMRES RESTART")); - nrs->vOptions.setArgs("KRYLOV SOLVER", options.getArgs("VELOCITY KRYLOV SOLVER")); - nrs->vOptions.setArgs("SOLVER TOLERANCE", options.getArgs("VELOCITY SOLVER TOLERANCE")); - nrs->vOptions.setArgs("LINEAR SOLVER STOPPING CRITERION", options.getArgs("VELOCITY LINEAR SOLVER STOPPING CRITERION")); - nrs->vOptions.setArgs("DISCRETIZATION", options.getArgs("VELOCITY DISCRETIZATION")); - nrs->vOptions.setArgs("BASIS", options.getArgs("VELOCITY BASIS")); - nrs->vOptions.setArgs("PRECONDITIONER", options.getArgs("VELOCITY PRECONDITIONER")); - nrs->vOptions.setArgs("INITIAL GUESS", options.getArgs("VELOCITY INITIAL GUESS")); - nrs->vOptions.setArgs("RESIDUAL PROJECTION VECTORS", options.getArgs("VELOCITY RESIDUAL PROJECTION VECTORS")); - nrs->vOptions.setArgs("RESIDUAL PROJECTION START", options.getArgs("VELOCITY RESIDUAL PROJECTION START")); - nrs->vOptions.setArgs("MULTIGRID COARSENING", options.getArgs("VELOCITY MULTIGRID COARSENING")); - nrs->vOptions.setArgs("MULTIGRID SMOOTHER", options.getArgs("VELOCITY MULTIGRID SMOOTHER")); - nrs->vOptions.setArgs("MULTIGRID CHEBYSHEV DEGREE", - options.getArgs("VELOCITY MULTIGRID CHEBYSHEV DEGREE")); - nrs->vOptions.setArgs("PARALMOND CYCLE", options.getArgs("VELOCITY PARALMOND CYCLE")); - nrs->vOptions.setArgs("PARALMOND SMOOTHER", options.getArgs("VELOCITY PARALMOND SMOOTHER")); - nrs->vOptions.setArgs("PARALMOND PARTITION", options.getArgs("VELOCITY PARALMOND PARTITION")); - nrs->vOptions.setArgs("PARALMOND CHEBYSHEV DEGREE", - options.getArgs("VELOCITY PARALMOND CHEBYSHEV DEGREE")); - nrs->vOptions.setArgs("PARALMOND AGGREGATION STRATEGY", - options.getArgs("VELOCITY PARALMOND AGGREGATION STRATEGY")); - nrs->vOptions.setArgs("MAXIMUM ITERATIONS", options.getArgs("VELOCITY MAXIMUM ITERATIONS")); - nrs->vOptions.setArgs("STABILIZATION METHOD", options.getArgs("VELOCITY STABILIZATION METHOD")); - nrs->vOptions.setArgs("HPFRT STRENGTH", options.getArgs("VELOCITY HPFRT STRENGTH")); - nrs->vOptions.setArgs("HPFRT MODES", options.getArgs("VELOCITY HPFRT MODES")); - - nrs->mOptions = options; - nrs->mOptions.setArgs("PGMRES RESTART", options.getArgs("MESH PGMRES RESTART")); - nrs->mOptions.setArgs("KRYLOV SOLVER", options.getArgs("MESH KRYLOV SOLVER")); - nrs->mOptions.setArgs("SOLVER TOLERANCE", options.getArgs("MESH SOLVER TOLERANCE")); - nrs->mOptions.setArgs("DISCRETIZATION", options.getArgs("MESH DISCRETIZATION")); - nrs->mOptions.setArgs("BASIS", options.getArgs("MESH BASIS")); - nrs->mOptions.setArgs("PRECONDITIONER", options.getArgs("MESH PRECONDITIONER")); - nrs->mOptions.setArgs("INITIAL GUESS", options.getArgs("MESH INITIAL GUESS")); - nrs->mOptions.setArgs("RESIDUAL PROJECTION VECTORS", options.getArgs("MESH RESIDUAL PROJECTION VECTORS")); - nrs->mOptions.setArgs("RESIDUAL PROJECTION START", options.getArgs("MESH RESIDUAL PROJECTION START")); - nrs->mOptions.setArgs("MULTIGRID COARSENING", options.getArgs("MESH MULTIGRID COARSENING")); - nrs->mOptions.setArgs("MULTIGRID SMOOTHER", options.getArgs("MESH MULTIGRID SMOOTHER")); - nrs->mOptions.setArgs("MULTIGRID CHEBYSHEV DEGREE", - options.getArgs("MESH MULTIGRID CHEBYSHEV DEGREE")); - nrs->mOptions.setArgs("PARALMOND CYCLE", options.getArgs("MESH PARALMOND CYCLE")); - nrs->mOptions.setArgs("PARALMOND SMOOTHER", options.getArgs("MESH PARALMOND SMOOTHER")); - nrs->mOptions.setArgs("PARALMOND PARTITION", options.getArgs("MESH PARALMOND PARTITION")); - nrs->mOptions.setArgs("PARALMOND CHEBYSHEV DEGREE", - options.getArgs("MESH PARALMOND CHEBYSHEV DEGREE")); - nrs->mOptions.setArgs("PARALMOND AGGREGATION STRATEGY", - options.getArgs("MESH PARALMOND AGGREGATION STRATEGY")); - nrs->mOptions.setArgs("MAXIMUM ITERATIONS", options.getArgs("MESH MAXIMUM ITERATIONS")); - - // coeff used by ellipticSetup to detect allNeumann - for (int i = 0; i < 2 * nrs->fieldOffset; i++) nrs->ellipticCoeff[i] = 1; - - if(nrs->uvwSolver) { - nrs->uvwSolver->blockSolver = 1; - nrs->uvwSolver->stressForm = 0; - if(options.compareArgs("STRESSFORMULATION", "TRUE")) - nrs->uvwSolver->stressForm = 1; - nrs->uvwSolver->Nfields = nrs->NVfields; - nrs->uvwSolver->Ntotal = nrs->fieldOffset; - nrs->uvwSolver->o_wrk = o_mempoolElliptic; - nrs->uvwSolver->mesh = mesh; - nrs->uvwSolver->options = nrs->vOptions; - nrs->uvwSolver->dim = nrs->dim; - nrs->uvwSolver->elementType = nrs->elementType; - nrs->uvwSolver->NBCType = NBCType; - nrs->uvwSolver->BCType = (int*) calloc(nrs->NVfields * NBCType,sizeof(int)); - memcpy(nrs->uvwSolver->BCType,uvwBCType,nrs->NVfields * NBCType * sizeof(int)); - nrs->uvwSolver->var_coeff = nrs->var_coeff; - nrs->uvwSolver->lambda = nrs->ellipticCoeff; - nrs->uvwSolver->o_lambda = nrs->o_ellipticCoeff; - nrs->uvwSolver->loffset = 0; // use same ellipticCoeff for u,v and w - - ellipticSolveSetup(nrs->uvwSolver); - } else { - nrs->uSolver = new elliptic_t(); - nrs->uSolver->blockSolver = 0; - nrs->uSolver->Nfields = 1; - nrs->uSolver->Ntotal = nrs->fieldOffset; - nrs->uSolver->o_wrk = o_mempoolElliptic; - nrs->uSolver->mesh = mesh; - nrs->uSolver->options = nrs->vOptions; - nrs->uSolver->dim = nrs->dim; - nrs->uSolver->elementType = nrs->elementType; - nrs->uSolver->NBCType = NBCType; - nrs->uSolver->BCType = (int*) calloc(NBCType,sizeof(int)); - memcpy(nrs->uSolver->BCType,uBCType,NBCType * sizeof(int)); - nrs->uSolver->var_coeff = nrs->var_coeff; - nrs->uSolver->lambda = nrs->ellipticCoeff; - nrs->uSolver->o_lambda = nrs->o_ellipticCoeff; - nrs->uSolver->loffset = 0; - - ellipticSolveSetup(nrs->uSolver); - - nrs->vSolver = new elliptic_t(); - nrs->vSolver->blockSolver = 0; - nrs->vSolver->Nfields = 1; - nrs->vSolver->Ntotal = nrs->fieldOffset; - nrs->vSolver->o_wrk = o_mempoolElliptic; - nrs->vSolver->mesh = mesh; - nrs->vSolver->options = nrs->vOptions; - nrs->vSolver->dim = nrs->dim; - nrs->vSolver->elementType = nrs->elementType; - nrs->vSolver->NBCType = NBCType; - nrs->vSolver->BCType = (int*) calloc(NBCType,sizeof(int)); - memcpy(nrs->vSolver->BCType,vBCType,NBCType * sizeof(int)); - nrs->vSolver->var_coeff = nrs->var_coeff; - nrs->vSolver->lambda = nrs->ellipticCoeff; - nrs->vSolver->o_lambda = nrs->o_ellipticCoeff; - nrs->vSolver->loffset = 0; - - ellipticSolveSetup(nrs->vSolver); - - if (nrs->dim == 3) { - nrs->wSolver = new elliptic_t(); - nrs->wSolver->blockSolver = 0; - nrs->wSolver->Nfields = 1; - nrs->wSolver->Ntotal = nrs->fieldOffset; - nrs->wSolver->o_wrk = o_mempoolElliptic; - nrs->wSolver->mesh = mesh; - nrs->wSolver->options = nrs->vOptions; - nrs->wSolver->dim = nrs->dim; - nrs->wSolver->elementType = nrs->elementType; - nrs->wSolver->NBCType = NBCType; - nrs->wSolver->BCType = (int*) calloc(NBCType,sizeof(int)); - memcpy(nrs->wSolver->BCType,wBCType,NBCType * sizeof(int)); - nrs->wSolver->var_coeff = nrs->var_coeff; - nrs->wSolver->lambda = nrs->ellipticCoeff; - nrs->wSolver->o_lambda = nrs->o_ellipticCoeff; - nrs->wSolver->loffset = 0; - - ellipticSolveSetup(nrs->wSolver); - } - } - - if(platform->options.compareArgs("VELOCITY BLOCK SOLVER", "TRUE")) { - nrs->uvwSolver->name = "velocity"; - } else { - nrs->uSolver->name = "x-velocity"; - nrs->vSolver->name = "y-velocity"; - nrs->wSolver->name = "v-velocity"; - } - } // flow - - if (nrs->flow) { - if (platform->comm.mpiRank == 0) printf("================ ELLIPTIC SETUP PRESSURE ================\n"); - - int* pBCType = (int*) calloc(NBCType, sizeof(int)); - for (int bID = 1; bID <= nbrBIDs; bID++) - pBCType[bID] = bcMap::type(bID, "pressure"); - - nrs->pOptions = options; - nrs->pOptions.setArgs("PGMRES RESTART", options.getArgs("PRESSURE PGMRES RESTART")); - nrs->pOptions.setArgs("KRYLOV SOLVER", options.getArgs("PRESSURE KRYLOV SOLVER")); - nrs->pOptions.setArgs("SOLVER TOLERANCE", options.getArgs("PRESSURE SOLVER TOLERANCE")); - nrs->pOptions.setArgs("LINEAR SOLVER STOPPING CRITERION", options.getArgs("PRESSURE LINEAR SOLVER STOPPING CRITERION")); - nrs->pOptions.setArgs("DISCRETIZATION", options.getArgs("PRESSURE DISCRETIZATION")); - nrs->pOptions.setArgs("BASIS", options.getArgs("PRESSURE BASIS")); - nrs->pOptions.setArgs("PRECONDITIONER", options.getArgs("PRESSURE PRECONDITIONER")); - nrs->pOptions.setArgs("SEMFEM SOLVER", options.getArgs("PRESSURE SEMFEM SOLVER")); - nrs->pOptions.setArgs("SEMFEM SOLVER PRECISION", options.getArgs("PRESSURE SEMFEM SOLVER PRECISION")); - nrs->pOptions.setArgs("MULTIGRID COARSENING", options.getArgs("PRESSURE MULTIGRID COARSENING")); - nrs->pOptions.setArgs("MULTIGRID SMOOTHER", options.getArgs("PRESSURE MULTIGRID SMOOTHER")); - nrs->pOptions.setArgs("MULTIGRID COARSE SOLVE", options.getArgs("PRESSURE MULTIGRID COARSE SOLVE")); - nrs->pOptions.setArgs("MULTIGRID COARSE SEMFEM", options.getArgs("PRESSURE MULTIGRID COARSE SEMFEM")); - nrs->pOptions.setArgs("MULTIGRID DOWNWARD SMOOTHER", - options.getArgs("PRESSURE MULTIGRID DOWNWARD SMOOTHER")); - nrs->pOptions.setArgs("MULTIGRID UPWARD SMOOTHER", - options.getArgs("PRESSURE MULTIGRID UPWARD SMOOTHER")); - nrs->pOptions.setArgs("MULTIGRID CHEBYSHEV DEGREE", - options.getArgs("PRESSURE MULTIGRID CHEBYSHEV DEGREE")); - nrs->pOptions.setArgs("PARALMOND CYCLE", options.getArgs("PRESSURE PARALMOND CYCLE")); - nrs->pOptions.setArgs("PARALMOND SMOOTHER", options.getArgs("PRESSURE MULTIGRID SMOOTHER")); - nrs->pOptions.setArgs("PARALMOND PARTITION", options.getArgs("PRESSURE PARALMOND PARTITION")); - nrs->pOptions.setArgs("PARALMOND CHEBYSHEV DEGREE", - options.getArgs("PRESSURE PARALMOND CHEBYSHEV DEGREE")); - nrs->pOptions.setArgs("PARALMOND AGGREGATION STRATEGY", - options.getArgs("PRESSURE PARALMOND AGGREGATION STRATEGY")); - nrs->pOptions.setArgs("INITIAL GUESS", options.getArgs("PRESSURE INITIAL GUESS")); - nrs->pOptions.setArgs("RESIDUAL PROJECTION VECTORS", - options.getArgs("PRESSURE RESIDUAL PROJECTION VECTORS")); - nrs->pOptions.setArgs("RESIDUAL PROJECTION START", - options.getArgs("PRESSURE RESIDUAL PROJECTION START")); - nrs->pOptions.setArgs("MULTIGRID VARIABLE COEFFICIENT", "FALSE"); - nrs->pOptions.setArgs("MAXIMUM ITERATIONS", options.getArgs("PRESSURE MAXIMUM ITERATIONS")); - nrs->pOptions.setArgs("MULTIGRID CHEBYSHEV MAX EIGENVALUE BOUND FACTOR", options.getArgs("PRESSURE MULTIGRID CHEBYSHEV MAX EIGENVALUE BOUND FACTOR")); - nrs->pOptions.setArgs("MULTIGRID CHEBYSHEV MIN EIGENVALUE BOUND FACTOR", options.getArgs("PRESSURE MULTIGRID CHEBYSHEV MIN EIGENVALUE BOUND FACTOR")); - - nrs->pSolver = new elliptic_t(); - nrs->pSolver->name = "pressure"; - nrs->pSolver->blockSolver = 0; - nrs->pSolver->Nfields = 1; - nrs->pSolver->Ntotal = nrs->fieldOffset; - nrs->pSolver->o_wrk = o_mempoolElliptic; - nrs->pSolver->mesh = mesh; - nrs->pSolver->dim = nrs->dim; - nrs->pSolver->elementType = nrs->elementType; - nrs->pSolver->BCType = (int*) calloc(nbrBIDs + 1,sizeof(int)); - memcpy(nrs->pSolver->BCType,pBCType,(nbrBIDs + 1) * sizeof(int)); - - nrs->pSolver->var_coeff = 1; - - // coeff used by ellipticSetup to detect allNeumann - for (int i = 0; i < 2 * nrs->fieldOffset; i++) nrs->ellipticCoeff[i] = 0; - nrs->pSolver->lambda = nrs->ellipticCoeff; - nrs->pSolver->o_lambda = nrs->o_ellipticCoeff; - nrs->pSolver->loffset = 0; - - { - const std::vector levels = determineMGLevels("pressure"); - nrs->pSolver->nLevels = levels.size(); - nrs->pSolver->levels = (int*) calloc(nrs->pSolver->nLevels,sizeof(int)); - for(int i = 0; i < nrs->pSolver->nLevels; ++i) - nrs->pSolver->levels[i] = levels.at(i); - } - - nrs->pSolver->options = nrs->pOptions; - ellipticSolveSetup(nrs->pSolver); - - } // flow - if(nrs->flow){ - if(options.compareArgs("MESH SOLVER", "ELASTICITY")){ - if (platform->comm.mpiRank == 0) printf("================ ELLIPTIC SETUP MESH ================\n"); - int* uvwMeshBCType = (int*) calloc(3 * NBCType, sizeof(int)); - int* uMeshBCType = uvwMeshBCType + 0 * NBCType; - int* vMeshBCType = uvwMeshBCType + 1 * NBCType; - int* wMeshBCType = uvwMeshBCType + 2 * NBCType; - for (int bID = 1; bID <= nbrBIDs; bID++) { - std::string bcTypeText(bcMap::text(bID, "mesh")); - if(platform->comm.mpiRank == 0) printf("bID %d -> bcType %s\n", bID, bcTypeText.c_str()); - - uMeshBCType[bID] = bcMap::type(bID, "x-mesh"); - vMeshBCType[bID] = bcMap::type(bID, "y-mesh"); - wMeshBCType[bID] = bcMap::type(bID, "z-mesh"); - } - nrs->meshSolver = new elliptic_t(); - nrs->meshSolver->name = "mesh"; - nrs->meshSolver->blockSolver = 1; - nrs->meshSolver->stressForm = 1; - nrs->meshSolver->Nfields = nrs->NVfields; - nrs->meshSolver->Ntotal = nrs->fieldOffset; - nrs->meshSolver->o_wrk = o_mempoolElliptic; - nrs->meshSolver->mesh = mesh; - nrs->meshSolver->options = nrs->mOptions; - nrs->meshSolver->dim = nrs->dim; - nrs->meshSolver->elementType = nrs->elementType; - nrs->meshSolver->NBCType = NBCType; - nrs->meshSolver->BCType = (int*) calloc(nrs->NVfields * NBCType,sizeof(int)); - memcpy(nrs->meshSolver->BCType,uvwMeshBCType,nrs->NVfields * NBCType * sizeof(int)); - nrs->meshSolver->var_coeff = 1; - nrs->meshSolver->lambda = nrs->ellipticCoeff; - nrs->meshSolver->o_lambda = nrs->o_ellipticCoeff; - nrs->meshSolver->loffset = 0; // use same ellipticCoeff for u,v and w - - ellipticSolveSetup(nrs->meshSolver); - } - } - // set I.C. for U, W - if(platform->options.compareArgs("MESH SOLVER", "ELASTICITY")) - { - double startTime; - platform->options.getArgs("START TIME", startTime); - platform->linAlg->fill(nrs->NVfields*nrs->fieldOffset, -1.0*std::numeric_limits::max(), platform->o_mempool.slice0); - for (int sweep = 0; sweep < 2; sweep++) { - nrs->velocityDirichletBCKernel(mesh->Nelements, - nrs->fieldOffset, - startTime, - mesh->o_sgeo, - mesh->o_x, - mesh->o_y, - mesh->o_z, - mesh->o_vmapM, - mesh->o_EToB, - nrs->o_EToB, - nrs->o_usrwrk, - nrs->o_U, - platform->o_mempool.slice0); - if (sweep == 0) oogs::startFinish(platform->o_mempool.slice0, nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsMax, nrs->gsh); - if (sweep == 1) oogs::startFinish(platform->o_mempool.slice0, nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsMin, nrs->gsh); - } - nrs->o_U.copyFrom(platform->o_mempool.slice0, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat)); - - platform->linAlg->fill(nrs->NVfields*nrs->fieldOffset, 0.0, platform->o_mempool.slice0); - for (int sweep = 0; sweep < 2; sweep++) { - nrs->meshV->velocityDirichletKernel(mesh->Nelements, - nrs->fieldOffset, - mesh->o_vmapM, - nrs->o_EToBMesh, - nrs->o_U, - platform->o_mempool.slice0); - //take care of Neumann-Dirichlet shared edges across elements - if(sweep == 0) oogs::startFinish(platform->o_mempool.slice0, nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsMax, nrs->gsh); - if(sweep == 1) oogs::startFinish(platform->o_mempool.slice0, nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsMin, nrs->gsh); - } - oogs::startFinish(platform->o_mempool.slice0, nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsAdd, nrs->gsh); - platform->linAlg->axmyMany( - mesh->Nlocal, - nrs->NVfields, - nrs->fieldOffset, - 0, - 1.0, - nrs->meshSolver->o_invDegree, - platform->o_mempool.slice0 - ); - mesh->o_U.copyFrom(platform->o_mempool.slice0, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat)); - } - - -} - -namespace{ -cds_t* cdsSetup(nrs_t* nrs, setupAide options) -{ - const std::string section = "cds-"; - cds_t* cds = new cds_t(); - platform_t* platform = platform_t::getInstance(); - device_t& device = platform->device; - - cds->mesh[0] = nrs->_mesh; - mesh_t* mesh = cds->mesh[0]; - cds->meshV = nrs->_mesh->fluid; - cds->elementType = nrs->elementType; - cds->dim = nrs->dim; - cds->NVfields = nrs->NVfields; - cds->NSfields = nrs->Nscalar; - - cds->coeffEXT = nrs->coeffEXT; - cds->coeffBDF = nrs->coeffBDF; - cds->coeffSubEXT = nrs->coeffSubEXT; - cds->nBDF = nrs->nBDF; - cds->nEXT = nrs->nEXT; - cds->o_coeffEXT = nrs->o_coeffEXT; - cds->o_coeffBDF = nrs->o_coeffBDF; - cds->o_coeffSubEXT = nrs->o_coeffSubEXT; - - cds->o_usrwrk = &(nrs->o_usrwrk); - - cds->vFieldOffset = nrs->fieldOffset; - cds->fieldOffset[0] = nrs->fieldOffset; - cds->fieldOffsetScan[0] = 0; - dlong sum = cds->fieldOffset[0]; - for(int s = 1; s < cds->NSfields; ++s){ - cds->fieldOffset[s] = cds->fieldOffset[0]; - cds->fieldOffsetScan[s] = sum; - sum += cds->fieldOffset[s]; - cds->mesh[s] = cds->mesh[0]; - } - cds->fieldOffsetSum = sum; - - cds->gsh = nrs->gsh; - - if(nrs->cht) { - meshParallelGatherScatterSetup(mesh, mesh->Nlocal, mesh->globalIds, platform->comm.mpiComm, 0); - oogs_mode oogsMode = OOGS_AUTO; - //if(platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP") oogsMode = OOGS_DEFAULT; - cds->gshT = oogs::setup(mesh->ogs, 1, cds->fieldOffset[0], ogsDfloat, NULL, oogsMode); - } else { - cds->gshT = cds->gsh; - } - - // Solution storage at interpolation nodes - cds->U = nrs->U; // Point to INS side Velocity - cds->S = - (dfloat*) calloc(std::max(cds->nBDF, cds->nEXT) * cds->fieldOffsetSum,sizeof(dfloat)); - cds->BF = (dfloat*) calloc(cds->fieldOffsetSum,sizeof(dfloat)); - cds->FS = - (dfloat*) calloc(cds->nEXT * cds->fieldOffsetSum,sizeof(dfloat)); - - cds->Nsubsteps = nrs->Nsubsteps; - if(cds->Nsubsteps) { - cds->nRK = nrs->nRK; - cds->coeffsfRK = nrs->coeffsfRK; - cds->weightsRK = nrs->weightsRK; - cds->nodesRK = nrs->nodesRK; - cds->o_coeffsfRK = nrs->o_coeffsfRK; - cds->o_weightsRK = nrs->o_weightsRK; - } - - cds->dt = nrs->dt; - cds->sdt = nrs->sdt; - - cds->prop = (dfloat*) calloc(2 * cds->fieldOffsetSum,sizeof(dfloat)); - - for(int is = 0; is < cds->NSfields; is++) { - std::stringstream ss; - ss << std::setfill('0') << std::setw(2) << is; - std::string sid = ss.str(); - - if(options.compareArgs("SCALAR" + sid + " SOLVER", "NONE")) continue; - - dfloat diff = 1; - dfloat rho = 1; - options.getArgs("SCALAR" + sid + " DIFFUSIVITY", diff); - options.getArgs("SCALAR" + sid + " DENSITY", rho); - - const dlong off = cds->fieldOffsetSum; - for (int e = 0; e < mesh->Nelements; e++) - for (int n = 0; n < mesh->Np; n++) { - cds->prop[0 * off + cds->fieldOffsetScan[is] + e * mesh->Np + n] = diff; - cds->prop[1 * off + cds->fieldOffsetScan[is] + e * mesh->Np + n] = rho; - } - } - - cds->o_prop = - device.malloc(2 * cds->fieldOffsetSum * sizeof(dfloat), cds->prop); - cds->o_diff = cds->o_prop.slice(0 * cds->fieldOffsetSum * sizeof(dfloat)); - cds->o_rho = cds->o_prop.slice(1 * cds->fieldOffsetSum * sizeof(dfloat)); - - cds->var_coeff = 1; // use always var coeff elliptic - cds->ellipticCoeff = nrs->ellipticCoeff; - cds->o_ellipticCoeff = nrs->o_ellipticCoeff; - - cds->o_U = nrs->o_U; - cds->o_Ue = nrs->o_Ue; - cds->o_S = - platform->device.malloc(std::max(cds->nBDF, cds->nEXT) * cds->fieldOffsetSum * sizeof(dfloat), cds->S); - cds->o_Se = - platform->device.malloc(cds->fieldOffsetSum , sizeof(dfloat)); - cds->o_BF = platform->device.malloc(cds->fieldOffsetSum * sizeof(dfloat), cds->BF); - cds->o_FS = - platform->device.malloc(cds->nEXT * cds->fieldOffsetSum * sizeof(dfloat), - cds->FS); - - cds->o_relUrst = nrs->o_relUrst; - cds->o_Urst = nrs->o_Urst; - - for (int is = 0; is < cds->NSfields; is++) { - std::stringstream ss; - ss << std::setfill('0') << std::setw(2) << is; - std::string sid = ss.str(); - - cds->compute[is] = 1; - if (options.compareArgs("SCALAR" + sid + " SOLVER", "NONE")) { - cds->compute[is] = 0; - continue; - } - - mesh_t* mesh; - (is) ? mesh = cds->meshV : mesh = cds->mesh[0]; // only first scalar can be a CHT mesh - - cds->options[is] = options; - - cds->options[is].setArgs("REGULARIZATION RAMP CONSTANT", options.getArgs("SCALAR" + sid + " REGULARIZATION RAMP CONSTANT")); - cds->options[is].setArgs("REGULARIZATION AVM C0", options.getArgs("SCALAR" + sid + " REGULARIZATION AVM C0")); - cds->options[is].setArgs("REGULARIZATION METHOD", options.getArgs("SCALAR" + sid + " REGULARIZATION METHOD")); - cds->options[is].setArgs("REGULARIZATION VISMAX COEFF", options.getArgs("SCALAR" + sid + " REGULARIZATION VISMAX COEFF")); - cds->options[is].setArgs("REGULARIZATION SCALING COEFF", options.getArgs("SCALAR" + sid + " REGULARIZATION SCALING COEFF")); - cds->options[is].setArgs("HPFRT STRENGTH", options.getArgs("SCALAR" + sid + " HPFRT STRENGTH")); - cds->options[is].setArgs("HPFRT MODES", options.getArgs("SCALAR" + sid + " HPFRT MODES")); - cds->options[is].setArgs("KRYLOV SOLVER", options.getArgs("SCALAR" + sid + " KRYLOV SOLVER")); - cds->options[is].setArgs("PGMRES RESTART", options.getArgs("SCALAR" + sid + " PGMRES RESTART")); - cds->options[is].setArgs("DISCRETIZATION", options.getArgs("SCALAR DISCRETIZATION")); - cds->options[is].setArgs("BASIS", options.getArgs("SCALAR BASIS")); - cds->options[is].setArgs("PRECONDITIONER", options.getArgs("SCALAR" + sid + " PRECONDITIONER")); - cds->options[is].setArgs("SOLVER TOLERANCE", - options.getArgs("SCALAR" + sid + " SOLVER TOLERANCE")); - cds->options[is].setArgs("LINEAR SOLVER STOPPING CRITERION", options.getArgs("SCALAR" + sid + " LINEAR SOLVER STOPPING CRITERION")); - cds->options[is].setArgs("INITIAL GUESS", options.getArgs("SCALAR" + sid + " INITIAL GUESS")); - cds->options[is].setArgs("RESIDUAL PROJECTION VECTORS", options.getArgs("SCALAR" + sid + " RESIDUAL PROJECTION VECTORS")); - cds->options[is].setArgs("RESIDUAL PROJECTION START", options.getArgs("SCALAR" + sid + " RESIDUAL PROJECTION START")); - cds->options[is].setArgs("MAXIMUM ITERATIONS", options.getArgs("SCALAR" + sid + " MAXIMUM ITERATIONS")); - - dfloat largeNumber = 1 << 20; - cds->EToB[is] = (int*) calloc(mesh->Nelements * mesh->Nfaces, sizeof(int)); - int* EToB = cds->EToB[is]; - int cnt = 0; - for (int e = 0; e < mesh->Nelements; e++) { - for (int f = 0; f < mesh->Nfaces; f++) { - int bc = bcMap::id(mesh->EToB[f + e * mesh->Nfaces], "scalar" + sid); - EToB[cnt] = bc; - cnt++; - } - } - cds->o_EToB[is] = device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int), EToB); - } - - bool scalarFilteringEnabled = false; - bool avmEnabled = false; - { - for(int is = 0; is < cds->NSfields; is++) { - if(!cds->options[is].compareArgs("REGULARIZATION METHOD", "NONE")) scalarFilteringEnabled = true; - if(cds->options[is].compareArgs("REGULARIZATION METHOD", "HPF_RESIDUAL")) avmEnabled = true; - if(cds->options[is].compareArgs("REGULARIZATION METHOD", "HIGHEST_MODAL_DECAY")) avmEnabled = true; - } - } - - if(scalarFilteringEnabled) - { - const dlong Nmodes = cds->mesh[0]->N + 1; - cds->o_filterMT = platform->device.malloc(cds->NSfields * Nmodes * Nmodes, sizeof(dfloat)); - for(int is = 0; is < cds->NSfields; is++) - { - if(cds->options[is].compareArgs("REGULARIZATION METHOD", "NONE")) continue; - if(!cds->compute[is]) continue; - int filterNc = -1; - cds->options[is].getArgs("HPFRT MODES", filterNc); - dfloat filterS; - cds->options[is].getArgs("HPFRT STRENGTH", filterS); - filterS = -1.0 * fabs(filterS); - cds->filterS[is] = filterS; - - dfloat* A = filterSetup(cds->mesh[is], filterNc); - - const dlong Nmodes = cds->mesh[is]->N + 1; - cds->o_filterMT.copyFrom(A, Nmodes * Nmodes * sizeof(dfloat), is * Nmodes * Nmodes * sizeof(dfloat)); - - free(A); - } - } - - if(avmEnabled) avm::setup(cds); - - std::string kernelName; - const std::string suffix = "Hex3D"; - - MPI_Barrier(platform->comm.mpiComm); - double tStartLoadKernel = MPI_Wtime(); - if(platform->comm.mpiRank == 0) printf("loading cds kernels ... "); fflush(stdout); - - { - kernelName = "strongAdvectionVolume" + suffix; - cds->advectionStrongVolumeKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "strongAdvectionCubatureVolume" + suffix; - cds->advectionStrongCubatureVolumeKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "advectMeshVelocityHex3D"; - cds->advectMeshVelocityKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "maskCopy"; - cds->maskCopyKernel = - platform->kernels.getKernel( section + kernelName); - - { - kernelName = "sumMakef"; - cds->sumMakefKernel = platform->kernels.getKernel( section + kernelName); - } - - kernelName = "helmholtzBC" + suffix; - cds->helmholtzRhsBCKernel = platform->kernels.getKernel( section + kernelName); - kernelName = "dirichletBC"; - cds->dirichletBCKernel = platform->kernels.getKernel( section + kernelName); - - kernelName = "setEllipticCoeff"; - cds->setEllipticCoeffKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "filterRT" + suffix; - cds->filterRTKernel = - platform->kernels.getKernel( section + kernelName); - - kernelName = "nStagesSum3"; - cds->nStagesSum3Kernel = - platform->kernels.getKernel( section + kernelName); - - if(cds->Nsubsteps) { - kernelName = "subCycleStrongCubatureVolume" + suffix; - cds->subCycleStrongCubatureVolumeKernel = - platform->kernels.getKernel( section + kernelName); - kernelName = "subCycleStrongVolume" + suffix; - cds->subCycleStrongVolumeKernel = - platform->kernels.getKernel( section + kernelName); - - - kernelName = "subCycleERKUpdate"; - cds->subCycleRKUpdateKernel = platform->kernels.getKernel( section + kernelName); - kernelName = "subCycleRK"; - cds->subCycleRKKernel = platform->kernels.getKernel( section + kernelName); - - kernelName = "subCycleInitU0"; - cds->subCycleInitU0Kernel = platform->kernels.getKernel( section + kernelName); - } - } - - MPI_Barrier(platform->comm.mpiComm); - if(platform->comm.mpiRank == 0) printf("done (%gs)\n", MPI_Wtime() - tStartLoadKernel); fflush(stdout); - - return cds; -} -} diff --git a/src/core/setupAide.cpp b/src/core/setupAide.cpp deleted file mode 100644 index c6bfa249f..000000000 --- a/src/core/setupAide.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/* - - The MIT License (MIT) - - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - */ - -//#include "headers2d.hpp" - -#include "setupAide.hpp" - -setupAide::setupAide(){} - -setupAide::setupAide(std::string setupFile) -{ - read(setupFile); -} - -setupAide::setupAide(const setupAide& sa) -{ - *this = sa; -} - -setupAide& setupAide::operator = (const setupAide& sa){ - int size = sa.data.size(); - - data.resize(size); - keyword.resize(size); - - for(int i = 0; i < size; i++) { // TW - data[i] = sa.data[i]; - keyword[i] = sa.keyword[i]; - } - - return *this; -} - -std::string setupAide::readFile(std::string filename) -{ - struct stat statbuf; - - FILE* fh = fopen(filename.c_str(), "r"); - if (fh == 0) { - printf("Failed to open: %s\n", filename.c_str()); - ABORT(EXIT_FAILURE); - } - - stat(filename.c_str(), &statbuf); - char* source = (char*) malloc(statbuf.st_size + 1); - size_t status = fread(source, statbuf.st_size, 1, fh); - source[statbuf.st_size] = '\0'; - - std::string ret = source; - - return ret; -} - -void setupAide::read(std::string setupFile) -{ - std::vector < std::string > data2; - std::vector < std::string > keyword2; - - std::string args = readFile(setupFile); - - int size = args.length(); - std::string current = ""; - std::stringstream ss; - char c; - - for(int i = 0; i < size; i++) { - c = args[i]; - - // Batch std::strings together - if(c == '\'' || c == '"') { - current += c; - i++; - - while(i < size && args[i] != c) - current += args[i++]; - - if(i >= size) - break; - - if( i < (size - 1) ) - current += args[i]; - } - // Batch comments - else if(c == '/' && i < size && args[i + 1] == '*') { - i += 2; - - while( args[i] != '*' || (i < size && args[i + 1] != '/') ) - i++; - - if(i >= size) - break; - - i++; - } - // Removing # comments - else if(c == '#') { - i++; - - while(i < size && args[i] != '\n') - i++; - } - // Change \[\] to [] - else if(c == '\\' && i < size && (args[i + 1] == '[' || args[i + 1] == ']')) { - current += args[i + 1]; - i += 2; - } - // Split keywords [] - else if(c == '[') { - data2.push_back(current); - current = ""; - i++; - - while(i < size && args[i] != ']') - current += args[i++]; - - keyword2.push_back(current); - current = ""; - } - // Else add the character - else - if(!isspace(c)) { // new check to remove whitespace - current += c; - } - - if(i >= (size - 1) && current.length()) - data2.push_back(current); - } - - int argc = (data2.size() - 1); - - data.resize(argc); - keyword.resize(argc); - - for(int i = 0; i < argc; i++) { // TW - data[i] = data2[i + 1]; - keyword[i] = keyword2[i]; - } -} - -std::string setupAide::getArgs(std::string key) -{ - for(int i = 0; i < keyword.size(); i++) // TW - if(!( keyword[i].compare(key) )) - return data[i]; - - //printf("Warning: Failed to find [%s].\n", key.c_str()); - return ""; -} - -void setupAide::setArgs(std::string key, std::string value) -{ - for(int i = 0; i < keyword.size(); i++) // TW - if(!( keyword[i].compare(key) )) { - data[i] = value; - return; - } - - //add the key value pair - keyword.push_back(key); - data.push_back(value); - return; -} - -int setupAide::getArgs(std::string key, std::vector < std::string >& m, std::string delimeter) -{ - std::string args, current; - std::vector < std::string > argv; - int argc, size; - - args = getArgs(key); - - size = args.length(); - - current = ""; - - for(int i = 0; i < size; i++) { // TW - while( i < size && delimeter.find(args[i]) == std::string::npos ) - current += args[i++]; - - if(current.length()) - argv.push_back(current); - - current = ""; - } - - argc = argv.size(); - - if(!argc) - //printf("Warning: Failed to find [%s].\n", key.c_str()); - return 0; - - m.resize(argc); - - for(int i = 0; i < argc; i++) // TW - m[i] = argv[i]; - - return 1; -} - -int setupAide::compareArgs(std::string key, std::string token) -{ - std::string foundToken; - if(getArgs(key,foundToken)) { - if(foundToken == token) - return 1; - if(foundToken.find(token) != std::string::npos) - return 2; - } - - return 0; -} - -std::ostream & operator << (std::ostream &os, const setupAide &aide){ - int maxLength = 0; - for(int i = 0; i < aide.keyword.size(); i++) { - int L = aide.keyword[i].length(); - if(L > maxLength) - maxLength = L; - } - for(int i = 0; i < aide.keyword.size(); i++) { - os << "key: " << aide.keyword[i] << ","; - - for(int j = aide.keyword[i].length(); j < maxLength; ++j) - os << " "; - - os << "value: " << aide.data[i] << std::endl; - } - - return os; - } diff --git a/src/core/timer.cpp b/src/core/timer.cpp index 68023653d..b8ba10a5d 100644 --- a/src/core/timer.cpp +++ b/src/core/timer.cpp @@ -7,13 +7,6 @@ #include "platform.hpp" #include "ogs.hpp" -std::string printPercentage(double num, double dom) -{ - char buf[4096]; - double frac = num/dom; - snprintf(buf, sizeof(buf), "(%.2f)", frac); - return std::string(buf); -} namespace timer { @@ -21,7 +14,7 @@ namespace { struct tagData { - int count; + long long int count; double hostElapsed; double deviceElapsed; double startTime; @@ -50,18 +43,18 @@ void timer_t::init(MPI_Comm comm,occa::device device,int ifSync) comm_ = comm; } -void timer_t::set(const std::string tag, double time) +void timer_t::set(const std::string tag, double time, long long int count) { m_[tag].startTime = time; auto it = m_.find(tag); if(it == m_.end()) { - printf("Error in set: Invalid tag name. %s:%u\n",__FILE__,__LINE__); + printf("Error in set: Invalid tag name %s\n",tag.c_str()); MPI_Abort(comm_,1); } it->second.hostElapsed = time; it->second.deviceElapsed = it->second.hostElapsed; - it->second.count++; + it->second.count = count; } void timer_t::reset() @@ -99,7 +92,7 @@ void timer_t::deviceToc(const std::string tag) std::map::iterator it = m_.find(tag); if(it == m_.end()) { - printf("Error in deviceToc: Invalid tag name. %s:%u\n",__FILE__,__LINE__); + printf("Error in deviceToc: Invalid tag name %s\n",tag.c_str()); MPI_Abort(comm_,1); } @@ -125,7 +118,7 @@ void timer_t::hostToc(const std::string tag) auto it = m_.find(tag); if(it == m_.end()) { - printf("Error in deviceToc: Invalid tag name. %s:%u\n",__FILE__,__LINE__); + printf("Error in deviceToc: Invalid tag name %s\n",tag.c_str()); MPI_Abort(comm_,1); } @@ -154,7 +147,7 @@ void timer_t::toc(const std::string tag) auto it = m_.find(tag); if(it == m_.end()) { - printf("Error in deviceToc: Invalid tag name. %s:%u\n",__FILE__,__LINE__); + printf("Error in deviceToc: Invalid tag name %s\n",tag.c_str()); MPI_Abort(comm_,1); } @@ -177,7 +170,7 @@ double timer_t::deviceElapsed(const std::string tag) return it->second.deviceElapsed; } -int timer_t::count(const std::string tag) +long long int timer_t::count(const std::string tag) { auto it = m_.find(tag); if(it == m_.end()) return NEKRS_TIMER_INVALID_KEY; @@ -235,110 +228,146 @@ double timer_t::query(const std::string tag,const std::string metric) return NEKRS_TIMER_INVALID_METRIC; } +std::string printPercentage(double num, double dom) +{ + char buf[4096]; + double frac = num/dom; + snprintf(buf, sizeof(buf), "%4.2f", frac); + return std::string(buf); +} + +void timer_t::printStatEntry(std::string name, std::string tag, std::string type, double tNorm) +{ + int rank; + MPI_Comm_rank(comm_, &rank); + const long long int nCalls = count(tag); + const double tTag = query(tag, type); + if(tTag > 0) { + if(rank == 0){ + std::cout << name + << tTag << "s" + << " " << printPercentage(tTag, tNorm) + << " " << nCalls << "\n"; + } + } +} + +void timer_t::printStatEntry(std::string name, double time, double tNorm) +{ + int rank; + MPI_Comm_rank(comm_, &rank); + if(time > 0) { + if(rank == 0){ + std::cout << name + << time << "s" + << " " << printPercentage(time, tNorm) << "\n"; + } + } +} + void timer_t::printRunStat(int step) { int rank; MPI_Comm_rank(comm_, &rank); - double dEtime[20]; - dEtime[0] = query("makef", "DEVICE:MAX"); - dEtime[1] = query("velocitySolve", "DEVICE:MAX"); - dEtime[17] = query("velocity proj pre", "DEVICE:MAX"); - dEtime[17]+= query("velocity proj post", "DEVICE:MAX"); - dEtime[2] = query("pressureSolve", "DEVICE:MAX"); - dEtime[3] = query("makeq", "DEVICE:MAX"); - dEtime[4] = query("scalarSolve", "DEVICE:MAX"); - dEtime[18] = query("meshSolve", "DEVICE:MAX"); - dEtime[5] = query("pressure preconditioner", "DEVICE:MAX"); - dEtime[16] = query("pressure preconditioner smoother", "DEVICE:MAX"); - dEtime[6] = query("pressure proj pre", "DEVICE:MAX"); - dEtime[6]+= query("pressure proj post", "DEVICE:MAX"); - - dEtime[8] = query("dotp", "DEVICE:MAX"); - - dEtime[9] = query("solve", "DEVICE:MAX"); - dEtime[10] = query("setup", "DEVICE:MAX"); - dEtime[11] = query("checkpointing", "DEVICE:MAX"); - - dEtime[12] = query("udfExecuteStep", "DEVICE:MAX"); - dEtime[13] = query("udfUEqnSource", "DEVICE:MAX"); - dEtime[14] = query("udfSEqnSource", "DEVICE:MAX"); - dEtime[15] = query("udfProperties", "DEVICE:MAX"); - - double hEtime[10]; - hEtime[0] = query("BoomerAMGSolve", "HOST:MAX"); - const double amgxTime = query("AmgXSolve", "DEVICE:MAX"); - hEtime[0] = hEtime[0] > amgxTime ? hEtime[0] : amgxTime; - const double semfemTime = query("Coarse SEMFEM Solve", "DEVICE:MAX"); - hEtime[0] = hEtime[0] > semfemTime ? hEtime[0] : semfemTime; - hEtime[1] = ogsTime(/* reportHostTime */ true); - MPI_Allreduce(MPI_IN_PLACE, &hEtime[1], 1, MPI_DOUBLE, MPI_MAX, comm_); - - hEtime[2] = query("minSolveStep", "HOST:MAX"); - hEtime[3] = query("maxSolveStep", "HOST:MAX"); - hEtime[4] = query("loadKernels", "HOST:MAX"); - - if (rank == 0) { - std::cout << "step= " << step << " runtime statistics\n\n"; - - std::cout.setf(std::ios::scientific); - int outPrecisionSave = std::cout.precision(); - std::cout.precision(5); - - std::cout << " setup " << dEtime[10] << "s " << printPercentage(dEtime[10],dEtime[9]) << "\n"; - std::cout << " loadKernels " << hEtime[4] << "s " << printPercentage(hEtime[4],dEtime[9]) << "\n"; - - if(dEtime[11] > 0) - std::cout << " checkpointing " << dEtime[11]<< "s " << printPercentage(dEtime[11],dEtime[9]) << "\n"; - - if(dEtime[12] > 0) - std::cout << " udfExecuteStep " << dEtime[12] << "s " << printPercentage(dEtime[12],dEtime[9]) << "\n"; - - if(hEtime[2] > 0 && hEtime[3] > 0) - std::cout << " solve step min/max " << hEtime[2] << "s / " << hEtime[3] << "s (first 10 steps excluded)\n"; - - std::cout << " total solve " << dEtime[9] << "s\n"; - std::cout << " makef " << dEtime[0] << "s " << printPercentage(dEtime[0],dEtime[9]) << "\n"; - if(dEtime[13] > 0) - std::cout << " udfUEqnSource " << dEtime[13] << "s " << printPercentage(dEtime[13],dEtime[9]) << "\n"; - std::cout << " velocitySolve " << dEtime[1] << "s " << printPercentage(dEtime[1],dEtime[9]) << "\n"; - if(dEtime[17] > 0) - std::cout << " projection " << dEtime[17] << "s " << printPercentage(dEtime[17],dEtime[9]) << "\n";; - - std::cout << " pressureSolve " << dEtime[2] << "s " << printPercentage(dEtime[2],dEtime[9]) << "\n"; - - std::cout << " preconditioner " << dEtime[5] << "s " << printPercentage(dEtime[5],dEtime[9]) << "\n"; - if(dEtime[16] > 0) - std::cout << " pMG smoother " << dEtime[16] << "s " << printPercentage(dEtime[16],dEtime[9]) << "\n"; - if(hEtime[0] > 0) - std::cout << " coarse grid " << hEtime[0] << "s " << printPercentage(hEtime[0],dEtime[9]) << "\n"; - if(dEtime[6] > 0) - std::cout << " projection " << dEtime[6] << "s " << printPercentage(dEtime[6],dEtime[9]) << "\n"; - - if(dEtime[4] > 0) - std::cout << " scalarSolve " << dEtime[4] << "s " << printPercentage(dEtime[4],dEtime[9]) << "\n"; - if(dEtime[4] > 0) { - std::cout << " makeq " << dEtime[3] << "s " << printPercentage(dEtime[3],dEtime[9]) << "\n"; - if(dEtime[14] > 0) - std::cout << " udfSEqnSource " << dEtime[14] << "s " << printPercentage(dEtime[14],dEtime[9]) << "\n"; - } + set("velocity proj", + query("velocity proj pre", "DEVICE:MAX") + query("velocity proj post", "DEVICE:MAX"), + count("velocity proj pre")); + + set("pressure proj", + query("pressure proj pre", "DEVICE:MAX") + query("pressure proj post", "DEVICE:MAX"), + count("pressure proj pre")); + + set("scalar proj", + query("scalar proj pre", "DEVICE:MAX") + query("scalar proj post", "DEVICE:MAX"), + count("scalar proj pre")); + + + double gsTime = ogsTime(/* reportHostTime */ true); + MPI_Allreduce(MPI_IN_PLACE, &gsTime, 1, MPI_DOUBLE, MPI_MAX, comm_); + + const double tElapsedTime = query("elapsed", "DEVICE:MAX"); + + if (rank == 0) + std::cout << "\n>>> runtime statistics (step= " << step << " elapsed= " << tElapsedTime << "s" + << "):\n"; + + std::cout.setf(std::ios::scientific); + int outPrecisionSave = std::cout.precision(); + std::cout.precision(5); + + if(rank == 0) std::cout << "name " << "time " << " % " << "calls\n"; + + const double tElapsedTimeSolve = query("elapsedStepSum", "DEVICE:MAX"); + const double tSetup = query("setup", "DEVICE:MAX"); + + printStatEntry(" setup ", "setup", "DEVICE:MAX", tElapsedTime); + printStatEntry(" loadKernels ", "loadKernels", "HOST:MAX", tSetup); - if(dEtime[15] > 0) - std::cout << " udfProperties " << dEtime[15] << "s " << printPercentage(dEtime[15],dEtime[9]) << "\n"; + printStatEntry(" checkpointing ", "checkpointing", "DEVICE:MAX", tElapsedTime); - if(dEtime[18] > 0) - std::cout << " meshSolve " << dEtime[18] << " s\n"; + printStatEntry(" udfExecuteStep ", "udfExecuteStep", "DEVICE:MAX", tElapsedTime); - if(hEtime[1] > 0) - std::cout << " gsMPI " << hEtime[1] << "s " << printPercentage(hEtime[1],dEtime[9]) << "\n"; - if(dEtime[8] > 0) - std::cout << " dotp " << dEtime[8] << "s " << printPercentage(dEtime[8],dEtime[9]) << "\n"; + const double tSolve = query("solve", "DEVICE:MAX"); + const double tMinSolveStep = query("minSolveStep", "HOST:MAX"); + const double tMaxSolveStep = query("maxSolveStep", "HOST:MAX"); + const double flops = platform->flopCounter->get(platform->comm.mpiComm); + bool printFlops = !platform->options.compareArgs("PRESSURE PRECONDITIONER", "SEMFEM"); - std::cout << std::endl; + if(tSolve > 0 && rank == 0) { - std::cout.unsetf(std::ios::scientific); - std::cout.precision(outPrecisionSave); + printStatEntry(" elapsedStepSum ", tElapsedTimeSolve, tElapsedTime); + printStatEntry(" solve ", tSolve, tElapsedTime); + std::cout << " min " << tMinSolveStep << "s\n"; + std::cout << " max " << tMaxSolveStep << "s\n"; + if (flops > 0 && printFlops) + std::cout << " flop/s " << flops/tSolve << "\n\n"; } + + printStatEntry(" meshUpdate ", "meshUpdate", "DEVICE:MAX", tSolve); + + const double tMakef = query("makef", "DEVICE:MAX"); + printStatEntry(" makef ", "makef", "DEVICE:MAX", tSolve); + printStatEntry(" udfUEqnSource ", "udfUEqnSource", "DEVICE:MAX", tMakef); + + const double tMakeq = query("makeq", "DEVICE:MAX"); + printStatEntry(" makeq ", "makeq", "DEVICE:MAX", tSolve); + printStatEntry(" udfSEqnSource ", "udfSEqnSource", "DEVICE:MAX", tMakeq); + + printStatEntry(" udfProperties ", "udfProperties", "DEVICE:MAX", tSolve); + + const double tVelocity = query("velocitySolve", "DEVICE:MAX"); + printStatEntry(" velocitySolve ", "velocitySolve", "DEVICE:MAX", tSolve); + printStatEntry(" rhs ", "velocity rhs", "DEVICE:MAX", tVelocity); + printStatEntry(" initial guess ", "velocity proj", "DEVICE:MAX", tVelocity); + + const double tPressure = query("pressureSolve", "DEVICE:MAX"); + printStatEntry(" pressureSolve ", "pressureSolve", "DEVICE:MAX", tSolve); + printStatEntry(" rhs ", "pressure rhs", "DEVICE:MAX", tPressure); + + const double tPressurePreco = query("pressure preconditioner", "DEVICE:MAX"); + printStatEntry(" preconditioner ", "pressure preconditioner", "DEVICE:MAX", tPressure); + printStatEntry(" pMG smoother ", "pressure preconditioner smoother", "DEVICE:MAX", tPressurePreco); + printStatEntry(" coarse grid ", "coarseSolve", "DEVICE:MAX", tPressurePreco); + printStatEntry(" initial guess ", "pressure proj", "DEVICE:MAX", tPressure); + + const double tScalar = query("scalarSolve", "DEVICE:MAX"); + printStatEntry(" scalarSolve ", "scalarSolve", "DEVICE:MAX", tSolve); + printStatEntry(" rhs ", "scalar rhs", "DEVICE:MAX", tScalar); + printStatEntry(" initial guess ", "scalar proj", "DEVICE:MAX", tScalar); + + const double tMesh = query("meshSolve", "DEVICE:MAX"); + printStatEntry(" meshSolve ", "meshSolve", "DEVICE:MAX", tSolve); + printStatEntry(" initial guess ", "mesh proj", "DEVICE:MAX", tMesh); + + printStatEntry(" gsMPI ", gsTime, tSolve); + + printStatEntry(" dotp ", "dotp", "DEVICE:MAX", tSolve); + + if(rank == 0) std::cout << std::endl; + + std::cout.unsetf(std::ios::scientific); + std::cout.precision(outPrecisionSave); } } // namespace diff --git a/src/core/timer.hpp b/src/core/timer.hpp index b9e3d4fc9..a4a6ccf37 100644 --- a/src/core/timer.hpp +++ b/src/core/timer.hpp @@ -24,13 +24,15 @@ void deviceTic(const std::string tag); void deviceTic(const std::string tag,int ifSync); void deviceToc(const std::string tag); -void set(const std::string tag, double time); +void set(const std::string tag, double time, long long int count = 1); double hostElapsed(const std::string tag); double deviceElapsed(const std::string tag); -int count(const std::string tag); +long long int count(const std::string tag); double query(const std::string tag,std::string metric); void printRunStat(int step); +void printStatEntry(std::string name, std::string tag, std::string type, double tNorm); +void printStatEntry(std::string name, double time, double tNorm); }; } diff --git a/src/elliptic/amgSolver/amgx/amgx.c b/src/elliptic/amgSolver/amgx/amgx.c index 6fcf5555e..13e73817d 100644 --- a/src/elliptic/amgSolver/amgx/amgx.c +++ b/src/elliptic/amgSolver/amgx/amgx.c @@ -173,6 +173,10 @@ void AMGXfree() free(handle); handle = NULL; } +int AMGXenabled() +{ + return 1; +} #else int AMGXsetup(const int nLocalRows, const int nnz, @@ -197,4 +201,9 @@ int AMGXsolve(void *x, void *rhs) void AMGXfree() { } + +int AMGXenabled() +{ + return 0; +} #endif diff --git a/src/elliptic/amgSolver/amgx/amgx.h b/src/elliptic/amgSolver/amgx/amgx.h index b388fac80..aa3f735e5 100644 --- a/src/elliptic/amgSolver/amgx/amgx.h +++ b/src/elliptic/amgSolver/amgx/amgx.h @@ -13,6 +13,7 @@ int AMGXsetup(const int nLocalRows, const int nnz, int useFP32, int MPIDIRECT, const char* cfgFile); int AMGXsolve(void *x, void *rhs); void AMGXfree(); +int AMGXenabled(); #ifdef __cplusplus } diff --git a/src/elliptic/amgSolver/hypre/__HYPRE.h b/src/elliptic/amgSolver/hypre/__HYPRE.h new file mode 100644 index 000000000..198b0c8ea --- /dev/null +++ b/src/elliptic/amgSolver/hypre/__HYPRE.h @@ -0,0 +1,375 @@ +#ifndef HYPRE_WRAPPER_H +#define HYPRE_WRAPPER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "HYPRE.h" +#include "HYPRE_parcsr_ls.h" + +#define DECLARE(a,b) typedef HYPRE_Int (*t_##a) b; static t_##a __##a + +// +// definitions have match HYPRE_parcsr_ls.h +// +DECLARE( +HYPRE_BoomerAMGSolve, +( + HYPRE_Solver solver, + HYPRE_ParCSRMatrix A, + HYPRE_ParVector b, + HYPRE_ParVector x +) +); + +DECLARE( +HYPRE_BoomerAMGSetup, +( + HYPRE_Solver solver, + HYPRE_ParCSRMatrix A, + HYPRE_ParVector b, + HYPRE_ParVector x +) +); + +DECLARE( +HYPRE_BoomerAMGCreate, +( + HYPRE_Solver *solver +) +); + +DECLARE( +HYPRE_BoomerAMGSetCoarsenType, +( + HYPRE_Solver solver, + HYPRE_Int coarsen_type +) +); + +DECLARE( +HYPRE_BoomerAMGSetCycleRelaxType, +( + HYPRE_Solver solver, + HYPRE_Int relax_type, + HYPRE_Int k +) +); + +DECLARE( +HYPRE_BoomerAMGSetCycleNumSweeps, +( + HYPRE_Solver solver, + HYPRE_Int num_sweeps, + HYPRE_Int k +) +); + +DECLARE( +HYPRE_BoomerAMGSetMinCoarseSize, +( + HYPRE_Solver solver, + HYPRE_Int min_coarse_size +) +); + +DECLARE( +HYPRE_BoomerAMGSetStrongThreshold, +( + HYPRE_Solver solver, + HYPRE_Real strong_threshold +) +); + +DECLARE( +HYPRE_BoomerAMGSetNonGalerkinTol, +( + HYPRE_Solver solver, + HYPRE_Real nongalerkin_tol +) +); + +DECLARE( +HYPRE_BoomerAMGSetLevelNonGalerkinTol, +( + HYPRE_Solver solver, + HYPRE_Real nongalerkin_tol, + HYPRE_Int level +) +); + +DECLARE( +HYPRE_BoomerAMGSetAggNumLevels, +( + HYPRE_Solver solver, + HYPRE_Int agg_num_levels +) +); + +DECLARE( +HYPRE_BoomerAMGSetMaxIter, +( + HYPRE_Solver solver, + HYPRE_Int max_iter +) +); + +DECLARE( +HYPRE_BoomerAMGSetTol, +( + HYPRE_Solver solver, + HYPRE_Real tol +) +); + +DECLARE( +HYPRE_BoomerAMGSetPrintLevel, +( + HYPRE_Solver solver, + HYPRE_Int print_level +) +); + +DECLARE( +HYPRE_BoomerAMGSetInterpType, +( + HYPRE_Solver solver, + HYPRE_Int interp_type +) +); + +DECLARE( +HYPRE_BoomerAMGDestroy, +( + HYPRE_Solver solver +) +); + +// +// definitions have match HYPRE_IJ_mv.h +// +DECLARE( +HYPRE_IJVectorCreate, +( + MPI_Comm comm, + HYPRE_BigInt jlower, + HYPRE_BigInt jupper, + HYPRE_IJVector *vector +) +); + +DECLARE( +HYPRE_IJVectorSetObjectType, +( + HYPRE_IJVector vector, + HYPRE_Int type +) +); + +DECLARE( +HYPRE_IJVectorInitialize, +( + HYPRE_IJVector vector +) +); + +DECLARE( +HYPRE_IJVectorAssemble, +( + HYPRE_IJVector vector +) +); + +DECLARE( +HYPRE_IJVectorSetValues, +( + HYPRE_IJVector vector, + HYPRE_Int nvalues, + const HYPRE_BigInt *indices, + const HYPRE_Complex *values +) +); + +DECLARE( +HYPRE_IJVectorGetValues, +( + HYPRE_IJVector vector, + HYPRE_Int nvalues, + const HYPRE_BigInt *indices, + HYPRE_Complex *values +) +); + +DECLARE( +HYPRE_IJVectorGetObject, +( + HYPRE_IJVector vector, + void **object +) +); + +DECLARE( +HYPRE_IJVectorDestroy, +( + HYPRE_IJVector vector +) +); + +DECLARE( +HYPRE_IJMatrixGetObject, +( + HYPRE_IJMatrix matrix, + void **object +) +); + +DECLARE( +HYPRE_IJMatrixDestroy, +( + HYPRE_IJMatrix matrix +) +); + +DECLARE( +HYPRE_IJMatrixSetValues, +( + HYPRE_IJMatrix matrix, + HYPRE_Int nrows, + HYPRE_Int *ncols, + const HYPRE_BigInt *rows, + const HYPRE_BigInt *cols, + const HYPRE_Complex *values +) +); + +DECLARE( +HYPRE_IJMatrixAssemble, +( + HYPRE_IJMatrix matrix +) +); + +DECLARE( +HYPRE_IJMatrixCreate, +( + MPI_Comm comm, + HYPRE_BigInt ilower, + HYPRE_BigInt iupper, + HYPRE_BigInt jlower, + HYPRE_BigInt jupper, + HYPRE_IJMatrix *matrix +) +); + +DECLARE( +HYPRE_IJMatrixSetObjectType, +( + HYPRE_IJMatrix matrix, + HYPRE_Int type +) +); + +DECLARE( +HYPRE_IJMatrixInitialize, +( + HYPRE_IJMatrix matrix +) +); + +DECLARE( +HYPRE_IJMatrixGetRowCounts, +( + HYPRE_IJMatrix matrix, + HYPRE_Int nrows, + HYPRE_BigInt *rows, + HYPRE_Int *ncols +) +); + +DECLARE( +HYPRE_IJMatrixGetValues, +( + HYPRE_IJMatrix matrix, + HYPRE_Int nrows, + HYPRE_Int *ncols, + HYPRE_BigInt *rows, + HYPRE_BigInt *cols, + HYPRE_Complex *values +) +); + +DECLARE( +HYPRE_IJMatrixAddToValues, +( + HYPRE_IJMatrix matrix, + HYPRE_Int nrows, + HYPRE_Int *ncols, + const HYPRE_BigInt *rows, + const HYPRE_BigInt *cols, + const HYPRE_Complex *values +) +); + +#undef DECLARE + +static void check_error(const char* error) +{ + if(error != NULL) { + fprintf(stderr, "Error: %s!\n", error); + MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); + } +} + +static void __HYPRE_Load(const char* lib_path) +{ + //TODO: bcast + load from node-local storage + void* lib_handle = dlopen(lib_path, RTLD_LAZY | RTLD_LOCAL); + if(!lib_handle) check_error(dlerror()); + +#define LIB_LOAD(a) __##a = (t_##a) dlsym(lib_handle, #a); check_error(dlerror());; + LIB_LOAD(HYPRE_BoomerAMGSolve); + LIB_LOAD(HYPRE_BoomerAMGSetup); + LIB_LOAD(HYPRE_BoomerAMGCreate); + LIB_LOAD(HYPRE_BoomerAMGSetCoarsenType); + LIB_LOAD(HYPRE_BoomerAMGSetCycleRelaxType); + LIB_LOAD(HYPRE_BoomerAMGSetCycleNumSweeps); + LIB_LOAD(HYPRE_BoomerAMGSetMinCoarseSize); + LIB_LOAD(HYPRE_BoomerAMGSetStrongThreshold); + LIB_LOAD(HYPRE_BoomerAMGSetNonGalerkinTol); + LIB_LOAD(HYPRE_BoomerAMGSetLevelNonGalerkinTol); + LIB_LOAD(HYPRE_BoomerAMGSetAggNumLevels); + LIB_LOAD(HYPRE_BoomerAMGSetMaxIter); + LIB_LOAD(HYPRE_BoomerAMGSetTol); + LIB_LOAD(HYPRE_BoomerAMGSetPrintLevel); + LIB_LOAD(HYPRE_BoomerAMGSetInterpType); + LIB_LOAD(HYPRE_BoomerAMGDestroy); + LIB_LOAD(HYPRE_IJVectorCreate); + LIB_LOAD(HYPRE_IJVectorSetObjectType); + LIB_LOAD(HYPRE_IJVectorInitialize); + LIB_LOAD(HYPRE_IJVectorAssemble); + LIB_LOAD(HYPRE_IJVectorSetValues); + LIB_LOAD(HYPRE_IJVectorGetValues); + LIB_LOAD(HYPRE_IJVectorGetObject); + LIB_LOAD(HYPRE_IJVectorDestroy); + LIB_LOAD(HYPRE_IJMatrixGetObject); + LIB_LOAD(HYPRE_IJMatrixDestroy); + LIB_LOAD(HYPRE_IJMatrixSetValues); + LIB_LOAD(HYPRE_IJMatrixAssemble); + LIB_LOAD(HYPRE_IJMatrixCreate); + LIB_LOAD(HYPRE_IJMatrixSetObjectType); + LIB_LOAD(HYPRE_IJMatrixInitialize); + LIB_LOAD(HYPRE_IJMatrixGetRowCounts); + LIB_LOAD(HYPRE_IJMatrixGetValues); + LIB_LOAD(HYPRE_IJMatrixAddToValues); + +#undef LIB_LOAD +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/elliptic/amgSolver/hypre/boomerAMG.c b/src/elliptic/amgSolver/hypre/boomerAMG.c index 4f69a7674..ea1838ac4 100644 --- a/src/elliptic/amgSolver/hypre/boomerAMG.c +++ b/src/elliptic/amgSolver/hypre/boomerAMG.c @@ -7,14 +7,12 @@ #include "boomerAMG.h" +static int _Nthreads = 1; static double boomerAMGParam[BOOMERAMG_NPARAM]; #ifdef HYPRE -#include "_hypre_utilities.h" -#include "HYPRE_parcsr_ls.h" -#include "_hypre_parcsr_ls.h" -#include "HYPRE.h" +#include "__HYPRE.h" typedef struct hypre_data { MPI_Comm comm; @@ -31,12 +29,24 @@ typedef struct hypre_data { } hypre_data; static hypre_data *data; - int boomerAMGSetup(int nrows, int nz, const long long int *Ai, const long long int *Aj, const double *Av, const int null_space, const MPI_Comm ce, int Nthreads, int deviceID, - const int useFP32, const double *param) + const int useFP32, const double *param, const int verbose) { + + const char* install_dir = getenv("NEKRS_HOME"); +#define MAX_PATH 4096 + char lib_path[MAX_PATH]; +#ifdef __APPLE__ + snprintf(lib_path, MAX_PATH, "%s/lib/libHYPRE.dylib", install_dir); +#else + snprintf(lib_path, MAX_PATH, "%s/lib/libHYPRE.so", install_dir); +#endif +#undef MAX_PATH + + __HYPRE_Load(lib_path); + data = (hypre_data*) malloc(sizeof(struct hypre_data)); data->Nthreads = Nthreads; @@ -61,10 +71,10 @@ int boomerAMGSetup(int nrows, data->ilower = ilower; HYPRE_BigInt iupper = ilower + (HYPRE_BigInt) nrows - 1; - HYPRE_IJMatrixCreate(comm,ilower,iupper,ilower,iupper,&data->A); + __HYPRE_IJMatrixCreate(comm,ilower,iupper,ilower,iupper,&data->A); HYPRE_IJMatrix A_ij = data->A; - HYPRE_IJMatrixSetObjectType(A_ij,HYPRE_PARCSR); - HYPRE_IJMatrixInitialize(A_ij); + __HYPRE_IJMatrixSetObjectType(A_ij,HYPRE_PARCSR); + __HYPRE_IJMatrixInitialize(A_ij); int i; for(i=0; isolver); + __HYPRE_BoomerAMGCreate(&data->solver); HYPRE_Solver solver = data->solver; int uparam = (int) param[0]; @@ -93,7 +103,7 @@ int boomerAMGSetup(int nrows, boomerAMGParam[0] = 10; /* coarsening */ boomerAMGParam[1] = 6; /* interpolation */ boomerAMGParam[2] = 1; /* number of cycles */ - boomerAMGParam[3] = 6; /* smoother for crs level */ + boomerAMGParam[3] = 16; /* smoother for crs level */ boomerAMGParam[4] = 3; /* sweeps */ boomerAMGParam[5] = -1; /* smoother */ boomerAMGParam[6] = 1; /* sweeps */ @@ -101,72 +111,74 @@ int boomerAMGSetup(int nrows, boomerAMGParam[8] = 0.0; /* non galerkin tolerance */ } - HYPRE_BoomerAMGSetCoarsenType(solver,boomerAMGParam[0]); - HYPRE_BoomerAMGSetInterpType(solver,boomerAMGParam[1]); + __HYPRE_BoomerAMGSetCoarsenType(solver,boomerAMGParam[0]); + __HYPRE_BoomerAMGSetInterpType(solver,boomerAMGParam[1]); - //HYPRE_BoomerAMGSetKeepTranspose(solver, 1); - //HYPRE_BoomerAMGSetChebyFraction(solver, 0.2); + //__HYPRE_BoomerAMGSetKeepTranspose(solver, 1); + //__HYPRE_BoomerAMGSetChebyFraction(solver, 0.2); if (boomerAMGParam[5] > 0) { - HYPRE_BoomerAMGSetCycleRelaxType(solver, boomerAMGParam[5], 1); - HYPRE_BoomerAMGSetCycleRelaxType(solver, boomerAMGParam[5], 2); + __HYPRE_BoomerAMGSetCycleRelaxType(solver, boomerAMGParam[5], 1); + __HYPRE_BoomerAMGSetCycleRelaxType(solver, boomerAMGParam[5], 2); } - HYPRE_BoomerAMGSetCycleRelaxType(solver, 9, 3); + __HYPRE_BoomerAMGSetCycleRelaxType(solver, 9, 3); - HYPRE_BoomerAMGSetCycleNumSweeps(solver, boomerAMGParam[6], 1); - HYPRE_BoomerAMGSetCycleNumSweeps(solver, boomerAMGParam[6], 2); - HYPRE_BoomerAMGSetCycleNumSweeps(solver, 1, 3); + __HYPRE_BoomerAMGSetCycleNumSweeps(solver, boomerAMGParam[6], 1); + __HYPRE_BoomerAMGSetCycleNumSweeps(solver, boomerAMGParam[6], 2); + __HYPRE_BoomerAMGSetCycleNumSweeps(solver, 1, 3); if (null_space) { - HYPRE_BoomerAMGSetMinCoarseSize(solver, 2); - HYPRE_BoomerAMGSetCycleRelaxType(solver, boomerAMGParam[3], 3); - HYPRE_BoomerAMGSetCycleNumSweeps(solver, boomerAMGParam[4], 3); + __HYPRE_BoomerAMGSetMinCoarseSize(solver, 2); + __HYPRE_BoomerAMGSetCycleRelaxType(solver, boomerAMGParam[3], 3); + __HYPRE_BoomerAMGSetCycleNumSweeps(solver, boomerAMGParam[4], 3); } - HYPRE_BoomerAMGSetStrongThreshold(solver,boomerAMGParam[7]); + __HYPRE_BoomerAMGSetStrongThreshold(solver,boomerAMGParam[7]); if (boomerAMGParam[8] > 1e-3) { - HYPRE_BoomerAMGSetNonGalerkinTol(solver,boomerAMGParam[8]); - HYPRE_BoomerAMGSetLevelNonGalerkinTol(solver,0.0 , 0); - HYPRE_BoomerAMGSetLevelNonGalerkinTol(solver,0.01, 1); - HYPRE_BoomerAMGSetLevelNonGalerkinTol(solver,0.05, 2); + __HYPRE_BoomerAMGSetNonGalerkinTol(solver,boomerAMGParam[8]); + __HYPRE_BoomerAMGSetLevelNonGalerkinTol(solver,0.0 , 0); + __HYPRE_BoomerAMGSetLevelNonGalerkinTol(solver,0.01, 1); + __HYPRE_BoomerAMGSetLevelNonGalerkinTol(solver,0.05, 2); } - HYPRE_BoomerAMGSetAggNumLevels(solver, boomerAMGParam[9]); + __HYPRE_BoomerAMGSetAggNumLevels(solver, boomerAMGParam[9]); - HYPRE_BoomerAMGSetMaxIter(solver,boomerAMGParam[2]); // number of V-cycles - HYPRE_BoomerAMGSetTol(solver,0); + __HYPRE_BoomerAMGSetMaxIter(solver,boomerAMGParam[2]); // number of V-cycles + __HYPRE_BoomerAMGSetTol(solver,0); - HYPRE_BoomerAMGSetPrintLevel(solver,1); + if(verbose) + __HYPRE_BoomerAMGSetPrintLevel(solver,3); + else + __HYPRE_BoomerAMGSetPrintLevel(solver,1); // Create and initialize rhs and solution vectors - HYPRE_IJVectorCreate(comm,ilower,iupper,&data->b); + __HYPRE_IJVectorCreate(comm,ilower,iupper,&data->b); HYPRE_IJVector b = data->b; - HYPRE_IJVectorSetObjectType(b,HYPRE_PARCSR); - HYPRE_IJVectorInitialize(b); - HYPRE_IJVectorAssemble(b); + __HYPRE_IJVectorSetObjectType(b,HYPRE_PARCSR); + __HYPRE_IJVectorInitialize(b); + __HYPRE_IJVectorAssemble(b); - HYPRE_IJVectorCreate(comm,ilower,iupper,&data->x); + __HYPRE_IJVectorCreate(comm,ilower,iupper,&data->x); HYPRE_IJVector x = data->x; - HYPRE_IJVectorSetObjectType(x,HYPRE_PARCSR); - HYPRE_IJVectorInitialize(x); - HYPRE_IJVectorAssemble(x); + __HYPRE_IJVectorSetObjectType(x,HYPRE_PARCSR); + __HYPRE_IJVectorInitialize(x); + __HYPRE_IJVectorAssemble(x); // Perform AMG setup HYPRE_ParVector par_b; HYPRE_ParVector par_x; - HYPRE_IJVectorGetObject(b,(void**) &par_b); - HYPRE_IJVectorGetObject(x,(void**) &par_x); + __HYPRE_IJVectorGetObject(b,(void**) &par_b); + __HYPRE_IJVectorGetObject(x,(void**) &par_x); HYPRE_ParCSRMatrix par_A; - HYPRE_IJMatrixGetObject(data->A,(void**) &par_A); + __HYPRE_IJMatrixGetObject(data->A,(void**) &par_A); - int _Nthreads = 1; #pragma omp parallel { int tid = omp_get_thread_num(); if(tid==0) _Nthreads = omp_get_num_threads(); } omp_set_num_threads(data->Nthreads); - HYPRE_BoomerAMGSetup(solver,par_A,par_b,par_x); + __HYPRE_BoomerAMGSetup(solver,par_A,par_b,par_x); omp_set_num_threads(_Nthreads); data->ii = (HYPRE_BigInt*) malloc(data->nRows*sizeof(HYPRE_BigInt)); @@ -180,33 +192,26 @@ int boomerAMGSetup(int nrows, int boomerAMGSolve(void *x, void *b) { - int i; int err; - const HYPRE_Real *xx = (HYPRE_Real*) x; + HYPRE_Real *xx = (HYPRE_Real*) x; const HYPRE_Real *bb = (HYPRE_Real*) b; HYPRE_ParVector par_x; HYPRE_ParVector par_b; HYPRE_ParCSRMatrix par_A; - HYPRE_IJVectorSetValues(data->b,data->nRows,data->ii,bb); - HYPRE_IJVectorAssemble(data->b); - HYPRE_IJVectorGetObject(data->b,(void**) &par_b); + __HYPRE_IJVectorSetValues(data->b,data->nRows,data->ii,bb); + __HYPRE_IJVectorAssemble(data->b); + __HYPRE_IJVectorGetObject(data->b,(void**) &par_b); - HYPRE_IJVectorAssemble(data->x); - HYPRE_IJVectorGetObject(data->x,(void **) &par_x); + __HYPRE_IJVectorAssemble(data->x); + __HYPRE_IJVectorGetObject(data->x,(void **) &par_x); - HYPRE_IJMatrixGetObject(data->A,(void**) &par_A); + __HYPRE_IJMatrixGetObject(data->A,(void**) &par_A); - int _Nthreads = 1; - #pragma omp parallel - { - int tid = omp_get_thread_num(); - if(tid==0) _Nthreads = omp_get_num_threads(); - } omp_set_num_threads(data->Nthreads); - err = HYPRE_BoomerAMGSolve(data->solver,par_A,par_b,par_x); + err = __HYPRE_BoomerAMGSolve(data->solver,par_A,par_b,par_x); if(err > 0) { int rank; MPI_Comm_rank(data->comm,&rank); @@ -215,17 +220,17 @@ int boomerAMGSolve(void *x, void *b) } omp_set_num_threads(_Nthreads); - HYPRE_IJVectorGetValues(data->x,data->nRows,data->ii,(HYPRE_Real*)xx); + __HYPRE_IJVectorGetValues(data->x,data->nRows,data->ii,xx); return 0; } void boomerAMGFree() { - HYPRE_BoomerAMGDestroy(data->solver); - HYPRE_IJMatrixDestroy(data->A); - HYPRE_IJVectorDestroy(data->x); - HYPRE_IJVectorDestroy(data->b); + __HYPRE_BoomerAMGDestroy(data->solver); + __HYPRE_IJMatrixDestroy(data->A); + __HYPRE_IJVectorDestroy(data->x); + __HYPRE_IJVectorDestroy(data->b); free(data); } @@ -239,7 +244,7 @@ void hypre_blas_lsame() { int boomerAMGSetup(int nrows, int nz, const long long int *Ai, const long long int *Aj, const double *Av, const int null_space, const MPI_Comm ce, int Nthreads, int deviceID - const double *param) + const double *param, const int verbose) { int rank; MPI_Comm_rank(ce,&rank); diff --git a/src/elliptic/amgSolver/hypre/boomerAMG.h b/src/elliptic/amgSolver/hypre/boomerAMG.h index e92e6a588..c8b65f54d 100644 --- a/src/elliptic/amgSolver/hypre/boomerAMG.h +++ b/src/elliptic/amgSolver/hypre/boomerAMG.h @@ -10,7 +10,7 @@ extern "C" { int boomerAMGSetup(int nrows, int nz, const long long int *Ai, const long long int *Aj, const double *Av, const int null_space, const MPI_Comm ce, int Nthreads, int deviceID, - const int useFP32, const double *param); + const int useFP32, const double *param, const int verbose); int boomerAMGSolve(void *x, void *b); diff --git a/src/elliptic/amgSolver/parAlmond/coarse.hpp b/src/elliptic/amgSolver/parAlmond/coarse.hpp index 0163b5ab1..d3c84db11 100644 --- a/src/elliptic/amgSolver/parAlmond/coarse.hpp +++ b/src/elliptic/amgSolver/parAlmond/coarse.hpp @@ -42,6 +42,8 @@ class coarseSolver { int N; dfloat *invCoarseA=NULL; + occa::memory h_xLocal; + occa::memory h_rhsLocal; dfloat *xLocal=NULL; dfloat *rhsLocal=NULL; diff --git a/src/elliptic/amgSolver/parAlmond/coarseSolver.cpp b/src/elliptic/amgSolver/parAlmond/coarseSolver.cpp index 743482328..045f2d9f6 100644 --- a/src/elliptic/amgSolver/parAlmond/coarseSolver.cpp +++ b/src/elliptic/amgSolver/parAlmond/coarseSolver.cpp @@ -71,21 +71,20 @@ void coarseSolver::setup( MPI_Comm_rank(comm,&rank); MPI_Comm_size(comm,&size); + const int verbose = (options.compareArgs("VERBOSE","TRUE")) ? 1: 0; + if(options.compareArgs("PARALMOND SMOOTH COARSEST", "TRUE")) return; // bail early as this will not get used - if ((rank==0)&&(options.compareArgs("VERBOSE","TRUE"))) - printf("Setting up coarse solver...");fflush(stdout); - { std::string kernelName = "convertFP64ToFP32"; - convertFP64ToFP32Kernel = platform->kernels.getKernel(kernelName); + convertFP64ToFP32Kernel = platform->kernels.get(kernelName); kernelName = "convertFP32ToFP64"; - convertFP32ToFP64Kernel = platform->kernels.getKernel(kernelName); + convertFP32ToFP64Kernel = platform->kernels.get(kernelName); kernelName = "vectorDotStar2"; - vectorDotStarKernel2 = platform->kernels.getKernel(kernelName); + vectorDotStarKernel2 = platform->kernels.get(kernelName); } @@ -136,11 +135,14 @@ void coarseSolver::setup( Nthreads, -1, /* device ID, if negative run on host */ 0, /* useFP32 */ - settings); + settings, + verbose); N = (int) Nrows; - xLocal = (dfloat*) calloc(N,sizeof(dfloat)); - rhsLocal = (dfloat*) calloc(N,sizeof(dfloat)); + h_xLocal = platform->device.mallocHost(N*sizeof(dfloat)); + h_rhsLocal = platform->device.mallocHost(N*sizeof(dfloat)); + xLocal = (dfloat*) h_xLocal.ptr(); + rhsLocal = (dfloat*) h_rhsLocal.ptr(); } else if (options.compareArgs("AMG SOLVER", "AMGX")){ const int useFP32 = options.compareArgs("AMG SOLVER PRECISION", "FP32"); @@ -404,13 +406,9 @@ void coarseSolver::scatter(occa::memory o_rhs, occa::memory o_x) } } void coarseSolver::BoomerAMGSolve() { - platform->timer.hostTic("BoomerAMGSolve", 1); boomerAMGSolve(xLocal, rhsLocal); - platform->timer.hostToc("BoomerAMGSolve"); } void coarseSolver::AmgXSolve(occa::memory o_rhs, occa::memory o_x) { - platform->timer.tic("AmgXSolve", 1); - const int useFP32 = options.compareArgs("SEMFEM SOLVER PRECISION", "FP32"); if(useFP32){ convertFP64ToFP32Kernel(N, o_rhs, o_rhsBuffer); @@ -419,60 +417,59 @@ void coarseSolver::AmgXSolve(occa::memory o_rhs, occa::memory o_x) { } else { AMGXsolve(o_x.ptr(), o_rhs.ptr()); } - - platform->timer.toc("AmgXSolve"); } void coarseSolver::solve(occa::memory o_rhs, occa::memory o_x) { + platform->timer.tic("coarseSolve", 1); if(useSEMFEM){ - platform->timer.tic("Coarse SEMFEM Solve", 1); semfemSolver(o_rhs, o_x); - platform->timer.toc("Coarse SEMFEM Solve"); - return; - } - - const bool useDevice = options.compareArgs("AMG SOLVER", "AMGX"); - if (gatherLevel) { - //weight - vectorDotStar(ogs->N, 1.0, ogs->o_invDegree, o_rhs, 0.0, o_Sx); - ogsGather(o_Gx, o_Sx, ogsDfloat, ogsAdd, ogs); - if(N && !useDevice) - o_Gx.copyTo(rhsLocal, N*sizeof(dfloat), 0); - } else { - if(N && !useDevice) - o_rhs.copyTo(rhsLocal, N*sizeof(dfloat), 0); } + else { + const bool useDevice = options.compareArgs("AMG SOLVER", "AMGX"); + if (gatherLevel) { + //weight + vectorDotStar(ogs->N, 1.0, ogs->o_invDegree, o_rhs, 0.0, o_Sx); + ogsGather(o_Gx, o_Sx, ogsDfloat, ogsAdd, ogs); + if(N && !useDevice) + o_Gx.copyTo(rhsLocal, N*sizeof(dfloat), 0); + } else { + if(N && !useDevice) + o_rhs.copyTo(rhsLocal, N*sizeof(dfloat), 0); + } - if (options.compareArgs("AMG SOLVER", "BOOMERAMG")){ - BoomerAMGSolve(); - } else if (options.compareArgs("AMG SOLVER", "AMGX")){ - occa::memory o_b = gatherLevel ? o_Gx : o_rhs; - AmgXSolve(o_b, o_x); - } else { - //gather the full vector - MPI_Allgatherv(rhsLocal, N, MPI_DFLOAT, - rhsCoarse, coarseCounts, coarseOffsets, MPI_DFLOAT, comm); - - //multiply by local part of the exact matrix inverse - // #pragma omp parallel for - for (int n=0;ntimer.toc("coarseSolve"); + } } //namespace parAlmond diff --git a/src/elliptic/amgSolver/parAlmond/parAlmond.cpp b/src/elliptic/amgSolver/parAlmond/parAlmond.cpp index 01b83651c..8c8783166 100644 --- a/src/elliptic/amgSolver/parAlmond/parAlmond.cpp +++ b/src/elliptic/amgSolver/parAlmond/parAlmond.cpp @@ -54,7 +54,7 @@ void AMGSetup(solver_t *MM, MPI_Barrier(M->comm); double startTime = MPI_Wtime(); - if(rank==0) printf("Setting up AMG...");fflush(stdout); + if(rank==0) printf("Setting up coarse solver ...");fflush(stdout); M->coarseLevel = new coarseSolver(M->options, M->comm); M->coarseLevel->setup(numLocalRows, globalRowStarts, nnz, Ai, Aj, Avals, nullSpace); diff --git a/src/elliptic/elliptic.h b/src/elliptic/elliptic.h index 7bd0e1a6b..a80b75b71 100644 --- a/src/elliptic/elliptic.h +++ b/src/elliptic/elliptic.h @@ -39,21 +39,30 @@ #include "platform.hpp" #include "timer.hpp" +#include +#include "ellipticApplyMask.hpp" #define ELLIPTIC_ENABLE_TIMER -class ResidualProjection; +#define NO_OP 0 +#define DIRICHLET 1 +#define NEUMANN 2 +#define ZERO_NORMAL 3 +#define ZERO_TANGENTIAL 4 + +class SolutionProjection; class elliptic_t; struct GmresData{ GmresData(elliptic_t*); - int restart; + int nRestartVectors; int flexible; deviceVector_t o_V; deviceVector_t o_Z; occa::memory o_y; occa::memory o_scratch; occa::memory h_scratch; + occa::memory h_y; dfloat* y; dfloat* H; dfloat* sn; @@ -64,11 +73,15 @@ struct GmresData{ struct elliptic_t { + static constexpr double targetBenchmark {0.1}; static constexpr int NScratchFields {4}; + static constexpr int minNFDMOverlap{4}; int dim; int elementType; // number of edges (3=tri, 4=quad, 6=tet, 12=hex) - int var_coeff; // flag for variable coefficient + int coeffField; // flag for variable coefficient (solver) + int coeffFieldPreco; // flag for variable coefficient (preconditioner) int blockSolver, Nfields, stressForm; // flag for vector solver and number of fields + int poisson; std::string name; @@ -91,13 +104,7 @@ struct elliptic_t dfloat tau; - int* BCType; - int NBCType; - - int* allBlockNeumann; bool allNeumann; - dfloat allNeumannPenalty; - dfloat allNeumannScale; // HOST shadow copies dfloat* invDegree; @@ -106,19 +113,16 @@ struct elliptic_t occa::memory o_wrk; - //C0-FEM mask data - int* mapB; // boundary flag of face nodes + // C0-FEM mask data dlong Nmasked; - dlong* fNmasked; - - dlong* maskIds; - hlong* maskedGlobalIds; + dlong NmaskedLocal; + dlong NmaskedGlobal; occa::memory o_maskIds; - occa::memory o_mapB; + occa::memory o_maskIdsGlobal; + occa::memory o_maskIdsLocal; - occa::stream defaultStream; - occa::stream dataStream; + occa::memory o_EToB; occa::memory o_x; occa::memory o_x0; @@ -128,21 +132,13 @@ struct elliptic_t occa::memory o_res; occa::memory o_Ap; // A*search direction occa::memory o_invDegree; - occa::memory o_EToB; + occa::memory o_interp; // interpolate (r,s,t)F -> (r,s,t)C for variable properties occa::memory o_EXYZ; // element vertices for reconstructing geofacs (trilinear hexes only) - occa::memory o_gllzw; // GLL nodes and weights occa::kernel AxKernel; - occa::kernel AxStressKernel; occa::kernel AxPfloatKernel; - occa::kernel partialAxKernel; - occa::kernel partialAxKernel2; - occa::kernel partialAxPfloatKernel; - occa::kernel partialCubatureAxKernel; - occa::kernel rhsBCKernel; - occa::kernel addBCKernel; occa::kernel scaledAddPfloatKernel; occa::kernel dotMultiplyPfloatKernel; occa::kernel copyDfloatToPfloatKernel; @@ -172,19 +168,21 @@ struct elliptic_t hlong NelementsGlobal; - occa::kernel updateDiagonalKernel; + occa::kernel ellipticBlockBuildDiagonalKernel; occa::memory o_lambda; - dfloat* lambda; + occa::memory o_lambdaPfloat; dlong loffset; int nLevels; int* levels; - ResidualProjection* residualProjection; - GmresData* gmresData; + SolutionProjection* solutionProjection; + GmresData *gmresData; + + std::function applyZeroNormalMask; }; #include "ellipticMultiGrid.h" -#include "ellipticResidualProjection.h" +#include "ellipticSolutionProjection.h" elliptic_t* ellipticBuildMultigridLevelFine(elliptic_t* elliptic); @@ -224,7 +222,8 @@ int pgmres(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x, void ellipticOperator(elliptic_t* elliptic, occa::memory &o_q, occa::memory &o_Aq, - const char* precision); + const char* precision, + bool masked = true); void ellipticAx(elliptic_t* elliptic, dlong NelementsList, @@ -238,17 +237,13 @@ void ellipticBuildContinuous(elliptic_t* elliptic, nonZero_t** A, void ellipticBuildContinuousGalerkinHex3D(elliptic_t* elliptic, elliptic_t* ellipticFine, - dfloat lambda, nonZero_t** A, dlong* nnz, ogs_t** ogs, hlong* globalStarts); -void ellipticBuildJacobi(elliptic_t* elliptic, dfloat** invDiagA); -void ellipticUpdateJacobi(elliptic_t* elliptic); - -void ellipticBuildLocalPatches(elliptic_t* elliptic, dfloat lambda, dfloat rateTolerance, - dlong* Npataches, dlong** patchesIndex, dfloat** patchesInvA); +void ellipticMultiGridUpdateLambda(elliptic_t* elliptic); +void ellipticUpdateJacobi(elliptic_t* elliptic, occa::memory& o_invDiagA); void ellipticMultiGridSetup(elliptic_t* elliptic, precon_t* precon); elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf); @@ -256,8 +251,18 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf dfloat ellipticUpdatePCG(elliptic_t* elliptic, occa::memory &o_p, occa::memory &o_Ap, dfloat alpha, occa::memory &o_x, occa::memory &o_r); -occa::properties ellipticKernelInfo(int N); - void ellipticZeroMean(elliptic_t* elliptic, occa::memory &o_q); +void ellipticOgs(mesh_t *mesh, + dlong mNlocal, + int nFields, + dlong offset, + int *EToB, + dlong &Nmasked, + occa::memory &o_maskIds, + dlong &NmaskedLocal, + occa::memory &o_maskIdsLocal, + dlong &NmaskedGlobal, + occa::memory &o_maskIdsGlobal, + ogs_t **ogs); #endif diff --git a/src/elliptic/ellipticApplyMask.cpp b/src/elliptic/ellipticApplyMask.cpp new file mode 100644 index 000000000..e70061c51 --- /dev/null +++ b/src/elliptic/ellipticApplyMask.cpp @@ -0,0 +1,29 @@ +#include +#include +void ellipticApplyMask(elliptic_t *solver, occa::memory &o_x, std::string precision) +{ + mesh_t *mesh = solver->mesh; + ellipticApplyMask(solver, mesh->Nelements, solver->Nmasked, mesh->o_elementList, solver->o_maskIds, o_x, precision); +} +void ellipticApplyMask(elliptic_t *solver, + dlong Nelements, + dlong Nmasked, + occa::memory &o_elementList, + occa::memory &o_maskIds, + occa::memory &o_x, + std::string precision) +{ + mesh_t *mesh = solver->mesh; + occa::kernel &maskKernel = (precision != dfloatString) ? mesh->maskPfloatKernel : mesh->maskKernel; + + if (solver->applyZeroNormalMask) { + if (precision != dfloatString) { + std::cout << "Precision level (" << precision << ") not supported in applyZeroNormalMask\n"; + ABORT(EXIT_FAILURE); + } + solver->applyZeroNormalMask(Nelements, o_elementList, o_x); + } + if (Nmasked) { + maskKernel(Nmasked, o_maskIds, o_x); + } +} diff --git a/src/elliptic/ellipticApplyMask.hpp b/src/elliptic/ellipticApplyMask.hpp new file mode 100644 index 000000000..4fd526ceb --- /dev/null +++ b/src/elliptic/ellipticApplyMask.hpp @@ -0,0 +1,17 @@ +#ifndef ellipticApplyMask_hpp +#define ellipticApplyMask_hpp + +#include +#include "occa.hpp" + +class elliptic_t; + +void ellipticApplyMask(elliptic_t *solver, occa::memory &o_x, std::string precision); +void ellipticApplyMask(elliptic_t *solver, + dlong Nelements, + dlong Nmasked, + occa::memory &o_elementList, + occa::memory &o_maskIds, + occa::memory &o_x, + std::string precision); +#endif \ No newline at end of file diff --git a/src/elliptic/ellipticBuildContinuous.cpp b/src/elliptic/ellipticBuildContinuous.cpp index 02319f5cb..4e9451aa6 100644 --- a/src/elliptic/ellipticBuildContinuous.cpp +++ b/src/elliptic/ellipticBuildContinuous.cpp @@ -78,9 +78,10 @@ void ellipticBuildContinuousHex3D(elliptic_t* elliptic, { mesh_t* mesh = elliptic->mesh; - setupAide options = elliptic->options; - // currently constant coefficient case only - const dfloat lambda = elliptic->lambda[0]; + setupAide& options = elliptic->options; + + // Poisson only + const dfloat lambda = 0.0; int rank = platform->comm.mpiRank; @@ -124,7 +125,12 @@ void ellipticBuildContinuousHex3D(elliptic_t* elliptic, int* ArecvOffsets = (int*) calloc(platform->comm.mpiCommSize + 1, sizeof(int)); int* mask = (int*) calloc(mesh->Np * mesh->Nelements,sizeof(int)); - for (dlong n = 0; n < elliptic->Nmasked; n++) mask[elliptic->maskIds[n]] = 1; + if(elliptic->Nmasked > 0){ + dlong* maskIds = (dlong*) calloc(elliptic->Nmasked, sizeof(dlong)); + elliptic->o_maskIds.copyTo(maskIds, elliptic->Nmasked * sizeof(dlong)); + for (dlong i = 0; i < elliptic->Nmasked; i++) mask[maskIds[i]] = 1.; + free(maskIds); + } dlong cnt = 0; for (dlong e = 0; e < mesh->Nelements; e++) diff --git a/src/elliptic/ellipticBuildContinuousGalerkin.cpp b/src/elliptic/ellipticBuildContinuousGalerkin.cpp index 8a6f6347d..14e3fd6e8 100644 --- a/src/elliptic/ellipticBuildContinuousGalerkin.cpp +++ b/src/elliptic/ellipticBuildContinuousGalerkin.cpp @@ -44,7 +44,6 @@ static int parallelCompareRowColumn(const void* a, const void* b) void ellipticBuildContinuousGalerkinHex3D (elliptic_t* elliptic, elliptic_t* ellipticFine, - dfloat lambda, nonZero_t** A, dlong* nnz, ogs_t** ogs, @@ -52,7 +51,6 @@ void ellipticBuildContinuousGalerkinHex3D (elliptic_t* elliptic, void ellipticBuildContinuousGalerkin(elliptic_t* elliptic, elliptic_t* ellipticFine, - dfloat lambda, nonZero_t** A, dlong* nnz, ogs_t** ogs, @@ -73,7 +71,7 @@ void ellipticBuildContinuousGalerkin(elliptic_t* elliptic, break; case HEXAHEDRA: ellipticBuildContinuousGalerkinHex3D(elliptic,ellipticFine, - lambda,A,nnz,ogs,globalStarts); + A,nnz,ogs,globalStarts); break; default: break; @@ -123,7 +121,6 @@ void ellipticGenerateCoarseBasisHex3D(dfloat* b,int j_,elliptic_t* elliptic) void ellipticBuildContinuousGalerkinHex3D(elliptic_t* elliptic, elliptic_t* ellipticFine, - dfloat lambda, nonZero_t** A, dlong* nnz, ogs_t** ogs, @@ -131,7 +128,7 @@ void ellipticBuildContinuousGalerkinHex3D(elliptic_t* elliptic, { mesh_t* mesh = elliptic->mesh; - setupAide options = elliptic->options; + setupAide& options = elliptic->options; MPI_Barrier(platform->comm.mpiComm); const double tStart = MPI_Wtime(); @@ -180,7 +177,12 @@ void ellipticBuildContinuousGalerkinHex3D(elliptic_t* elliptic, int* ArecvOffsets = (int*) calloc(platform->comm.mpiCommSize + 1, sizeof(int)); int* mask = (int*) calloc(mesh->Np * mesh->Nelements,sizeof(int)); - for (dlong n = 0; n < elliptic->Nmasked; n++) mask[elliptic->maskIds[n]] = 1; + if(elliptic->Nmasked > 0){ + dlong* maskIds = (dlong*) calloc(elliptic->Nmasked, sizeof(dlong)); + elliptic->o_maskIds.copyTo(maskIds, elliptic->Nmasked * sizeof(dlong)); + for (dlong i = 0; i < elliptic->Nmasked; i++) mask[maskIds[i]] = 1.; + free(maskIds); + } mesh_t* meshf = ellipticFine->mesh; @@ -217,9 +219,7 @@ void ellipticBuildContinuousGalerkinHex3D(elliptic_t* elliptic, meshf->Np * sizeof(dfloat)); o_q.copyFrom(q); - ellipticFine->AxKernel(mesh->Nelements,meshf->o_ggeo, - meshf->o_D,meshf->o_DT, - lambda,o_q,o_Aq); + ellipticOperator(ellipticFine, o_q, o_Aq, dfloatString, false); o_Aq.copyTo(Aq); for(dlong e = 0; e < mesh->Nelements; e++) diff --git a/src/elliptic/ellipticBuildMultigridLevel.cpp b/src/elliptic/ellipticBuildMultigridLevel.cpp index d6cba5228..59e92459e 100644 --- a/src/elliptic/ellipticBuildMultigridLevel.cpp +++ b/src/elliptic/ellipticBuildMultigridLevel.cpp @@ -28,6 +28,7 @@ #include "platform.hpp" namespace{ + std::string gen_suffix(const elliptic_t * elliptic, const char * floatString) { const std::string precision = std::string(floatString); @@ -39,189 +40,37 @@ std::string gen_suffix(const elliptic_t * elliptic, const char * floatString) } } + } -// create elliptic and mesh structs for multigrid levels elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf) { - const int serial = platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP"; - elliptic_t* elliptic = new elliptic_t(); - memcpy(elliptic,baseElliptic,sizeof(elliptic_t)); - //populate the mini-mesh using the mesh struct - mesh_t* mesh = new mesh_t(); - memcpy(mesh,baseElliptic->mesh,sizeof(mesh_t)); - + mesh_t* mesh = createMeshMG(baseElliptic->mesh, Nc); elliptic->mesh = mesh; - setupAide options = elliptic->options; - - switch(elliptic->elementType) { - case HEXAHEDRA: - meshLoadReferenceNodesHex3D(mesh, Nc, 1); - meshHaloSetup(mesh); - meshPhysicalNodesHex3D(mesh); - meshHaloPhysicalNodes(mesh); - meshGeometricFactorsHex3D(mesh); - - if(!options.compareArgs("BOX DOMAIN", "TRUE")) { - meshConnectFaceNodes3D(mesh); - }else { - if(platform->comm.mpiRank == 0) - printf("WARNING: connecting periodic box\n"); - - dfloat XMIN = -1, XMAX = +1; // default bi-unit cube - dfloat YMIN = -1, YMAX = +1; - dfloat ZMIN = -1, ZMAX = +1; - - options.getArgs("BOX XMIN", XMIN); - options.getArgs("BOX YMIN", YMIN); - options.getArgs("BOX ZMIN", ZMIN); - - options.getArgs("BOX XMAX", XMAX); - options.getArgs("BOX YMAX", YMAX); - options.getArgs("BOX ZMAX", ZMAX); - - meshConnectPeriodicFaceNodes3D(mesh, XMAX - XMIN, YMAX - YMIN, ZMAX - ZMIN); - } - meshSurfaceGeometricFactorsHex3D(mesh); - break; - } - - // global nodes - meshGlobalIds(mesh); - -#if 0 - mesh->o_x = platform->device.malloc(mesh->Np * mesh->Nelements * sizeof(dfloat), mesh->x); - mesh->o_y = platform->device.malloc(mesh->Np * mesh->Nelements * sizeof(dfloat), mesh->y); - mesh->o_z = platform->device.malloc(mesh->Np * mesh->Nelements * sizeof(dfloat), mesh->z); -#endif - - //dont need these once vmap is made - free(mesh->x); - free(mesh->y); - if (elliptic->dim == 3) - free(mesh->z); - - dlong Ntotal = mesh->Np * mesh->Nelements; - - if (elliptic->elementType == HEXAHEDRA) { - //lumped mass matrix - mesh->MM = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat)); - dfloat* DT = (dfloat*) calloc(mesh->Nq * mesh->Nq, sizeof(dfloat)); - - for (int j = 0; j < mesh->Nq; j++) - for (int i = 0; i < mesh->Nq; i++) - DT[j * mesh->Nq + i] = mesh->D[i * mesh->Nq + j]; - - for (int k = 0; k < mesh->Nq; k++) - for (int j = 0; j < mesh->Nq; j++) - for (int i = 0; i < mesh->Nq; i++) { - int n = i + j * mesh->Nq + k * mesh->Nq * mesh->Nq; - mesh->MM[n + n * mesh->Np] = mesh->gllw[i] * mesh->gllw[j] * mesh->gllw[k]; - } - - mesh->o_D = platform->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), mesh->D); - mesh->o_DT = platform->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), DT); // transpose(D) - -#if 0 - mesh->o_cubD = platform->device.malloc(mesh->cubNq * mesh->cubNq * sizeof(dfloat), mesh->cubD); - - dfloat* cubInterpT = (dfloat*) calloc(mesh->cubNq * mesh->Nq, sizeof(dfloat)); - for(int n = 0; n < mesh->Nq; ++n) - for(int m = 0; m < mesh->cubNq; ++m) - cubInterpT[m + n * mesh->cubNq] = mesh->cubInterp[m * mesh->Nq + n]; - - mesh->o_cubInterpT = platform->device.malloc(mesh->cubNq * mesh->Nq * sizeof(dfloat), cubInterpT); - - free(cubInterpT); -#endif - - mesh->o_ggeo = - platform->device.malloc(mesh->Nelements * mesh->Np * mesh->Nggeo * sizeof(dfloat), - mesh->ggeo); -#if 0 - mesh->o_vgeo = - platform->device.malloc(mesh->Nelements * mesh->Nvgeo * mesh->Np * sizeof(dfloat), - mesh->vgeo); - mesh->o_sgeo = - platform->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->Nfp * mesh->Nsgeo * sizeof(dfloat), - mesh->sgeo); - - mesh->o_vmapM = - platform->device.malloc(mesh->Nelements * mesh->Nfp * mesh->Nfaces * sizeof(dlong), - mesh->vmapM); - - mesh->o_vmapP = - platform->device.malloc(mesh->Nelements * mesh->Nfp * mesh->Nfaces * sizeof(dlong), - mesh->vmapP); -#endif - - } - - //set the normalization constant for the allNeumann Poisson problem on this coarse mesh - hlong localElements = (hlong) mesh->Nelements; - hlong totalElements = 0; - MPI_Allreduce(&localElements, &totalElements, 1, MPI_HLONG, MPI_SUM, platform->comm.mpiComm); - elliptic->allNeumannScale = 1.0 / sqrt(mesh->Np * totalElements); - - elliptic->allNeumannPenalty = 0; - elliptic->allNeumannScale = 0; - - //setup an unmasked gs handle - int verbose = options.compareArgs("VERBOSE","TRUE") ? 1:0; - meshParallelGatherScatterSetup(mesh, Ntotal, mesh->globalIds, platform->comm.mpiComm, verbose); - - //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann) - elliptic->mapB = (int*) calloc(mesh->Nelements * mesh->Np,sizeof(int)); - for (dlong e = 0; e < mesh->Nelements; e++) { - for (int n = 0; n < mesh->Np; n++) elliptic->mapB[n + e * mesh->Np] = 1E9; - for (int f = 0; f < mesh->Nfaces; f++) { - int bc = mesh->EToB[f + e * mesh->Nfaces]; - if (bc > 0) { - for (int n = 0; n < mesh->Nfp; n++) { - int BCFlag = elliptic->BCType[bc]; - int fid = mesh->faceNodes[n + f * mesh->Nfp]; - elliptic->mapB[fid + e * mesh->Np] = mymin(BCFlag,elliptic->mapB[fid + e * mesh->Np]); - } - } - } - } - ogsGatherScatter(elliptic->mapB, ogsInt, ogsMin, mesh->ogs); - - //use the bc flags to find masked ids - elliptic->Nmasked = 0; - for (dlong n = 0; n < mesh->Nelements * mesh->Np; n++) { - if (elliptic->mapB[n] == 1E9) - elliptic->mapB[n] = 0.; - else if (elliptic->mapB[n] == 1) //Dirichlet boundary - elliptic->Nmasked++; + { // setup an unmasked gs handle + ogs_t *ogs = NULL; + ellipticOgs(mesh, + mesh->Nlocal, + /* nFields */ 1, + /* offset */ 0, + elliptic->EToB, + elliptic->Nmasked, + elliptic->o_maskIds, + elliptic->NmaskedLocal, + elliptic->o_maskIdsLocal, + elliptic->NmaskedGlobal, + elliptic->o_maskIdsGlobal, + &ogs); + elliptic->ogs = ogs; + elliptic->o_invDegree = elliptic->ogs->o_invDegree; } - elliptic->o_mapB = platform->device.malloc(mesh->Nelements * mesh->Np * sizeof(int), elliptic->mapB); - - elliptic->maskIds = (dlong*) calloc(elliptic->Nmasked, sizeof(dlong)); - elliptic->Nmasked = 0; //reset - for (dlong n = 0; n < mesh->Nelements * mesh->Np; n++) - if (elliptic->mapB[n] == 1) elliptic->maskIds[elliptic->Nmasked++] = n; - - if (elliptic->Nmasked) - elliptic->o_maskIds = platform->device.malloc(elliptic->Nmasked * sizeof(dlong), elliptic->maskIds); - - //make a masked version of the global id numbering - hlong* maskedGlobalIds = (hlong*) calloc(Ntotal,sizeof(hlong)); - memcpy(maskedGlobalIds, mesh->globalIds, Ntotal * sizeof(hlong)); - for (dlong n = 0; n < elliptic->Nmasked; n++) - maskedGlobalIds[elliptic->maskIds[n]] = 0; - //use the masked ids to make another gs handle - elliptic->ogs = ogsSetup(Ntotal, maskedGlobalIds, platform->comm.mpiComm, verbose, platform->device); - elliptic->o_invDegree = elliptic->ogs->o_invDegree; - free(maskedGlobalIds); - - std::string suffix = "Hex3D"; + const std::string suffix = "Hex3D"; std::string kernelName; @@ -230,93 +79,66 @@ elliptic_t* ellipticBuildMultigridLevel(elliptic_t* baseElliptic, int Nc, int Nf ellipticBuildPreconditionerKernels(elliptic); - { - kernelName = "ellipticAx" + suffix; - { - const std::string kernelSuffix = gen_suffix(elliptic, dfloatString); - elliptic->AxKernel = platform->kernels.getKernel(kernelName + kernelSuffix); - } - if(!strstr(pfloatString,dfloatString)) { - kernelName = "ellipticAx" + suffix; - { - const std::string kernelSuffix = gen_suffix(elliptic, pfloatString); - elliptic->AxPfloatKernel = platform->kernels.getKernel(kernelName + kernelSuffix); - } - } + const std::string poissonPrefix = elliptic->poisson ? "poisson-" : ""; + if(Nc > 1 || elliptic->options.compareArgs("MULTIGRID COARSE SOLVE", "FALSE")) + { + const std::string AxSuffix = elliptic->coeffFieldPreco ? "CoeffHex3D" : "Hex3D"; // check for trilinear if(elliptic->elementType != HEXAHEDRA) { - kernelName = "ellipticPartialAx" + suffix; + kernelName = "ellipticPartialAx" + AxSuffix; }else { if(elliptic->options.compareArgs("ELEMENT MAP", "TRILINEAR")) - kernelName = "ellipticPartialAxTrilinear" + suffix; + kernelName = "ellipticPartialAxTrilinear" + AxSuffix; else - kernelName = "ellipticPartialAx" + suffix; + kernelName = "ellipticPartialAx" + AxSuffix; } - if(!serial) { - { - const std::string kernelSuffix = gen_suffix(elliptic, dfloatString); - elliptic->partialAxKernel = platform->kernels.getKernel(kernelName + kernelSuffix); - } - if(!strstr(pfloatString,dfloatString)) { - const std::string kernelSuffix = gen_suffix(elliptic, pfloatString); - elliptic->partialAxPfloatKernel = - platform->kernels.getKernel( kernelName + kernelSuffix); - } + { + const std::string kernelSuffix = gen_suffix(elliptic, dfloatString); + elliptic->AxKernel = platform->kernels.get(poissonPrefix + kernelName + kernelSuffix); + } + { + const std::string kernelSuffix = gen_suffix(elliptic, pfloatString); + elliptic->AxPfloatKernel = + platform->kernels.get(poissonPrefix + kernelName + kernelSuffix); } } - MPI_Barrier(platform->comm.mpiComm); - if(platform->comm.mpiRank == 0) printf("done (%gs)\n", MPI_Wtime() - tStartLoadKernel); - fflush(stdout); - - //new precon struct elliptic->precon = new precon_t(); { - - const std::string kernelSuffix = std::string("_") + std::to_string(Nf); + const std::string kernelSuffix = + std::string("_Nf_") + std::to_string(Nf) + std::string("_Nc_") + std::to_string(Nc); kernelName = "ellipticPreconCoarsen" + suffix; - elliptic->precon->coarsenKernel = platform->kernels.getKernel(kernelName + kernelSuffix); + elliptic->precon->coarsenKernel = platform->kernels.get(kernelName + kernelSuffix); kernelName = "ellipticPreconProlongate" + suffix; - elliptic->precon->prolongateKernel = platform->kernels.getKernel(kernelName + kernelSuffix); + elliptic->precon->prolongateKernel = platform->kernels.get(kernelName + kernelSuffix); } - if(elliptic->elementType == HEXAHEDRA) { - // pack gllz, gllw, and elementwise EXYZ - dfloat* gllzw = (dfloat*) calloc(2 * mesh->Nq, sizeof(dfloat)); + elliptic->o_lambdaPfloat = platform->device.malloc(2 * mesh->Nelements * mesh->Np, sizeof(pfloat)); + elliptic->o_lambda = platform->device.malloc(2 * mesh->Nelements * mesh->Np, sizeof(dfloat)); - int sk = 0; - for(int n = 0; n < mesh->Nq; ++n) - gllzw[sk++] = mesh->gllz[n]; - for(int n = 0; n < mesh->Nq; ++n) - gllzw[sk++] = mesh->gllw[n]; + const int Nfq = Nf+1; + const int Ncq = Nc+1; + dfloat* fToCInterp = (dfloat*) calloc(Nfq * Ncq, sizeof(dfloat)); + InterpolationMatrix1D(Nf, Nfq, baseElliptic->mesh->r, Ncq, mesh->r, fToCInterp); + elliptic->o_interp = platform->device.malloc(Nfq * Ncq * sizeof(dfloat), fToCInterp); - elliptic->o_gllzw = platform->device.malloc(2 * mesh->Nq * sizeof(dfloat), gllzw); - free(gllzw); - } + elliptic->precon->coarsenKernel(2 * mesh->Nelements, elliptic->o_interp, baseElliptic->o_lambda, elliptic->o_lambda); + + elliptic->copyDfloatToPfloatKernel(2 * mesh->Nelements * mesh->Np, + elliptic->o_lambda, + elliptic->o_lambdaPfloat); + + free(fToCInterp); - if(!strstr(pfloatString,dfloatString)) { - mesh->o_ggeoPfloat = platform->device.malloc(mesh->Nelements * mesh->Np * mesh->Nggeo , sizeof(pfloat)); - mesh->o_DPfloat = platform->device.malloc(mesh->Nq * mesh->Nq , sizeof(pfloat)); - mesh->o_DTPfloat = platform->device.malloc(mesh->Nq * mesh->Nq , sizeof(pfloat)); - elliptic->copyDfloatToPfloatKernel(mesh->Nelements * mesh->Np * mesh->Nggeo, - elliptic->mesh->o_ggeoPfloat, - mesh->o_ggeo); -#if 0 - mesh->o_ggeo.free(); -#endif - elliptic->copyDfloatToPfloatKernel(mesh->Nq * mesh->Nq, - elliptic->mesh->o_DPfloat, - mesh->o_D); - elliptic->copyDfloatToPfloatKernel(mesh->Nq * mesh->Nq, - elliptic->mesh->o_DTPfloat, - mesh->o_DT); - } + MPI_Barrier(platform->comm.mpiComm); + if(platform->comm.mpiRank == 0) printf("done (%gs)\n", MPI_Wtime() - tStartLoadKernel); + fflush(stdout); return elliptic; } diff --git a/src/elliptic/ellipticBuildMultigridLevelFine.cpp b/src/elliptic/ellipticBuildMultigridLevelFine.cpp index baa1ec42c..fe02db0c8 100644 --- a/src/elliptic/ellipticBuildMultigridLevelFine.cpp +++ b/src/elliptic/ellipticBuildMultigridLevelFine.cpp @@ -50,10 +50,8 @@ elliptic_t* ellipticBuildMultigridLevelFine(elliptic_t* baseElliptic) mesh_t* mesh = elliptic->mesh; ellipticBuildPreconditionerKernels(elliptic); - const int serial = platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP"; - - elliptic->var_coeff = 0; - elliptic->lambda = (dfloat*) calloc(elliptic->Nfields, sizeof(dfloat)); // enforce lambda = 0 + elliptic->coeffField = baseElliptic->coeffField; + elliptic->coeffFieldPreco = baseElliptic->coeffFieldPreco; if(!strstr(pfloatString,dfloatString)) { @@ -62,50 +60,36 @@ elliptic_t* ellipticBuildMultigridLevelFine(elliptic_t* baseElliptic) mesh->o_DTPfloat = platform->device.malloc(mesh->Nq * mesh->Nq , sizeof(pfloat)); elliptic->copyDfloatToPfloatKernel(mesh->Nelements * mesh->Np * mesh->Nggeo, - elliptic->mesh->o_ggeoPfloat, - mesh->o_ggeo); + mesh->o_ggeo, + elliptic->mesh->o_ggeoPfloat); elliptic->copyDfloatToPfloatKernel(mesh->Nq * mesh->Nq, - elliptic->mesh->o_DPfloat, - mesh->o_D); + mesh->o_D, + elliptic->mesh->o_DPfloat); elliptic->copyDfloatToPfloatKernel(mesh->Nq * mesh->Nq, - elliptic->mesh->o_DTPfloat, - mesh->o_DT); + mesh->o_DT, + elliptic->mesh->o_DTPfloat); } - std::string suffix; - if(elliptic->elementType == HEXAHEDRA) - suffix = "Hex3D"; + std::string suffix = elliptic->coeffFieldPreco ? "CoeffHex3D" : "Hex3D"; std::string kernelName; - { - kernelName = "ellipticAx" + suffix; - { - const std::string kernelSuffix = gen_suffix(elliptic, dfloatString); - elliptic->AxKernel = platform->kernels.getKernel(kernelName + kernelSuffix); - } - - if(!strstr(pfloatString,dfloatString)) { - kernelName = "ellipticAx" + suffix; - const std::string kernelSuffix = gen_suffix(elliptic, pfloatString); - elliptic->AxPfloatKernel = platform->kernels.getKernel(kernelName + kernelSuffix); - } + const std::string poissonPrefix = elliptic->poisson ? "poisson-" : ""; + { if(elliptic->options.compareArgs("ELEMENT MAP", "TRILINEAR")) kernelName = "ellipticPartialAxTrilinear" + suffix; else kernelName = "ellipticPartialAx" + suffix; - if(!serial) { - { - const std::string kernelSuffix = gen_suffix(elliptic, dfloatString); - elliptic->partialAxKernel = platform->kernels.getKernel(kernelName + kernelSuffix); - } - if(!strstr(pfloatString,dfloatString)) { - const std::string kernelSuffix = gen_suffix(elliptic, pfloatString); - elliptic->partialAxPfloatKernel = - platform->kernels.getKernel( kernelName + kernelSuffix); - } + { + const std::string kernelSuffix = gen_suffix(elliptic, dfloatString); + elliptic->AxKernel = platform->kernels.get(poissonPrefix + kernelName + kernelSuffix); + } + { + const std::string kernelSuffix = gen_suffix(elliptic, pfloatString); + elliptic->AxPfloatKernel = + platform->kernels.get(poissonPrefix + kernelName + kernelSuffix); } } diff --git a/src/elliptic/ellipticBuildPreconditionerKernels.cpp b/src/elliptic/ellipticBuildPreconditionerKernels.cpp index 52e6cb097..8aca6a0b0 100644 --- a/src/elliptic/ellipticBuildPreconditionerKernels.cpp +++ b/src/elliptic/ellipticBuildPreconditionerKernels.cpp @@ -37,54 +37,52 @@ void ellipticBuildPreconditionerKernels(elliptic_t* elliptic) std::string prefix = "Hex3D"; std::string kernelName; - MPI_Barrier(platform->comm.mpiComm); - double tStartLoadKernel = MPI_Wtime(); - if(platform->comm.mpiRank == 0) printf("loading elliptic preconditioner kernels ... "); - fflush(stdout); - const std::string orderSuffix = std::string("_") + std::to_string(mesh->N); { kernelName = "mask"; mesh->maskKernel = - platform->kernels.getKernel(kernelName + orderSuffix); + platform->kernels.get(kernelName + orderSuffix); mesh->maskPfloatKernel = - platform->kernels.getKernel(kernelName + orderSuffix + "pfloat"); + platform->kernels.get(kernelName + orderSuffix + "pfloat"); kernelName = "fusedCopyDfloatToPfloat"; elliptic->fusedCopyDfloatToPfloatKernel = - platform->kernels.getKernel(kernelName + orderSuffix); + platform->kernels.get(kernelName + orderSuffix); kernelName = "copyDfloatToPfloat"; elliptic->copyDfloatToPfloatKernel = - platform->kernels.getKernel(kernelName + orderSuffix); + platform->kernels.get(kernelName + orderSuffix); kernelName = "copyPfloatToDfloat"; elliptic->copyPfloatToDPfloatKernel = - platform->kernels.getKernel(kernelName + orderSuffix); + platform->kernels.get(kernelName + orderSuffix); kernelName = "scaledAdd"; elliptic->scaledAddPfloatKernel = - platform->kernels.getKernel(kernelName + orderSuffix); + platform->kernels.get(kernelName + orderSuffix); kernelName = "dotMultiply"; elliptic->dotMultiplyPfloatKernel = - platform->kernels.getKernel(kernelName + orderSuffix); + platform->kernels.get(kernelName + orderSuffix); kernelName = "updateSmoothedSolutionVec"; elliptic->updateSmoothedSolutionVecKernel = - platform->kernels.getKernel(kernelName + orderSuffix); + platform->kernels.get(kernelName + orderSuffix); kernelName = "updateChebyshevSolutionVec"; elliptic->updateChebyshevSolutionVecKernel = - platform->kernels.getKernel(kernelName + orderSuffix); + platform->kernels.get(kernelName + orderSuffix); kernelName = "updateIntermediateSolutionVec"; elliptic->updateIntermediateSolutionVecKernel = - platform->kernels.getKernel(kernelName + orderSuffix); - } + platform->kernels.get(kernelName + orderSuffix); - MPI_Barrier(platform->comm.mpiComm); - if(platform->comm.mpiRank == 0) printf("done (%gs)\n", MPI_Wtime() - tStartLoadKernel); - fflush(stdout); + kernelName = "ellipticBlockBuildDiagonalHex3D"; + const std::string poissonPrefix = elliptic->poisson ? "poisson-" : ""; + elliptic->ellipticBlockBuildDiagonalKernel = + platform->kernels.get(poissonPrefix + kernelName + orderSuffix); + elliptic->axmyzManyPfloatKernel = platform->kernels.get("axmyzManyPfloat"); + elliptic->adyManyPfloatKernel = platform->kernels.get("adyManyPfloat"); + } } diff --git a/src/elliptic/ellipticBuildSEMFEM.cpp b/src/elliptic/ellipticBuildSEMFEM.cpp index dadcd03f5..1b23cf853 100644 --- a/src/elliptic/ellipticBuildSEMFEM.cpp +++ b/src/elliptic/ellipticBuildSEMFEM.cpp @@ -6,10 +6,7 @@ #include #include -#include "_hypre_utilities.h" -#include "HYPRE_parcsr_ls.h" -#include "_hypre_parcsr_ls.h" -#include "HYPRE.h" +#include "__HYPRE.h" #include "gslib.h" #include "ellipticBuildSEMFEM.hpp" @@ -162,7 +159,7 @@ void inverse(double invA[3][3], double A[3][3]) { } } -occa::memory scratchOrAllocateMemory(int nWords, int sizeT, void* src, long long& bytesRemaining, long long& byteOffset, long long& bytesAllocated, bool& allocated); +occa::memory scratchOrAllocateMemory(int nWords, int sizeT, void* src, size_t& bytesRemaining, size_t& byteOffset, size_t& bytesAllocated, bool& allocated); static occa::kernel computeStiffnessMatrixKernel; static occa::memory o_stiffness; static occa::memory o_x; @@ -242,17 +239,23 @@ SEMFEMData* ellipticBuildSEMFEM(const int N_, const int n_elem_, /* mode */ 0); } - constructOnHost = - platform->device.mode() == std::string("OpenCL") - || - platform->device.mode() == std::string("HIP") - || - platform->device.mode() == std::string("Serial"); + constructOnHost = !platform->device.deviceAtomic; if(!constructOnHost) load(); matrix_distribution(); + { + std::string libPath(getenv("NEKRS_INSTALL_DIR")); + libPath += "/lib/libHYPRE"; +#ifdef __APPLE__ + libPath += ".dylib"; +#else + libPath += ".so"; +#endif + __HYPRE_Load(libPath.c_str()); + } + fem_assembly(); SEMFEMData* data; @@ -267,7 +270,7 @@ SEMFEMData* ellipticBuildSEMFEM(const int N_, const int n_elem_, comm_allreduce(&comm, gs_long_long, gs_add, &numRows64, 1, &numRowsGlobal64); if(numRowsGlobal64 > std::numeric_limits::max()) { if(comm.id == 0) printf("Number of global rows requires BigInt support!"); - MPI_Abort(comm.c, EXIT_FAILURE); + ABORT(EXIT_FAILURE); } } @@ -277,7 +280,7 @@ SEMFEMData* ellipticBuildSEMFEM(const int N_, const int n_elem_, ownedRows[ctr++] = row; HYPRE_Int *ncols = (HYPRE_Int*) calloc(numRows, sizeof(HYPRE_Int)); - HYPRE_IJMatrixGetRowCounts(A_bc, + __HYPRE_IJMatrixGetRowCounts(A_bc, numRows, ownedRows, ncols); @@ -289,7 +292,7 @@ SEMFEMData* ellipticBuildSEMFEM(const int N_, const int n_elem_, // construct COO matrix from Hypre matrix HYPRE_BigInt *hAj = (HYPRE_BigInt*) calloc(nnz, sizeof(HYPRE_BigInt)); HYPRE_Real *hAv = (HYPRE_Real*) calloc(nnz, sizeof(HYPRE_Real)); - HYPRE_IJMatrixGetValues(A_bc, + __HYPRE_IJMatrixGetValues(A_bc, -numRows, ncols, ownedRows, @@ -314,7 +317,7 @@ SEMFEMData* ellipticBuildSEMFEM(const int N_, const int n_elem_, free(hAv); free(ownedRows); free(ncols); - HYPRE_IJMatrixDestroy(A_bc); + __HYPRE_IJMatrixDestroy(A_bc); data = (SEMFEMData*) malloc(sizeof(SEMFEMData)); data->Ai = Ai; @@ -664,11 +667,11 @@ void fem_assembly_host() { } - int err = HYPRE_IJMatrixAddToValues(A_bc, nrows, ncols, rows, cols, vals); + int err = __HYPRE_IJMatrixAddToValues(A_bc, nrows, ncols, rows, cols, vals); if (err != 0) { if (comm.id == 0) - printf("err!\n"); - exit(EXIT_FAILURE); + printf("HYPRE_IJMatrixAddToValues failed!\n"); + ABORT(EXIT_FAILURE); } free(rows); @@ -699,9 +702,9 @@ void fem_assembly_device() { bool o_valsAlloc; }; AllocationTracker allocations; - long long bytesRemaining = platform->o_mempool.bytesAllocated; - long long byteOffset = 0; - long long bytesAllocated = 0; + size_t bytesRemaining = platform->o_mempool.bytesAllocated; + size_t byteOffset = 0; + size_t bytesAllocated = 0; occa::memory o_mask = scratchOrAllocateMemory( n_xyze, sizeof(double), @@ -779,11 +782,11 @@ void fem_assembly_device() { if(allocations.o_colsAlloc) o_cols.free(); if(allocations.o_valsAlloc) o_vals.free(); - int err = HYPRE_IJMatrixAddToValues(A_bc, nrows, ncols, rows, cols, vals); + int err = __HYPRE_IJMatrixAddToValues(A_bc, nrows, ncols, rows, cols, vals); if (err != 0) { if (comm.id == 0) - printf("err!\n"); - exit(EXIT_FAILURE); + printf("HYPRE_IJMatrixAddToValues failed!\n"); + ABORT(EXIT_FAILURE); } free(rows); @@ -830,9 +833,9 @@ void fem_assembly() { } /* Assemble FE matrices with boundary conditions applied */ - HYPRE_IJMatrixCreate(comm.c, row_start, row_end, row_start, row_end, &A_bc); - HYPRE_IJMatrixSetObjectType(A_bc, HYPRE_PARCSR); - HYPRE_IJMatrixInitialize(A_bc); + __HYPRE_IJMatrixCreate(comm.c, row_start, row_end, row_start, row_end, &A_bc); + __HYPRE_IJMatrixSetObjectType(A_bc, HYPRE_PARCSR); + __HYPRE_IJMatrixInitialize(A_bc); construct_coo_graph(); @@ -846,7 +849,7 @@ void fem_assembly() { } { - HYPRE_IJMatrixAssemble(A_bc); + __HYPRE_IJMatrixAssemble(A_bc); } free(glo_num); @@ -856,7 +859,7 @@ void fem_assembly() { } void load(){ - computeStiffnessMatrixKernel = platform->kernels.getKernel( + computeStiffnessMatrixKernel = platform->kernels.get( "computeStiffnessMatrix" ); } @@ -921,7 +924,7 @@ void mesh_connectivity(int v_coord[8][3], int t_map[8][4]) { (t_map)[7][3] = 5; } -occa::memory scratchOrAllocateMemory(int nWords, int sizeT, void* src, long long& bytesRemaining, long long& byteOffset, long long& bytesAllocated, bool& allocated) +occa::memory scratchOrAllocateMemory(int nWords, int sizeT, void* src, size_t& bytesRemaining, size_t& byteOffset, size_t& bytesAllocated, bool& allocated) { occa::memory o_mem; if(nWords * sizeT < bytesRemaining){ diff --git a/src/elliptic/ellipticJacobi.cpp b/src/elliptic/ellipticJacobi.cpp deleted file mode 100644 index 1917ba9b9..000000000 --- a/src/elliptic/ellipticJacobi.cpp +++ /dev/null @@ -1,355 +0,0 @@ -/* - - The MIT License (MIT) - - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - */ - -#include "elliptic.h" -#include "linAlg.hpp" - -void BuildLocalContinuousDiagHex3D (elliptic_t* elliptic, - mesh_t* mesh, - dlong eM, - dfloat* B, - dfloat* Br, - dfloat* Bs, - dfloat* Bt, - dfloat* A); -void BuildLocalContinuousBlockDiagHex3D (elliptic_t* elliptic, - mesh_t* mesh, - dfloat* B, - dfloat* Br, - dfloat* Bs, - dfloat* Bt, - dfloat* A); - -void ellipticUpdateJacobi(elliptic_t* elliptic) -{ - mesh_t* mesh = elliptic->mesh; - setupAide options = elliptic->options; - precon_t* precon = elliptic->precon; - - - const dfloat allNeumannScale = elliptic->allNeumannPenalty * elliptic->allNeumannScale * - elliptic->allNeumannScale; - const dlong Nlocal = mesh->Np * mesh->Nelements; - - elliptic->updateDiagonalKernel(mesh->Nelements, - elliptic->Ntotal, - elliptic->loffset, - elliptic->allNeumann, - allNeumannScale, - elliptic->o_mapB, - mesh->o_ggeo, - mesh->o_D, - mesh->o_DT, - elliptic->o_lambda, - precon->o_invDiagA); - - oogs::startFinish(precon->o_invDiagA, elliptic->Nfields, elliptic->Ntotal, ogsPfloat, ogsAdd, elliptic->oogs); - - const pfloat one = 1.0; - elliptic->adyManyPfloatKernel(Nlocal, elliptic->Nfields, elliptic->Ntotal, one, precon->o_invDiagA); -} - -void ellipticBuildJacobi(elliptic_t* elliptic, dfloat** invDiagA) -{ - mesh_t* mesh = elliptic->mesh; - setupAide options = elliptic->options; - - MPI_Barrier(platform->comm.mpiComm); - const double tStart = MPI_Wtime(); - if(platform->comm.mpiRank == 0) printf("building Jacobi ... "); - fflush(stdout); - - // build some monolithic basis arrays (for quads and hexes) - dfloat* B = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat)); - dfloat* Br = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat)); - dfloat* Bs = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat)); - dfloat* Bt = (dfloat*) calloc(mesh->Np * mesh->Np, sizeof(dfloat)); - - if (elliptic->elementType == QUADRILATERALS) { - int mode = 0; - for(int nj = 0; nj < mesh->N + 1; ++nj) - for(int ni = 0; ni < mesh->N + 1; ++ni) { - int node = 0; - - for(int j = 0; j < mesh->N + 1; ++j) - for(int i = 0; i < mesh->N + 1; ++i) { - if(nj == j && ni == i) - B[mode * mesh->Np + node] = 1; - if(nj == j) - Br[mode * mesh->Np + node] = mesh->D[ni + mesh->Nq * i]; - if(ni == i) - Bs[mode * mesh->Np + node] = mesh->D[nj + mesh->Nq * j]; - - ++node; - } - ++mode; - } - } - - if (elliptic->elementType == HEXAHEDRA) { - int mode = 0; - for(int nk = 0; nk < mesh->N + 1; ++nk) - for(int nj = 0; nj < mesh->N + 1; ++nj) - for(int ni = 0; ni < mesh->N + 1; ++ni) { - int node = 0; - - for(int k = 0; k < mesh->N + 1; ++k) - for(int j = 0; j < mesh->N + 1; ++j) - for(int i = 0; i < mesh->N + 1; ++i) { - if(nk == k && nj == j && ni == i) - B[mode * mesh->Np + node] = 1; - if(nj == j && nk == k) - Br[mode * mesh->Np + node] = mesh->D[ni + mesh->Nq * i]; - if(ni == i && nk == k) - Bs[mode * mesh->Np + node] = mesh->D[nj + mesh->Nq * j]; - if(ni == i && nj == j) - Bt[mode * mesh->Np + node] = mesh->D[nk + mesh->Nq * k]; - - ++node; - } - - ++mode; - } - } - - dlong diagNnum; - if(elliptic->blockSolver) - diagNnum = elliptic->Ntotal * elliptic->Nfields; - else - diagNnum = mesh->Np * mesh->Nelements; - - dfloat* diagA = (dfloat*) calloc(diagNnum, sizeof(dfloat)); - - switch(elliptic->elementType) { - case HEXAHEDRA: - if(elliptic->blockSolver) { - BuildLocalContinuousBlockDiagHex3D(elliptic, mesh, B, Br, Bs, Bt, diagA); - break; - }else{ - for(dlong eM = 0; eM < mesh->Nelements; ++eM) - BuildLocalContinuousDiagHex3D(elliptic, mesh, eM, B, Br, Bs, Bt, diagA + eM * mesh->Np); - break; - } - } - - if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) { - if(elliptic->blockSolver) { - ogsGatherScatterMany(diagA, - elliptic->Nfields, - elliptic->Ntotal, - ogsDfloat, - ogsAdd, - elliptic->ogs); - *invDiagA = (dfloat*) calloc(diagNnum, sizeof(dfloat)); - for (dlong n = 0; n < mesh->Nelements * mesh->Np; n++) - (*invDiagA)[n] = 1; - - for(int fld = 0; fld < elliptic->Nfields; fld++) - for(int n = 0; n < mesh->Nelements * mesh->Np; n++) - (*invDiagA)[n + fld * elliptic->Ntotal] = 1 / diagA[n + fld * elliptic->Ntotal]; - }else{ - ogsGatherScatter(diagA, ogsDfloat, ogsAdd, elliptic->ogs); - *invDiagA = (dfloat*) calloc(diagNnum, sizeof(dfloat)); - for (dlong n = 0; n < mesh->Nelements * mesh->Np; n++) - (*invDiagA)[n] = 1 / diagA[n]; - } - } - - MPI_Barrier(platform->comm.mpiComm); - if(platform->comm.mpiRank == 0) printf("done (%gs)\n", MPI_Wtime() - tStart); - - free(diagA); - free(B); - free(Br); - free(Bs); - free(Bt); -} - -void BuildLocalContinuousDiagHex3D(elliptic_t* elliptic, - mesh_t* mesh, - dlong eM, - dfloat* B, - dfloat* Br, - dfloat* Bs, - dfloat* Bt, - dfloat* A) -{ - int var_coeff = elliptic->var_coeff; - - for (int nz = 0; nz < mesh->Nq; nz++) - for (int ny = 0; ny < mesh->Nq; ny++) - for (int nx = 0; nx < mesh->Nq; nx++) { - int idn = nx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq; - if (elliptic->mapB[idn + eM * mesh->Np] != 1) { - A[idn] = 0; - - int id = nx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq; - dlong base = eM * mesh->Np * mesh->Nggeo; - - dfloat lambda_0 = - var_coeff ? elliptic->lambda[eM * mesh->Np + id + 0 * elliptic->Ntotal] : 1.0; - dfloat lambda_1 = - var_coeff ? elliptic->lambda[eM * mesh->Np + id + 1 * - elliptic->Ntotal] : elliptic->lambda[0]; - - dfloat Grs = mesh->ggeo[base + id + G01ID * mesh->Np]; - A[idn] += 2 * lambda_0 * Grs * mesh->D[nx + nx * mesh->Nq] * mesh->D[ny + ny * mesh->Nq]; - - dfloat Grt = mesh->ggeo[base + id + G02ID * mesh->Np]; - A[idn] += 2 * lambda_0 * Grt * mesh->D[nx + nx * mesh->Nq] * mesh->D[nz + nz * mesh->Nq]; - - dfloat Gst = mesh->ggeo[base + id + G12ID * mesh->Np]; - A[idn] += 2 * lambda_0 * Gst * mesh->D[ny + ny * mesh->Nq] * mesh->D[nz + nz * mesh->Nq]; - - for (int k = 0; k < mesh->Nq; k++) { - int iid = k + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq; - dfloat Grr = mesh->ggeo[base + iid + G00ID * mesh->Np]; - lambda_0 = - var_coeff ? elliptic->lambda[eM * mesh->Np + iid + 0 * elliptic->Ntotal] : 1.0; - A[idn] += Grr * lambda_0 * mesh->D[nx + k * mesh->Nq] * mesh->D[nx + k * mesh->Nq]; - } - - for (int k = 0; k < mesh->Nq; k++) { - int iid = nx + k * mesh->Nq + nz * mesh->Nq * mesh->Nq; - dfloat Gss = mesh->ggeo[base + iid + G11ID * mesh->Np]; - lambda_0 = - var_coeff ? elliptic->lambda[eM * mesh->Np + iid + 0 * elliptic->Ntotal] : 1.0; - A[idn] += Gss * lambda_0 * mesh->D[ny + k * mesh->Nq] * mesh->D[ny + k * mesh->Nq]; - } - - for (int k = 0; k < mesh->Nq; k++) { - int iid = nx + ny * mesh->Nq + k * mesh->Nq * mesh->Nq; - dfloat Gtt = mesh->ggeo[base + iid + G22ID * mesh->Np]; - lambda_0 = - var_coeff ? elliptic->lambda[eM * mesh->Np + iid + 0 * elliptic->Ntotal] : 1.0; - A[idn] += Gtt * lambda_0 * mesh->D[nz + k * mesh->Nq] * mesh->D[nz + k * mesh->Nq]; - } - - dfloat JW = mesh->ggeo[base + id + GWJID * mesh->Np]; - A[idn] += JW * lambda_1; - } else { - A[idn] = 1; //just put a 1 so A is invertable - } - } - - //add the rank boost for the allNeumann Poisson problem - if (elliptic->allNeumann) - for(int n = 0; n < mesh->Np; ++n) - if (elliptic->mapB[n + eM * mesh->Np] != 1) //dont fill rows for masked nodes - A[n] += elliptic->allNeumannPenalty * elliptic->allNeumannScale * elliptic->allNeumannScale; -} - -void BuildLocalContinuousBlockDiagHex3D(elliptic_t* elliptic, - mesh_t* mesh, - dfloat* B, - dfloat* Br, - dfloat* Bs, - dfloat* Bt, - dfloat* A) -{ - int var_coeff = elliptic->var_coeff; - for(dlong eM = 0; eM < mesh->Nelements; ++eM) { - for(int fld = 0; fld < elliptic->Nfields; fld++) { - const dlong offset = fld * elliptic->Ntotal; - - for (int nz = 0; nz < mesh->Nq; nz++) - for (int ny = 0; ny < mesh->Nq; ny++) - for (int nx = 0; nx < mesh->Nq; nx++) { - int idn = nx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq + eM * mesh->Np + offset; - if (elliptic->mapB[idn] != 1) { - A[idn] = 0; - - int id = nx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq + eM * mesh->Np; - int gid = nx + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq; - dlong base = eM * mesh->Np * mesh->Nggeo; - - dfloat lambda_0 = - var_coeff ? elliptic->lambda[id + 0 * elliptic->Ntotal + fld * - elliptic->loffset] : 1.0; - dfloat lambda_1 = - var_coeff ? elliptic->lambda[id + 1 * elliptic->Ntotal + fld * - elliptic->loffset] : elliptic->lambda[fld * - elliptic->loffset - ]; - - dfloat Grs = mesh->ggeo[base + gid + G01ID * mesh->Np]; - A[idn] += 2 * lambda_0 * Grs * mesh->D[nx + nx * mesh->Nq] * - mesh->D[ny + ny * mesh->Nq]; - - dfloat Grt = mesh->ggeo[base + gid + G02ID * mesh->Np]; - A[idn] += 2 * lambda_0 * Grt * mesh->D[nx + nx * mesh->Nq] * - mesh->D[nz + nz * mesh->Nq]; - - dfloat Gst = mesh->ggeo[base + gid + G12ID * mesh->Np]; - A[idn] += 2 * lambda_0 * Gst * mesh->D[ny + ny * mesh->Nq] * - mesh->D[nz + nz * mesh->Nq]; - - for (int k = 0; k < mesh->Nq; k++) { - int iid = k + ny * mesh->Nq + nz * mesh->Nq * mesh->Nq; - dfloat Grr = mesh->ggeo[base + iid + G00ID * mesh->Np]; - lambda_0 = - var_coeff ? elliptic->lambda[eM * mesh->Np + iid + 0 * elliptic->Ntotal + fld * - elliptic->loffset] : 1.0; - A[idn] += Grr * lambda_0 * mesh->D[nx + k * mesh->Nq] * mesh->D[nx + k * mesh->Nq]; - } - - for (int k = 0; k < mesh->Nq; k++) { - int iid = nx + k * mesh->Nq + nz * mesh->Nq * mesh->Nq; - dfloat Gss = mesh->ggeo[base + iid + G11ID * mesh->Np]; - lambda_0 = - var_coeff ? elliptic->lambda[eM * mesh->Np + iid + 0 * elliptic->Ntotal + fld * - elliptic->loffset] : 1.0; - A[idn] += Gss * lambda_0 * mesh->D[ny + k * mesh->Nq] * mesh->D[ny + k * mesh->Nq]; - } - - for (int k = 0; k < mesh->Nq; k++) { - int iid = nx + ny * mesh->Nq + k * mesh->Nq * mesh->Nq; - dfloat Gtt = mesh->ggeo[base + iid + G22ID * mesh->Np]; - lambda_0 = - var_coeff ? elliptic->lambda[eM * mesh->Np + iid + 0 * elliptic->Ntotal + fld * - elliptic->loffset] : 1.0; - A[idn] += Gtt * lambda_0 * mesh->D[nz + k * mesh->Nq] * mesh->D[nz + k * mesh->Nq]; - } - - dfloat JW = mesh->ggeo[base + gid + GWJID * mesh->Np]; - A[idn] += JW * lambda_1; - } else { - A[idn] = 1; //just put a 1 so A is invertable - } - } - } - - //add the rank boost for the allNeumann Poisson problem - for(int fld = 0; fld < elliptic->Nfields; fld++) { - const dlong offset = fld * elliptic->Ntotal; - if (elliptic->allBlockNeumann[fld]) - for(int n = 0; n < mesh->Np; ++n) - if (elliptic->mapB[n + eM * mesh->Np + offset] != 1) //dont fill rows for masked nodes - A[n + eM * mesh->Np + offset] += elliptic->allNeumannPenalty * - elliptic->allNeumannScale * elliptic->allNeumannScale; - } - } -} diff --git a/src/elliptic/ellipticKernelInfo.cpp b/src/elliptic/ellipticKernelInfo.cpp deleted file mode 100644 index c6d16743a..000000000 --- a/src/elliptic/ellipticKernelInfo.cpp +++ /dev/null @@ -1,94 +0,0 @@ -/* - - The MIT License (MIT) - - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - */ - -#include "elliptic.h" -#include "platform.hpp" - -occa::properties ellipticKernelInfo(int N) -{ - - // info for kernel construction - occa::properties kernelInfo = platform->kernelInfo; - - kernelInfo["defines"].asObject(); - kernelInfo["includes"].asArray(); - kernelInfo["header"].asArray(); - kernelInfo["flags"].asObject(); - - const int Nq = N+1; - const int Np = Nq * Nq * Nq; - const int Nfp = Nq * Nq; - constexpr int Nfaces {6}; - - constexpr int Nvgeo {12}; - constexpr int Nggeo {7}; - constexpr int Nsgeo {17}; - - kernelInfo["defines/" "p_dim"] = 3; - kernelInfo["defines/" "p_Nfields"] = 1; - kernelInfo["defines/" "p_N"] = N; - kernelInfo["defines/" "p_Nq"] = Nq; - kernelInfo["defines/" "p_Np"] = Np; - kernelInfo["defines/" "p_Nfp"] = Nfp; - kernelInfo["defines/" "p_Nfaces"] = Nfaces; - kernelInfo["defines/" "p_NfacesNfp"] = Nfp * Nfaces; - kernelInfo["defines/" "p_Nvgeo"] = Nvgeo; - kernelInfo["defines/" "p_Nsgeo"] = Nsgeo; - kernelInfo["defines/" "p_Nggeo"] = Nggeo; - - kernelInfo["defines/" "p_NXID"] = NXID; - kernelInfo["defines/" "p_NYID"] = NYID; - kernelInfo["defines/" "p_NZID"] = NZID; - kernelInfo["defines/" "p_SJID"] = SJID; - kernelInfo["defines/" "p_IJID"] = IJID; - kernelInfo["defines/" "p_WSJID"] = WSJID; - kernelInfo["defines/" "p_IHID"] = IHID; - - kernelInfo["defines/" "p_G00ID"] = G00ID; - kernelInfo["defines/" "p_G01ID"] = G01ID; - kernelInfo["defines/" "p_G02ID"] = G02ID; - kernelInfo["defines/" "p_G11ID"] = G11ID; - kernelInfo["defines/" "p_G12ID"] = G12ID; - kernelInfo["defines/" "p_G22ID"] = G22ID; - kernelInfo["defines/" "p_GWJID"] = GWJID; - - kernelInfo["defines/" "p_RXID"] = RXID; - kernelInfo["defines/" "p_SXID"] = SXID; - kernelInfo["defines/" "p_TXID"] = TXID; - - kernelInfo["defines/" "p_RYID"] = RYID; - kernelInfo["defines/" "p_SYID"] = SYID; - kernelInfo["defines/" "p_TYID"] = TYID; - - kernelInfo["defines/" "p_RZID"] = RZID; - kernelInfo["defines/" "p_SZID"] = SZID; - kernelInfo["defines/" "p_TZID"] = TZID; - - kernelInfo["defines/" "p_JID"] = JID; - kernelInfo["defines/" "p_JWID"] = JWID; - - return kernelInfo; -} diff --git a/src/elliptic/ellipticMultiGrid.h b/src/elliptic/ellipticMultiGrid.h index d43cf32f4..16a3e080b 100644 --- a/src/elliptic/ellipticMultiGrid.h +++ b/src/elliptic/ellipticMultiGrid.h @@ -45,7 +45,6 @@ class MGLevel : public parAlmond::multigridLevel elliptic_t* elliptic; mesh_t* mesh; - dfloat lambda; int degree; @@ -92,8 +91,8 @@ class MGLevel : public parAlmond::multigridLevel //local patch data occa::memory o_invAP, o_patchesIndex, o_invDegreeAP; - //ogs_t* extendedOgs = nullptr; - void* extendedOgs; + void* ogsExt; + void* ogsExtOverlap; void* ogs; void build( elliptic_t* pSolver); @@ -104,7 +103,7 @@ class MGLevel : public parAlmond::multigridLevel bool isCoarse; //build a single level - MGLevel(elliptic_t* ellipticBase, dfloat lambda_, int Nc, + MGLevel(elliptic_t* ellipticBase, int Nc, setupAide options_, parAlmond::KrylovType ktype_, MPI_Comm comm_, bool _isCoarse = false ); @@ -113,7 +112,6 @@ class MGLevel : public parAlmond::multigridLevel mesh_t** meshLevels, elliptic_t* ellipticFine, //previous level elliptic_t* ellipticCoarse, //current level - dfloat lambda_, int Nf, int Nc, setupAide options_, parAlmond::KrylovType ktype_, diff --git a/src/elliptic/ellipticMultiGridLevel.cpp b/src/elliptic/ellipticMultiGridLevel.cpp index 072610bab..8270ef112 100644 --- a/src/elliptic/ellipticMultiGridLevel.cpp +++ b/src/elliptic/ellipticMultiGridLevel.cpp @@ -23,7 +23,7 @@ SOFTWARE. */ - +#include #include "elliptic.h" #include "linAlg.hpp" #include @@ -37,15 +37,7 @@ void MGLevel::residual(occa::memory o_rhs, occa::memory o_x, occa::memory o_res) if(stype != SmootherType::SCHWARZ) { ellipticOperator(elliptic,o_x,o_res, dfloatString); // subtract r = b - A*x - platform->linAlg->axpbyMany( - Nrows, - elliptic->Nfields, - elliptic->Ntotal, - 1.0, - o_rhs, - -1.0, - o_res - ); + platform->linAlg->axpbyMany(Nrows, elliptic->Nfields, elliptic->Ntotal, 1.0, o_rhs, -1.0, o_res); } else { o_res.copyFrom(o_rhs, Nrows*sizeof(dfloat)); } @@ -53,20 +45,36 @@ void MGLevel::residual(occa::memory o_rhs, occa::memory o_x, occa::memory o_res) void MGLevel::coarsen(occa::memory o_x, occa::memory o_Rx) { - if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) + double flopCounter = 0.0; + if (options.compareArgs("DISCRETIZATION", "CONTINUOUS")) { platform->linAlg->axmy(mesh->Nelements * NpF, 1.0, o_invDegree, o_x); + flopCounter += static_cast(mesh->Nelements) * NpF; + } + + const auto NqC = elliptic->mesh->Nq; + const auto NqF = std::cbrt(NpF); elliptic->precon->coarsenKernel(mesh->Nelements, o_R, o_x, o_Rx); + const auto workPerElem = 2 * (NqF * NqF * NqF * NqC + NqF * NqF * NqC * NqC + NqF * NqC * NqC * NqC); + flopCounter += static_cast(mesh->Nelements) * workPerElem; if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) { oogs::startFinish(o_Rx, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, elliptic->oogs); - //if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Rx); + // ellipticApplyMask(elliptic, o_Rx, dfloatString); } + + platform->flopCounter->add("MGLevel::coarsen, N=" + std::to_string(mesh->N), flopCounter); } void MGLevel::prolongate(occa::memory o_x, occa::memory o_Px) { elliptic->precon->prolongateKernel(mesh->Nelements, o_R, o_x, o_Px); + const auto NqC = elliptic->mesh->Nq; + const auto NqF = std::cbrt(NpF); + double flopCounter = 2 * (NqF * NqF * NqF * NqC + NqF * NqF * NqC * NqC + NqF * NqC * NqC * NqC); + flopCounter += NqF * NqF * NqF; + flopCounter *= static_cast(mesh->Nelements); + platform->flopCounter->add("MGLevel::prolongate, N=" + std::to_string(mesh->N), flopCounter); } void MGLevel::smooth(occa::memory o_rhs, occa::memory o_x, bool x_is_zero) @@ -120,16 +128,24 @@ void MGLevel::smoothJacobi (occa::memory &o_r, occa::memory &o_x, bool xIsZero) const pfloat mone = -1.0; const pfloat zero = 0.0; + double flopCount = 0.0; + if(xIsZero) { //skip the Ax if x is zero //res = Sr elliptic->dotMultiplyPfloatKernel(Nrows,o_invDiagA,o_r,o_x); + flopCount += Nrows; } else { //res = S(r-Ax) this->Ax(o_x,o_res); elliptic->scaledAddPfloatKernel(Nrows, one, o_r, mone, o_res); elliptic->dotMultiplyPfloatKernel(Nrows, o_invDiagA, o_res, o_d); elliptic->scaledAddPfloatKernel(Nrows, one, o_d, one, o_x); + // two saxpy's + collocation + flopCount += 7 * Nrows; } + auto mesh = elliptic->mesh; + const double factor = std::is_same::value ? 0.5 : 1.0; + platform->flopCounter->add("MGLevel::smoothJacobi, N=" + std::to_string(mesh->N), factor * flopCount); } void MGLevel::smoothChebyshevOneIteration (occa::memory &o_r, occa::memory &o_x, bool xIsZero) { @@ -146,16 +162,21 @@ void MGLevel::smoothChebyshevOneIteration (occa::memory &o_r, occa::memory &o_x, occa::memory o_Ad = o_smootherResidual2; occa::memory o_d = o_smootherUpdate; + double flopCount = 0.0; + if(xIsZero) { //skip the Ax if x is zero //res = Sr this->smoother(o_r, o_res, xIsZero); elliptic->updateSmoothedSolutionVecKernel(Nrows, invTheta, o_res, one, o_d, zero, o_x); + flopCount += 4 * Nrows; } else { //res = S(r-Ax) this->Ax(o_x,o_res); elliptic->scaledAddPfloatKernel(Nrows, one, o_r, mone, o_res); this->smoother(o_res, o_res, xIsZero); elliptic->updateSmoothedSolutionVecKernel(Nrows, invTheta, o_res, one, o_d, one, o_x); + + flopCount += 7 * Nrows; } //r_k+1 = r_k - SAd_k @@ -164,6 +185,14 @@ void MGLevel::smoothChebyshevOneIteration (occa::memory &o_r, occa::memory &o_x, rho_np1 = 1.0 / (2. * sigma - rho_n); pfloat rhoDivDelta = 2.0 * rho_np1 / delta; elliptic->updateChebyshevSolutionVecKernel(Nrows, rhoDivDelta, rho_np1, rho_n, o_Ad, o_res, o_d, o_x); + ellipticApplyMask(elliptic, o_x, pfloatString); + + flopCount += 6 * Nrows; + + auto mesh = elliptic->mesh; + const double factor = std::is_same::value ? 0.5 : 1.0; + platform->flopCounter->add("MGLevel::smoothChebyshevOneIteration, N=" + std::to_string(mesh->N), + factor * flopCount); } void MGLevel::smoothChebyshevTwoIteration (occa::memory &o_r, occa::memory &o_x, bool xIsZero) { @@ -180,11 +209,14 @@ void MGLevel::smoothChebyshevTwoIteration (occa::memory &o_r, occa::memory &o_x, occa::memory o_Ad = o_smootherResidual2; occa::memory o_d = o_smootherUpdate; + double flopCount = 0.0; + if(xIsZero) { //skip the Ax if x is zero //res = Sr this->smoother(o_r, o_res, xIsZero); elliptic->updateSmoothedSolutionVecKernel(Nrows, invTheta, o_res, one, o_d, zero, o_x); + flopCount += 4 * Nrows; } else { //res = S(r-Ax) this->Ax(o_x,o_res); @@ -192,6 +224,7 @@ void MGLevel::smoothChebyshevTwoIteration (occa::memory &o_r, occa::memory &o_x, this->smoother(o_res, o_res, xIsZero); elliptic->updateSmoothedSolutionVecKernel(Nrows, invTheta, o_res, one, o_d, one, o_x); + flopCount += 7 * Nrows; } @@ -202,6 +235,7 @@ void MGLevel::smoothChebyshevTwoIteration (occa::memory &o_r, occa::memory &o_x, pfloat rhoDivDelta = 2.0 * rho_np1 / delta; elliptic->updateIntermediateSolutionVecKernel(Nrows, rhoDivDelta, rho_n, rho_np1, o_Ad, o_res, o_d, o_x); + flopCount += 6 * Nrows; rho_n = rho_np1; //r_k+1 = r_k - SAd_k @@ -211,7 +245,12 @@ void MGLevel::smoothChebyshevTwoIteration (occa::memory &o_r, occa::memory &o_x, rhoDivDelta = 2.0 * rho_np1 / delta; elliptic->updateIntermediateSolutionVecKernel(Nrows, rhoDivDelta, rho_n, rho_np1, o_Ad, o_res, o_d, o_x); + flopCount += 6 * Nrows; + ellipticApplyMask(elliptic, o_x, pfloatString); + const double factor = std::is_same::value ? 0.5 : 1.0; + platform->flopCounter->add("MGLevel::smoothChebyshevTwoIteration, N=" + std::to_string(mesh->N), + factor * flopCount); } void MGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_x, bool xIsZero) @@ -236,47 +275,63 @@ void MGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_x, bool xIsZer occa::memory o_Ad = o_smootherResidual2; occa::memory o_d = o_smootherUpdate; + double flopCount = 0.0; + if(xIsZero) { //skip the Ax if x is zero //res = Sr this->smoother(o_r, o_res, xIsZero); //d = invTheta*res elliptic->scaledAddPfloatKernel(Nrows, invTheta, o_res, zero, o_d); + flopCount += Nrows; } else { //res = S(r-Ax) this->Ax(o_x,o_res); elliptic->scaledAddPfloatKernel(Nrows, one, o_r, mone, o_res); this->smoother(o_res, o_res, xIsZero); + flopCount += 2 * Nrows; //d = invTheta*res elliptic->scaledAddPfloatKernel(Nrows, invTheta, o_res, zero, o_d); + flopCount += Nrows; } for (int k = 0; k < ChebyshevIterations; k++) { //x_k+1 = x_k + d_k - if (xIsZero && (k == 0)) + if (xIsZero && (k == 0)) { elliptic->scaledAddPfloatKernel(Nrows, one, o_d, zero, o_x); - else + } + else { elliptic->scaledAddPfloatKernel(Nrows, one, o_d, one, o_x); + flopCount += 1 * Nrows; + } //r_k+1 = r_k - SAd_k this->Ax(o_d,o_Ad); this->smoother(o_Ad, o_Ad, xIsZero); elliptic->scaledAddPfloatKernel(Nrows, mone, o_Ad, one, o_res); + flopCount += Nrows; rho_np1 = 1.0 / (2. * sigma - rho_n); pfloat rhoDivDelta = 2.0 * rho_np1 / delta; //d_k+1 = rho_k+1*rho_k*d_k + 2*rho_k+1*r_k+1/delta elliptic->scaledAddPfloatKernel(Nrows, rhoDivDelta, o_res, rho_np1 * rho_n, o_d); + flopCount += 4 * Nrows; rho_n = rho_np1; } //x_k+1 = x_k + d_k elliptic->scaledAddPfloatKernel(Nrows, one, o_d, one, o_x); + flopCount += Nrows; + ellipticApplyMask(elliptic, o_x, pfloatString); + const double factor = std::is_same::value ? 0.5 : 1.0; + platform->flopCounter->add("MGLevel::smoothChebyshev, N=" + std::to_string(mesh->N), factor * flopCount); } void MGLevel::smootherJacobi(occa::memory &o_r, occa::memory &o_Sr) { elliptic->dotMultiplyPfloatKernel(Nrows, o_invDiagA, o_r, o_Sr); + const double factor = std::is_same::value ? 0.5 : 1.0; + platform->flopCounter->add("MGLevel::smootherJacobi, N=" + std::to_string(mesh->N), factor * Nrows); } diff --git a/src/elliptic/ellipticMultiGridLevelSetup.cpp b/src/elliptic/ellipticMultiGridLevelSetup.cpp index 3e34e371e..b1081938b 100644 --- a/src/elliptic/ellipticMultiGridLevelSetup.cpp +++ b/src/elliptic/ellipticMultiGridLevelSetup.cpp @@ -35,7 +35,7 @@ occa::memory MGLevel::o_smootherResidual2; occa::memory MGLevel::o_smootherUpdate; //build a single level -MGLevel::MGLevel(elliptic_t* ellipticBase, dfloat lambda_, int Nc, +MGLevel::MGLevel(elliptic_t* ellipticBase, int Nc, setupAide options_, parAlmond::KrylovType ktype_, MPI_Comm comm_, bool _isCoarse) : multigridLevel(ellipticBase->mesh->Nelements * ellipticBase->mesh->Np, (ellipticBase->mesh->Nelements + ellipticBase->mesh->totalHaloPairs) * ellipticBase->mesh->Np, @@ -47,7 +47,6 @@ MGLevel::MGLevel(elliptic_t* ellipticBase, dfloat lambda_, int Nc, elliptic = ellipticBase; mesh = elliptic->mesh; options = options_; - lambda = lambda_; degree = Nc; weighted = false; @@ -58,7 +57,7 @@ MGLevel::MGLevel(elliptic_t* ellipticBase, dfloat lambda_, int Nc, weight = elliptic->invDegree; } - if(!isCoarse || options.compareArgs("MULTIGRID COARSE SOLVE", "TRUE")) + if(!isCoarse || options.compareArgs("MULTIGRID COARSE SOLVE", "FALSE")) this->setupSmoother(ellipticBase); o_xPfloat = platform->device.malloc(Nrows , sizeof(pfloat)); @@ -70,7 +69,6 @@ MGLevel::MGLevel(elliptic_t* ellipticBase, //finest level mesh_t** meshLevels, elliptic_t* ellipticFine, //previous level elliptic_t* ellipticCoarse, //current level - dfloat lambda_, int Nf, int Nc, setupAide options_, parAlmond::KrylovType ktype_, @@ -88,7 +86,6 @@ MGLevel::MGLevel(elliptic_t* ellipticBase, //finest level elliptic = ellipticCoarse; mesh = elliptic->mesh; options = options_; - lambda = lambda_; degree = Nc; weighted = false; @@ -102,12 +99,12 @@ MGLevel::MGLevel(elliptic_t* ellipticBase, //finest level o_invDegree = ellipticFine->ogs->o_invDegree; } - if(!isCoarse || options.compareArgs("MULTIGRID COARSE SOLVE", "FALSE")) - this->setupSmoother(ellipticBase); - /* build coarsening and prologation operators to connect levels */ this->buildCoarsenerQuadHex(meshLevels, Nf, Nc); + if(!isCoarse || options.compareArgs("MULTIGRID COARSE SOLVE", "FALSE")) + this->setupSmoother(ellipticBase); + o_xPfloat = platform->device.malloc(Nrows , sizeof(pfloat)); o_rhsPfloat = platform->device.malloc(Nrows , sizeof(pfloat)); } @@ -141,12 +138,8 @@ void MGLevel::setupSmoother(elliptic_t* ellipticBase) } if(options.compareArgs("MULTIGRID DOWNWARD SMOOTHER","JACOBI") || options.compareArgs("MULTIGRID UPWARD SMOOTHER","JACOBI")) { - dfloat* invDiagA; - std::vector casted_invDiagA(mesh->Np * mesh->Nelements, 0.0); - ellipticBuildJacobi(elliptic,&invDiagA); - for(dlong i = 0; i < mesh->Np * mesh->Nelements; ++i) - casted_invDiagA[i] = static_cast(invDiagA[i]); - o_invDiagA = platform->device.malloc(mesh->Np * mesh->Nelements * sizeof(pfloat), casted_invDiagA.data()); + o_invDiagA = platform->device.malloc(mesh->Np * mesh->Nelements * sizeof(pfloat)); + ellipticUpdateJacobi(elliptic,o_invDiagA); if(options.compareArgs("MULTIGRID UPWARD SMOOTHER","JACOBI")) smtypeUp = SecondarySmootherType::JACOBI; if(options.compareArgs("MULTIGRID DOWNWARD SMOOTHER","JACOBI")) @@ -156,13 +149,9 @@ void MGLevel::setupSmoother(elliptic_t* ellipticBase) stype = SmootherType::JACOBI; smtypeUp = SecondarySmootherType::JACOBI; smtypeDown = SecondarySmootherType::JACOBI; - dfloat* invDiagA; - ellipticBuildJacobi(elliptic,&invDiagA); - std::vector casted_invDiagA(mesh->Np * mesh->Nelements, 0.0); - for(dlong i = 0; i < mesh->Np * mesh->Nelements; ++i) - casted_invDiagA[i] = static_cast(invDiagA[i]); - o_invDiagA = platform->device.malloc(mesh->Np * mesh->Nelements * sizeof(pfloat), casted_invDiagA.data()); + o_invDiagA = platform->device.malloc(mesh->Np * mesh->Nelements * sizeof(pfloat)); + ellipticUpdateJacobi(elliptic,o_invDiagA); if (options.compareArgs("MULTIGRID SMOOTHER","CHEBYSHEV")) { stype = SmootherType::CHEBYSHEV; @@ -176,7 +165,6 @@ void MGLevel::setupSmoother(elliptic_t* ellipticBase) lambda1 = maxMultiplier * rho; lambda0 = minMultiplier * rho; } - free(invDiagA); } } @@ -225,40 +213,24 @@ void MGLevel::Report() void MGLevel::buildCoarsenerQuadHex(mesh_t** meshLevels, int Nf, int Nc) { - - int NqFine = Nf + 1; - int NqCoarse = Nc + 1; - dfloat* P = (dfloat*) calloc(NqFine * NqCoarse,sizeof(dfloat)); - dfloat* Ptmp = (dfloat*) calloc(NqFine * NqCoarse,sizeof(dfloat)); - - //initialize P as identity - for (int i = 0; i < NqCoarse; i++) P[i * NqCoarse + i] = 1.0; - - for (int n = Nc; n < Nf; n++) { - int Nqp1 = n + 2; - int Nq = n + 1; - - //copy P - for (int i = 0; i < Nq * NqCoarse; i++) Ptmp[i] = P[i]; - - //Multiply by the raise op - for (int i = 0; i < Nqp1; i++) - for (int j = 0; j < NqCoarse; j++) { - P[i * NqCoarse + j] = 0.; - for (int k = 0; k < Nq; k++) - P[i * NqCoarse + j] += meshLevels[n]->interpRaise[i * Nq + k] * Ptmp[k * NqCoarse + j]; - } + + const int Nfq = Nf + 1; + const int Ncq = Nc + 1; + dfloat *cToFInterp = (dfloat *)calloc(Nfq * Ncq, sizeof(dfloat)); + dfloat *R = (dfloat *)calloc(Nfq * Ncq, sizeof(dfloat)); + InterpolationMatrix1D(Nc, Ncq, meshLevels[Nc]->r, Nfq, meshLevels[Nf]->r, cToFInterp); + + // transpose + for (int i = 0; i < Ncq; i++) { + for (int j = 0; j < Nfq; j++) { + R[i * Nfq + j] = cToFInterp[j * Ncq + i]; + } } - //the coarsen matrix is P^T - R = (dfloat*) calloc(NqFine * NqCoarse,sizeof(dfloat)); - for (int i = 0; i < NqCoarse; i++) - for (int j = 0; j < NqFine; j++) - R[i * NqFine + j] = P[j * NqCoarse + i]; - o_R = platform->device.malloc(NqFine * NqCoarse * sizeof(dfloat), R); + o_R = platform->device.malloc(Nfq * Ncq * sizeof(dfloat), R); - free(P); - free(Ptmp); + free(R); + free(cToFInterp); } static void eig(const int Nrows, double* A, double* WR, double* WI) @@ -312,28 +284,54 @@ dfloat MGLevel::maxEigSmoothAx() // allocate memory for basis dfloat* Vx = (dfloat*) calloc(M, sizeof(dfloat)); - // occa::memory *o_V = (occa::memory *) calloc(k+1, sizeof(occa::memory)); occa::memory* o_V = new occa::memory[k + 1]; - occa::memory o_Vx = platform->device.malloc(M * sizeof(dfloat),Vx); - occa::memory o_AVx = platform->device.malloc(M * sizeof(dfloat),Vx); - occa::memory o_AVxPfloat = platform->device.malloc(M , sizeof(pfloat)); - occa::memory o_VxPfloat = platform->device.malloc(M , sizeof(pfloat)); + size_t offset = 0; + const size_t vectorSize = ((M * sizeof(dfloat))/ALIGN_SIZE + 1) * ALIGN_SIZE ; - for(int i = 0; i <= k; i++) - o_V[i] = platform->device.malloc(M * sizeof(dfloat),Vx); + for(int i = 0; i <= k; i++) { + if(offset + vectorSize < platform->o_mempool.o_ptr.size()) { + o_V[i] = platform->o_mempool.o_ptr.slice(offset, vectorSize); + offset += vectorSize; + } else { + o_V[i] = platform->device.malloc(vectorSize); + } + } + + occa::memory o_Vx; + if(offset + vectorSize < platform->o_mempool.o_ptr.size()) { + o_Vx = platform->o_mempool.o_ptr.slice(offset, vectorSize); + offset += vectorSize; + } else { + o_Vx = platform->device.malloc(vectorSize); + } + + occa::memory o_AVx; + if(offset + vectorSize < platform->o_mempool.o_ptr.size()) { + o_AVx = platform->o_mempool.o_ptr.slice(offset, vectorSize); + offset += vectorSize; + } else { + o_AVx = platform->device.malloc(vectorSize); + } + + occa::memory o_AVxPfloat = platform->device.malloc(M, sizeof(pfloat)); + occa::memory o_VxPfloat = platform->device.malloc(M, sizeof(pfloat)); // generate a random vector for initial basis vector for (dlong i = 0; i < N; i++) Vx[i] = (dfloat) drand48(); - //gather-scatter if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) { ogsGatherScatter(Vx, ogsDfloat, ogsAdd, mesh->ogs); - for (dlong i = 0; i < elliptic->Nmasked; i++) Vx[elliptic->maskIds[i]] = 0.; + if(elliptic->Nmasked > 0){ + dlong* maskIds = (dlong*) calloc(elliptic->Nmasked, sizeof(dlong)); + elliptic->o_maskIds.copyTo(maskIds, elliptic->Nmasked * sizeof(dlong)); + for (dlong i = 0; i < elliptic->Nmasked; i++) Vx[maskIds[i]] = 0.; + free(maskIds); + } } - o_Vx.copyFrom(Vx); //copy to device + o_Vx.copyFrom(Vx, M*sizeof(dfloat)); dfloat norm_vo = platform->linAlg->weightedInnerProdMany( Nlocal, elliptic->Nfields, @@ -359,7 +357,7 @@ dfloat MGLevel::maxEigSmoothAx() // v[j+1] = invD*(A*v[j]) //this->Ax(o_V[j],o_AVx); ellipticOperator(elliptic,o_V[j],o_AVx,dfloatString); - elliptic->copyDfloatToPfloatKernel(M, o_AVxPfloat, o_AVx); + elliptic->copyDfloatToPfloatKernel(M, o_AVx, o_AVxPfloat); this->smoother(o_AVxPfloat, o_VxPfloat, true); elliptic->copyPfloatToDPfloatKernel(M, o_VxPfloat, o_V[j + 1]); @@ -428,7 +426,6 @@ dfloat MGLevel::maxEigSmoothAx() rho = rho_i; } - // free memory free(H); free(WR); free(WI); @@ -438,8 +435,8 @@ dfloat MGLevel::maxEigSmoothAx() o_AVx.free(); o_AVxPfloat.free(); o_VxPfloat.free(); + for(int i = 0; i <= k; i++) o_V[i].free(); - //free((void*)o_V); delete[] o_V; MPI_Barrier(platform->comm.mpiComm); diff --git a/src/elliptic/ellipticMultiGridSchwarz.cpp b/src/elliptic/ellipticMultiGridSchwarz.cpp index f09df695e..86eb8cc3b 100644 --- a/src/elliptic/ellipticMultiGridSchwarz.cpp +++ b/src/elliptic/ellipticMultiGridSchwarz.cpp @@ -24,6 +24,7 @@ */ +#include #include "elliptic.h" #include #include @@ -251,7 +252,7 @@ void compute_element_boundary_conditions(int* lbr, for(int iface = 0; iface < 6; ++iface) { const int id = lookup[iface]; int bc = elliptic->EToB[6 * e + id]; - assert(bc > -1 && bc < 3); + assert(bc == NO_OP || DIRICHLET || NEUMANN); fbc[iface] = bc; } *lbr = fbc[0]; @@ -340,6 +341,8 @@ void compute_1d_stiffness_matrix( } #undef a #undef ah + free(ah); + free(tmp); } void compute_1d_mass_matrix( @@ -607,8 +610,6 @@ void gen_operators(FDMOperators* op, ElementLengths* lengths, elliptic_t* ellipt mesh_t* create_extended_mesh(elliptic_t* elliptic, hlong* maskedGlobalIds) { - - //platform_t* platform = platform_t::getInstance(); mesh_t* meshRoot = elliptic->mesh; mesh_t* mesh = new mesh_t(); @@ -634,7 +635,10 @@ mesh_t* create_extended_mesh(elliptic_t* elliptic, hlong* maskedGlobalIds) memcpy(mesh->EToV, meshRoot->EToV, mesh->Nverts * mesh->Nelements * sizeof(hlong)); meshParallelConnect(mesh); - meshConnectBoundary(mesh); + + mesh->EToB = (int*) calloc(mesh->Nfaces * mesh->Nelements, sizeof(int)); + memcpy(mesh->EToB, meshRoot->EToB, mesh->Nfaces * mesh->Nelements * sizeof(int)); + meshLoadReferenceNodesHex3D(mesh, mesh->N, 1); meshHaloSetup(mesh); meshPhysicalNodesHex3D(mesh); @@ -642,25 +646,26 @@ mesh_t* create_extended_mesh(elliptic_t* elliptic, hlong* maskedGlobalIds) meshConnectFaceNodes3D(mesh); meshGlobalIds(mesh); - mesh->ogs = ogsSetup(mesh->Nelements * mesh->Np, mesh->globalIds, platform->comm.mpiComm, 1, platform->device); + mesh->ogs = ogsSetup(mesh->Nelements * mesh->Np, mesh->globalIds, platform->comm.mpiComm, 1, platform->device.occaDevice()); const int bigNum = 1E9; dlong Ntotal = mesh->Np * mesh->Nelements; + //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann) int* mapB = (int*) calloc(mesh->Nelements * mesh->Np,sizeof(int)); for (dlong e = 0; e < mesh->Nelements; e++) { for (int n = 0; n < mesh->Np; n++) mapB[n + e * mesh->Np] = bigNum; for (int f = 0; f < mesh->Nfaces; f++) { - int bc = mesh->EToB[f + e * mesh->Nfaces]; + const int bc = elliptic->EToB[f + e * mesh->Nfaces]; if (bc > 0) { for (int n = 0; n < mesh->Nfp; n++) { - int BCFlag = elliptic->BCType[bc]; - int fid = mesh->faceNodes[n + f * mesh->Nfp]; - mapB[fid + e * mesh->Np] = mymin(BCFlag,mapB[fid + e * mesh->Np]); + const int fid = mesh->faceNodes[n + f * mesh->Nfp]; + mapB[fid + e * mesh->Np] = mymin(bc, mapB[fid + e * mesh->Np]); } } } } + ogsGatherScatter(mapB, ogsInt, ogsMin, mesh->ogs); //use the bc flags to find masked ids dlong Nmasked = 0; @@ -673,7 +678,7 @@ mesh_t* create_extended_mesh(elliptic_t* elliptic, hlong* maskedGlobalIds) Nmasked++; else if (mapB[n] == bigNum) mapB[n] = 0.; - else if (mapB[n] == 1) //Dirichlet boundary + else if (mapB[n] == DIRICHLET) Nmasked++; } dlong* maskIds = (dlong*) calloc(Nmasked, sizeof(dlong)); @@ -687,7 +692,7 @@ mesh_t* create_extended_mesh(elliptic_t* elliptic, hlong* maskedGlobalIds) else if (isEdgeNode) maskIds[Nmasked++] = n; } //make a masked version of the global id numbering - memcpy(maskedGlobalIds, mesh->globalIds, Ntotal * sizeof(hlong)); + memcpy(maskedGlobalIds, mesh->globalIds, mesh->Nlocal * sizeof(hlong)); for (dlong n = 0; n < Nmasked; n++) maskedGlobalIds[maskIds[n]] = 0; @@ -778,7 +783,7 @@ void MGLevel::generate_weights() } extrude(work2, 0, zero, work1, 0, one, elliptic->mesh); - oogs::startFinish(work1, 1, 0, ogsPfloat, ogsAdd, (oogs_t*) extendedOgs); + oogs::startFinish(work1, 1, 0, ogsPfloat, ogsAdd, (oogs_t*) ogsExt); extrude(work1, 0, one, work2, 0, onem, elliptic->mesh); extrude(work1, 2, one, work1, 0, one, elliptic->mesh); @@ -808,28 +813,24 @@ void MGLevel::build( const int Nq = elliptic->mesh->Nq; const int Np = elliptic->mesh->Np; - overlap = false; + if(N == 1 && elliptic->options.compareArgs("MULTIGRID COARSE SOLVE", "TRUE")){ + return; + } + const bool serial = (platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP"); - if(Nq >= 5 && !serial) overlap = true; - hlong* maskedGlobalIds; - maskedGlobalIds = (hlong*) calloc(Nelements*(Nq+2)*(Nq+2)*(Nq+2),sizeof(hlong)); - mesh_t* extendedMesh = create_extended_mesh(elliptic, maskedGlobalIds); + overlap = false; + if (Nq >= (elliptic_t::minNFDMOverlap + 1) && !serial) + overlap = true; + + hlong* maskedGlobalIdsExt; + maskedGlobalIdsExt = (hlong*) calloc(Nelements*(Nq+2)*(Nq+2)*(Nq+2),sizeof(hlong)); + mesh_t* extendedMesh = create_extended_mesh(elliptic, maskedGlobalIdsExt); const int Nq_e = extendedMesh->Nq; const int Np_e = extendedMesh->Np; const dlong Nlocal_e = Nelements * Np_e; - oogs_mode oogsMode = OOGS_AUTO; - //if(platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP") oogsMode = OOGS_DEFAULT; - - extendedOgs = (void*) oogs::setup(Nelements * Np_e, maskedGlobalIds, 1, 0, - ogsPfloat, platform->comm.mpiComm, 1, platform->device, - NULL, oogsMode); - meshFree(extendedMesh); - - ogs = (void*) elliptic->oogs; - /** create the element lengths, using the most refined level **/ ElementLengths* lengths = (ElementLengths*) calloc(1,sizeof(ElementLengths)); compute_element_lengths(lengths, pSolver); @@ -878,6 +879,44 @@ void MGLevel::build( o_Sz.copyFrom(casted_Sz, Nq_e * Nq_e * Nelements * sizeof(pfloat)); o_invL.copyFrom(casted_D, Nlocal_e * sizeof(pfloat)); + { + const std::string suffix = std::string("_") + std::to_string(Nq_e-1) + std::string("pfloat"); + preFDMKernel = platform->kernels.get("preFDM" + suffix); + fusedFDMKernel = platform->kernels.get("fusedFDM" + suffix); + postFDMKernel = platform->kernels.get("postFDM" + suffix); + } + + const oogs_mode oogsMode = OOGS_AUTO; + ogsExt = (void*) oogs::setup(Nelements * Np_e, maskedGlobalIdsExt, 1, 0, + ogsPfloat, platform->comm.mpiComm, 1, platform->device.occaDevice(), + NULL, oogsMode); + + ogsExtOverlap = NULL; + if(overlap) { + occa::memory o_Su = platform->device.malloc(mesh->Nlocal * sizeof(pfloat)); + const dlong Nelements = elliptic->mesh->Nelements; + auto callback = [&]() { + if(options.compareArgs("MULTIGRID SMOOTHER","RAS")) { + if(mesh->NlocalGatherElements) + fusedFDMKernel(mesh->NlocalGatherElements,mesh->o_localGatherElementList, + o_Su,o_Sx,o_Sy,o_Sz,o_invL,elliptic->o_invDegree,o_work1); + } else { + if(mesh->NlocalGatherElements) + fusedFDMKernel(mesh->NlocalGatherElements,mesh->o_localGatherElementList, + o_work2,o_Sx,o_Sy,o_Sz,o_invL,o_work1); + } + }; + ogsExtOverlap = (void*) oogs::setup(Nelements * Np_e, maskedGlobalIdsExt, 1, 0, + ogsPfloat, platform->comm.mpiComm, 1, platform->device.occaDevice(), + callback, oogsMode); + o_Su.free(); + } + + free(maskedGlobalIdsExt); + meshFree(extendedMesh); + + ogs = (void*) elliptic->oogs; + generate_weights(); free(casted_Sx); @@ -885,13 +924,6 @@ void MGLevel::build( free(casted_Sz); free(casted_D); - const std::string suffix = std::string("_") + std::to_string(Nq_e-1) + std::string("pfloat"); - - { - preFDMKernel = platform->kernels.getKernel("preFDM" + suffix); - fusedFDMKernel = platform->kernels.getKernel("fusedFDM" + suffix); - postFDMKernel = platform->kernels.getKernel("postFDM" + suffix); - } } void MGLevel::smoothSchwarz(occa::memory& o_u, occa::memory& o_Su, bool xIsZero) @@ -902,44 +934,58 @@ void MGLevel::smoothSchwarz(occa::memory& o_u, occa::memory& o_Su, bool xIsZero) const dlong Nelements = elliptic->mesh->Nelements; preFDMKernel(Nelements, o_u, o_work1); - oogs::startFinish(o_work1, 1, 0, ogsDataTypeString, ogsAdd, (oogs_t*) extendedOgs); + oogs_t *hogsExt = (overlap) ? (oogs_t*)ogsExtOverlap : (oogs_t*)ogsExt; + + oogs::startFinish(o_work1, 1, 0, ogsDataTypeString, ogsAdd, hogsExt); if(options.compareArgs("MULTIGRID SMOOTHER","RAS")) { if(!overlap){ - fusedFDMKernel(Nelements,mesh->NglobalGatherElements,mesh->o_globalGatherElementList, - o_Su,o_Sx,o_Sy,o_Sz,o_invL,elliptic->o_invDegree,o_work1); - } else if(overlap && mesh->NglobalGatherElements){ - fusedFDMKernel(Nelements,mesh->NglobalGatherElements,mesh->o_globalGatherElementList, + fusedFDMKernel(Nelements, o_Su,o_Sx,o_Sy,o_Sz,o_invL,elliptic->o_invDegree,o_work1); + } else { + if(mesh->NglobalGatherElements) + fusedFDMKernel(mesh->NglobalGatherElements,mesh->o_globalGatherElementList, + o_Su,o_Sx,o_Sy,o_Sz,o_invL,elliptic->o_invDegree,o_work1); } oogs::start(o_Su, 1, 0, ogsDataTypeString, ogsAdd, (oogs_t*) ogs); if(overlap && mesh->NlocalGatherElements) - fusedFDMKernel(Nelements,mesh->NlocalGatherElements,mesh->o_localGatherElementList, + fusedFDMKernel(mesh->NlocalGatherElements,mesh->o_localGatherElementList, o_Su,o_Sx,o_Sy,o_Sz,o_invL,elliptic->o_invDegree,o_work1); oogs::finish(o_Su, 1, 0, ogsDataTypeString, ogsAdd, (oogs_t*) ogs); } else { if(!overlap){ - fusedFDMKernel(Nelements,mesh->NglobalGatherElements,mesh->o_globalGatherElementList, - o_work2,o_Sx,o_Sy,o_Sz,o_invL,o_work1); - } else if(overlap && mesh->NglobalGatherElements){ - fusedFDMKernel(Nelements,mesh->NglobalGatherElements,mesh->o_globalGatherElementList, + fusedFDMKernel(Nelements, o_work2,o_Sx,o_Sy,o_Sz,o_invL,o_work1); + } else { + if(mesh->NglobalGatherElements) + fusedFDMKernel(mesh->NglobalGatherElements,mesh->o_globalGatherElementList, + o_work2,o_Sx,o_Sy,o_Sz,o_invL,o_work1); } - oogs::start(o_work2, 1, 0, ogsDataTypeString, ogsAdd, (oogs_t*) extendedOgs); + oogs::start(o_work2, 1, 0, ogsDataTypeString, ogsAdd, hogsExt); - if(overlap && mesh->NlocalGatherElements) - fusedFDMKernel(Nelements,mesh->NlocalGatherElements,mesh->o_localGatherElementList, - o_work2,o_Sx,o_Sy,o_Sz,o_invL,o_work1); + if(overlap) { + if(mesh->NlocalGatherElements) + fusedFDMKernel(mesh->NlocalGatherElements,mesh->o_localGatherElementList, + o_work2,o_Sx,o_Sy,o_Sz,o_invL,o_work1); + } - oogs::finish(o_work2, 1, 0, ogsDataTypeString, ogsAdd, (oogs_t*) extendedOgs); + oogs::finish(o_work2, 1, 0, ogsDataTypeString, ogsAdd, hogsExt); postFDMKernel(Nelements,o_work1,o_work2,o_Su, o_wts); oogs::startFinish(o_Su, 1, 0, ogsDataTypeString, ogsAdd, (oogs_t*) ogs); } - if (elliptic->Nmasked) mesh->maskPfloatKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Su); + ellipticApplyMask(elliptic, o_Su, pfloatString); + + const auto Nqe = mesh->Nq + 2; + const auto Npe = Nqe * Nqe * Nqe; + const double flopsPerElem = 12 * Nqe * Npe + Npe; + const double flops = static_cast(mesh->Nelements) * flopsPerElem; + + const double factor = std::is_same::value ? 0.5 : 1.0; + platform->flopCounter->add(elliptic->name + " Schwarz, N=" + std::to_string(mesh->N), factor * flops); } diff --git a/src/elliptic/ellipticMultiGridSetup.cpp b/src/elliptic/ellipticMultiGridSetup.cpp index 9683deaf2..46e843508 100644 --- a/src/elliptic/ellipticMultiGridSetup.cpp +++ b/src/elliptic/ellipticMultiGridSetup.cpp @@ -29,14 +29,10 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon) { - - - // setup new object with constant coeff + // setup new object from fine grid but with constant coeff elliptic_t* elliptic = ellipticBuildMultigridLevelFine(elliptic_); + setupAide options = elliptic_->options; mesh_t* mesh = elliptic->mesh; - setupAide options = elliptic->options; - - const dfloat lambda = elliptic->lambda[0]; //read all the nodes files and load them in a dummy mesh array mesh_t** meshLevels = (mesh_t**) calloc(mesh->N + 1,sizeof(mesh_t*)); @@ -63,7 +59,7 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon) int Nmin = levelDegree[numMGLevels - 1]; //initialize parAlmond - precon->parAlmond = parAlmond::Init(platform->device, platform->comm.mpiComm, options); + precon->parAlmond = parAlmond::Init(platform->device.occaDevice(), platform->comm.mpiComm, options); parAlmond::multigridLevel** levels = precon->parAlmond->levels; oogs_mode oogsMode = OOGS_AUTO; @@ -74,15 +70,22 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon) if(platform->comm.mpiRank == 0) printf("=============BUILDING MULTIGRID LEVEL OF DEGREE %d==================\n", Nmax); + elliptic->o_lambdaPfloat = platform->device.malloc(2 * mesh->Nelements * mesh->Np, sizeof(pfloat)); + elliptic->copyDfloatToPfloatKernel(2 * mesh->Nelements * mesh->Np, + elliptic->o_lambda, + elliptic->o_lambdaPfloat); + auto callback = [&]() { ellipticAx(elliptic, mesh->NlocalGatherElements, mesh->o_localGatherElementList, elliptic->o_p, elliptic->o_Ap, pfloatString); }; elliptic->oogs = oogs::setup(elliptic->ogs, 1, 0, ogsPfloat, NULL, oogsMode); - elliptic->oogsAx = oogs::setup(elliptic->ogs, 1, 0, ogsPfloat, callback, oogsMode); + elliptic->oogsAx = elliptic->oogs; + if(options.compareArgs("GS OVERLAP", "TRUE")) + elliptic->oogsAx = oogs::setup(elliptic->ogs, 1, 0, ogsPfloat, callback, oogsMode); - levels[0] = new MGLevel(elliptic, lambda, Nmax, options, + levels[0] = new MGLevel(elliptic, Nmax, options, precon->parAlmond->ktype, platform->comm.mpiComm); MGLevelAllocateStorage((MGLevel*) levels[0], 0, precon->parAlmond->ctype); @@ -93,11 +96,12 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon) for (int n = 1; n < numMGLevels - 1; n++) { int Nc = levelDegree[n]; int Nf = levelDegree[n - 1]; + elliptic_t* ellipticFine = ((MGLevel*) levels[n - 1])->elliptic; //build elliptic struct for this degree if(platform->comm.mpiRank == 0) printf("=============BUILDING MULTIGRID LEVEL OF DEGREE %d==================\n", Nc); - elliptic_t* ellipticC = ellipticBuildMultigridLevel(elliptic,Nc,Nf); + elliptic_t* ellipticC = ellipticBuildMultigridLevel(ellipticFine,Nc,Nf); auto callback = [&]() { @@ -109,14 +113,15 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon) pfloatString); }; ellipticC->oogs = oogs::setup(ellipticC->ogs, 1, 0, ogsPfloat, NULL, oogsMode); - ellipticC->oogsAx = oogs::setup(ellipticC->ogs, 1, 0, ogsPfloat, callback, oogsMode); + ellipticC->oogsAx = ellipticC->oogs; + if(options.compareArgs("GS OVERLAP", "TRUE")) + ellipticC->oogsAx = oogs::setup(ellipticC->ogs, 1, 0, ogsPfloat, callback, oogsMode); //add the level manually levels[n] = new MGLevel(elliptic, meshLevels, - ((MGLevel*) levels[n - 1])->elliptic, + ellipticFine, ellipticC, - lambda, Nf, Nc, options, precon->parAlmond->ktype, platform->comm.mpiComm); @@ -151,7 +156,9 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon) pfloatString); }; ellipticCoarse->oogs = oogs::setup(ellipticCoarse->ogs, 1, 0, ogsPfloat, NULL, oogsMode); - ellipticCoarse->oogsAx = oogs::setup(ellipticCoarse->ogs, 1, 0, ogsPfloat, callback, oogsMode); + ellipticCoarse->oogsAx = ellipticCoarse->oogs; + if(options.compareArgs("GS OVERLAP", "TRUE") && options.compareArgs("MULTIGRID COARSE SOLVE", "FALSE")) + ellipticCoarse->oogsAx = oogs::setup(ellipticCoarse->ogs, 1, 0, ogsPfloat, callback, oogsMode); } else { ellipticCoarse = elliptic; } @@ -174,7 +181,7 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon) hlong* coarseGlobalStarts = (hlong*) calloc(platform->comm.mpiCommSize + 1, sizeof(hlong)); if(options.compareArgs("GALERKIN COARSE OPERATOR","TRUE")) - ellipticBuildContinuousGalerkinHex3D(ellipticCoarse,elliptic,lambda,&coarseA,&nnzCoarseA, + ellipticBuildContinuousGalerkinHex3D(ellipticCoarse,elliptic,&coarseA,&nnzCoarseA, &coarseogs,coarseGlobalStarts); else ellipticBuildContinuous(ellipticCoarse, &coarseA, &nnzCoarseA,&coarseogs, @@ -220,13 +227,12 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon) meshLevels, ellipticFine, ellipticCoarse, - lambda, Nf, Nc, options, precon->parAlmond->ktype, platform->comm.mpiComm, true); } else { - levels[numMGLevels - 1] = new MGLevel(ellipticCoarse, lambda, Nmin, options, + levels[numMGLevels - 1] = new MGLevel(ellipticCoarse, Nmin, options, precon->parAlmond->ktype, platform->comm.mpiComm, true); } MGLevelAllocateStorage((MGLevel*) levels[numMGLevels - 1], numMGLevels - 1, @@ -277,9 +283,7 @@ void ellipticMultiGridSetup(elliptic_t* elliptic_, precon_t* precon) // for (int n=1;nN+1;n++) delete[] meshLevels[n]; free(meshLevels); - //report top levels - if (platform->comm.mpiRank == 0) { //report the upper multigrid levels - printf("--------------------Multigrid Report---------------------\n"); + if (platform->comm.mpiRank == 0) { printf("---------------------------------------------------------\n"); printf("level| Type | | Smoother |\n"); printf(" | | | |\n"); diff --git a/src/elliptic/ellipticMultiGridUpdateLambda.cpp b/src/elliptic/ellipticMultiGridUpdateLambda.cpp new file mode 100644 index 000000000..4ca3e6d53 --- /dev/null +++ b/src/elliptic/ellipticMultiGridUpdateLambda.cpp @@ -0,0 +1,46 @@ +#include + +void +ellipticMultiGridUpdateLambda(elliptic_t* elliptic) +{ + mesh_t* mesh = elliptic->mesh; + precon_t* precon = elliptic->precon; + parAlmond::multigridLevel** levels = precon->parAlmond->levels; + const int numMGLevels = elliptic->nLevels; + for(int levelIndex = 0; levelIndex < numMGLevels; levelIndex++){ + auto mgLevel = dynamic_cast(levels[levelIndex]); + + if(levelIndex == 0){ + elliptic_t* ellipticFine = mgLevel->elliptic; + ellipticFine->copyDfloatToPfloatKernel(2 * mesh->Nelements * mesh->Np, + elliptic->o_lambda, + ellipticFine->o_lambdaPfloat); + } + else { + auto prevLevel = dynamic_cast(levels[levelIndex-1]); + elliptic_t* ellipticFine = prevLevel->elliptic; + elliptic_t* ellipticCoarse = mgLevel->elliptic; + const int Nfq = ellipticFine->mesh->Nq; + const int Ncq = ellipticCoarse->mesh->Nq; + ellipticCoarse->copyPfloatToDPfloatKernel(2 * ellipticFine->mesh->Nelements * ellipticFine->mesh->Np, + ellipticFine->o_lambdaPfloat, + ellipticFine->o_lambda); + + ellipticCoarse->precon->coarsenKernel(2 * ellipticCoarse->mesh->Nelements, ellipticCoarse->o_interp, ellipticFine->o_lambda, ellipticCoarse->o_lambda); + + ellipticCoarse->copyDfloatToPfloatKernel(2 * ellipticCoarse->mesh->Nelements * ellipticCoarse->mesh->Np, + ellipticCoarse->o_lambda, + ellipticCoarse->o_lambdaPfloat); + } + + if(elliptic->options.compareArgs("MULTIGRID DOWNWARD SMOOTHER","JACOBI") || + elliptic->options.compareArgs("MULTIGRID UPWARD SMOOTHER","JACOBI") || + elliptic->options.compareArgs("MULTIGRID SMOOTHER", "DAMPEDJACOBI")) + { + const bool coarsestLevel = levelIndex == numMGLevels-1; + if(!coarsestLevel || elliptic->options.compareArgs("MULTIGRID COARSE SOLVE", "FALSE")) + ellipticUpdateJacobi(mgLevel->elliptic,mgLevel->o_invDiagA); + } + + } +} diff --git a/src/elliptic/ellipticOgs.cpp b/src/elliptic/ellipticOgs.cpp new file mode 100644 index 000000000..e2b3e253f --- /dev/null +++ b/src/elliptic/ellipticOgs.cpp @@ -0,0 +1,136 @@ +#include "elliptic.h" +#include "platform.hpp" + +void ellipticOgs(mesh_t *mesh, + dlong mNlocal, + int nFields, + dlong offset, + int *EToB, + dlong &Nmasked, + occa::memory &o_maskIds, + dlong &NmaskedLocal, + occa::memory &o_maskIdsLocal, + dlong &NmaskedGlobal, + occa::memory &o_maskIdsGlobal, + ogs_t **ogs) +{ + const int Nlocal = (nFields == 1) ? mNlocal : nFields * offset; + const int largeNumber = 1 << 20; + + int *mapB = (int*) calloc(Nlocal, sizeof(int)); + for(int fld = 0; fld < nFields; fld++) { + for (dlong e = 0; e < mesh->Nelements; e++) { + for (int n = 0; n < mesh->Np; n++) + mapB[n + e * mesh->Np + fld * offset] = largeNumber; + for (int f = 0; f < mesh->Nfaces; f++) { + const int fOffset = fld * mesh->Nelements * mesh->Nfaces; + int bc = EToB[f + e * mesh->Nfaces + fOffset]; + if (bc > 0) { + for (int n = 0; n < mesh->Nfp; n++) { + int fid = mesh->faceNodes[n + f * mesh->Nfp]; + mapB[fid + e * mesh->Np + fld * offset] = mymin(bc, mapB[fid + e * mesh->Np + fld * offset]); + } + } + } + } + } + ogsGatherScatterMany(mapB, + nFields, + offset, + ogsInt, + ogsMin, + mesh->ogs); + + Nmasked = 0; + for(int fld = 0; fld < nFields; fld++) { + for (dlong n = 0; n < mesh->Nlocal; n++) { + if (mapB[n + fld * offset] == largeNumber) { + mapB[n + fld * offset] = 0; + } + else if (mapB[n + fld * offset] == DIRICHLET) { + Nmasked++; + } + } + } + dlong *maskIds = (dlong*) calloc(Nmasked, sizeof(dlong)); + + Nmasked = 0; + for(int fld = 0; fld < nFields; fld++) { + for (dlong n = 0; n < mesh->Nlocal; n++) { + if (mapB[n + fld * offset] == DIRICHLET) + maskIds[Nmasked++] = n + fld * offset; + } + } + if(Nmasked) o_maskIds = platform->device.malloc(Nmasked * sizeof(dlong), maskIds); + + NmaskedLocal = 0; + for (int fld = 0; fld < nFields; fld++) { + for (dlong el = 0; el < mesh->NlocalGatherElements; ++el) { + const dlong elemOffset = mesh->localGatherElementList[el] * mesh->Np; + for (dlong qp = 0; qp < mesh->Np; qp++) { + const dlong n = elemOffset + qp; + if (mapB[n + fld * offset] == DIRICHLET) + NmaskedLocal++; + } + } + } + dlong *localMaskIds = (dlong *)calloc(NmaskedLocal, sizeof(dlong)); + NmaskedLocal = 0; + for (int fld = 0; fld < nFields; fld++) { + for (dlong el = 0; el < mesh->NlocalGatherElements; ++el) { + const dlong elemOffset = mesh->localGatherElementList[el] * mesh->Np; + for (dlong qp = 0; qp < mesh->Np; qp++) { + const dlong n = elemOffset + qp; + if (mapB[n + fld * offset] == DIRICHLET) + localMaskIds[NmaskedLocal++] = n + fld * offset; + } + } + } + if (NmaskedLocal) + o_maskIdsLocal = platform->device.malloc(NmaskedLocal * sizeof(dlong), localMaskIds); + free(localMaskIds); + + NmaskedGlobal = 0; + for (int fld = 0; fld < nFields; fld++) { + for (dlong eg = 0; eg < mesh->NglobalGatherElements; ++eg) { + const dlong elemOffset = mesh->globalGatherElementList[eg] * mesh->Np; + for (dlong qp = 0; qp < mesh->Np; qp++) { + const dlong n = elemOffset + qp; + if (mapB[n + fld * offset] == DIRICHLET) + NmaskedGlobal++; + } + } + } + dlong *globalMaskIds = (dlong *)calloc(NmaskedGlobal, sizeof(dlong)); + NmaskedGlobal = 0; + for (int fld = 0; fld < nFields; fld++) { + for (dlong eg = 0; eg < mesh->NglobalGatherElements; ++eg) { + const dlong elemOffset = mesh->globalGatherElementList[eg] * mesh->Np; + for (dlong qp = 0; qp < mesh->Np; qp++) { + const dlong n = elemOffset + qp; + if (mapB[n + fld * offset] == DIRICHLET) + globalMaskIds[NmaskedGlobal++] = n + fld * offset; + } + } + } + if (NmaskedGlobal) + o_maskIdsGlobal = platform->device.malloc(NmaskedGlobal * sizeof(dlong), globalMaskIds); + free(globalMaskIds); + + free(mapB); + + if(! *ogs) { + if(nFields > 1) { + if(platform->comm.mpiRank == 0) + printf("Creating a masked gs handle for nFields > 1 is currently not supported!\n"); + ABORT(EXIT_FAILURE); + } + + hlong* maskedGlobalIds = (hlong*) calloc(mesh->Nlocal,sizeof(hlong)); + memcpy(maskedGlobalIds, mesh->globalIds, mesh->Nlocal * sizeof(hlong)); + for (dlong n = 0; n < Nmasked; n++) maskedGlobalIds[maskIds[n]] = 0; + *ogs = ogsSetup(mesh->Nlocal, maskedGlobalIds, platform->comm.mpiComm, 1, platform->device.occaDevice()); + free(maskedGlobalIds); + } + free(maskIds); +} diff --git a/src/elliptic/ellipticOperator.cpp b/src/elliptic/ellipticOperator.cpp index 6e60387a5..c0c893b69 100644 --- a/src/elliptic/ellipticOperator.cpp +++ b/src/elliptic/ellipticOperator.cpp @@ -24,11 +24,9 @@ */ +#include #include "elliptic.h" -//#include "ogsInterface.h" #include - -#include "omp.h" void ellipticAx(elliptic_t* elliptic, dlong NelementsList, occa::memory &o_elementsList, @@ -36,186 +34,84 @@ void ellipticAx(elliptic_t* elliptic, occa::memory &o_Aq, const char* precision) { + + if(NelementsList == 0) return; + mesh_t* mesh = elliptic->mesh; setupAide &options = elliptic->options; - const int continuous = options.compareArgs("DISCRETIZATION", "CONTINUOUS"); - const int serial = platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP"; + const bool continuous = options.compareArgs("DISCRETIZATION", "CONTINUOUS"); const int mapType = (elliptic->elementType == HEXAHEDRA && options.compareArgs("ELEMENT MAP", "TRILINEAR")) ? 1:0; const int integrationType = (elliptic->elementType == HEXAHEDRA && options.compareArgs("ELLIPTIC INTEGRATION", "CUBATURE")) ? 1:0; + const std::string precisionStr(precision); + const std::string dFloatStr(dfloatString); - { - bool valid = true; - valid &= continuous; - if(!strstr(precision, dfloatString)) { - valid &= !elliptic->var_coeff; - valid &= !elliptic->blockSolver; - if(!serial) { - valid &= mapType == 0; - valid &= integrationType == 0; - } - } - if(!valid) { - printf("Encountered invalid configuration inside ellipticAx!\n"); - if(elliptic->var_coeff) - printf("Precision level (%s) does not support variable coefficient\n", precision); - if(elliptic->blockSolver) - printf("Precision level (%s) does not support block solver\n", precision); - if(!serial) { - if(mapType != 0) - printf("Precision level (%s) does not support mapType %d\n", precision, mapType); - if(integrationType != 0) - printf("Precision level (%s) does not support integrationType %d\n", precision, integrationType); - } - ABORT(EXIT_FAILURE); - } + bool valid = true; + valid &= continuous; + if(precisionStr != dFloatStr) { + valid &= !elliptic->blockSolver; + valid &= mapType == 0; + valid &= integrationType == 0; } - - if(serial) { - if(continuous) { - if(elliptic->var_coeff) { - if(elliptic->blockSolver) { - occa::memory & o_geom_factors = elliptic->stressForm ? mesh->o_vgeo : mesh->o_ggeo; - if(!elliptic->stressForm) - elliptic->AxKernel(mesh->Nelements, elliptic->Ntotal, elliptic->loffset, o_geom_factors, - mesh->o_D, mesh->o_DT, elliptic->o_lambda, - o_q, o_Aq); - else - elliptic->AxStressKernel(mesh->Nelements, elliptic->Ntotal, elliptic->loffset, o_geom_factors, - mesh->o_D, mesh->o_DT, elliptic->o_lambda, - o_q, o_Aq); - }else { - elliptic->AxKernel(mesh->Nelements, elliptic->Ntotal, mesh->o_ggeo, mesh->o_D, - mesh->o_DT, elliptic->o_lambda, o_q, o_Aq); - } - }else{ - const dfloat lambda = elliptic->lambda[0]; - if(elliptic->blockSolver) { - occa::memory & o_geom_factors = elliptic->stressForm ? mesh->o_vgeo : mesh->o_ggeo; - if(!elliptic->stressForm) - elliptic->AxKernel(mesh->Nelements, elliptic->Ntotal, elliptic->loffset, o_geom_factors, - mesh->o_D, mesh->o_DT, elliptic->o_lambda, - o_q, o_Aq); - else - elliptic->AxStressKernel(mesh->Nelements, elliptic->Ntotal, elliptic->loffset, o_geom_factors, - mesh->o_D, mesh->o_DT, elliptic->o_lambda, - o_q, o_Aq); - }else { - occa::memory &o_ggeo = (!strstr(precision,dfloatString)) ? mesh->o_ggeoPfloat : mesh->o_ggeo; - occa::memory &o_D = (!strstr(precision,dfloatString)) ? mesh->o_DPfloat : mesh->o_D; - occa::memory &o_DT = (!strstr(precision,dfloatString)) ? mesh->o_DTPfloat : mesh->o_DT; - occa::kernel &AxKernel = (!strstr(precision,dfloatString)) ? elliptic->AxPfloatKernel : elliptic->AxKernel; - AxKernel(mesh->Nelements, o_ggeo, o_D, o_DT, elliptic->lambda[0], - o_q, o_Aq); - } - } - } else { - ABORT(EXIT_FAILURE); - } - return; + if(!valid) { + printf("Encountered invalid configuration inside ellipticAx!\n"); + if(elliptic->blockSolver) + printf("Precision level (%s) does not support block solver\n", precision); + if(mapType != 0) + printf("Precision level (%s) does not support mapType %d\n", precision, mapType); + if(integrationType != 0) + printf("Precision level (%s) does not support integrationType %d\n", precision, integrationType); + ABORT(EXIT_FAILURE); } - if(continuous) { - occa::kernel &partialAxKernel = - (!strstr(precision, dfloatString)) ? elliptic->partialAxPfloatKernel : elliptic->partialAxKernel; - - if(NelementsList) { - if(integrationType == 0) { // GLL or non-hex - if(mapType == 0) { - if(elliptic->var_coeff) { - if(elliptic->blockSolver) { - occa::memory & o_geom_factors = elliptic->stressForm ? mesh->o_vgeo : mesh->o_ggeo; - partialAxKernel(NelementsList, - elliptic->Ntotal, - elliptic->loffset, - o_elementsList, - o_geom_factors, - mesh->o_D, - mesh->o_DT, - elliptic->o_lambda, - o_q, - o_Aq); - }else { - partialAxKernel(NelementsList, - elliptic->Ntotal, - o_elementsList, - mesh->o_ggeo, - mesh->o_D, - mesh->o_DT, - elliptic->o_lambda, - o_q, - o_Aq); - } - }else{ - if(elliptic->blockSolver) { - occa::memory & o_geom_factors = elliptic->stressForm ? mesh->o_vgeo : mesh->o_ggeo; - partialAxKernel(NelementsList, - elliptic->Ntotal, - elliptic->loffset, - o_elementsList, - o_geom_factors, - mesh->o_D, - mesh->o_DT, - elliptic->o_lambda, - o_q, - o_Aq); - }else { - occa::memory &o_ggeo = (!strstr(precision,dfloatString)) ? mesh->o_ggeoPfloat : mesh->o_ggeo; - occa::memory &o_D = - (!strstr(precision,dfloatString)) ? mesh->o_DPfloat : mesh->o_D; - occa::memory &o_DT = - (!strstr(precision,dfloatString)) ? mesh->o_DTPfloat : mesh->o_DT; - partialAxKernel(NelementsList, - o_elementsList, - o_ggeo, - o_D, - o_DT, - elliptic->lambda[0], - o_q, - o_Aq); - } - } - }else{ - if(elliptic->var_coeff) { - if(elliptic->blockSolver) - printf("Trilinear version for block solver is not avalibale yet\n"); - else - partialAxKernel(NelementsList, - elliptic->Ntotal, - o_elementsList, - elliptic->o_EXYZ, - elliptic->o_gllzw, - mesh->o_D, - mesh->o_DT, - elliptic->o_lambda, - o_q, - o_Aq); - }else{ - if(elliptic->blockSolver) - printf("Trilinear version for block solver is not avalibale yet\n"); - else - partialAxKernel(NelementsList, - o_elementsList, - elliptic->o_EXYZ, - elliptic->o_gllzw, - mesh->o_D, - mesh->o_DT, - elliptic->lambda[0], - o_q, - o_Aq); - } - } - } + occa::memory & o_geom_factors = + (precisionStr != dFloatStr) ? mesh->o_ggeoPfloat : + elliptic->stressForm ? mesh->o_vgeo : mesh->o_ggeo; + occa::memory & o_D = (precisionStr != dFloatStr) ? mesh->o_DPfloat : mesh->o_D; + occa::memory & o_DT = (precisionStr != dFloatStr) ? mesh->o_DTPfloat : mesh->o_DT; + occa::memory & o_lambda = (precisionStr != dFloatStr) ? elliptic->o_lambdaPfloat : elliptic->o_lambda; + occa::kernel &AxKernel = + (precisionStr != dFloatStr) ? elliptic->AxPfloatKernel : elliptic->AxKernel; + + AxKernel(NelementsList, + elliptic->Ntotal, + elliptic->loffset, + o_elementsList, + o_geom_factors, + o_D, + o_DT, + o_lambda, + o_q, + o_Aq); + double flopCount = 0.0; + + if (elliptic->stressForm) { + // already factors in Nfields + flopCount = 36 * mesh->Np * mesh->Nq + 123 * mesh->Np; + flopCount *= static_cast(NelementsList); + } + else { + flopCount = 12 * mesh->Np * mesh->Nq + 15 * mesh->Np; + if (!elliptic->poisson) { + flopCount += 5 * mesh->Np; } + flopCount *= elliptic->Nfields * static_cast(NelementsList); } + + const double factor = std::is_same::value && (precisionStr != dFloatStr) ? 0.5 : 1.0; + + platform->flopCounter->add(elliptic->name + " Ax, N=" + std::to_string(mesh->N) + ", " + + std::string(precision), + factor * flopCount); } void ellipticOperator(elliptic_t* elliptic, occa::memory &o_q, occa::memory &o_Aq, - const char* precision) + const char* precision, + bool masked) { mesh_t* mesh = elliptic->mesh; setupAide &options = elliptic->options; @@ -225,17 +121,27 @@ void ellipticOperator(elliptic_t* elliptic, "TRUE") ? ogsFloatCommHalf : ogsPfloat : ogsDfloat; - int serial = platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP"; - if(serial) { - occa::memory o_dummy; - ellipticAx(elliptic, mesh->Nelements, o_dummy, o_q, o_Aq, precision); - oogs::startFinish(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDataTypeString, ogsAdd, oogsAx); - } else { - ellipticAx(elliptic, mesh->NglobalGatherElements, mesh->o_globalGatherElementList, o_q, o_Aq, precision); - oogs::start(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDataTypeString, ogsAdd, oogsAx); - ellipticAx(elliptic, mesh->NlocalGatherElements, mesh->o_localGatherElementList, o_q, o_Aq, precision); - oogs::finish(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDataTypeString, ogsAdd, oogsAx); + ellipticAx(elliptic, mesh->NglobalGatherElements, mesh->o_globalGatherElementList, o_q, o_Aq, precision); + if (masked) { + ellipticApplyMask(elliptic, + mesh->NglobalGatherElements, + elliptic->NmaskedGlobal, + mesh->o_globalGatherElementList, + elliptic->o_maskIdsGlobal, + o_Aq, + precision); + } + oogs::start(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDataTypeString, ogsAdd, oogsAx); + ellipticAx(elliptic, mesh->NlocalGatherElements, mesh->o_localGatherElementList, o_q, o_Aq, precision); + + if (masked) { + ellipticApplyMask(elliptic, + mesh->NlocalGatherElements, + elliptic->NmaskedLocal, + mesh->o_localGatherElementList, + elliptic->o_maskIdsLocal, + o_Aq, + precision); } - occa::kernel &maskKernel = (!strstr(precision, dfloatString)) ? mesh->maskPfloatKernel : mesh->maskKernel; - if (elliptic->Nmasked) maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Aq); + oogs::finish(o_Aq, elliptic->Nfields, elliptic->Ntotal, ogsDataTypeString, ogsAdd, oogsAx); } diff --git a/src/elliptic/ellipticPreconditioner.cpp b/src/elliptic/ellipticPreconditioner.cpp index 803baf757..268651b89 100644 --- a/src/elliptic/ellipticPreconditioner.cpp +++ b/src/elliptic/ellipticPreconditioner.cpp @@ -36,7 +36,7 @@ void ellipticPreconditioner(elliptic_t* elliptic, occa::memory &o_r, occa::memor mesh_t* mesh = elliptic->mesh; precon_t* precon = elliptic->precon; - setupAide options = elliptic->options; + setupAide& options = elliptic->options; const dlong Nlocal = mesh->Np * mesh->Nelements; @@ -52,12 +52,13 @@ void ellipticPreconditioner(elliptic_t* elliptic, occa::memory &o_r, occa::memor precon->o_invDiagA, o_z ); + platform->flopCounter->add("jacobiPrecon", static_cast(Nlocal) * elliptic->Nfields); }else if (options.compareArgs("PRECONDITIONER", "MULTIGRID")) { parAlmond::Precon(precon->parAlmond, o_z, o_r); }else if (options.compareArgs("PRECONDITIONER", "SEMFEM")) { ellipticSEMFEMSolve(elliptic, o_r, o_z); }else if (options.compareArgs("PRECONDITIONER", "NONE")) { - o_z.copyFrom(o_r); + o_z.copyFrom(o_r, elliptic->Ntotal*elliptic->Nfields*sizeof(dfloat)); }else { if(platform->comm.mpiRank == 0) printf("ERRROR: Unknown preconditioner\n"); MPI_Abort(platform->comm.mpiComm, 1); diff --git a/src/elliptic/ellipticPreconditionerSetup.cpp b/src/elliptic/ellipticPreconditionerSetup.cpp index 90526d44a..acef27feb 100644 --- a/src/elliptic/ellipticPreconditionerSetup.cpp +++ b/src/elliptic/ellipticPreconditionerSetup.cpp @@ -32,7 +32,7 @@ void ellipticPreconditionerSetup(elliptic_t* elliptic, ogs_t* ogs) mesh_t* mesh = elliptic->mesh; precon_t* precon = elliptic->precon; - setupAide options = elliptic->options; + setupAide& options = elliptic->options; MPI_Barrier(platform->comm.mpiComm); const double tStart = MPI_Wtime(); @@ -45,9 +45,7 @@ void ellipticPreconditionerSetup(elliptic_t* elliptic, ogs_t* ogs) } else if(options.compareArgs("PRECONDITIONER", "JACOBI")) { if(platform->comm.mpiRank == 0) printf("building Jacobi preconditioner ... "); fflush(stdout); precon->o_invDiagA = platform->device.malloc(elliptic->Nfields * elliptic->Ntotal , sizeof(pfloat)); - elliptic->axmyzManyPfloatKernel = platform->kernels.getKernel("axmyzManyPfloat"); - elliptic->adyManyPfloatKernel = platform->kernels.getKernel("adyManyPfloat"); - ellipticUpdateJacobi(elliptic); + ellipticUpdateJacobi(elliptic, precon->o_invDiagA); } else if(options.compareArgs("PRECONDITIONER", "NONE")) { // nothing } else { diff --git a/src/elliptic/ellipticSEMFEM.cpp b/src/elliptic/ellipticSEMFEM.cpp index 6d76bc392..912bd8935 100644 --- a/src/elliptic/ellipticSEMFEM.cpp +++ b/src/elliptic/ellipticSEMFEM.cpp @@ -19,10 +19,10 @@ dlong numRowsSEMFEM; void ellipticSEMFEMSetup(elliptic_t* elliptic) { - + const int verbose = (platform->options.compareArgs("VERBOSE","TRUE")) ? 1: 0; const int useFP32 = elliptic->options.compareArgs("SEMFEM SOLVER PRECISION", "FP32"); - gatherKernel = platform->kernels.getKernel("gather"); - scatterKernel = platform->kernels.getKernel("scatter"); + gatherKernel = platform->kernels.get("gather"); + scatterKernel = platform->kernels.get("scatter"); MPI_Barrier(platform->comm.mpiComm); double tStart = MPI_Wtime(); @@ -31,10 +31,13 @@ void ellipticSEMFEMSetup(elliptic_t* elliptic) mesh_t* mesh = elliptic->mesh; double* mask = (double*) malloc(mesh->Np*mesh->Nelements*sizeof(double)); for(int i = 0; i < mesh->Np*mesh->Nelements; ++i) mask[i] = 1.0; - for(dlong n = 0; n < elliptic->Nmasked; n++){ - mask[elliptic->maskIds[n]] = 0.0; + if(elliptic->Nmasked > 0){ + dlong* maskIds = (dlong*) calloc(elliptic->Nmasked, sizeof(dlong)); + elliptic->o_maskIds.copyTo(maskIds, elliptic->Nmasked * sizeof(dlong)); + for (dlong i = 0; i < elliptic->Nmasked; i++) mask[maskIds[i]] = 0.; + free(maskIds); } - + SEMFEMData* data = ellipticBuildSEMFEM( mesh->Nq, mesh->Nelements, @@ -109,7 +112,8 @@ void ellipticSEMFEMSetup(elliptic_t* elliptic) 1, /* Nthreads */ useDevice ? platform->device.id() : -1, 0, /* do not use FP32 - hardwired as no runtime switch is available */ - settings + settings, + verbose ); } else if(elliptic->options.compareArgs("SEMFEM SOLVER", "AMGX")){ diff --git a/src/elliptic/ellipticSetup.cpp b/src/elliptic/ellipticSetup.cpp new file mode 100644 index 000000000..d677b49e4 --- /dev/null +++ b/src/elliptic/ellipticSetup.cpp @@ -0,0 +1,273 @@ +/* + + The MIT License (MIT) + + Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + */ + +#include "elliptic.h" +#include +#include "platform.hpp" +#include "linAlg.hpp" + +void ellipticSolveSetup(elliptic_t* elliptic) +{ + + mesh_t* mesh = elliptic->mesh; + setupAide& options = elliptic->options; + + const int verbose = options.compareArgs("VERBOSE","TRUE") ? 1:0; + + MPI_Barrier(platform->comm.mpiComm); + const double tStart = MPI_Wtime(); + + const dlong Nlocal = mesh->Np * mesh->Nelements; + elliptic->resNormFactor = 1 / (elliptic->Nfields * mesh->volume); + + if (elliptic->blockSolver && elliptic->elementType != HEXAHEDRA && + !options.compareArgs("DISCRETIZATION", + "CONTINUOUS") && !options.compareArgs("PRECONDITIONER","JACOBI") ) { + if(platform->comm.mpiRank == 0) + printf("ERROR: Block solver is implemented for C0-HEXAHEDRA with Jacobi preconditioner only\n"); + + ABORT(EXIT_FAILURE); + } + + if (options.compareArgs("COEFFICIENT","VARIABLE") && elliptic->elementType != HEXAHEDRA && + !options.compareArgs("DISCRETIZATION", "CONTINUOUS")) { + if(platform->comm.mpiRank == 0) + printf("ERROR: Varibale coefficient solver is implemented for C0-HEXAHEDRA only\n"); + + ABORT(EXIT_FAILURE); + } + + if (options.compareArgs("COEFFICIENT","VARIABLE")) { + if(options.compareArgs("PRECONDITIONER", + "MULTIGRID") && + !options.compareArgs("MULTIGRID VARIABLE COEFFICIENT", "FALSE")) { + if(platform->comm.mpiRank == 0) + printf( + "ERROR: Varibale coefficient solver is implemented for constant multigrid preconditioner only\n"); + + ABORT(EXIT_FAILURE); + } + } + + if(options.compareArgs("KRYLOV SOLVER", "PGMRES")){ + initializeGmresData(elliptic); + const std::string sectionIdentifier = std::to_string(elliptic->Nfields) + "-"; + elliptic->gramSchmidtOrthogonalizationKernel = + platform->kernels.get(sectionIdentifier + "gramSchmidtOrthogonalization"); + elliptic->updatePGMRESSolutionKernel = + platform->kernels.get(sectionIdentifier + "updatePGMRESSolution"); + elliptic->fusedResidualAndNormKernel = + platform->kernels.get(sectionIdentifier + "fusedResidualAndNorm"); + } + + const size_t offsetBytes = elliptic->Ntotal * elliptic->Nfields * sizeof(dfloat); + if(elliptic->o_wrk.size() < elliptic_t::NScratchFields * offsetBytes) { + if(platform->comm.mpiRank == 0) printf("ERROR: mempool assigned for elliptic too small!"); + ABORT(EXIT_FAILURE); + } + +#if 0 + elliptic->o_p = platform->device.malloc(elliptic->Ntotal * elliptic->Nfields * sizeof(dfloat)); + elliptic->o_z = platform->device.malloc(elliptic->Ntotal * elliptic->Nfields * sizeof(dfloat)); + elliptic->o_Ap = platform->device.malloc(elliptic->Ntotal * elliptic->Nfields * sizeof(dfloat)); + elliptic->o_x0 = platform->device.malloc(elliptic->Ntotal * elliptic->Nfields * sizeof(dfloat)); +#else + elliptic->o_p = elliptic->o_wrk + 0*offsetBytes; + elliptic->o_z = elliptic->o_wrk + 1*offsetBytes; + elliptic->o_Ap = elliptic->o_wrk + 2*offsetBytes; + elliptic->o_x0 = elliptic->o_wrk + 3*offsetBytes; +#endif + + const dlong Nblocks = (Nlocal + BLOCKSIZE - 1) / BLOCKSIZE; + elliptic->tmpNormr = (dfloat*) calloc(Nblocks,sizeof(dfloat)); + elliptic->o_tmpNormr = platform->device.malloc(Nblocks * sizeof(dfloat), + elliptic->tmpNormr); + + int useFlexible = options.compareArgs("KRYLOV SOLVER", "FLEXIBLE"); + + elliptic->type = strdup(dfloatString); + + // count total number of elements + hlong NelementsLocal = mesh->Nelements; + hlong NelementsGlobal = 0; + + MPI_Allreduce(&NelementsLocal, &NelementsGlobal, 1, MPI_HLONG, MPI_SUM, platform->comm.mpiComm); + + elliptic->NelementsGlobal = NelementsGlobal; + + dfloat* lambda = (dfloat*) calloc(2*elliptic->Ntotal, sizeof(dfloat)); + elliptic->o_lambda.copyTo(lambda, 2*elliptic->Ntotal*sizeof(dfloat)); + + int *allNeumann = (int *)calloc(elliptic->Nfields, sizeof(int)); + // check based on the coefficient + for(int fld = 0; fld < elliptic->Nfields; fld++) { + if(elliptic->coeffField) { + int allzero = 1; + for(int n = 0; n < Nlocal; n++) { // check any non-zero value for each field + if(lambda[n + elliptic->Ntotal + fld * elliptic->loffset]) { + allzero = 0; + break; + } + } + allNeumann[fld] = allzero; + }else{ + allNeumann[fld] = (lambda[elliptic->Ntotal + fld * elliptic->loffset] == 0) ? 1 : 0; + } + } + + free(lambda); + + elliptic->o_EToB = platform->device.malloc(mesh->Nelements * mesh->Nfaces * elliptic->Nfields * sizeof(int), + elliptic->EToB); + + // check based on BC + for (int fld = 0; fld < elliptic->Nfields; fld++) { + for (dlong e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + const int offset = fld * mesh->Nelements * mesh->Nfaces; + const int bc = elliptic->EToB[f + e * mesh->Nfaces + offset]; + bool isDirichlet = (bc != NO_OP && bc != NEUMANN); + if (isDirichlet) + allNeumann[fld] = 0; + } + } + } + elliptic->allNeumann = 0; + int* allBlockNeumann = (int*)calloc(elliptic->Nfields, sizeof(int)); + for(int fld = 0; fld < elliptic->Nfields; fld++) { + int lallNeumann, gallNeumann; + lallNeumann = allNeumann[fld] ? 0:1; + MPI_Allreduce(&lallNeumann, &gallNeumann, 1, MPI_INT, MPI_SUM, platform->comm.mpiComm); + allBlockNeumann[fld] = (gallNeumann > 0) ? 0: 1; + if (allBlockNeumann[fld]) + elliptic->allNeumann = 1; + } + free(allBlockNeumann); + + if (platform->comm.mpiRank == 0 && elliptic->allNeumann) + printf("allNeumann = %d \n", elliptic->allNeumann); + + if(mesh->ogs == NULL) { + if(platform->comm.mpiRank == 0) printf("ERROR: mesh->ogs == NULL!"); + ABORT(EXIT_FAILURE); + } + + { //setup an unmasked gs handle + ogs_t *ogs = NULL; + if (elliptic->blockSolver) ogs = mesh->ogs; + ellipticOgs(mesh, + elliptic->Ntotal, + elliptic->Nfields, + /* offset */ elliptic->Ntotal, + elliptic->EToB, + elliptic->Nmasked, + elliptic->o_maskIds, + elliptic->NmaskedLocal, + elliptic->o_maskIdsLocal, + elliptic->NmaskedGlobal, + elliptic->o_maskIdsGlobal, + &ogs); + elliptic->ogs = ogs; + elliptic->o_invDegree = elliptic->ogs->o_invDegree; + } + + elliptic->precon = new precon_t(); + + std::string suffix = "Hex3D"; + std::string kernelName; + + { + mesh->maskKernel = platform->kernels.get("mask"); + mesh->maskPfloatKernel = platform->kernels.get("maskPfloat"); + } + + { + const std::string sectionIdentifier = std::to_string(elliptic->Nfields) + "-"; + kernelName = "ellipticBlockBuildDiagonal" + suffix; + elliptic->ellipticBlockBuildDiagonalKernel = platform->kernels.get(sectionIdentifier + kernelName); + elliptic->axmyzManyPfloatKernel = platform->kernels.get("axmyzManyPfloat"); + elliptic->adyManyPfloatKernel = platform->kernels.get("adyManyPfloat"); + + std::string kernelNamePrefix = ""; + if(elliptic->poisson) kernelNamePrefix += "poisson-"; + kernelNamePrefix += "elliptic"; + if (elliptic->blockSolver) + kernelNamePrefix += (elliptic->stressForm) ? "Stress" : "Block"; + + kernelName = "Ax"; + if (elliptic->coeffField) kernelName += "Coeff"; + if (platform->options.compareArgs("ELEMENT MAP", "TRILINEAR")) kernelName += "Trilinear"; + kernelName += suffix; + if (elliptic->blockSolver && !elliptic->stressForm) + kernelName += "_N" + std::to_string(elliptic->Nfields); + + elliptic->AxKernel = + platform->kernels.get(kernelNamePrefix + "Partial" + kernelName); + + elliptic->updatePCGKernel = + platform->kernels.get(sectionIdentifier + "ellipticBlockUpdatePCG"); + } + + oogs_mode oogsMode = OOGS_AUTO; + auto callback = [&]() // hardwired to FP64 variable coeff + { + ellipticAx(elliptic, mesh->NlocalGatherElements, mesh->o_localGatherElementList, + elliptic->o_p, elliptic->o_Ap, dfloatString); + }; + elliptic->oogs = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, NULL, oogsMode); + elliptic->oogsAx = elliptic->oogs; + if(options.compareArgs("GS OVERLAP", "TRUE")) + elliptic->oogsAx = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, callback, oogsMode); + + long long int pre = platform->device.occaDevice().memoryAllocated(); + ellipticPreconditionerSetup(elliptic, elliptic->ogs); + + long long int usedBytes = platform->device.occaDevice().memoryAllocated() - pre; + + elliptic->precon->preconBytes = usedBytes; + + if(options.compareArgs("INITIAL GUESS","PROJECTION") || + options.compareArgs("INITIAL GUESS", "PROJECTION-ACONJ")) + { + dlong nVecsProject = 8; + options.getArgs("RESIDUAL PROJECTION VECTORS", nVecsProject); + + dlong nStepsStart = 5; + options.getArgs("RESIDUAL PROJECTION START", nStepsStart); + + SolutionProjection::ProjectionType type = SolutionProjection::ProjectionType::CLASSIC; + if(options.compareArgs("INITIAL GUESS", "PROJECTION-ACONJ")) + type = SolutionProjection::ProjectionType::ACONJ; + else if (options.compareArgs("INITIAL GUESS", "PROJECTION")) + type = SolutionProjection::ProjectionType::CLASSIC; + + elliptic->solutionProjection = new SolutionProjection(*elliptic, type, nVecsProject, nStepsStart); + } + + MPI_Barrier(platform->comm.mpiComm); + if(platform->comm.mpiRank == 0) printf("done (%gs)\n", MPI_Wtime() - tStart); + fflush(stdout); +} diff --git a/src/elliptic/ellipticResidualProjection.cpp b/src/elliptic/ellipticSolutionProjection.cpp similarity index 77% rename from src/elliptic/ellipticResidualProjection.cpp rename to src/elliptic/ellipticSolutionProjection.cpp index 8b4918a2f..e2b2da956 100644 --- a/src/elliptic/ellipticResidualProjection.cpp +++ b/src/elliptic/ellipticSolutionProjection.cpp @@ -25,13 +25,13 @@ */ #include "mesh.h" #include "elliptic.h" -#include "ellipticResidualProjection.h" +#include "ellipticSolutionProjection.h" #include #include "timer.hpp" #include "platform.hpp" #include "linAlg.hpp" -void ResidualProjection::matvec(occa::memory& o_Ax, +void SolutionProjection::matvec(occa::memory& o_Ax, const dlong Ax_offset, occa::memory& o_x, const dlong x_offset) @@ -41,11 +41,13 @@ void ResidualProjection::matvec(occa::memory& o_Ax, matvecOperator(o_xtmp, o_Axtmp); } -void ResidualProjection::updateProjectionSpace() +void SolutionProjection::updateProjectionSpace() { if(numVecsProjection <= 0) return; + double flopCount = 0.0; + platform->linAlg->weightedInnerProdMulti( Nlocal, numVecsProjection, @@ -67,6 +69,10 @@ void ResidualProjection::updateProjectionSpace() const dfloat one = 1.0; multiScaledAddwOffsetKernel(Nlocal, numVecsProjection, Nfields * (numVecsProjection - 1) * fieldOffset, fieldOffset, o_alpha, one, o_xx); if(type == ProjectionType::CLASSIC) multiScaledAddwOffsetKernel(Nlocal, numVecsProjection, Nfields * (numVecsProjection - 1) * fieldOffset, fieldOffset, o_alpha, one, o_bb); + + flopCount += 3 * static_cast(Nlocal) * Nfields * (numVecsProjection - 1); + flopCount *= (type == ProjectionType::CLASSIC) ? 2 : 1; + for(int k = 0; k < numVecsProjection - 1; ++k) norm_new = norm_new - alpha[k] * alpha[k]; norm_new = sqrt(norm_new); @@ -76,6 +82,8 @@ void ResidualProjection::updateProjectionSpace() const dfloat scale = 1.0 / norm_new; platform->linAlg->scaleMany(Nlocal, Nfields, fieldOffset, scale, o_xx, fieldOffset * Nfields * (numVecsProjection - 1)); if(type == ProjectionType::CLASSIC) platform->linAlg->scaleMany(Nlocal, Nfields, fieldOffset, scale, o_bb, fieldOffset * Nfields * (numVecsProjection - 1)); + flopCount += static_cast(Nlocal) * Nfields; + flopCount *= (type == ProjectionType::CLASSIC) ? 2 : 1; } else { if(verbose && platform->comm.mpiRank == 0) { std::cout << "Detected rank deficiency: " << test << ".\n"; @@ -83,11 +91,15 @@ void ResidualProjection::updateProjectionSpace() } numVecsProjection--; } + + platform->flopCounter->add(solverName + " SolutionProjection::updateProjectionSpace", flopCount); } -void ResidualProjection::computePreProjection(occa::memory& o_r) +void SolutionProjection::computePreProjection(occa::memory& o_r) { - + + dfloat flopCount = 0.0; + dfloat one = 1.0; dfloat zero = 0.0; dfloat mone = -1.0; @@ -107,20 +119,25 @@ void ResidualProjection::computePreProjection(occa::memory& o_r) o_alpha.copyFrom(alpha,sizeof(dfloat) * numVecsProjection); accumulateKernel(Nlocal, numVecsProjection, fieldOffset, o_alpha, o_xx, o_xbar); + + flopCount += Nfields * (1 + 2 * (numVecsProjection - 1)) * static_cast(Nlocal); if(type == ProjectionType::CLASSIC){ accumulateKernel(Nlocal, numVecsProjection, fieldOffset, o_alpha, o_bb, o_rtmp); platform->linAlg->axpbyMany(Nlocal, Nfields, fieldOffset, mone, o_rtmp, one, o_r); + + flopCount += Nfields * (1 + 2 * (numVecsProjection - 1)) * static_cast(Nlocal); // accumulation } else if (type == ProjectionType::ACONJ) { matvec(o_bb, 0, o_xbar, 0); platform->linAlg->axpbyMany(Nlocal, Nfields, fieldOffset, mone, o_bb, one, o_r); } + + platform->flopCounter->add(solverName + " SolutionProjection::computePreProjection", flopCount); } -void ResidualProjection::computePostProjection(occa::memory & o_x) +void SolutionProjection::computePostProjection(occa::memory & o_x) { - const dfloat one = 1.0; const dfloat zero = 0.0; @@ -152,26 +169,21 @@ void ResidualProjection::computePostProjection(occa::memory & o_x) } } -ResidualProjection::ResidualProjection(elliptic_t& elliptic, +SolutionProjection::SolutionProjection(elliptic_t &elliptic, const ProjectionType _type, const dlong _maxNumVecsProjection, const dlong _numTimeSteps) - : - maxNumVecsProjection(_maxNumVecsProjection), - numTimeSteps(_numTimeSteps), - type(_type), - Nlocal(elliptic.mesh->Np * elliptic.mesh->Nelements), - fieldOffset(elliptic.Ntotal), - Nfields(elliptic.Nfields), - o_invDegree(elliptic.mesh->ogs->o_invDegree), - o_rtmp(elliptic.o_z), - o_Ap(elliptic.o_Ap) + : maxNumVecsProjection(_maxNumVecsProjection), numTimeSteps(_numTimeSteps), type(_type), + alpha((dfloat *)calloc(maxNumVecsProjection, sizeof(dfloat))), numVecsProjection(0), + prevNumVecsProjection(0), Nlocal(elliptic.mesh->Np * elliptic.mesh->Nelements), + fieldOffset(elliptic.Ntotal), Nfields(elliptic.Nfields), timestep(0), + verbose(elliptic.options.compareArgs("VERBOSE", "TRUE")), o_invDegree(elliptic.mesh->ogs->o_invDegree), + o_rtmp(elliptic.o_z), o_Ap(elliptic.o_Ap) { + solverName = elliptic.name; + platform_t* platform = platform_t::getInstance(); - timestep = 0; - numVecsProjection = 0; - verbose = elliptic.options.compareArgs("VERBOSE","TRUE"); - alpha = (dfloat*) calloc(maxNumVecsProjection, sizeof(dfloat)); + o_alpha = platform->device.malloc(maxNumVecsProjection, sizeof(dfloat)); o_xbar = platform->device.malloc(Nfields * fieldOffset, sizeof(dfloat)); o_xx = platform->device.malloc(Nfields * fieldOffset * maxNumVecsProjection, sizeof(dfloat)); @@ -180,12 +192,11 @@ ResidualProjection::ResidualProjection(elliptic_t& elliptic, Nfields * fieldOffset , sizeof(dfloat)); - std::string kernelName; const std::string sectionIdentifier = std::to_string(Nfields) + "-"; { - multiScaledAddwOffsetKernel = platform->kernels.getKernel(sectionIdentifier + "multiScaledAddwOffset"); - accumulateKernel = platform->kernels.getKernel(sectionIdentifier + "accumulate"); + multiScaledAddwOffsetKernel = platform->kernels.get(sectionIdentifier + "multiScaledAddwOffset"); + accumulateKernel = platform->kernels.get(sectionIdentifier + "accumulate"); } matvecOperator = [&](occa::memory& o_x, occa::memory & o_Ax) @@ -194,17 +205,19 @@ ResidualProjection::ResidualProjection(elliptic_t& elliptic, }; } -void ResidualProjection::pre(occa::memory& o_r) +void SolutionProjection::pre(occa::memory& o_r) { ++timestep; if(timestep < numTimeSteps) return; if(numVecsProjection <= 0) return; + + prevNumVecsProjection = numVecsProjection; computePreProjection(o_r); } -void ResidualProjection::post(occa::memory& o_x) +void SolutionProjection::post(occa::memory& o_x) { if(timestep < numTimeSteps) return; diff --git a/src/elliptic/ellipticResidualProjection.h b/src/elliptic/ellipticSolutionProjection.h similarity index 87% rename from src/elliptic/ellipticResidualProjection.h rename to src/elliptic/ellipticSolutionProjection.h index b47a26dee..9e699edd6 100644 --- a/src/elliptic/ellipticResidualProjection.h +++ b/src/elliptic/ellipticSolutionProjection.h @@ -31,19 +31,22 @@ #include #include "elliptic.h" -class ResidualProjection final +class SolutionProjection final { public: enum class ProjectionType { CLASSIC, ACONJ, }; - ResidualProjection(elliptic_t& _elliptic, + SolutionProjection(elliptic_t& _elliptic, const ProjectionType _type, const dlong _maxNumVecsProjection = 8, const dlong _numTimeSteps = 5); void pre(occa::memory& o_r); void post(occa::memory& o_x); + dlong getNumVecsProjection() const { return numVecsProjection; } + dlong getPrevNumVecsProjection() const { return prevNumVecsProjection; } + dlong getMaxNumVecsProjection() const { return maxNumVecsProjection; } private: void computePreProjection(occa::memory& o_r); void computePostProjection(occa::memory& o_x); @@ -55,6 +58,8 @@ class ResidualProjection final dlong timestep; bool verbose; + std::string solverName; + occa::memory o_xbar; occa::memory o_xx; occa::memory o_bb; @@ -71,6 +76,7 @@ class ResidualProjection final dfloat* alpha; dlong numVecsProjection; + dlong prevNumVecsProjection; const dlong Nlocal; // vector size const dlong fieldOffset; // offset const dlong Nfields; diff --git a/src/elliptic/ellipticSolve.cpp b/src/elliptic/ellipticSolve.cpp index 3bbf4e6bc..0aa7c57d4 100644 --- a/src/elliptic/ellipticSolve.cpp +++ b/src/elliptic/ellipticSolve.cpp @@ -31,13 +31,23 @@ void ellipticSolve(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x) { + setupAide& options = elliptic->options; + if(elliptic->coeffFieldPreco && options.compareArgs("PRECONDITIONER", "JACOBI")) + ellipticUpdateJacobi(elliptic, elliptic->precon->o_invDiagA); + else if(elliptic->coeffFieldPreco && options.compareArgs("PRECONDITIONER", "MULTIGRID")) + ellipticMultiGridUpdateLambda(elliptic); + mesh_t* mesh = elliptic->mesh; - setupAide options = elliptic->options; + + std::string name = elliptic->name; + if(name.find("scalar") != std::string::npos){ + name = "scalar"; + } int maxIter = 999; options.getArgs("MAXIMUM ITERATIONS", maxIter); const int verbose = options.compareArgs("VERBOSE", "TRUE"); - elliptic->resNormFactor = 1 / (elliptic->Nfields * mesh->volume); + elliptic->resNormFactor = 1 / mesh->volume; if(verbose) { const dfloat rhsNorm = @@ -67,9 +77,6 @@ void ellipticSolve(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x) if(platform->comm.mpiRank == 0) printf("%s x0 norm: %.15e\n", elliptic->name.c_str(), rhsNorm); } - if(elliptic->var_coeff && options.compareArgs("PRECONDITIONER", "JACOBI")) - ellipticUpdateJacobi(elliptic); - // compute initial residual r = rhs - Ax0 ellipticAx(elliptic, mesh->NglobalGatherElements, mesh->o_globalGatherElementList, o_x, elliptic->o_Ap, dfloatString); ellipticAx(elliptic, mesh->NlocalGatherElements, mesh->o_localGatherElementList, o_x, elliptic->o_Ap, dfloatString); @@ -83,14 +90,15 @@ void ellipticSolve(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x) o_r ); if(elliptic->allNeumann) ellipticZeroMean(elliptic, o_r); + ellipticApplyMask(elliptic, o_r, dfloatString); oogs::startFinish(o_r, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, ogsAdd, elliptic->oogs); - if(elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_r); elliptic->o_x0.copyFrom(o_x, elliptic->Nfields * elliptic->Ntotal * sizeof(dfloat)); platform->linAlg->fill(elliptic->Ntotal * elliptic->Nfields, 0.0, o_x); if(options.compareArgs("INITIAL GUESS","PROJECTION") || options.compareArgs("INITIAL GUESS","PROJECTION-ACONJ")) { - platform->timer.tic(elliptic->name + " proj pre",1); + + platform->timer.tic(name + " proj pre",1); elliptic->res00Norm = platform->linAlg->weightedNorm2Many( mesh->Nlocal, @@ -105,8 +113,8 @@ void ellipticSolve(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x) if(platform->comm.mpiRank == 0) printf("Unreasonable res00Norm!\n"); ABORT(EXIT_FAILURE); } - elliptic->residualProjection->pre(o_r); - platform->timer.toc(elliptic->name + " proj pre"); + elliptic->solutionProjection->pre(o_r); + platform->timer.toc(name + " proj pre"); } elliptic->res0Norm = @@ -147,9 +155,9 @@ void ellipticSolve(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x) if(options.compareArgs("INITIAL GUESS","PROJECTION") || options.compareArgs("INITIAL GUESS","PROJECTION-ACONJ")) { - platform->timer.tic(elliptic->name + " proj post",1); - elliptic->residualProjection->post(o_x); - platform->timer.toc(elliptic->name + " proj post"); + platform->timer.tic(name + " proj post",1); + elliptic->solutionProjection->post(o_x); + platform->timer.toc(name + " proj post"); } else { elliptic->res00Norm = elliptic->res0Norm; } diff --git a/src/elliptic/ellipticSolveSetup.cpp b/src/elliptic/ellipticSolveSetup.cpp deleted file mode 100644 index e7b261b5a..000000000 --- a/src/elliptic/ellipticSolveSetup.cpp +++ /dev/null @@ -1,382 +0,0 @@ -/* - - The MIT License (MIT) - - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - */ - -#include "elliptic.h" -#include -#include "platform.hpp" -#include "linAlg.hpp" - -void ellipticSolveSetup(elliptic_t* elliptic) -{ - - mesh_t* mesh = elliptic->mesh; - - setupAide options = elliptic->options; - - MPI_Barrier(platform->comm.mpiComm); - const double tStart = MPI_Wtime(); - - const dlong Nlocal = mesh->Np * mesh->Nelements; - elliptic->resNormFactor = 1 / (elliptic->Nfields * mesh->volume); - - const int serial = platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP"; - - if (elliptic->blockSolver && elliptic->elementType != HEXAHEDRA && - !options.compareArgs("DISCRETIZATION", - "CONTINUOUS") && !options.compareArgs("PRECONDITIONER","JACOBI") ) { - if(platform->comm.mpiRank == 0) - printf("ERROR: Block solver is implemented for C0-HEXAHEDRA with Jacobi preconditioner only\n"); - - ABORT(EXIT_FAILURE); - } - - if (options.compareArgs("COEFFICIENT","VARIABLE") && elliptic->elementType != HEXAHEDRA && - !options.compareArgs("DISCRETIZATION", "CONTINUOUS")) { - if(platform->comm.mpiRank == 0) - printf("ERROR: Varibale coefficient solver is implemented for C0-HEXAHEDRA only\n"); - - ABORT(EXIT_FAILURE); - } - - if (options.compareArgs("COEFFICIENT","VARIABLE")) { - if(options.compareArgs("PRECONDITIONER", - "MULTIGRID") && - !options.compareArgs("MULTIGRID VARIABLE COEFFICIENT", "FALSE")) { - if(platform->comm.mpiRank == 0) - printf( - "ERROR: Varibale coefficient solver is implemented for constant multigrid preconditioner only\n"); - - ABORT(EXIT_FAILURE); - } - } - - if(options.compareArgs("KRYLOV SOLVER", "PGMRES")){ - initializeGmresData(elliptic); - const std::string sectionIdentifier = std::to_string(elliptic->Nfields) + "-"; - elliptic->gramSchmidtOrthogonalizationKernel = - platform->kernels.getKernel(sectionIdentifier + "gramSchmidtOrthogonalization"); - elliptic->updatePGMRESSolutionKernel = - platform->kernels.getKernel(sectionIdentifier + "updatePGMRESSolution"); - elliptic->fusedResidualAndNormKernel = - platform->kernels.getKernel(sectionIdentifier + "fusedResidualAndNorm"); - } - - const size_t offsetBytes = elliptic->Ntotal * elliptic->Nfields * sizeof(dfloat); - if(elliptic->o_wrk.size() < elliptic_t::NScratchFields * offsetBytes) { - if(platform->comm.mpiRank == 0) printf("ERROR: mempool assigned for elliptic too small!"); - ABORT(EXIT_FAILURE); - } - -#if 0 - elliptic->o_p = platform->device.malloc(elliptic->Ntotal * elliptic->Nfields * sizeof(dfloat)); - elliptic->o_z = platform->device.malloc(elliptic->Ntotal * elliptic->Nfields * sizeof(dfloat)); - elliptic->o_Ap = platform->device.malloc(elliptic->Ntotal * elliptic->Nfields * sizeof(dfloat)); - elliptic->o_x0 = platform->device.malloc(elliptic->Ntotal * elliptic->Nfields * sizeof(dfloat)); -#else - elliptic->o_p = elliptic->o_wrk + 0*offsetBytes; - elliptic->o_z = elliptic->o_wrk + 1*offsetBytes; - elliptic->o_Ap = elliptic->o_wrk + 2*offsetBytes; - elliptic->o_x0 = elliptic->o_wrk + 3*offsetBytes; -#endif - - const dlong Nblocks = (Nlocal + BLOCKSIZE - 1) / BLOCKSIZE; - elliptic->tmpNormr = (dfloat*) calloc(Nblocks,sizeof(dfloat)); - elliptic->o_tmpNormr = platform->device.malloc(Nblocks * sizeof(dfloat), - elliptic->tmpNormr); - - int useFlexible = options.compareArgs("KRYLOV SOLVER", "FLEXIBLE"); - - elliptic->type = strdup(dfloatString); - - // count total number of elements - hlong NelementsLocal = mesh->Nelements; - hlong NelementsGlobal = 0; - - MPI_Allreduce(&NelementsLocal, &NelementsGlobal, 1, MPI_HLONG, MPI_SUM, platform->comm.mpiComm); - - elliptic->NelementsGlobal = NelementsGlobal; - - elliptic->allNeumannPenalty = 1.; - hlong localElements = (hlong) mesh->Nelements; - hlong totalElements = 0; - MPI_Allreduce(&localElements, &totalElements, 1, MPI_HLONG, MPI_SUM, platform->comm.mpiComm); - elliptic->allNeumannScale = 1. / sqrt((dfloat)mesh->Np * totalElements); - - elliptic->allNeumannPenalty = 0; - elliptic->allNeumannScale = 0; - - elliptic->EToB = (int*) calloc(mesh->Nelements * mesh->Nfaces * elliptic->Nfields,sizeof(int)); - int* allNeumann = (int*)calloc(elliptic->Nfields, sizeof(int)); - // check based on the coefficient - for(int fld = 0; fld < elliptic->Nfields; fld++) { - if(elliptic->var_coeff) { - int allzero = 1; - for(int n = 0; n < Nlocal; n++) { // check any non-zero value for each field - const dfloat lambda = elliptic->lambda[n + elliptic->Ntotal + fld * elliptic->loffset]; - if(lambda) { - allzero = 0; - break; - } - } - allNeumann[fld] = allzero; - }else{ - allNeumann[fld] = (elliptic->lambda[fld] == 0) ? 1 : 0; - } - } - - // check based on BC - for(int fld = 0; fld < elliptic->Nfields; fld++) - for (dlong e = 0; e < mesh->Nelements; e++) - for (int f = 0; f < mesh->Nfaces; f++) { - int bc = mesh->EToB[e * mesh->Nfaces + f]; - if (bc > 0) { - int BC = elliptic->BCType[bc + elliptic->NBCType * fld]; - elliptic->EToB[f + e * mesh->Nfaces + fld * mesh->Nelements * mesh->Nfaces] = BC; //record it - if (BC != 2) allNeumann[fld] = 0; //check if its a Dirchlet for each field - } - } - - elliptic->allNeumann = 0; - elliptic->allBlockNeumann = (int*)calloc(elliptic->Nfields, sizeof(int)); - for(int fld = 0; fld < elliptic->Nfields; fld++) { - int lallNeumann, gallNeumann; - lallNeumann = allNeumann[fld] ? 0:1; - MPI_Allreduce(&lallNeumann, &gallNeumann, 1, MPI_INT, MPI_SUM, platform->comm.mpiComm); - elliptic->allBlockNeumann[fld] = (gallNeumann > 0) ? 0: 1; - // even if there is a single allNeumann activate Null space correction - if(elliptic->allBlockNeumann[fld]) - elliptic->allNeumann = 1; - } - - if(platform->comm.mpiRank == 0) - printf("allNeumann = %d \n", elliptic->allNeumann); - - //copy boundary flags - elliptic->o_EToB = platform->device.malloc( - mesh->Nelements * mesh->Nfaces * elliptic->Nfields * sizeof(int), - elliptic->EToB); - - //setup an unmasked gs handle - int verbose = options.compareArgs("VERBOSE","TRUE") ? 1:0; - if(mesh->ogs == NULL) meshParallelGatherScatterSetup(mesh, Nlocal, mesh->globalIds, platform->comm.mpiComm, verbose); - - //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann) - const int mapSize = elliptic->blockSolver ? elliptic->Ntotal * elliptic->Nfields: Nlocal; - elliptic->mapB = (int*) calloc(mapSize,sizeof(int)); - const int largeNumber = 1 << 20; - for(int fld = 0; fld < elliptic->Nfields; fld++) - for (dlong e = 0; e < mesh->Nelements; e++) { - for (int n = 0; n < mesh->Np; n++) - elliptic->mapB[n + e * mesh->Np + fld * elliptic->Ntotal] = largeNumber; - for (int f = 0; f < mesh->Nfaces; f++) { - int bc = mesh->EToB[f + e * mesh->Nfaces]; - if (bc > 0) { - int BCFlag = elliptic->BCType[bc + elliptic->NBCType * fld]; - for (int n = 0; n < mesh->Nfp; n++) { - int fid = mesh->faceNodes[n + f * mesh->Nfp]; - elliptic->mapB[fid + e * mesh->Np + fld * elliptic->Ntotal] = - mymin(BCFlag, elliptic->mapB[fid + e * mesh->Np + fld * elliptic->Ntotal]); - } - } - } - } - ogsGatherScatterMany(elliptic->mapB, - elliptic->Nfields, - elliptic->Ntotal, - ogsInt, - ogsMin, - mesh->ogs); - - // Create mask Ids - elliptic->Nmasked = 0; - elliptic->fNmasked = (dlong*)calloc(elliptic->Nfields, sizeof(dlong)); - for(int fld = 0; fld < elliptic->Nfields; fld++) - for (dlong n = 0; n < mesh->Nelements * mesh->Np; n++) { - if (elliptic->mapB[n + fld * elliptic->Ntotal] == largeNumber) { - elliptic->mapB[n + fld * elliptic->Ntotal] = 0.; - } else if (elliptic->mapB[n + fld * elliptic->Ntotal] == 1) { //Dirichlet boundary - elliptic->Nmasked++; // increase global accumulator - elliptic->fNmasked[fld]++; // increase local accumulator - } - } - elliptic->o_mapB = platform->device.malloc(mapSize * sizeof(int), elliptic->mapB); - - elliptic->maskIds = (dlong*) calloc(elliptic->Nmasked, sizeof(dlong)); - elliptic->Nmasked = 0; - for(int fld = 0; fld < elliptic->Nfields; fld++) - for (dlong n = 0; n < mesh->Nelements * mesh->Np; n++) - if (elliptic->mapB[n + fld * elliptic->Ntotal] == 1) - elliptic->maskIds[elliptic->Nmasked++] = n + fld * elliptic->Ntotal; - if (elliptic->Nmasked) - elliptic->o_maskIds = platform->device.malloc(elliptic->Nmasked * sizeof(dlong), elliptic->maskIds); - - if(elliptic->blockSolver) { - elliptic->ogs = mesh->ogs; // cannot use masked version as mixed BC's possible in each field - } else { - hlong* maskedGlobalIds = (hlong*) calloc(Nlocal,sizeof(hlong)); - memcpy(maskedGlobalIds, mesh->globalIds, Nlocal * sizeof(hlong)); - for (dlong n = 0; n < elliptic->Nmasked; n++) - maskedGlobalIds[elliptic->maskIds[n]] = 0; - - elliptic->ogs = ogsSetup(Nlocal, maskedGlobalIds, platform->comm.mpiComm, verbose, platform->device); - free(maskedGlobalIds); - } - elliptic->o_invDegree = elliptic->ogs->o_invDegree; - - elliptic->precon = new precon_t(); - - std::string suffix = "Hex3D"; - std::string kernelName; - - MPI_Barrier(platform->comm.mpiComm); - double tStartLoadKernel = MPI_Wtime(); - if(platform->comm.mpiRank == 0) printf("loading elliptic kernels ... "); - fflush(stdout); - - { - mesh->maskKernel = - platform->kernels.getKernel("mask"); - } - - { - const std::string sectionIdentifier = std::to_string(elliptic->Nfields) + "-"; - kernelName = "ellipticBlockBuildDiagonal" + suffix; - elliptic->updateDiagonalKernel = platform->kernels.getKernel(sectionIdentifier + kernelName); - if(elliptic->blockSolver) { - if(elliptic->var_coeff && elliptic->elementType == HEXAHEDRA) { - if(elliptic->stressForm) - kernelName = "ellipticStressAxVar" + suffix; - else - kernelName = "ellipticBlockAxVar" + suffix + "_N" + std::to_string(elliptic->Nfields); - }else { - if(elliptic->stressForm) - kernelName = "ellipticStressAx" + suffix; - else - kernelName = "ellipticBlockAx", suffix + "_N" + std::to_string(elliptic->Nfields); - } - }else{ - if(elliptic->var_coeff && elliptic->elementType == HEXAHEDRA) - kernelName = "ellipticAxVar" + suffix; - else - kernelName = "ellipticAx" + suffix; - } - elliptic->AxStressKernel = platform->kernels.getKernel(kernelName); - if(elliptic->blockSolver) { - if(elliptic->var_coeff && elliptic->elementType == HEXAHEDRA) - kernelName = "ellipticBlockAxVar" + suffix + "_N" + std::to_string(elliptic->Nfields); - else - kernelName = "ellipticBlockAx" + suffix + "_N" + std::to_string(elliptic->Nfields); - }else{ - if(elliptic->var_coeff && elliptic->elementType == HEXAHEDRA) - kernelName = "ellipticAxVar" + suffix; - else - kernelName = "ellipticAx" + suffix; - } - // Keep other kernel around - elliptic->AxKernel = platform->kernels.getKernel(kernelName); - - if(!serial) { - if(elliptic->elementType != HEXAHEDRA) { - kernelName = "ellipticPartialAx" + suffix; - }else { - if(elliptic->options.compareArgs("ELEMENT MAP", "TRILINEAR")) { - if(elliptic->var_coeff || elliptic->blockSolver) { - printf( - "ERROR: TRILINEAR form is not implemented for varibale coefficient and block solver yet \n"); - ABORT(EXIT_FAILURE); - } - kernelName = "ellipticPartialAxTrilinear" + suffix; - }else { - if(elliptic->blockSolver) { - if(elliptic->var_coeff) { - if(elliptic->stressForm) - kernelName = "ellipticStressPartialAxVar" + suffix; - else - kernelName = "ellipticBlockPartialAxVar" + suffix + "_N" + std::to_string(elliptic->Nfields); - }else { - if(elliptic->stressForm) - kernelName = "ellipticStessPartialAx" + suffix; - else - kernelName = "ellipticBlockPartialAx" + suffix + "_N" + std::to_string(elliptic->Nfields); - } - }else { - if(elliptic->var_coeff) - kernelName = "ellipticPartialAxVar" + suffix; - else - kernelName = "ellipticPartialAx" + suffix; - } - } - } - elliptic->partialAxKernel = platform->kernels.getKernel(kernelName); - elliptic->partialAxKernel2 = platform->kernels.getKernel(kernelName); - } - elliptic->updatePCGKernel = - platform->kernels.getKernel(sectionIdentifier + "ellipticBlockUpdatePCG"); - } - - MPI_Barrier(platform->comm.mpiComm); - if(platform->comm.mpiRank == 0) printf("done (%gs)\n", MPI_Wtime() - tStartLoadKernel); - fflush(stdout); - - oogs_mode oogsMode = OOGS_AUTO; - //if(platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP") oogsMode = OOGS_DEFAULT; - auto callback = [&]() // hardwired to FP64 variable coeff - { - ellipticAx(elliptic, mesh->NlocalGatherElements, mesh->o_localGatherElementList, - elliptic->o_p, elliptic->o_Ap, dfloatString); - }; - elliptic->oogs = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, NULL, oogsMode); - elliptic->oogsAx = oogs::setup(elliptic->ogs, elliptic->Nfields, elliptic->Ntotal, ogsDfloat, callback, oogsMode); - - long long int pre = platform->device.memoryAllocated(); - ellipticPreconditionerSetup(elliptic, elliptic->ogs); - - long long int usedBytes = platform->device.memoryAllocated() - pre; - - elliptic->precon->preconBytes = usedBytes; - - if(options.compareArgs("INITIAL GUESS","PROJECTION") || - options.compareArgs("INITIAL GUESS", "PROJECTION-ACONJ")) - { - dlong nVecsProject = 8; - options.getArgs("RESIDUAL PROJECTION VECTORS", nVecsProject); - - dlong nStepsStart = 5; - options.getArgs("RESIDUAL PROJECTION START", nStepsStart); - - ResidualProjection::ProjectionType type = ResidualProjection::ProjectionType::CLASSIC; - if(options.compareArgs("INITIAL GUESS", "PROJECTION-ACONJ")) - type = ResidualProjection::ProjectionType::ACONJ; - else if (options.compareArgs("INITIAL GUESS", "PROJECTION")) - type = ResidualProjection::ProjectionType::CLASSIC; - - elliptic->residualProjection = new ResidualProjection(*elliptic, type, nVecsProject, nStepsStart); - } - - MPI_Barrier(platform->comm.mpiComm); - if(platform->comm.mpiRank == 0) printf("done (%gs)\n", MPI_Wtime() - tStart); - fflush(stdout); -} diff --git a/src/elliptic/ellipticUpdateJacobi.cpp b/src/elliptic/ellipticUpdateJacobi.cpp new file mode 100644 index 000000000..fbe3e4145 --- /dev/null +++ b/src/elliptic/ellipticUpdateJacobi.cpp @@ -0,0 +1,57 @@ +/* + + The MIT License (MIT) + + Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + */ + +#include "elliptic.h" +#include "linAlg.hpp" + +void ellipticUpdateJacobi(elliptic_t *elliptic, occa::memory &o_invDiagA) +{ + dfloat flopCount = 0.0; + mesh_t *mesh = elliptic->mesh; + setupAide& options = elliptic->options; + + const dlong Nlocal = mesh->Np * mesh->Nelements; + + elliptic->ellipticBlockBuildDiagonalKernel(mesh->Nelements, + elliptic->Nfields, + elliptic->Ntotal, + elliptic->loffset, + mesh->o_ggeo, + mesh->o_D, + mesh->o_DT, + elliptic->o_lambda, + o_invDiagA); + + flopCount += 12 * mesh->Nq + 12; + flopCount += (elliptic->poisson) ? 0.0 : 2.0; + flopCount *= static_cast(mesh->Nlocal) * elliptic->Nfields; + + oogs::startFinish(o_invDiagA, elliptic->Nfields, elliptic->Ntotal, ogsPfloat, ogsAdd, elliptic->oogs); + + const pfloat one = 1.0; + elliptic->adyManyPfloatKernel(Nlocal, elliptic->Nfields, elliptic->Ntotal, one, o_invDiagA); + platform->flopCounter->add(elliptic->name + " ellipticUpdateJacobi", flopCount); +} diff --git a/src/elliptic/ellipticUpdatePCG.cpp b/src/elliptic/ellipticUpdatePCG.cpp index 046fcb188..542dd0da0 100644 --- a/src/elliptic/ellipticUpdatePCG.cpp +++ b/src/elliptic/ellipticUpdatePCG.cpp @@ -33,9 +33,8 @@ dfloat ellipticUpdatePCG(elliptic_t* elliptic, { mesh_t* mesh = elliptic->mesh; - int serial = platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP"; + const bool serial = platform->serial; - // x <= x + alpha*p // r <= r - alpha*A*p // dot(r,r) elliptic->updatePCGKernel(mesh->Nlocal, @@ -44,7 +43,6 @@ dfloat ellipticUpdatePCG(elliptic_t* elliptic, o_p, o_Ap, alpha, - o_x, o_r, elliptic->o_tmpNormr); @@ -60,10 +58,24 @@ dfloat ellipticUpdatePCG(elliptic_t* elliptic, for(int n = 0; n < Nblock; ++n) rdotr1 += elliptic->tmpNormr[n]; } + + // x <= x + alpha*p + platform->linAlg->axpbyMany( + mesh->Nlocal, + elliptic->Nfields, + elliptic->Ntotal, + alpha, + o_p, + 1.0, + o_x); + MPI_Allreduce(MPI_IN_PLACE, &rdotr1, 1, MPI_DFLOAT, MPI_SUM, platform->comm.mpiComm); #ifdef ELLIPTIC_ENABLE_TIMER //platform->timer.toc("dotp"); #endif + platform->flopCounter->add(elliptic->name + " ellipticUpdatePC", + elliptic->Nfields * static_cast(mesh->Nlocal) * 6 + mesh->Nlocal); + return rdotr1; } diff --git a/src/elliptic/linearSolver/PCG.cpp b/src/elliptic/linearSolver/PCG.cpp index a0521e2a4..6781a7dc3 100644 --- a/src/elliptic/linearSolver/PCG.cpp +++ b/src/elliptic/linearSolver/PCG.cpp @@ -33,7 +33,7 @@ int pcg(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x, { mesh_t* mesh = elliptic->mesh; - setupAide options = elliptic->options; + setupAide& options = elliptic->options; const int flexible = options.compareArgs("KRYLOV SOLVER", "FLEXIBLE"); const int verbose = options.compareArgs("VERBOSE", "TRUE"); @@ -44,7 +44,7 @@ int pcg(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x, /*aux variables */ occa::memory &o_p = elliptic->o_p; - occa::memory &o_z = elliptic->o_z; + occa::memory &o_z = (!options.compareArgs("PRECONDITIONER", "NONE")) ? elliptic->o_z : o_r; occa::memory &o_Ap = elliptic->o_Ap; occa::memory &o_weight = elliptic->o_invDegree; platform->linAlg->fill(elliptic->Nfields * elliptic->Ntotal, 0.0, o_p); @@ -60,17 +60,21 @@ int pcg(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x, int iter = 0; do { iter++; - ellipticPreconditioner(elliptic, o_r, o_z); - const dfloat rdotz2 = rdotz1; - rdotz1 = platform->linAlg->weightedInnerProdMany( - mesh->Nlocal, - elliptic->Nfields, - elliptic->Ntotal, - o_weight, - o_r, - o_z, - platform->comm.mpiComm); + if(!options.compareArgs("PRECONDITIONER", "NONE")) { + ellipticPreconditioner(elliptic, o_r, o_z); + + rdotz1 = platform->linAlg->weightedInnerProdMany( + mesh->Nlocal, + elliptic->Nfields, + elliptic->Ntotal, + o_weight, + o_r, + o_z, + platform->comm.mpiComm); + } else { + rdotz1 = rdotr; + } //printf("norm rdotz1: %.15e\n", rdotz1); @@ -109,7 +113,7 @@ int pcg(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x, o_p, o_Ap, platform->comm.mpiComm); - alpha = rdotz1 / pAp; + alpha = rdotz1 / (pAp + 1e-300); //printf("norm pAp: %.15e\n", pAp); @@ -117,7 +121,11 @@ int pcg(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x, // r <= r - alpha*A*p // dot(r,r) rdotr = sqrt(ellipticUpdatePCG(elliptic, o_p, o_Ap, alpha, o_x, o_r) * elliptic->resNormFactor); - + if(std::isnan(rdotr)) { + if(platform->comm.mpiRank == 0) printf("Detected invalid resiual norm while running linear solver!\n"); + ABORT(1); + } + if (verbose && (platform->comm.mpiRank == 0)) printf("it %d r norm %.15e\n", iter, rdotr); } diff --git a/src/elliptic/linearSolver/PGMRES.cpp b/src/elliptic/linearSolver/PGMRES.cpp index 5e4cdb4d0..39c877dd0 100644 --- a/src/elliptic/linearSolver/PGMRES.cpp +++ b/src/elliptic/linearSolver/PGMRES.cpp @@ -29,11 +29,11 @@ #include "linAlg.hpp" GmresData::GmresData(elliptic_t* elliptic) -: restart( +: nRestartVectors( [&](){ - int _restart = 15; - elliptic->options.getArgs("PGMRES RESTART", _restart); - return _restart; + int _nRestartVectors = 15; + elliptic->options.getArgs("PGMRES RESTART", _nRestartVectors); + return _nRestartVectors; }() ), flexible( @@ -43,21 +43,23 @@ GmresData::GmresData(elliptic_t* elliptic) return 0; }() ), - o_V(elliptic->Nfields * elliptic->Ntotal, restart, sizeof(dfloat)), - o_Z(elliptic->Nfields * elliptic->Ntotal, flexible ? restart : 1, sizeof(dfloat)), - o_y(platform->device.malloc(restart, sizeof(dfloat))), - H((dfloat *) calloc((restart+1)*(restart+1), sizeof(dfloat))), - sn((dfloat *) calloc(restart, sizeof(dfloat))), - cs((dfloat *) calloc(restart, sizeof(dfloat))), - s((dfloat *) calloc(restart+1, sizeof(dfloat))), - y((dfloat *) calloc(restart, sizeof(dfloat))) + o_V(elliptic->Ntotal * elliptic->Nfields, nRestartVectors, sizeof(dfloat)), + o_Z(elliptic->Ntotal * elliptic->Nfields, flexible ? nRestartVectors : 1, sizeof(dfloat)), + o_y(platform->device.malloc(nRestartVectors, sizeof(dfloat))), + H((dfloat *) calloc((nRestartVectors+1)*(nRestartVectors+1), sizeof(dfloat))), + sn((dfloat *) calloc(nRestartVectors, sizeof(dfloat))), + cs((dfloat *) calloc(nRestartVectors, sizeof(dfloat))), + s((dfloat *) calloc(nRestartVectors+1, sizeof(dfloat))) { int Nblock = (elliptic->mesh->Nlocal+BLOCKSIZE-1)/BLOCKSIZE; - const dlong Nbytes = restart * Nblock * sizeof(dfloat); + const size_t Nbytes = nRestartVectors * Nblock * sizeof(dfloat); //pinned scratch buffer { h_scratch = platform->device.mallocHost(Nbytes); scratch = (dfloat*) h_scratch.ptr(); + + h_y = platform->device.mallocHost(nRestartVectors * sizeof(dfloat)); + y = (dfloat*) h_y.ptr(); } o_scratch = platform->device.malloc(Nbytes); } @@ -71,8 +73,8 @@ void initializeGmresData(elliptic_t* elliptic) namespace{ void gmresUpdate(elliptic_t* elliptic, occa::memory o_x, - int I){ - const int restart = elliptic->gmresData->restart; + int gmresUpdateSize){ + const int nRestartVectors = elliptic->gmresData->nRestartVectors; mesh_t* mesh = elliptic->mesh; dfloat* y = elliptic->gmresData->y; dfloat* H = elliptic->gmresData->H; @@ -83,22 +85,22 @@ void gmresUpdate(elliptic_t* elliptic, occa::memory& o_z = elliptic->o_z; occa::memory& o_tmp = elliptic->o_p; - for(int k=I-1; k>=0; --k){ + for(int k=gmresUpdateSize-1; k>=0; --k){ y[k] = s[k]; - for(int m=k+1; moptions.compareArgs("KRYLOV SOLVER", "FLEXIBLE")){ elliptic->updatePGMRESSolutionKernel( mesh->Nlocal, elliptic->Ntotal, - I, + gmresUpdateSize, o_y, o_Z, o_x @@ -108,7 +110,7 @@ void gmresUpdate(elliptic_t* elliptic, elliptic->updatePGMRESSolutionKernel( mesh->Nlocal, elliptic->Ntotal, - I, + gmresUpdateSize, o_y, o_V, o_z @@ -124,6 +126,9 @@ void gmresUpdate(elliptic_t* elliptic, 1.0, o_x ); + + double flopCount = 2 * gmresUpdateSize * elliptic->Nfields * static_cast(mesh->Nlocal); + platform->flopCounter->add("gmresUpdate", flopCount); } } } @@ -155,7 +160,7 @@ int pgmres(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x, dfloat* cs = elliptic->gmresData->cs; dfloat* s = elliptic->gmresData->s; - const int restart = elliptic->gmresData->restart; + const int nRestartVectors = elliptic->gmresData->nRestartVectors; const int flexible = elliptic->options.compareArgs("KRYLOV SOLVER", "FLEXIBLE"); @@ -194,7 +199,7 @@ int pgmres(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x, o_V); //Construct orthonormal basis via Gram-Schmidt - for(int i=0;igramSchmidtOrthogonalizationKernel( @@ -242,35 +247,38 @@ int pgmres(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x, MPI_Allreduce(MPI_IN_PLACE, &nw, 1, MPI_DFLOAT, MPI_SUM, platform->comm.mpiComm); nw = sqrt(nw); + { + double flopCount = 5 * (i + 1) * elliptic->Nfields * static_cast(mesh->Nlocal); + platform->flopCounter->add("gramSchmidt", flopCount); + } + // H(i+1,i) = ||w||_2 - H[i+1 + i*(restart+1)] = nw; + H[i+1 + i*(nRestartVectors+1)] = nw; // V(:,i+1) = w/nw - if (iNlocal, - elliptic->Nfields, - elliptic->Ntotal, - (1./nw), o_w, 0., o_V.at(i+1)); + if (i < nRestartVectors - 1) { + linAlg + .axpbyMany(mesh->Nlocal, elliptic->Nfields, elliptic->Ntotal, (1. / nw), o_w, 0., o_V.at(i + 1)); + } //apply Givens rotation for(int k=0; kresNormFactor); rdotr = error; + if(std::isnan(error)) { + if(platform->comm.mpiRank == 0) printf("Detected invalid resiual norm while running linear solver!\n"); + ABORT(1); + } + if (verbose && (platform->comm.mpiRank == 0)) printf("it %d r norm %.15e\n", iter, rdotr); @@ -294,9 +307,9 @@ int pgmres(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x, if(error < TOL || iter==MAXIT) break; //update approximation - gmresUpdate(elliptic, o_x, restart); + gmresUpdate(elliptic, o_x, nRestartVectors); - // restart GMRES + // nRestartVectors GMRES // compute A*x ellipticOperator(elliptic, o_x, o_Ax, dfloatString); @@ -323,6 +336,11 @@ int pgmres(elliptic_t* elliptic, occa::memory &o_r, occa::memory &o_x, MPI_Allreduce(MPI_IN_PLACE, &nr, 1, MPI_DFLOAT, MPI_SUM, platform->comm.mpiComm); nr = sqrt(nr); + { + double flopCount = 4 * elliptic->Nfields * static_cast(mesh->Nlocal); + platform->flopCounter->add("gmres evaluate residual and norm", flopCount); + } + error = nr * sqrt(elliptic->resNormFactor); rdotr = nr * sqrt(elliptic->resNormFactor); //exit if tolerance is reached diff --git a/src/elliptic/registerEllipticKernels.cpp b/src/elliptic/registerEllipticKernels.cpp new file mode 100644 index 000000000..16a79c868 --- /dev/null +++ b/src/elliptic/registerEllipticKernels.cpp @@ -0,0 +1,196 @@ +#include +#include "elliptic.h" +#include "re2Reader.hpp" +#include "benchmarkAx.hpp" + +namespace{ + +void registerGMRESKernels(const std::string §ion, int Nfields) { + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + const std::string oklpath = installDir + "/okl/elliptic/"; + std::string fileName; + const bool serial = platform->serial; + + const std::string fileNameExtension = (serial) ? ".c" : ".okl"; + const std::string sectionIdentifier = std::to_string(Nfields) + "-"; + + occa::properties gmresKernelInfo = platform->kernelInfo; + gmresKernelInfo["defines/p_Nfields"] = Nfields; + + std::string kernelName = "gramSchmidtOrthogonalization"; + fileName = oklpath + kernelName + fileNameExtension; + platform->kernels.add( + sectionIdentifier + kernelName, fileName, gmresKernelInfo); + + kernelName = "updatePGMRESSolution"; + fileName = oklpath + kernelName + fileNameExtension; + platform->kernels.add( + sectionIdentifier + kernelName, fileName, gmresKernelInfo); + + kernelName = "fusedResidualAndNorm"; + fileName = oklpath + kernelName + fileNameExtension; + platform->kernels.add( + sectionIdentifier + kernelName, fileName, gmresKernelInfo); +} + +} + +void registerEllipticKernels(std::string section, int poissonEquation) { + int N; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + const std::string optionsPrefix = createOptionsPrefix(section); + + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + occa::properties kernelInfo = platform->kernelInfo; + kernelInfo["defines"].asObject(); + kernelInfo["includes"].asArray(); + kernelInfo["header"].asArray(); + kernelInfo["flags"].asObject(); + kernelInfo["include_paths"].asArray(); + kernelInfo += meshKernelProperties(N); + + const bool blockSolver = [§ion]() { + if (section.find("velocity") == std::string::npos) + return false; + if (platform->options.compareArgs("STRESSFORMULATION", "TRUE")) + return true; + if (platform->options.compareArgs("VELOCITY BLOCK SOLVER", "TRUE")) + return true; + return false; + }(); + const int Nfields = (blockSolver) ? 3 : 1; + const bool stressForm = [§ion]() { + if (section.find("velocity") == std::string::npos) + return false; + if (platform->options.compareArgs("STRESSFORMULATION", "TRUE")) + return true; + return false; + }(); + + const bool serial = platform->serial; + + const std::string fileNameExtension = (serial) ? ".c" : ".okl"; + + const std::string sectionIdentifier = std::to_string(Nfields) + "-"; + + if (platform->options.compareArgs( + optionsPrefix + "KRYLOV SOLVER", "PGMRES")) { + registerGMRESKernels(section, Nfields); + } + + // solution projection kernels + { + const std::string oklpath = installDir + "/okl/elliptic/"; + std::string fileName, kernelName; + + { + const std::string extension = ".okl"; + occa::properties properties = platform->kernelInfo; + properties["defines/p_Nfields"] = Nfields; + + kernelName = "multiScaledAddwOffset"; + fileName = oklpath + kernelName + extension; + platform->kernels.add( + sectionIdentifier + kernelName, fileName, properties); + kernelName = "accumulate"; + fileName = oklpath + kernelName + extension; + platform->kernels.add( + sectionIdentifier + kernelName, fileName, properties); + } + } + + { + const std::string oklpath = installDir + "/okl/core/"; + std::string fileName; + + fileName = oklpath + "mask.okl"; + platform->kernels.add("mask", fileName, kernelInfo); + + occa::properties pfloatKernelInfo = kernelInfo; + pfloatKernelInfo["defines/dfloat"] = pfloatString; + platform->kernels.add("maskPfloat", fileName, pfloatKernelInfo); + } + + kernelInfo["defines/p_Nfields"] = Nfields; + + occa::properties dfloatKernelInfo = kernelInfo; + occa::properties floatKernelInfo = kernelInfo; + floatKernelInfo["defines/pfloat"] = pfloatString; + floatKernelInfo["defines/dfloat"] = pfloatString; + + constexpr int elementType{HEXAHEDRA}; + + const std::string suffix = "Hex3D"; + + occa::properties AxKernelInfo = dfloatKernelInfo; + const std::string oklpath = installDir + "/okl/elliptic/"; + std::string fileName; + std::string kernelName; + + kernelName = "ellipticBlockBuildDiagonal" + suffix; + fileName = oklpath + kernelName + ".okl"; + dfloatKernelInfo["defines/dfloat"] = dfloatString; + dfloatKernelInfo["defines/pfloat"] = pfloatString; + platform->kernels.add( + sectionIdentifier + kernelName, fileName, dfloatKernelInfo); + + if(poissonEquation){ + AxKernelInfo["defines/p_poisson"] = 1; + } + + int nelgt, nelgv; + const std::string meshFile = platform->options.getArgs("MESH FILE"); + re2::nelg(meshFile, nelgt, nelgv, platform->comm.mpiComm); + const int NelemBenchmark = nelgv/platform->comm.mpiCommSize; + bool verbose = platform->options.compareArgs("VERBOSE", "TRUE"); + const int verbosity = verbose ? 2 : 1; + + for(auto&& coeffField : {true, false}){ + std::string kernelNamePrefix = "elliptic"; + if (blockSolver) + kernelNamePrefix += (stressForm) ? "Stress" : "Block"; + + kernelName = "Ax"; + if (coeffField) kernelName += "Coeff"; + if (platform->options.compareArgs("ELEMENT MAP", "TRILINEAR")) kernelName += "Trilinear"; + kernelName += suffix; + if (blockSolver && !stressForm) kernelName += "_N" + std::to_string(Nfields); + + const std::string _kernelName = kernelNamePrefix + "Partial" + kernelName; + const std::string prefix = (poissonEquation) ? "poisson-" : ""; + fileName = oklpath + _kernelName + fileNameExtension; + + auto axKernel = benchmarkAx(NelemBenchmark, + N + 1, + N, + !coeffField, + poissonEquation, + false, + sizeof(dfloat), + Nfields, + stressForm, + verbosity, + elliptic_t::targetBenchmark, + false, + ""); + + platform->kernels.add( + prefix + _kernelName, axKernel); + } + + kernelName = "ellipticBlockBuildDiagonal" + suffix; + fileName = oklpath + kernelName + ".okl"; + dfloatKernelInfo["defines/dfloat"] = dfloatString; + dfloatKernelInfo["defines/pfloat"] = pfloatString; + platform->kernels.add( + sectionIdentifier + kernelName, fileName, dfloatKernelInfo); + dfloatKernelInfo["defines/pfloat"] = dfloatString; + + // PCG update + fileName = oklpath + "ellipticBlockUpdatePCG" + fileNameExtension; + platform->kernels.add(sectionIdentifier + "ellipticBlockUpdatePCG", + fileName, + kernelInfo); +} \ No newline at end of file diff --git a/src/elliptic/registerEllipticPreconditionerKernels.cpp b/src/elliptic/registerEllipticPreconditionerKernels.cpp new file mode 100644 index 000000000..6ffb8b1a0 --- /dev/null +++ b/src/elliptic/registerEllipticPreconditionerKernels.cpp @@ -0,0 +1,445 @@ +#include +#include "nrs.hpp" +#include "elliptic.h" +#include "benchmarkFDM.hpp" +#include "benchmarkAx.hpp" + +#include "re2Reader.hpp" + +namespace { + +void registerAxKernels(const std::string& section, int N, int poissonEquation) +{ + auto gen_suffix = [N](const char *floatString) { + const std::string precision = std::string(floatString); + if (precision.find(pfloatString) != std::string::npos) { + return std::string("_") + std::to_string(N) + std::string("pfloat"); + } else { + return std::string("_") + std::to_string(N); + } + }; + constexpr int Nfields{1}; + + auto kernelInfo = platform->kernelInfo + meshKernelProperties(N); + kernelInfo["defines/p_Nfields"] = Nfields; + + std::string fileName, kernelName; + + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + const std::string oklpath = installDir + "/okl/elliptic/"; + const bool serial = platform->serial; + const std::string fileNameExtension = (serial) ? ".c" : ".okl"; + const std::string poissonPrefix = poissonEquation ? "poisson-" : ""; + { + int nelgt, nelgv; + const std::string meshFile = platform->options.getArgs("MESH FILE"); + re2::nelg(meshFile, nelgt, nelgv, platform->comm.mpiComm); + const int NelemBenchmark = nelgv/platform->comm.mpiCommSize; + + occa::properties AxKernelInfo = kernelInfo; + const auto Nq = N+1; + for(auto&& coeffField : {true,false}){ + for(auto&& floatString : {std::string(dfloatString), std::string(pfloatString)}){ + + dlong wordSize = 8; + if(floatString.find("float") != std::string::npos){ + wordSize = 4; + } + + bool verbose = platform->options.compareArgs("VERBOSE", "TRUE"); + const int verbosity = verbose ? 2 : 1; + const std::string kernelSuffix = gen_suffix(floatString.c_str()); + auto axKernel = benchmarkAx(NelemBenchmark, + Nq, + Nq - 1, + !coeffField, + poissonEquation, + false, + wordSize, + Nfields, + false, // no stress formulation in preconditioner + verbosity, + elliptic_t::targetBenchmark, + false, + kernelSuffix); + + const std::string suffix = coeffField ? "CoeffHex3D" : "Hex3D"; + + if (platform->options.compareArgs("ELEMENT MAP", "TRILINEAR")) + kernelName = "ellipticPartialAxTrilinear" + suffix; + else + kernelName = "ellipticPartialAx" + suffix; + + fileName = oklpath + kernelName + fileNameExtension; + + platform->kernels.add(poissonPrefix + kernelName + kernelSuffix, + axKernel); + } + } + } +} + +void registerJacobiKernels(const std::string §ion, int poissonEquation) { + const bool serial = platform->serial; + const std::string extension = serial ? ".c" : ".okl"; + const std::string optionsPrefix = createOptionsPrefix(section); + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + const std::string oklpath = installDir + "/okl/"; + occa::properties pfloatProps = platform->kernelInfo; + pfloatProps["defines/dfloat"] = pfloatString; + + // This kernel is needed as it used for mixed-precision Jacobi preconditioning + std::string kernelName = "axmyzManyPfloat"; + std::string fileName = oklpath + "elliptic/" + kernelName + extension; + platform->kernels.add( + kernelName, fileName, platform->kernelInfo); + + kernelName = "adyManyPfloat"; + fileName = oklpath + "linAlg/adyMany.okl"; + platform->kernels.add( + kernelName, fileName, pfloatProps); +} + +void registerCommonMGPreconditionerKernels(int N, occa::properties kernelInfo, int poissonEquation) { + const std::string prefix = "Hex3D"; + std::string fileName, kernelName; + + kernelInfo["defines/pfloat"] = pfloatString; + + kernelInfo["defines/p_Nfields"] = 1; + + occa::properties pfloatKernelInfo = kernelInfo; + pfloatKernelInfo["defines/dfloat"] = pfloatString; + pfloatKernelInfo["defines/pfloat"] = pfloatString; + + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + + const std::string orderSuffix = std::string("_") + std::to_string(N); + + const bool serial = platform->serial; + const std::string extension = serial ? ".c" : ".okl"; + + { + const std::string oklpath = installDir + "/okl/core/"; + std::string fileName; + + fileName = oklpath + "mask.okl"; + kernelName = "mask"; + platform->kernels.add(kernelName + orderSuffix, + fileName, + kernelInfo, + orderSuffix); + + fileName = oklpath + "mask.okl"; + platform->kernels.add(kernelName + orderSuffix + "pfloat", + fileName, + pfloatKernelInfo, + orderSuffix + "pfloat"); + kernelName = "fusedCopyDfloatToPfloat"; + fileName = installDir + "/okl/elliptic/" + kernelName + ".okl"; + platform->kernels.add(kernelName + orderSuffix, + fileName, + kernelInfo, + orderSuffix); + + kernelName = "copyDfloatToPfloat"; + fileName = installDir + "/okl/core/" + kernelName + extension; + platform->kernels.add(kernelName + orderSuffix, + fileName, + kernelInfo, + orderSuffix); + + kernelName = "copyPfloatToDfloat"; + fileName = installDir + "/okl/core/" + kernelName + extension; + platform->kernels.add(kernelName + orderSuffix, + fileName, + kernelInfo, + orderSuffix); + + kernelName = "scaledAdd"; + fileName = installDir + "/okl/elliptic/" + kernelName + ".okl"; + platform->kernels.add(kernelName + orderSuffix, + fileName, + kernelInfo, + orderSuffix); + kernelName = "dotMultiply"; + fileName = installDir + "/okl/elliptic/" + kernelName + ".okl"; + platform->kernels.add(kernelName + orderSuffix, + fileName, + kernelInfo, + orderSuffix); + + kernelName = "updateSmoothedSolutionVec"; + fileName = installDir + "/okl/elliptic/" + kernelName + ".okl"; + platform->kernels.add(kernelName + orderSuffix, + fileName, + kernelInfo, + orderSuffix); + kernelName = "updateChebyshevSolutionVec"; + fileName = installDir + "/okl/elliptic/" + kernelName + ".okl"; + platform->kernels.add(kernelName + orderSuffix, + fileName, + kernelInfo, + orderSuffix); + + kernelName = "updateIntermediateSolutionVec"; + fileName = installDir + "/okl/elliptic/" + kernelName + ".okl"; + platform->kernels.add(kernelName + orderSuffix, + fileName, + kernelInfo, + orderSuffix); + + occa::properties buildDiagInfo = kernelInfo; + if(poissonEquation) buildDiagInfo["defines/p_poisson"] = 1; + const std::string poissonPrefix = poissonEquation ? "poisson-" : ""; + kernelName = "ellipticBlockBuildDiagonalHex3D"; + fileName = installDir + "/okl/elliptic/" + kernelName + ".okl"; + platform->kernels.add(poissonPrefix + kernelName + orderSuffix, fileName, buildDiagInfo, orderSuffix); + } +} + +void registerSchwarzKernels(const std::string §ion, int N) { + const std::string optionsPrefix = createOptionsPrefix(section); + const int Nq = N + 1; + const int Nq_e = Nq + 2; + const int Np = Nq * Nq * Nq; + const int Np_e = Nq_e * Nq_e * Nq_e; + + bool overlap = false; + const bool serial = platform->serial; + if (Nq >= (elliptic_t::minNFDMOverlap + 1) && !serial) + overlap = true; + + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + const std::string oklpath = installDir + "/okl/elliptic/"; + std::string fileName, kernelName; + const std::string extension = serial ? ".c" : ".okl"; + + { + occa::properties properties = platform->kernelInfo; + properties["defines/p_Nq"] = Nq; + properties["defines/p_Nq_e"] = Nq_e; + properties["defines/p_restrict"] = 0; + bool useRAS = platform->options.compareArgs(optionsPrefix + "MULTIGRID SMOOTHER", "RAS"); + const std::string suffix = + std::string("_") + std::to_string(Nq_e - 1) + std::string("pfloat"); + properties["defines/p_overlap"] = (int)overlap; + if(useRAS){ + properties["defines/p_restrict"] = 1; + } + + fileName = oklpath + "preFDM" + extension; + platform->kernels.add( + "preFDM" + suffix, fileName, properties, suffix); + + int nelgt, nelgv; + const std::string meshFile = platform->options.getArgs("MESH FILE"); + re2::nelg(meshFile, nelgt, nelgv, platform->comm.mpiComm); + const int NelemBenchmark = nelgv/platform->comm.mpiCommSize; + + bool verbose = platform->options.compareArgs("VERBOSE", "TRUE"); + const int verbosity = verbose ? 2 : 1; + auto fdmKernel = benchmarkFDM(NelemBenchmark, + Nq_e, + sizeof(pfloat), + useRAS, + static_cast(overlap), + verbosity, + elliptic_t::targetBenchmark, + false, + suffix); + platform->kernels.add("fusedFDM" + suffix, fdmKernel); + + fileName = oklpath + "postFDM" + extension; + platform->kernels.add( + "postFDM" + suffix, fileName, properties, suffix); + } +} +void registerFineLevelKernels(const std::string §ion, int N, int poissonEquation) { + auto gen_suffix = [N](const char *floatString) { + const std::string precision = std::string(floatString); + if (precision.find(pfloatString) != std::string::npos) { + return std::string("_") + std::to_string(N) + std::string("pfloat"); + } else { + return std::string("_") + std::to_string(N); + } + }; + + auto kernelInfo = platform->kernelInfo + meshKernelProperties(N); + registerCommonMGPreconditionerKernels(N, kernelInfo, poissonEquation); + + registerAxKernels(section, N, poissonEquation); + registerSchwarzKernels(section, N); +} +void registerSEMFEMKernels(const std::string §ion, int N, int poissonEquation); + +void registerMultigridLevelKernels(const std::string §ion, int Nf, int N, int poissonEquation) { + const int Nc = N; + auto gen_suffix = [N](const char *floatString) { + const std::string precision = std::string(floatString); + if (precision.find(pfloatString) != std::string::npos) { + return std::string("_") + std::to_string(N) + std::string("pfloat"); + } else { + return std::string("_") + std::to_string(N); + } + }; + + occa::properties kernelInfo = platform->kernelInfo + meshKernelProperties(N); + + const std::string suffix = "Hex3D"; + + std::string fileName, kernelName; + + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + const std::string oklpath = installDir + "/okl/elliptic/"; + registerCommonMGPreconditionerKernels(N, kernelInfo, poissonEquation); + + const bool serial = platform->serial; + + const std::string fileNameExtension = (serial) ? ".c" : ".okl"; + + constexpr int elementType = HEXAHEDRA; + + { + // sizes for the coarsen and prolongation kernels. degree NFine to degree N + int NqFine = (Nf + 1); + int NqCoarse = (Nc + 1); + occa::properties coarsenProlongateKernelInfo = kernelInfo; + coarsenProlongateKernelInfo["defines/p_NqFine"] = Nf + 1; + coarsenProlongateKernelInfo["defines/p_NqCoarse"] = Nc + 1; + + const int NpFine = (Nf + 1) * (Nf + 1) * (Nf + 1); + const int NpCoarse = (Nc + 1) * (Nc + 1) * (Nc + 1); + coarsenProlongateKernelInfo["defines/p_NpFine"] = NpFine; + coarsenProlongateKernelInfo["defines/p_NpCoarse"] = NpCoarse; + + const std::string orderSuffix = + std::string("_Nf_") + std::to_string(Nf) + std::string("_Nc_") + std::to_string(Nc); + const std::string fileExtension = serial ? ".c" : ".okl"; + + fileName = oklpath + "ellipticPreconCoarsen" + suffix + fileNameExtension; + kernelName = "ellipticPreconCoarsen" + suffix; + platform->kernels.add(kernelName + orderSuffix, + fileName, + coarsenProlongateKernelInfo, + orderSuffix); + fileName = oklpath + "ellipticPreconProlongate" + suffix + fileNameExtension; + kernelName = "ellipticPreconProlongate" + suffix; + platform->kernels.add(kernelName + orderSuffix, + fileName, + coarsenProlongateKernelInfo, + orderSuffix); + } + + const std::string optionsPrefix = createOptionsPrefix(section); + if (N == 1 && platform->options.compareArgs(optionsPrefix + "MULTIGRID COARSE SOLVE", "TRUE")) { + return; + } + + registerAxKernels(section, N, poissonEquation); + registerSchwarzKernels(section, N); +} +void registerMultiGridKernels(const std::string §ion, int poissonEquation) { + int N; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + const std::string optionsPrefix = createOptionsPrefix(section); + + registerFineLevelKernels(section, N, poissonEquation); + + std::vector levels = determineMGLevels(section); + + if (levels.empty()) + return; + + for (unsigned levelIndex = 1U; levelIndex < levels.size(); ++levelIndex) { + const int levelFine = levels[levelIndex - 1]; + const int levelCoarse = levels[levelIndex]; + registerMultigridLevelKernels(section, levelFine, levelCoarse, poissonEquation); + } + const int coarseLevel = levels.back(); + if (platform->options.compareArgs( + optionsPrefix + "MULTIGRID COARSE SOLVE", "TRUE")) { + if (platform->options.compareArgs( + optionsPrefix + "MULTIGRID COARSE SEMFEM", "TRUE")) { + registerSEMFEMKernels(section, coarseLevel, poissonEquation); + } else { + { + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + const std::string oklpath = installDir + "/okl/"; + std::string fileName = oklpath + "parAlmond/convertFP64ToFP32.okl"; + std::string kernelName = "convertFP64ToFP32"; + platform->kernels.add( + kernelName, fileName, platform->kernelInfo); + + fileName = oklpath + "parAlmond/convertFP32ToFP64.okl"; + kernelName = "convertFP32ToFP64"; + platform->kernels.add( + kernelName, fileName, platform->kernelInfo); + fileName = oklpath + "parAlmond/vectorDotStar2.okl"; + kernelName = "vectorDotStar2"; + platform->kernels.add( + kernelName, fileName, platform->kernelInfo); + } + } + } +} +void registerSEMFEMKernels(const std::string §ion, int N, int poissonEquation) { + const int Nq = N + 1; + const int Np = Nq * Nq * Nq; + const std::string optionsPrefix = createOptionsPrefix(section); + const int useFP32 = platform->options.compareArgs( + optionsPrefix + "SEMFEM SOLVER PRECISION", "FP32"); + occa::properties SEMFEMKernelProps = platform->kernelInfo; + if (useFP32) { + SEMFEMKernelProps["defines/pfloat"] = "float"; + } else { + SEMFEMKernelProps["defines/pfloat"] = "double"; + } + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + const std::string oklpath = installDir + "/okl/elliptic/"; + std::string fileName = oklpath + "gather.okl"; + platform->kernels.add("gather", fileName, SEMFEMKernelProps); + fileName = oklpath + "scatter.okl"; + platform->kernels.add( + "scatter", fileName, SEMFEMKernelProps); + occa::properties stiffnessKernelInfo = platform->kernelInfo; + fileName = oklpath + "computeStiffnessMatrix.okl"; + stiffnessKernelInfo["defines/p_Nq"] = Nq; + stiffnessKernelInfo["defines/p_Np"] = Np; + stiffnessKernelInfo["defines/p_rows_sorted"] = 1; + stiffnessKernelInfo["defines/p_cols_sorted"] = 0; + + const bool constructOnHost = !platform->device.deviceAtomic; + + if (!constructOnHost) { + platform->kernels.add("computeStiffnessMatrix", + fileName, + stiffnessKernelInfo); + } +} + +} + +void registerEllipticPreconditionerKernels(std::string section, int poissonEquation) +{ + int N; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + const std::string optionsPrefix = createOptionsPrefix(section); + + if(platform->options.compareArgs(optionsPrefix + "PRECONDITIONER", "MULTIGRID")){ + registerMultiGridKernels(section, poissonEquation); + } + if(platform->options.compareArgs(optionsPrefix + "PRECONDITIONER", "SEMFEM")){ + registerSEMFEMKernels(section, N, poissonEquation); + } + if(platform->options.compareArgs(optionsPrefix + "PRECONDITIONER", "JACOBI")){ + registerJacobiKernels(section, poissonEquation); + } +} diff --git a/src/io/io.hpp b/src/io/fldFile.hpp similarity index 91% rename from src/io/io.hpp rename to src/io/fldFile.hpp index b9368c9d8..f30014762 100644 --- a/src/io/io.hpp +++ b/src/io/fldFile.hpp @@ -1,3 +1,6 @@ +#if !defined(nekrs_io_hpp_) +#define nekrs_io_hpp_ + #include "nrs.hpp" void fileSync(const char *file); @@ -12,3 +15,5 @@ void writeFld(nrs_t *nrs, dfloat t, int outXYZ, int FP64, std::string suffix); void writeFld(std::string suffix, dfloat t, int outXYZ, int FP64, void* o_u, void *o_p, void *o_s, int NSfields); + +#endif diff --git a/src/io/ioUtils.hpp b/src/io/ioUtils.hpp new file mode 100644 index 000000000..125f2c90d --- /dev/null +++ b/src/io/ioUtils.hpp @@ -0,0 +1,12 @@ +#if !defined(nekrs_ioutils_hpp_) +#define nekrs_ioutils_hpp_ + +#include "nrs.hpp" + +void fileSync(const char *file); +void copyFile(const char *srcName, const char* destName); +bool isFileEmpty(const char *file); +bool isFileNewer(const char *file1, const char* file2); +bool fileExists(const char *file); + +#endif diff --git a/src/core/parHelp.txt b/src/io/parHelp.txt similarity index 95% rename from src/core/parHelp.txt rename to src/io/parHelp.txt index 609b0e002..48942e797 100644 --- a/src/core/parHelp.txt +++ b/src/io/parHelp.txt @@ -82,7 +82,7 @@ initialGuess previous [D] +nVector= dimension of projection space preconditioner Jacobi [D] - [D for PRESSURE] polynomial multigrid + [D for PRESSURE] polynomial multigrid +coarse [D for PRESSURE] coarse grid correction for polynomial multigrid pMultigridCoarsening , , ... custom polynomial order for each pMG level @@ -96,13 +96,14 @@ smootherType smoother +minEigenvalueBoundFactor= +maxEigenvalueBoundFactor= -coarseSolver FEM [D] linear finite elment discretization - +Galerkin coarse grid matrix by Galerkin projection - SEMFEM linear FEM approx on high-order nodes - +BoomerAMG [D] HYPRE's AMG solver +coarseSolver BoomerAMG [D] HYPRE's AMG solver +AmgX NVIDIA's AMG solver +FP32 or FP64 [D] floating point precision +coarseGridDiscretization FEM [D] linear finite elment discretization + +Galerkin coarse grid matrix by Galerkin projection + SEMFEM linear FEM approx on high-order nodes + boundaryTypeMap <...>, <...>, ... boundary type for each boundary ID fixedValue user specified Dirichlet zeroValue zero Dirichlet @@ -111,6 +112,8 @@ boundaryTypeMap <...>, <...>, ... boundary zeroXValue/zeroGradient symmetry x-normal plane zeroYValue/zeroGradient symmetry y-normal plane zeroZValue/zeroGradient symmetry z-normal plane + zeroNValue/zeroGradient unaligned symmetry + zeroNValue/fixedGradient unaligned traction regularization hpfrt HPF stabilization @@ -135,6 +138,8 @@ connectivityTol 0.2 [D] file "" name of .re2 file + +writeToFieldFile true, false [D] output mesh in all field writes ---------------------------------------------------------------------------------------------------------------------- [VELOCITY] diff --git a/src/core/parReader.cpp b/src/io/parReader.cpp similarity index 75% rename from src/core/parReader.cpp rename to src/io/parReader.cpp index c6f5b8a92..ade538189 100644 --- a/src/core/parReader.cpp +++ b/src/io/parReader.cpp @@ -14,9 +14,58 @@ #include "nrs.hpp" #include +#include "amgx.h" + namespace{ static std::ostringstream errorLogger; static std::ostringstream valueErrorLogger; + +static std::string mapTemperatureToScalarString() +{ + const int scalarWidth = getDigitsRepresentation(NSCALAR_MAX - 1); + std::stringstream ss; + ss << std::setfill('0') << std::setw(scalarWidth) << 0; + std::string sid = ss.str(); + return "scalar" + sid; +} +int parseScalarIntegerFromString(const std::string &scalarString) +{ + if (scalarString.length() > std::string("scalar").length()) { + const auto numString = scalarString.substr(std::string("scalar").length()); + + try { + return std::stoi(numString); + } + catch (std::invalid_argument &e) { + std::cout << "Hit an invalid_argument error. It said\n" << e.what() << "\n"; + ABORT(EXIT_FAILURE); + return 0; + } + } + else { + ABORT(EXIT_FAILURE); + return 0; + } +} +std::string parPrefixFromParSection(const std::string &parSection) +{ + if (parSection.find("general") != std::string::npos) { + return std::string(""); + } + if (parSection.find("temperature") != std::string::npos) { + return mapTemperatureToScalarString() + " "; + } + if (parSection.find("scalar") != std::string::npos) { + const int scalarWidth = getDigitsRepresentation(NSCALAR_MAX - 1); + const auto is = parseScalarIntegerFromString(parSection); + + std::stringstream ss; + ss << std::setfill('0') << std::setw(scalarWidth) << is; + std::string sid = ss.str(); + return "scalar" + sid + " "; + } + return parSection + std::string(" "); +} } template @@ -33,12 +82,12 @@ void append_value_error(Printable message) #define UPPER(a) \ { \ transform(a.begin(), a.end(), a.begin(), \ - std::ptr_fun(std::toupper)); \ + [](int c){return std::toupper(c);}); \ } #define LOWER(a) \ { \ transform(a.begin(), a.end(), a.begin(), \ - std::ptr_fun(std::tolower)); \ + [](int c){return std::tolower(c);}); \ } namespace @@ -79,38 +128,36 @@ static std::vector problemTypeKeys = { // common keys static std::vector commonKeys = { - {"solver"}, - {"residualTol"}, - {"initialGuess"}, - {"preconditioner"}, - {"pMultigridCoarsening"}, - {"smootherType"}, - {"coarseSolver"}, - {"boundaryTypeMap"}, - {"maxIterations"}, - {"regularization"}, - - // deprecated filter params - {"filtering"}, - {"filterWeight"}, - {"filterModes"}, - {"filterCutoffRatio"}, - - // deprecated no-op extrapolation param - {"extrapolation"}, - - - // deprecated projection params - {"residualProj"}, - {"residualProjection"}, - {"residualProjectionVectors"}, - {"residualProjectionStart"}, + {"solver"}, + {"residualTol"}, + {"initialGuess"}, + {"preconditioner"}, + {"pMultigridCoarsening"}, + {"smootherType"}, + {"coarseSolver"}, + {"coarseGridDiscretization"}, + {"boundaryTypeMap"}, + {"maxIterations"}, + {"regularization"}, + + // deprecated filter params + {"filtering"}, + {"filterWeight"}, + {"filterModes"}, + {"filterCutoffRatio"}, + + // deprecated projection params + {"residualProj"}, + {"residualProjection"}, + {"residualProjectionVectors"}, + {"residualProjectionStart"}, }; static std::vector meshKeys = { {"partitioner"}, {"file"}, {"connectivitytol"}, + {"writetofieldfile"}, }; static std::vector velocityKeys = { @@ -150,20 +197,31 @@ static std::vector occaKeys = { static std::vector pressureKeys = {}; static std::vector deprecatedKeys = { - // deprecated filter params - {"filtering"}, - {"filterWeight"}, - {"filterModes"}, - {"filterCutoffRatio"}, - - // deprecated no-op extrapolation param - {"extrapolation"}, - - // deprecated projection params - {"residualProj"}, - {"residualProjection"}, - {"residualProjectionVectors"}, - {"residualProjectionStart"}, + // deprecated filter params + {"filtering"}, + {"filterWeight"}, + {"filterModes"}, + {"filterCutoffRatio"}, + + // deprecated projection params + {"residualProj"}, + {"residualProjection"}, + {"residualProjectionVectors"}, + {"residualProjectionStart"}, +}; + +static std::vector validSections = { + {"general"}, + {"temperature"}, + {"pressure"}, + {"velocity"}, + {"problemtype"}, + {"amgx"}, + {"boomeramg"}, + {"occa"}, + {"mesh"}, + {"scalar"}, + {"casedata"}, }; void convertToLowerCase(std::vector& stringVec) @@ -187,6 +245,7 @@ void makeStringsLowerCase() convertToLowerCase(boomeramgKeys); convertToLowerCase(pressureKeys); convertToLowerCase(occaKeys); + convertToLowerCase(validSections); } const std::vector& getValidKeys(const std::string& section) @@ -226,25 +285,50 @@ int validateKeys(const inipp::Ini::Sections& sections) int err = 0; bool generalExists = false; for (auto const & sec : sections) { - if(sec.first.find("general") != std::string::npos) generalExists = true; + if (sec.first.find("general") != std::string::npos) + generalExists = true; } + if(!generalExists){ std::ostringstream error; error << "mandatory section [GENERAL] not found!\n"; append_error(error.str()); err++; } + for (auto const & sec : sections) { + + bool isScalar = sec.first.find("scalar") != std::string::npos; + if (isScalar) { + const int scalarNumber = parseScalarIntegerFromString(sec.first); + if (scalarNumber >= NSCALAR_MAX) { + std::ostringstream error; + error << "ERROR: specified " << scalarNumber << " scalars, while the maximum allowed is " + << NSCALAR_MAX << "\n"; + append_error(error.str()); + err++; + } + } + if(sec.first.find("casedata") != std::string::npos) continue; - if(sec.first.find("general") != std::string::npos) generalExists = true; - const auto& validKeys = getValidKeys(sec.first); - for (auto const & val : sec.second) { - if (std::find(validKeys.begin(), validKeys.end(), val.first) == validKeys.end()) { - if (std::find(commonKeys.begin(), commonKeys.end(), val.first) == commonKeys.end()) { - std::ostringstream error; - error << "unknown key: " << sec.first << "::" << val.first << "\n"; - append_error(error.str()); - err++; + // check that section exists + if (std::find(validSections.begin(), validSections.end(), sec.first) == validSections.end() && + !isScalar) { + std::ostringstream error; + error << "ERROR: Invalid section name: " << sec.first << std::endl; + append_error(error.str()); + err++; + } + else { + const auto &validKeys = getValidKeys(sec.first); + for (auto const &val : sec.second) { + if (std::find(validKeys.begin(), validKeys.end(), val.first) == validKeys.end()) { + if (std::find(commonKeys.begin(), commonKeys.end(), val.first) == commonKeys.end()) { + std::ostringstream error; + error << "unknown key: " << sec.first << "::" << val.first << "\n"; + append_error(error.str()); + err++; + } } } } @@ -377,12 +461,10 @@ void parseConstFlowRate(const int rank, setupAide& options, inipp::Ini *par) } } } -void parseSolverTolerance(const int rank, setupAide &options, - inipp::Ini *par, std::string parScope) { - std::string parSectionName = (parScope.find("temperature") != std::string::npos) - ? "scalar00" - : parScope; +void parseSolverTolerance(const int rank, setupAide &options, inipp::Ini *par, std::string parScope) +{ + std::string parSectionName = parPrefixFromParSection(parScope); UPPER(parSectionName); const std::vector validValues = { @@ -395,7 +477,7 @@ void parseSolverTolerance(const int rank, setupAide &options, { if(residualTol.find("relative") != std::string::npos) { - options.setArgs(parSectionName + " LINEAR SOLVER STOPPING CRITERION", "RELATIVE"); + options.setArgs(parSectionName + "LINEAR SOLVER STOPPING CRITERION", "RELATIVE"); } std::vector entries = serializeString(residualTol, '+'); @@ -404,18 +486,55 @@ void parseSolverTolerance(const int rank, setupAide &options, double tolerance = std::strtod(entry.c_str(), nullptr); if(tolerance > 0.0) { - options.setArgs(parSectionName + " SOLVER TOLERANCE", to_string_f(tolerance)); + options.setArgs(parSectionName + "SOLVER TOLERANCE", to_string_f(tolerance)); } else { checkValidity(rank, validValues, entry); } } } } -void parseCoarseSolver(const int rank, setupAide &options, - inipp::Ini *par, std::string parScope) { - std::string parSectionName = (parScope.find("temperature") != std::string::npos) - ? "scalar00" - : parScope; + +void parseCoarseGridDiscretization(const int rank, setupAide &options, inipp::Ini *par, std::string parScope) +{ + std::string parSectionName = parPrefixFromParSection(parScope); + UPPER(parSectionName); + std::string p_coarseGridDiscretization; + const bool continueParsing = par->extract(parScope, "coarsegriddiscretization", p_coarseGridDiscretization); + if (!continueParsing) + return; + + const std::vector validValues = { + {"semfem"}, + {"fem"}, + {"galerkin"}, + }; + + const auto entries = serializeString(p_coarseGridDiscretization, '+'); + for (auto &&s : entries) { + checkValidity(rank, validValues, s); + } + + // exit early if not using multigrid as preconditioner + if (!options.compareArgs(parSectionName + "PRECONDITIONER", "MULTIGRID")) { + return; + } + + // coarse grid discretization + if (p_coarseGridDiscretization.find("semfem") != std::string::npos) { + options.setArgs(parSectionName + "MULTIGRID COARSE SEMFEM", "TRUE"); + } + else if (p_coarseGridDiscretization.find("fem") != std::string::npos) { + options.setArgs(parSectionName + "MULTIGRID COARSE SEMFEM", "FALSE"); + options.setArgs("GALERKIN COARSE OPERATOR", "FALSE"); + if (p_coarseGridDiscretization.find("galerkin") != std::string::npos) { + options.setArgs("GALERKIN COARSE OPERATOR", "TRUE"); + } + } +} + +void parseCoarseSolver(const int rank, setupAide &options, inipp::Ini *par, std::string parScope) +{ + std::string parSectionName = parPrefixFromParSection(parScope); UPPER(parSectionName); std::string p_coarseSolver; const bool continueParsing = par->extract(parScope, "coarsesolver", p_coarseSolver); @@ -423,55 +542,51 @@ void parseCoarseSolver(const int rank, setupAide &options, return; const std::vector validValues = { - {"boomeramg"}, - {"amgx"}, - {"semfem"}, - {"fem"}, - {"fp32"}, - {"fp64"}, - {"cpu"}, - {"gpu"}, + {"boomeramg"}, + {"amgx"}, + {"fp32"}, + {"fp64"}, + {"cpu"}, + {"gpu"}, }; + std::vector entries = serializeString(p_coarseSolver, '+'); + for (std::string entry : entries) { + checkValidity(rank, validValues, entry); + } + + // exit early if not using multigrid as preconditioner + if (!options.compareArgs(parSectionName + "PRECONDITIONER", "MULTIGRID")) { + return; + } + // solution methods if(p_coarseSolver.find("boomeramg") != std::string::npos){ options.setArgs("AMG SOLVER", "BOOMERAMG"); - options.setArgs(parSectionName + " SEMFEM SOLVER", options.getArgs("AMG SOLVER")); + options.setArgs(parSectionName + "SEMFEM SOLVER", options.getArgs("AMG SOLVER")); options.setArgs("AMG SOLVER PRECISION", "FP64"); - options.setArgs(parSectionName + " SEMFEM SOLVER PRECISION", "FP64"); + options.setArgs(parSectionName + "SEMFEM SOLVER PRECISION", "FP64"); options.setArgs("AMG SOLVER LOCATION", "CPU"); } else if(p_coarseSolver.find("amgx") != std::string::npos){ + + if(!AMGXenabled()){ + append_error("AMGX was requested but is not compiled!\n"); + } + options.setArgs("AMG SOLVER", "AMGX"); - options.setArgs(parSectionName + " SEMFEM SOLVER", options.getArgs("AMG SOLVER")); + options.setArgs(parSectionName + "SEMFEM SOLVER", options.getArgs("AMG SOLVER")); options.setArgs("AMG SOLVER PRECISION", "FP32"); - options.setArgs(parSectionName + " SEMFEM SOLVER PRECISION", "FP32"); + options.setArgs(parSectionName + "SEMFEM SOLVER PRECISION", "FP32"); options.setArgs("AMG SOLVER LOCATION", "GPU"); } - // coarse grid discretization - if(p_coarseSolver.find("semfem") != std::string::npos){ - options.setArgs(parSectionName + " MULTIGRID COARSE SEMFEM", "TRUE"); - } - else if(p_coarseSolver.find("fem") != std::string::npos){ - options.setArgs(parSectionName + " MULTIGRID COARSE SEMFEM", "FALSE"); - options.setArgs("GALERKIN COARSE OPERATOR", "FALSE"); - options.setArgs(parSectionName + " USER SPECIFIED FEM COARSE SOLVER", "TRUE"); - if(p_coarseSolver.find("galerkin") != std::string::npos){ - options.setArgs("GALERKIN COARSE OPERATOR", "TRUE"); - } - } - - // parse fp type + location - std::vector entries = serializeString(p_coarseSolver, '+'); - for(std::string entry : entries) - { - checkValidity(rank, validValues, entry); + for (std::string entry : entries) { if(entry.find("fp32") != std::string::npos) { options.setArgs("AMG SOLVER PRECISION", "FP32"); - options.setArgs(parSectionName + " SEMFEM SOLVER PRECISION", "FP32"); + options.setArgs(parSectionName + "SEMFEM SOLVER PRECISION", "FP32"); if(p_coarseSolver.find("boomeramg") != std::string::npos){ append_error("BoomerAMG+FP32 is not currently supported!\n"); } @@ -479,7 +594,7 @@ void parseCoarseSolver(const int rank, setupAide &options, else if(entry.find("fp64") != std::string::npos) { options.setArgs("AMG SOLVER PRECISION", "FP64"); - options.setArgs(parSectionName + " SEMFEM SOLVER PRECISION", "FP64"); + options.setArgs(parSectionName + "SEMFEM SOLVER PRECISION", "FP64"); } else if(entry.find("cpu") != std::string::npos) { @@ -676,26 +791,23 @@ void parseSmoother(const int rank, setupAide &options, inipp::Ini *par, void parsePreconditioner(const int rank, setupAide &options, inipp::Ini *par, std::string parScope) { const std::vector validValues = { - {"none"}, - {"jac"}, - {"semfem"}, - {"pmg"}, - {"multigrid"}, - {"semg"}, - {"semfem"}, - {"amgx"}, - {"fp32"}, - {"fp64"}, - {"additive"}, - {"multiplicative"}, - {"overlap"}, - {"coarse"}, + {"none"}, + {"jac"}, + {"semfem"}, + {"pmg"}, + {"multigrid"}, + {"semfem"}, + {"amgx"}, + {"fp32"}, + {"fp64"}, + {"additive"}, + {"multiplicative"}, + {"overlap"}, + {"coarse"}, }; - - std::string parSection = (parScope.find("temperature") != std::string::npos) - ? "scalar00" - : parScope; + std::string parSection = + (parScope.find("temperature") != std::string::npos) ? mapTemperatureToScalarString() : parScope; UPPER(parSection); std::string p_preconditioner; @@ -728,6 +840,9 @@ void parsePreconditioner(const int rank, setupAide &options, for (std::string s : list) { if (s.find("semfem") != std::string::npos) { } else if (s.find("amgx") != std::string::npos) { + if(!AMGXenabled()){ + append_error("AMGX was requested but is not compiled!\n"); + } options.setArgs(parSection + " SEMFEM SOLVER", "AMGX"); options.setArgs(parSection + " SEMFEM SOLVER PRECISION", "FP32"); } else if (s.find("fp32") != std::string::npos) { @@ -742,10 +857,9 @@ void parsePreconditioner(const int rank, setupAide &options, append_error(error.str()); } } - - } else if (p_preconditioner.find("semg") != std::string::npos || - p_preconditioner.find("multigrid") != std::string::npos || - p_preconditioner.find("pmg") != std::string::npos) { + } + else if (p_preconditioner.find("multigrid") != std::string::npos || + p_preconditioner.find("pmg") != std::string::npos) { options.setArgs(parSection + " PRECONDITIONER", "MULTIGRID"); std::string key = "VCYCLE"; if (p_preconditioner.find("additive") != std::string::npos) @@ -784,9 +898,7 @@ bool checkForFalse(const std::string& s) void parseInitialGuess(const int rank, setupAide &options, inipp::Ini *par, std::string parScope) { - std::string parSectionName = (parScope.find("temperature") != std::string::npos) - ? "scalar00" - : parScope; + std::string parSectionName = parPrefixFromParSection(parScope); UPPER(parSectionName); @@ -808,26 +920,30 @@ void parseInitialGuess(const int rank, setupAide &options, if (par->extract(parScope, "initialguess", initialGuess)) { const int defaultNumVectors = parScope == "pressure" ? 10 : 5; - options.setArgs(parSectionName + " RESIDUAL PROJECTION VECTORS", - std::to_string(defaultNumVectors)); - options.setArgs(parSectionName + " RESIDUAL PROJECTION START", "5"); + options.setArgs(parSectionName + "RESIDUAL PROJECTION VECTORS", std::to_string(defaultNumVectors)); + options.setArgs(parSectionName + "RESIDUAL PROJECTION START", "5"); if (initialGuess.find("projectionaconj") != std::string::npos) { - options.setArgs(parSectionName + " INITIAL GUESS", "PROJECTION-ACONJ"); + options.setArgs(parSectionName + "INITIAL GUESS", "PROJECTION-ACONJ"); } else if (initialGuess.find("projection") != std::string::npos) { - options.setArgs(parSectionName + " INITIAL GUESS", - "PROJECTION"); + options.setArgs(parSectionName + "INITIAL GUESS", "PROJECTION"); } else if (initialGuess.find("previous") != std::string::npos) { - options.setArgs(parSectionName + " INITIAL GUESS", "PREVIOUS"); + options.setArgs(parSectionName + "INITIAL GUESS", "PREVIOUS"); + // removeArgs any default entries associated with projection initial guess + options.removeArgs(parSectionName + "RESIDUAL PROJECTION START"); + options.removeArgs(parSectionName + "RESIDUAL PROJECTION VECTORS"); } else if (checkForTrue(initialGuess)) { const int defaultNumVectors = parScope == "pressure" ? 10 : 5; - options.setArgs(parSectionName + " INITIAL GUESS", "PROJECTION-ACONJ"); - options.setArgs(parSectionName + " RESIDUAL PROJECTION START", "5"); + options.setArgs(parSectionName + "INITIAL GUESS", "PROJECTION-ACONJ"); + options.setArgs(parSectionName + "RESIDUAL PROJECTION START", "5"); } else if (checkForFalse(initialGuess)) { - options.setArgs(parSectionName + " INITIAL GUESS", "PREVIOUS"); + options.setArgs(parSectionName + "INITIAL GUESS", "PREVIOUS"); + // removeArgs any default entries associated with projection initial guess + options.removeArgs(parSectionName + "RESIDUAL PROJECTION START"); + options.removeArgs(parSectionName + "RESIDUAL PROJECTION VECTORS"); } else { std::ostringstream error; - error << "Could not parse initialGuess string" << initialGuess << "!\n"; + error << "Could not parse initialGuess = " << initialGuess << "!\n"; append_error(error.str()); } @@ -839,15 +955,13 @@ void parseInitialGuess(const int rank, setupAide &options, const std::vector items = serializeString(s, '='); assert(items.size() == 2); const int value = std::stoi(items[1]); - options.setArgs(parSectionName + " RESIDUAL PROJECTION VECTORS", - std::to_string(value)); + options.setArgs(parSectionName + "RESIDUAL PROJECTION VECTORS", std::to_string(value)); } if (s.find("start") != std::string::npos) { const std::vector items = serializeString(s, '='); assert(items.size() == 2); const int value = std::stoi(items[1]); - options.setArgs(parSectionName + " RESIDUAL PROJECTION START", - std::to_string(value)); + options.setArgs(parSectionName + "RESIDUAL PROJECTION START", std::to_string(value)); } } return; @@ -859,33 +973,34 @@ void parseInitialGuess(const int rank, setupAide &options, if (par->extract(parScope, "residualproj", solutionProjection) || par->extract(parScope, "residualprojection", solutionProjection)) { if (solutionProjection) { - options.setArgs(parSectionName + " INITIAL GUESS", "PROJECTION-ACONJ"); + options.setArgs(parSectionName + "INITIAL GUESS", "PROJECTION-ACONJ"); const int defaultNumVectors = parScope == "pressure" ? 10 : 5; // default parameters - options.setArgs(parSectionName + " RESIDUAL PROJECTION VECTORS", - std::to_string(defaultNumVectors)); - options.setArgs(parSectionName + " RESIDUAL PROJECTION START", "5"); - } + options.setArgs(parSectionName + "RESIDUAL PROJECTION VECTORS", std::to_string(defaultNumVectors)); + options.setArgs(parSectionName + "RESIDUAL PROJECTION START", "5"); + } else { + options.setArgs(parSectionName + "INITIAL GUESS", "PREVIOUS"); - return; + // removeArgs any default entries associated with projection initial guess + options.removeArgs(parSectionName + "RESIDUAL PROJECTION START"); + options.removeArgs(parSectionName + "RESIDUAL PROJECTION VECTORS"); + } } int nVectors; if(par->extract(parScope, "residualprojectionvectors", nVectors)){ - options.setArgs(parSectionName + " RESIDUAL PROJECTION VECTORS", - std::to_string(nVectors)); + options.setArgs(parSectionName + "RESIDUAL PROJECTION VECTORS", std::to_string(nVectors)); } int nStart; if(par->extract(parScope, "residualprojectionstart", nStart)){ - options.setArgs(parSectionName + " RESIDUAL PROJECTION START", - std::to_string(nStart)); + options.setArgs(parSectionName + "RESIDUAL PROJECTION START", std::to_string(nStart)); } } } -void parseRegularization(const int rank, setupAide &options, - inipp::Ini *par, std::string parSection){ +void parseRegularization(const int rank, setupAide &options, inipp::Ini *par, std::string parSection) +{ int N; options.getArgs("POLYNOMIAL DEGREE", N); const bool isScalar = (parSection.find("temperature") != std::string::npos) || @@ -893,14 +1008,7 @@ void parseRegularization(const int rank, setupAide &options, const bool isVelocity = parSection.find("velocity") != std::string::npos; std::string sbuf; - std::string parPrefix = [parSection](){ - if(parSection.find("general") != std::string::npos) - return std::string(""); - if(parSection.find("temperature") != std::string::npos) - return std::string("scalar00 "); - return parSection + std::string(" "); - }(); - + std::string parPrefix = parPrefixFromParSection(parSection); UPPER(parPrefix); options.setArgs(parPrefix + "REGULARIZATION METHOD", "NONE"); @@ -1096,6 +1204,7 @@ void parseRegularization(const int rank, setupAide &options, } } void setDefaultSettings(setupAide &options, std::string casename, int rank) { + options.setArgs("CHECKPOINT OUTPUT MESH", "FALSE"); options.setArgs("FORMAT", std::string("1.0")); options.setArgs("CONSTANT FLOW RATE", "FALSE"); @@ -1145,7 +1254,6 @@ void setDefaultSettings(setupAide &options, std::string casename, int rank) { options.setArgs("ELLIPTIC INTEGRATION", "NODAL"); options.setArgs("PRESSURE MAXIMUM ITERATIONS", "200"); - options.setArgs("GALERKIN COARSE MATRIX", "FALSE"); options.setArgs("PRESSURE KRYLOV SOLVER", "PGMRES+FLEXIBLE"); options.setArgs("PRESSURE PRECONDITIONER", "MULTIGRID"); options.setArgs("PRESSURE DISCRETIZATION", "CONTINUOUS"); @@ -1171,9 +1279,12 @@ void setDefaultSettings(setupAide &options, std::string casename, int rank) { options.setArgs("PARALMOND SMOOTH COARSEST", "FALSE"); options.setArgs("ENABLE FLOATCOMMHALF GS SUPPORT", "FALSE"); options.setArgs("MOVING MESH", "FALSE"); - options.setArgs("ENABLE OVERLAP", "TRUE"); + options.setArgs("GS OVERLAP", "TRUE"); options.setArgs("VARIABLE DT", "FALSE"); + + // coeff fields + options.setArgs("VELOCITY COEFF FIELD", "TRUE"); } setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { @@ -1230,9 +1341,9 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { std::string sbuf; // OCCA - std::string threadModel; - if (par->extract("occa", "backend", threadModel)) { - const std::vector validValues = { + std::string backendSpecification; + if (par->extract("occa", "backend", backendSpecification)) { + const std::vector validBackends = { {"serial"}, {"cpu"}, {"cuda"}, @@ -1240,11 +1351,59 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { {"opencl"}, {"openmp"}, }; + const std::vector validArchitectures = { + {"arch"}, // include the arch= specifier here + {"x86"}, + {"a64fx"}, + }; + + std::vector validValues = validBackends; + validValues.insert(validValues.end(), validArchitectures.begin(), validArchitectures.end()); + + const std::vector list = serializeString(backendSpecification, '+'); + for(const std::string entry : list){ + const std::vector arguments = serializeString(entry, '='); + for(const std::string argument : arguments){ + checkValidity(rank, validValues, argument); + } + } + + std::string threadModel = ""; + std::string architecture = ""; + for(const std::string entry : list){ + const std::vector arguments = serializeString(entry, '='); + if(arguments.size() == 1){ + for(const std::string backend : validBackends){ + if(backend == arguments.at(0)){ + threadModel = backend; + } + } + } else if (arguments.size() == 2){ + for(const std::string arch : validArchitectures){ + if(arch == arguments.at(1)){ + architecture = arch; + } + } + } else { + std::ostringstream error; + error << "Could not parse string \"" << entry << "\" while parsing OCCA:backend.\n"; + append_error(error.str()); + } + } - checkValidity(rank, validValues, threadModel); + if(threadModel.empty()){ + std::ostringstream error; + error << "Could not parse valid backend from \"" << backendSpecification << "\" while parsing OCCA:backend.\n"; + append_error(error.str()); + } UPPER(threadModel); options.setArgs("THREAD MODEL", threadModel); + + if(!architecture.empty()){ + UPPER(architecture); + options.setArgs("ARCHITECTURE", architecture); + } } std::string deviceNumber; @@ -1304,14 +1463,29 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { } } - // mesh file + std::string subCyclingString; + if(par->extract("general", "subcyclingsteps", subCyclingString)) { - std::string meshFile; - if(par->extract("mesh", "file", meshFile)){ - options.setArgs("MESH FILE", meshFile); + if(subCyclingString.find("auto") != std::string::npos) + { + std::string dtString; + if (par->extract("general", "dt", dtString)){ + if(dtString.find("targetcfl") == std::string::npos) + { + append_error("subCyclingSteps = auto requires the targetCFL to be set"); + } + } } } + { + int NSubCycles = 0; + if (par->extract("general", "subcyclingsteps", NSubCycles)){ + options.setArgs("SUBCYCLING STEPS", std::to_string(NSubCycles)); + } + } + + std::string dtString; if (par->extract("general", "dt", dtString)){ const std::vector validValues = { @@ -1319,13 +1493,22 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { {"max"}, {"initial"}, }; - if(dtString.find("targetcfl") != std::string::npos) + + bool useVariableDt = false; + for(auto&& variableDtEntry : validValues){ + if(dtString.find(variableDtEntry) != std::string::npos){ + useVariableDt = true; + } + } + + + if(useVariableDt) { bool userSuppliesInitialDt = false; + bool userSuppliesTargetCFL = false; options.setArgs("VARIABLE DT", "TRUE"); options.setArgs("TARGET CFL", "0.5"); const double bigNumber = std::numeric_limits::max(); - options.setArgs("MAX DT", to_string_f(bigNumber)); std::vector entries = serializeString(dtString, '+'); for(std::string entry : entries) { @@ -1355,7 +1538,22 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { int nSteps = std::ceil(targetCFL / 2.0); if (targetCFL <= 0.51) nSteps = 0; options.setArgs("SUBCYCLING STEPS", std::to_string(nSteps)); + + userSuppliesTargetCFL = true; + } + } + + // if targetCFL is not set, try to infer from subcyclingSteps + if(!userSuppliesTargetCFL){ + int NSubCycles = 0; + double targetCFL = 0.5; + options.getArgs("SUBCYCLING STEPS", NSubCycles); + if(NSubCycles == 0){ + targetCFL = 0.5; + } else { + targetCFL = 2 * NSubCycles; } + options.setArgs("TARGET CFL", to_string_f(targetCFL)); } // guard against using a higher initial dt than the max @@ -1365,7 +1563,7 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { double maxDt = 0.0; options.getArgs("DT", initialDt); options.getArgs("MAX DT", maxDt); - if(initialDt > maxDt) + if(maxDt > 0 && initialDt > maxDt) { std::ostringstream error; error << "Error: initial dt " << initialDt << " is larger than max dt " << maxDt << "\n"; @@ -1430,27 +1628,6 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { append_error(error.str()); } - std::string subCyclingString; - if(par->extract("general", "subcyclingsteps", subCyclingString)) - { - if(subCyclingString.find("auto") != std::string::npos) - { - if (par->extract("general", "dt", dtString)){ - if(dtString.find("targetcfl") == std::string::npos) - { - append_error("subCyclingSteps = auto requires the targetCFL to be set"); - } - } - } - } - - { - int NSubCycles = 0; - if (par->extract("general", "subcyclingsteps", NSubCycles)){ - options.setArgs("SUBCYCLING STEPS", std::to_string(NSubCycles)); - } - } - double writeInterval = 0; par->extract("general", "writeinterval", writeInterval); @@ -1458,6 +1635,9 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { std::string writeControl; if (par->extract("general", "writecontrol", writeControl)) { + + checkValidity(rank, {"steps", "runtime"}, writeControl); + if (writeControl == "steps") options.setArgs("SOLUTION OUTPUT CONTROL", "STEPS"); else if (writeControl == "runtime") @@ -1470,96 +1650,141 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { } bool dealiasing = true; - if (par->extract("general", "dealiasing", dealiasing)) + if (par->extract("general", "dealiasing", dealiasing)) { if (dealiasing) options.setArgs("ADVECTION TYPE", "CUBATURE+CONVECTIVE"); else options.setArgs("ADVECTION TYPE", "CONVECTIVE"); + } int cubN = round((3./2) * (N+1) - 1) - 1; if(!dealiasing) cubN = 0; par->extract("general", "cubaturepolynomialorder", cubN); options.setArgs("CUBATURE POLYNOMIAL DEGREE", std::to_string(cubN)); - { parseRegularization(rank, options, par, "general"); } - { - parseRegularization(rank, options, par, "velocity"); - } - - // MESH - std::string meshPartitioner; - if (par->extract("mesh", "partitioner", meshPartitioner)){ - if(meshPartitioner != "rcb" && meshPartitioner != "rcb+rsb"){ - std::ostringstream error; - error << "Could not parse mesh::partitioner = " << meshPartitioner; - append_error(error.str()); + // PROBLEMTYPE + bool stressFormulation; + if (par->extract("problemtype", "stressformulation", stressFormulation)){ + if (stressFormulation){ + options.setArgs("STRESSFORMULATION", "TRUE"); } - options.setArgs("MESH PARTITIONER", meshPartitioner); - } - - std::string meshConTol; - if (par->extract("mesh", "connectivitytol", meshConTol)){ - options.setArgs("MESH CONNECTIVITY TOL", meshConTol); } - std::string meshSolver; - if (par->extract("mesh", "solver", meshSolver)) { - options.setArgs("MESH KRYLOV SOLVER", "PCG"); - options.setArgs("MESH BASIS", "NODAL"); - options.setArgs("MESH PRECONDITIONER", "JACOBI"); - options.setArgs("MESH DISCRETIZATION", "CONTINUOUS"); - options.setArgs("MOVING MESH", "TRUE"); - if(meshSolver == "user") options.setArgs("MESH SOLVER", "USER"); - else if(meshSolver == "elasticity") { - options.setArgs("MESH SOLVER", "ELASTICITY"); - options.setArgs("MESH INITIAL GUESS", "PROJECTION-ACONJ"); - options.setArgs("MESH RESIDUAL PROJECTION VECTORS", "5"); - options.setArgs("MESH RESIDUAL PROJECTION START", "5"); + std::string eqn; + if (par->extract("problemtype", "equation", eqn)) { + const std::vector validValues = { + {"stokes"}, + }; + const std::vector list = serializeString(eqn, '+'); + for(std::string s : list) + { + checkValidity(rank, validValues, s); } - else if(meshSolver == "none") options.setArgs("MOVING MESH", "FALSE"); - else { - std::ostringstream error; - error << "Could not parse mesh::solver = " << meshSolver; - append_error(error.str()); + options.setArgs("ADVECTION", "TRUE"); + if (eqn == "stokes"){ + options.setArgs("ADVECTION", "FALSE"); } } - { - std::string keyValue; - if (par->extract("mesh", "maxiterations", keyValue)) - options.setArgs("MESH MAXIMUM ITERATIONS", keyValue); - } + int bcInPar = 1; - parseInitialGuess(rank, options, par, "mesh"); + // MESH + if (par->sections.count("mesh")) { + std::string meshFile; + if(par->extract("mesh", "file", meshFile)){ + options.setArgs("MESH FILE", meshFile); + } - parseSolverTolerance(rank, options, par, "mesh"); + std::string meshSolver; + if (par->extract("mesh", "solver", meshSolver)) { + options.setArgs("MESH KRYLOV SOLVER", "PCG"); + options.setArgs("MESH BASIS", "NODAL"); + options.setArgs("MESH PRECONDITIONER", "JACOBI"); + options.setArgs("MESH DISCRETIZATION", "CONTINUOUS"); + options.setArgs("MOVING MESH", "TRUE"); + if(meshSolver == "user") options.setArgs("MESH SOLVER", "USER"); + else if(meshSolver == "elasticity") { + options.setArgs("MESH COEFF FIELD", "TRUE"); + options.setArgs("MESH SOLVER", "ELASTICITY"); + options.setArgs("MESH INITIAL GUESS", "PROJECTION-ACONJ"); + options.setArgs("MESH RESIDUAL PROJECTION VECTORS", "5"); + options.setArgs("MESH RESIDUAL PROJECTION START", "5"); + } + else if(meshSolver == "none") options.setArgs("MOVING MESH", "FALSE"); + else { + std::ostringstream error; + error << "Could not parse mesh::solver = " << meshSolver; + append_error(error.str()); + } + } - int bcInPar = 1; - std::string m_bcMap; - if(par->extract("mesh", "boundarytypemap", m_bcMap)) { - std::vector sList; - sList = serializeString(m_bcMap,','); - bcMap::setup(sList, "mesh"); - bcInPar = 1; - } else { - bcInPar = 0; - } + std::string m_bcMap; + if(par->extract("mesh", "boundarytypemap", m_bcMap)) { + std::vector sList; + sList = serializeString(m_bcMap,','); + bcMap::setup(sList, "mesh"); + } else { + if(meshSolver == "elasticity"){ + // use derived mapping based on fluid boundary conditions + std::string v_bcMap; + if(par->extract("velocity", "boundarytypemap", v_bcMap)) { + std::vector sList; + sList = serializeString(v_bcMap,','); + bcMap::deriveMeshBoundaryConditions(sList); + } + } + } + + std::string meshPartitioner; + if (par->extract("mesh", "partitioner", meshPartitioner)){ + if(meshPartitioner != "rcb" && meshPartitioner != "rcb+rsb"){ + std::ostringstream error; + error << "Could not parse mesh::partitioner = " << meshPartitioner; + append_error(error.str()); + } + options.setArgs("MESH PARTITIONER", meshPartitioner); + } + + std::string meshConTol; + if (par->extract("mesh", "connectivitytol", meshConTol)){ + options.setArgs("MESH CONNECTIVITY TOL", meshConTol); + } + + { + const std::vector validValues = { + {"yes"}, + {"true"}, + {"1"}, + {"no"}, + {"false"}, + {"0"}, + }; + std::string checkpointOutputMesh; + if(par->extract("mesh", "writetofieldfile", checkpointOutputMesh)){ + + checkValidity(rank, validValues, checkpointOutputMesh); + if(checkForTrue(checkpointOutputMesh)){ + options.setArgs("CHECKPOINT OUTPUT MESH", "TRUE"); + } else { + options.setArgs("CHECKPOINT OUTPUT MESH", "FALSE"); + } + } + } + + { + std::string keyValue; + if (par->extract("mesh", "maxiterations", keyValue)) + options.setArgs("MESH MAXIMUM ITERATIONS", keyValue); + } - bool stressFormulation; - if (par->extract("problemtype", "stressformulation", stressFormulation)) - if (stressFormulation) - options.setArgs("STRESSFORMULATION", "TRUE"); + parseInitialGuess(rank, options, par, "mesh"); + parseSolverTolerance(rank, options, par, "mesh"); - std::string eqn; - if (par->extract("problemtype", "equation", eqn)) { - options.setArgs("ADVECTION", "TRUE"); - if (eqn == "stokes") - options.setArgs("ADVECTION", "FALSE"); } if (par->sections.count("velocity")) { @@ -1576,9 +1801,11 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { parsePreconditioner(rank, options, par, "pressure"); - std::string p_mglevels; - if (par->extract("pressure", "pmultigridcoarsening", p_mglevels)) - options.setArgs("PRESSURE MULTIGRID COARSENING", p_mglevels); + if (options.compareArgs("PRESSURE PRECONDITIONER", "MULTIGRID")) { + std::string p_mglevels; + if (par->extract("pressure", "pmultigridcoarsening", p_mglevels)) + options.setArgs("PRESSURE MULTIGRID COARSENING", p_mglevels); + } std::string p_solver; if (par->extract("pressure", "solver", p_solver)) { @@ -1636,6 +1863,7 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { parseSmoother(rank, options, par, "pressure"); + parseCoarseGridDiscretization(rank, options, par, "pressure"); parseCoarseSolver(rank, options, par, "pressure"); if (par->sections.count("boomeramg")) { @@ -1668,6 +1896,9 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { } if (par->sections.count("amgx")) { + if(!AMGXenabled()){ + append_error("AMGX was requested but is not compiled!\n"); + } std::string configFile; if (par->extract("amgx", "configfile", configFile)) options.setArgs("AMGX CONFIG FILE", configFile); @@ -1716,7 +1947,6 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { std::vector sList; sList = serializeString(v_bcMap, ','); bcMap::setup(sList, "velocity"); - bcInPar = 1; } else { bcInPar = 0; } @@ -1735,37 +1965,48 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { viscosity = fabs(1 / viscosity); options.setArgs("VISCOSITY", to_string_f(viscosity)); } + + parseRegularization(rank, options, par, "velocity"); } else { options.setArgs("VELOCITY", "FALSE"); } + // MESH + // SCALARS int nscal = 0; int isStart = 0; + + const int scalarWidth = getDigitsRepresentation(NSCALAR_MAX - 1); + if (par->sections.count("temperature")) { + std::stringstream ss; + ss << std::setfill('0') << std::setw(scalarWidth) << 0; + std::string sid = ss.str(); nscal++; isStart++; { std::string keyValue; if (par->extract("temperature", "maxiterations", keyValue)) - options.setArgs("SCALAR00 MAXIMUM ITERATIONS", keyValue); + options.setArgs("SCALAR" + sid + " MAXIMUM ITERATIONS", keyValue); } { parseRegularization(rank, options, par, "temperature"); } - options.setArgs("SCALAR00 IS TEMPERATURE", "TRUE"); + options.setArgs("SCALAR" + sid + " IS TEMPERATURE", "TRUE"); std::string solver; par->extract("temperature", "solver", solver); if (solver == "none") { - options.setArgs("SCALAR00 SOLVER", "NONE"); + options.setArgs("SCALAR" + sid + " SOLVER", "NONE"); } else { - options.setArgs("SCALAR00 KRYLOV SOLVER", "PCG"); - options.setArgs("SCALAR00 PRECONDITIONER", "JACOBI"); + options.setArgs("SCALAR" + sid + " KRYLOV SOLVER", "PCG"); + options.setArgs("SCALAR" + sid + " PRECONDITIONER", "JACOBI"); + options.setArgs("SCALAR" + sid + " COEFF FIELD", "TRUE"); parseInitialGuess(rank, options, par, "temperature"); @@ -1778,7 +2019,7 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { append_error("Invalid expression for conductivity"); if (diffusivity < 0) diffusivity = fabs(1 / diffusivity); - options.setArgs("SCALAR00 DIFFUSIVITY", to_string_f(diffusivity)); + options.setArgs("SCALAR" + sid + " DIFFUSIVITY", to_string_f(diffusivity)); } if (par->extract("temperature", "rhocp", sbuf)) { @@ -1786,7 +2027,7 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { double rhoCp = te_interp(sbuf.c_str(), &err); if (err) append_error("Invalid expression for rhoCp"); - options.setArgs("SCALAR00 DENSITY", to_string_f(rhoCp)); + options.setArgs("SCALAR" + sid + " DENSITY", to_string_f(rhoCp)); } std::string s_bcMap; @@ -1795,7 +2036,7 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { append_error("ERROR: boundaryTypeMap has to be defined for all fields"); std::vector sList; sList = serializeString(s_bcMap, ','); - bcMap::setup(sList, "scalar00"); + bcMap::setup(sList, "scalar" + sid); } else { if (bcInPar) append_error("ERROR: boundaryTypeMap has to be defined for all fields"); @@ -1804,35 +2045,45 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { } } + const auto sections = par->sections; + for (auto &sec : par->sections) { std::string key = sec.first; if (key.compare(0, 6, "scalar") == 0) nscal++; } options.setArgs("NUMBER OF SCALARS", std::to_string(nscal)); - for (int is = isStart; is < nscal; is++) { + for (auto &&sec : sections) { + const auto parScope = sec.first; + if (parScope.compare(0, 6, "scalar") != 0) + continue; + + const auto is = parseScalarIntegerFromString(parScope); + std::stringstream ss; - ss << std::setfill('0') << std::setw(2) << is; + ss << std::setfill('0') << std::setw(scalarWidth) << is; std::string sid = ss.str(); std::string sidPar = sid; if (isStart == 0) { std::stringstream ss; - ss << std::setfill('0') << std::setw(2) << is + 1; + ss << std::setfill('0') << std::setw(scalarWidth) << is + 1; sidPar = ss.str(); } { std::string keyValue; - if (par->extract("scalar" + sidPar, "maxiterations", keyValue)) + if (par->extract(parScope, "maxiterations", keyValue)) options.setArgs("SCALAR" + sid + " MAXIMUM ITERATIONS", keyValue); } - { - parseRegularization(rank, options, par, "scalar" + sidPar); + options.setArgs("SCALAR" + sid + " COEFF FIELD", "TRUE"); + + { + parseRegularization(rank, options, par, parScope); } std::string solver; - par->extract("scalar" + sidPar, "solver", solver); + par->extract(parScope, "solver", solver); if (solver == "none") { options.setArgs("SCALAR" + sid + " SOLVER", "NONE"); continue; @@ -1844,9 +2095,9 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { options.setArgs("SCALAR" + sid + " PRECONDITIONER", "JACOBI"); - parseSolverTolerance(rank, options, par, "scalar" + sidPar); + parseSolverTolerance(rank, options, par, parScope); - if (par->extract("scalar" + sidPar, "diffusivity", sbuf)) { + if (par->extract(parScope, "diffusivity", sbuf)) { int err = 0; double diffusivity = te_interp(sbuf.c_str(), &err); if (err) @@ -1857,7 +2108,7 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { to_string_f(diffusivity)); } - if (par->extract("scalar" + sidPar, "rho", sbuf)) { + if (par->extract(parScope, "rho", sbuf)) { int err = 0; double rho = te_interp(sbuf.c_str(), &err); if (err) @@ -1866,13 +2117,14 @@ setupAide parRead(void *ppar, std::string setupFile, MPI_Comm comm) { } std::string s_bcMap; - if (par->extract("scalar" + sidPar, "boundarytypemap", s_bcMap)) { + if (par->extract(parScope, "boundarytypemap", s_bcMap)) { if (!bcInPar) append_error("ERROR: boundaryTypeMap has to be defined for all fields"); std::vector sList; sList = serializeString(s_bcMap, ','); bcMap::setup(sList, "scalar" + sid); - } else { + } + else { if (bcInPar) append_error("ERROR: boundaryTypeMap has to be defined for all fields"); bcInPar = 0; diff --git a/src/core/parReader.hpp b/src/io/parReader.hpp similarity index 100% rename from src/core/parReader.hpp rename to src/io/parReader.hpp diff --git a/src/io/re2Reader.cpp b/src/io/re2Reader.cpp new file mode 100644 index 000000000..3f8e871e9 --- /dev/null +++ b/src/io/re2Reader.cpp @@ -0,0 +1,40 @@ +#include "nrs.hpp" +#include "re2Reader.hpp" + +void re2::nelg(const std::string& meshFile, int& nelgt, int& nelgv, MPI_Comm comm) +{ + int rank = 0; + if(comm != MPI_COMM_NULL) MPI_Comm_rank(comm, &rank); + + int err = 0; + if(rank == 0) { + char buf[FILENAME_MAX]; + strcpy(buf, meshFile.c_str()); + FILE *fp = fopen(buf, "r"); + if (!fp) { + if(rank == 0) printf("\nERROR: Cannot find %s!\n", buf); + ABORT(EXIT_FAILURE); + } + fgets(buf, 80, fp); + fclose(fp); + + char ver[6]; + int ndim; + // has to match header in re2 + sscanf(buf, "%5s %9d %1d %9d", ver, &nelgt, &ndim, &nelgv); + + if(ndim != 3) { + if(rank == 0) printf("\nERROR: Unsupported ndim=%d read from re2 header!\n", ndim); + err++; + } + if(nelgt <= 0 || nelgv <=0 || nelgv > nelgt) { + if(rank == 0) printf("\nERROR: Invalid nelgt=%d / nelgv=%d read from re2 header!\n", nelgt, nelgv); + err++; + } + } + if(comm != MPI_COMM_NULL) MPI_Allreduce(MPI_IN_PLACE, &err, 1, MPI_INT, MPI_MAX, comm); + if(err) ABORT(EXIT_FAILURE); + + if(comm != MPI_COMM_NULL) MPI_Bcast(&nelgt, 1, MPI_INT, 0, comm); + if(comm != MPI_COMM_NULL) MPI_Bcast(&nelgv, 1, MPI_INT, 0, comm); +} diff --git a/src/io/re2Reader.hpp b/src/io/re2Reader.hpp new file mode 100644 index 000000000..a11f26a1d --- /dev/null +++ b/src/io/re2Reader.hpp @@ -0,0 +1,11 @@ +#if !defined(nekrs_re2reader_hpp_) +#define nekrs_re2reader_hpp_ + +#include "nrs.hpp" + +namespace re2 +{ +void nelg(const std::string& meshFile, int& nelgt, int& nelgv, MPI_Comm comm); +} + +#endif diff --git a/src/lib/nekrs.cpp b/src/lib/nekrs.cpp index 78717a5b4..2f5ffe90b 100644 --- a/src/lib/nekrs.cpp +++ b/src/lib/nekrs.cpp @@ -3,6 +3,7 @@ #include "meshSetup.hpp" #include "setup.hpp" #include "nekInterfaceAdapter.hpp" +#include "printHeader.hpp" #include "udf.hpp" #include "parReader.hpp" #include "configReader.hpp" @@ -17,27 +18,21 @@ platform_t* platform; static int rank, size; -static MPI_Comm comm; +static MPI_Comm commg, comm; static nrs_t* nrs; static setupAide options; static dfloat lastOutputTime = 0; +static int firstOutfld = 1; static int enforceLastStep = 0; static int enforceOutputStep = 0; static void setOccaVars(); -void printHeader() -{ - std::cout << R"( __ ____ _____)" << std::endl - << R"( ____ ___ / /__ / __ \/ ___/)" << std::endl - << R"( / __ \ / _ \ / //_// /_/ /\__ \ )" << std::endl - << R"( / / / // __// ,< / _, _/___/ / )" << std::endl - << R"(/_/ /_/ \___//_/|_|/_/ |_|/____/ )" - << "v" << NEKRS_VERSION << "." << NEKRS_SUBVERSION - << " (" << GITCOMMITHASH << ")" << std::endl - << std::endl - << "COPYRIGHT (c) 2019-2021 UCHICAGO ARGONNE, LLC" << std::endl - << std::endl; +bool useNodeLocalCache(){ + int buildNodeLocal = 0; + if (getenv("NEKRS_CACHE_LOCAL")) + buildNodeLocal = std::stoi(getenv("NEKRS_CACHE_LOCAL")); + return (buildNodeLocal > 0); } namespace nekrs @@ -49,10 +44,13 @@ double startTime(void) return val; } -void setup(MPI_Comm comm_in, int buildOnly, int commSizeTarget, - int ciMode, std::string cacheDir, std::string _setupFile, - std::string _backend, std::string _deviceID) +void setup(MPI_Comm commg_in, MPI_Comm comm_in, + int buildOnly, int commSizeTarget, + int ciMode, std::string _setupFile, + std::string _backend, std::string _deviceID, + int debug) { + MPI_Comm_dup(commg_in, &commg); MPI_Comm_dup(comm_in, &comm); MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &size); @@ -76,12 +74,7 @@ void setup(MPI_Comm comm_in, int buildOnly, int commSizeTarget, { char buf[FILENAME_MAX]; char * ret = getcwd(buf, sizeof(buf)); - if(!ret) ABORT(EXIT_FAILURE);; - std::string cwd; - cwd.assign(buf); - - std::string dir(cacheDir); - if (cacheDir.empty()) dir = cwd + "/.cache"; + std::string dir = std::string(buf) + "/.cache"; if(getenv("NEKRS_CACHE_DIR")) dir.assign(getenv("NEKRS_CACHE_DIR")); setenv("NEKRS_CACHE_DIR", dir.c_str(), 1); } @@ -89,10 +82,10 @@ void setup(MPI_Comm comm_in, int buildOnly, int commSizeTarget, setOccaVars(); if (rank == 0) { - std::string install_dir; - install_dir.assign(getenv("NEKRS_HOME")); + std::string installDir; + installDir.assign(getenv("NEKRS_HOME")); std::cout << std::endl; - std::cout << "using NEKRS_HOME: " << install_dir << std::endl; + std::cout << "using NEKRS_HOME: " << installDir << std::endl; std:: string cache_dir; cache_dir.assign(getenv("NEKRS_CACHE_DIR")); @@ -113,22 +106,20 @@ void setup(MPI_Comm comm_in, int buildOnly, int commSizeTarget, fflush(stdout); } - if (options.getArgs("THREAD MODEL").length() == 0) + if(options.getArgs("THREAD MODEL").length() == 0) options.setArgs("THREAD MODEL", getenv("NEKRS_OCCA_MODE_DEFAULT")); if(!_backend.empty()) options.setArgs("THREAD MODEL", _backend); if(!_deviceID.empty()) options.setArgs("DEVICE NUMBER", _deviceID); // setup device - platform_t* _platform = platform_t::getInstance(options, comm); + platform_t* _platform = platform_t::getInstance(options, commg, comm); platform = _platform; platform->par = par; - platform->timer.tic("setup", 1); + if(debug) platform->options.setArgs("VERBOSE","TRUE"); int buildRank = rank; - int buildNodeLocal = 0; - if (getenv("NEKRS_BUILD_NODE_LOCAL")) - buildNodeLocal = std::stoi(getenv("NEKRS_BUILD_NODE_LOCAL")); + const bool buildNodeLocal = useNodeLocalCache(); if(buildNodeLocal) MPI_Comm_rank(platform->comm.mpiCommLocal, &buildRank); @@ -147,13 +138,7 @@ void setup(MPI_Comm comm_in, int buildOnly, int commSizeTarget, options.getArgs("UDF FILE", udfFile); if (!udfFile.empty()) { udfBuild(udfFile.c_str(), options); - - if(buildOnly) { - *(void**)(&udf.loadKernels) = udfLoadFunction("UDF_LoadKernels",1); - *(void**)(&udf.setup0) = udfLoadFunction("UDF_Setup0",0); - } else { - udfLoad(); - } + udfLoad(); } options.setArgs("CI-MODE", std::to_string(ciMode)); @@ -164,6 +149,12 @@ void setup(MPI_Comm comm_in, int buildOnly, int commSizeTarget, if(udf.setup0) udf.setup0(comm, options); + { + int overlap = 1; + if(options.compareArgs("GS OVERLAP", "FALSE")) overlap = 0; + oogs::overlap(overlap); + } + compileKernels(); if(buildOnly) { @@ -185,32 +176,24 @@ void setup(MPI_Comm comm_in, int buildOnly, int commSizeTarget, nrs = new nrs_t(); - nrsSetup(comm, options, nrs); + { + int result = 0; + MPI_Comm_compare(commg, comm, &result); - nrs->o_U.copyFrom(nrs->U); - nrs->o_P.copyFrom(nrs->P); - nrs->o_prop.copyFrom(nrs->prop); - if(nrs->Nscalar) { - nrs->cds->o_S.copyFrom(nrs->cds->S); - nrs->cds->o_prop.copyFrom(nrs->cds->prop); + nrs->multiSession = (result == MPI_UNEQUAL); } - evaluateProperties(nrs, startTime()); - nrs->o_prop.copyTo(nrs->prop); - if(nrs->Nscalar) nrs->cds->o_prop.copyTo(nrs->cds->prop); - - nek::ocopyToNek(startTime(), 0); + nrsSetup(comm, options, nrs); - platform->timer.toc("setup"); const double setupTime = platform->timer.query("setup", "DEVICE:MAX"); if(rank == 0) { std::cout << "\nsettings:\n" << std::endl << options << std::endl; - std::cout << "occa memory usage: " << platform->device.memoryAllocated()/1e9 << " GB" << std::endl; - std::cout << "initialization took " << setupTime << " s" << std::endl; + std::cout << "occa memory usage: " << platform->device.occaDevice().memoryAllocated() / 1e9 << " GB" + << std::endl; } fflush(stdout); - platform->timer.set("setup", setupTime); + platform->flopCounter->clear(); } void runStep(double time, double dt, int tstep) @@ -256,16 +239,16 @@ double dt(int tstep) } const double dtOld = nrs->dt[0]; timeStepper::adjustDt(nrs, tstep); - // limit relative change to control introduced error - if(tstep > 1) nrs->dt[0] = (nrs->dt[0] < 1.25*dtOld) ? nrs->dt[0] : 1.25*dtOld; + + // limit dt to control introduced error + if(tstep > 1) nrs->dt[0] = std::min(nrs->dt[0], 1.25*dtOld); + double maxDt = 0; + platform->options.getArgs("MAX DT", maxDt); + if(maxDt > 0) nrs->dt[0] = std::min(nrs->dt[0], maxDt); } - double maxDt = std::numeric_limits::max(); - platform->options.getArgs("MAX DT", maxDt); - nrs->dt[0] = (nrs->dt[0] < maxDt) ? nrs->dt[0] : maxDt; - if(nrs->dt[0] < 1e-10 || std::isnan(nrs->dt[0]) || std::isinf(nrs->dt[0])) { - if(platform->comm.mpiRank == 0) std::cout << "Invalid time step size!\n"; + if(platform->comm.mpiRank == 0) printf("Invalid time step size %.2e\n", nrs->dt[0]); ABORT(EXIT_FAILURE); } @@ -313,11 +296,15 @@ void outfld(double time, std::string suffix) std::string oldValue; platform->options.getArgs("CHECKPOINT OUTPUT MESH", oldValue); - if(lastOutputTime == 0) + if(firstOutfld) + platform->options.setArgs("CHECKPOINT OUTPUT MESH", "TRUE"); + + if(platform->options.compareArgs("MOVING MESH", "TRUE")) platform->options.setArgs("CHECKPOINT OUTPUT MESH", "TRUE"); writeFld(nrs, time, suffix); lastOutputTime = time; + firstOutfld = 0; platform->options.setArgs("CHECKPOINT OUTPUT MESH", oldValue); } @@ -370,15 +357,21 @@ void* nrsPtr(void) void finalize(void) { - AMGXfree(); + if(options.compareArgs("BUILD ONLY", "FALSE")) { + AMGXfree(); + nek::end(); + } } -void printRuntimeStatistics(int step) +int runTimeStatFreq() { - platform_t* platform = platform_t::getInstance(options, comm); - platform->timer.printRunStat(step); + int freq = 500; + platform->options.getArgs("RUNTIME STATISTICS FREQUENCY", freq); + return freq; } +void printRuntimeStatistics(int step) { platform->timer.printRunStat(step); } + void processUpdFile() { char* rbuf = nullptr; @@ -445,6 +438,16 @@ void processUpdFile() } } +void printInfo(double time, int tstep) { timeStepper::printInfo(nrs, time, tstep); } + +void verboseInfo(bool enabled) +{ + platform->options.setArgs("VERBOSE SOLVER INFO", "FALSE"); + if(enabled) platform->options.setArgs("VERBOSE SOLVER INFO", "TRUE"); +} + +void updateTimer(const std::string &key, double time) { platform->timer.set(key, time); } + } // namespace static void setOccaVars() @@ -452,14 +455,17 @@ static void setOccaVars() std::string cache_dir; cache_dir.assign(getenv("NEKRS_CACHE_DIR")); - if (!getenv("OCCA_CACHE_DIR")) - occa::env::OCCA_CACHE_DIR = cache_dir + "/occa/"; + if (!getenv("OCCA_CACHE_DIR")) { + const std::string path= cache_dir + "/occa/"; + occa::env::OCCA_CACHE_DIR = path; + setenv("OCCA_CACHE_DIR", path.c_str(), 1); + } - std::string install_dir; - install_dir.assign(getenv("NEKRS_HOME")); + std::string installDir; + installDir.assign(getenv("NEKRS_HOME")); if (!getenv("OCCA_DIR")) - occa::env::OCCA_DIR = install_dir + "/"; + occa::env::OCCA_DIR = installDir + "/"; occa::env::OCCA_INSTALL_DIR = occa::env::OCCA_DIR; } diff --git a/src/lib/nekrs.hpp b/src/lib/nekrs.hpp index f2befa1b2..5c7c62358 100644 --- a/src/lib/nekrs.hpp +++ b/src/lib/nekrs.hpp @@ -1,15 +1,16 @@ #if !defined(nekrs_nrs_hpp_) #define nekrs_nrs_hpp_ -#include #include +#include namespace nekrs { -void setup(MPI_Comm comm, int buildOnly, int targetSize, - int ciMode, std::string cacheDir, std::string setupFile, - std::string backend, std::string deviceID); - +void setup(MPI_Comm commg_in, MPI_Comm comm_in, + int buildOnly, int commSizeTarget, + int ciMode, std::string _setupFile, + std::string _backend, std::string _deviceID, + int debug); void runStep(double time, double dt, int tstep); void copyFromNek(double time, int tstep); void udfExecuteStep(double time, int tstep, int isOutputStep); @@ -19,6 +20,7 @@ int outputStep(double time, int tStep); void outputStep(int val); void finalize(); void nekUserchk(void); +int runTimeStatFreq(); void printRuntimeStatistics(int step); double writeInterval(void); double dt(int tStep); @@ -28,6 +30,9 @@ int numSteps(void); int lastStep(double time, int tstep, double elapsedTime); int writeControlRunTime(void); void processUpdFile(); +void printInfo(double time, int tstep); +void verboseInfo(bool enabled); +void updateTimer(const std::string &key, double time); void* nrsPtr(void); void* nekPtr(const char* id); diff --git a/src/linAlg/linAlg.cpp b/src/linAlg/linAlg.cpp index 043a2d0b3..b4ca4250c 100644 --- a/src/linAlg/linAlg.cpp +++ b/src/linAlg/linAlg.cpp @@ -27,36 +27,39 @@ SOFTWARE. #include "linAlg.hpp" #include "platform.hpp" -linAlg_t* linAlg_t::singleton = nullptr; +linAlg_t *linAlg_t::singleton = nullptr; -linAlg_t* -linAlg_t::getInstance() +linAlg_t *linAlg_t::getInstance() { - if(!singleton) + if (!singleton) singleton = new linAlg_t(); return singleton; } -linAlg_t::linAlg_t() { +linAlg_t::linAlg_t() +{ blocksize = BLOCKSIZE; - serial = platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP"; + serial = platform->serial; comm = platform->comm.mpiComm; setup(); } -void linAlg_t::reallocScratch(const dlong Nbytes) +void linAlg_t::reallocScratch(const size_t Nbytes) { - device_t& device = platform->device; - if(h_scratch.size()) h_scratch.free(); - if(o_scratch.size()) o_scratch.free(); - //pinned scratch buffer + device_t &device = platform->device; + if (h_scratch.size()) + h_scratch.free(); + if (o_scratch.size()) + o_scratch.free(); + // pinned scratch buffer { h_scratch = device.mallocHost(Nbytes); - scratch = (dfloat*) h_scratch.ptr(); + scratch = (dfloat *)h_scratch.ptr(); } o_scratch = device.malloc(Nbytes); } -void linAlg_t::setup() { +void linAlg_t::setup() +{ - auto& kernels = platform->kernels; + auto &kernels = platform->kernels; int rank; MPI_Comm_rank(comm, &rank); @@ -71,46 +74,49 @@ void linAlg_t::setup() { MPI_Barrier(platform->comm.mpiComm); double tStartLoadKernel = MPI_Wtime(); { - fillKernel = kernels.getKernel("fill"); - absKernel = kernels.getKernel("vabs"); - addKernel = kernels.getKernel("add"); - scaleKernel = kernels.getKernel("scale"); - scaleManyKernel = kernels.getKernel("scaleMany"); - axpbyKernel = kernels.getKernel("axpby"); - axpbyManyKernel = kernels.getKernel("axpbyMany"); - axpbyzKernel = kernels.getKernel("axpbyz"); - axpbyzManyKernel = kernels.getKernel("axpbyzMany"); - axmyKernel = kernels.getKernel("axmy"); - axmyManyKernel = kernels.getKernel("axmyMany"); - axmyVectorKernel = kernels.getKernel("axmyVector"); - axmyzKernel = kernels.getKernel("axmyz"); - axmyzManyKernel = kernels.getKernel("axmyzMany"); - adyKernel = kernels.getKernel("ady"); - adyManyKernel = kernels.getKernel("adyMany"); - axdyKernel = kernels.getKernel("axdy"); - aydxKernel = kernels.getKernel("aydx"); - aydxManyKernel = kernels.getKernel("aydxMany"); - axdyzKernel = kernels.getKernel("axdyz"); - sumKernel = kernels.getKernel("sum"); - sumManyKernel = kernels.getKernel("sumMany"); - minKernel = kernels.getKernel("min"); - maxKernel = kernels.getKernel("max"); - norm2Kernel = kernels.getKernel("norm2"); - norm2ManyKernel = kernels.getKernel("norm2Many"); - norm1Kernel = kernels.getKernel("norm1"); - norm1ManyKernel = kernels.getKernel("norm1Many"); - weightedNorm1Kernel = kernels.getKernel("weightedNorm1"); - weightedNorm1ManyKernel = kernels.getKernel("weightedNorm1Many"); - weightedNorm2Kernel = kernels.getKernel("weightedNorm2"); - weightedNorm2ManyKernel = kernels.getKernel("weightedNorm2Many"); - innerProdKernel = kernels.getKernel("innerProd"); - weightedInnerProdKernel = kernels.getKernel("weightedInnerProd"); - weightedInnerProdManyKernel = kernels.getKernel("weightedInnerProdMany"); - weightedInnerProdMultiKernel = kernels.getKernel("weightedInnerProdMulti"); - } -} - -linAlg_t::~linAlg_t() { + fillKernel = kernels.get("fill"); + absKernel = kernels.get("vabs"); + addKernel = kernels.get("add"); + scaleKernel = kernels.get("scale"); + scaleManyKernel = kernels.get("scaleMany"); + axpbyKernel = kernels.get("axpby"); + axpbyManyKernel = kernels.get("axpbyMany"); + axpbyzKernel = kernels.get("axpbyz"); + axpbyzManyKernel = kernels.get("axpbyzMany"); + axmyKernel = kernels.get("axmy"); + axmyManyKernel = kernels.get("axmyMany"); + axmyVectorKernel = kernels.get("axmyVector"); + axmyzKernel = kernels.get("axmyz"); + axmyzManyKernel = kernels.get("axmyzMany"); + adyKernel = kernels.get("ady"); + adyManyKernel = kernels.get("adyMany"); + axdyKernel = kernels.get("axdy"); + aydxKernel = kernels.get("aydx"); + aydxManyKernel = kernels.get("aydxMany"); + axdyzKernel = kernels.get("axdyz"); + sumKernel = kernels.get("sum"); + sumManyKernel = kernels.get("sumMany"); + minKernel = kernels.get("min"); + maxKernel = kernels.get("max"); + norm2Kernel = kernels.get("norm2"); + norm2ManyKernel = kernels.get("norm2Many"); + norm1Kernel = kernels.get("norm1"); + norm1ManyKernel = kernels.get("norm1Many"); + weightedNorm1Kernel = kernels.get("weightedNorm1"); + weightedNorm1ManyKernel = kernels.get("weightedNorm1Many"); + weightedNorm2Kernel = kernels.get("weightedNorm2"); + weightedNorm2ManyKernel = kernels.get("weightedNorm2Many"); + innerProdKernel = kernels.get("innerProd"); + weightedInnerProdKernel = kernels.get("weightedInnerProd"); + weightedInnerProdManyKernel = kernels.get("weightedInnerProdMany"); + weightedInnerProdMultiKernel = kernels.get("weightedInnerProdMulti"); + crossProductKernel = kernels.get("crossProduct"); + unitVectorKernel = kernels.get("unitVector"); + } +} + +linAlg_t::~linAlg_t() +{ fillKernel.free(); absKernel.free(); addKernel.free(); @@ -153,157 +159,239 @@ linAlg_t::~linAlg_t() { /*********************/ // o_a[n] = alpha -void linAlg_t::fill(const dlong N, const dfloat alpha, occa::memory& o_a) { - fillKernel(N, alpha, o_a); -} +void linAlg_t::fill(const dlong N, const dfloat alpha, occa::memory &o_a) { fillKernel(N, alpha, o_a); } // o_a[n] = abs(o_a[n]) -void linAlg_t::abs(const dlong N, occa::memory& o_a) { - absKernel(N, o_a); -} +void linAlg_t::abs(const dlong N, occa::memory &o_a) { absKernel(N, o_a); } // o_a[n] += alpha -void linAlg_t::add(const dlong N, const dfloat alpha, occa::memory& o_a, const dlong offset) { +void linAlg_t::add(const dlong N, const dfloat alpha, occa::memory &o_a, const dlong offset) +{ addKernel(N, offset, alpha, o_a); } // o_a[n] *= alpha -void linAlg_t::scale(const dlong N, const dfloat alpha, occa::memory& o_a) { - scaleKernel(N, alpha, o_a); -} -void linAlg_t::scaleMany(const dlong N, const dlong Nfields, const dlong fieldOffset, const dfloat alpha, occa::memory& o_a, const dlong offset) { +void linAlg_t::scale(const dlong N, const dfloat alpha, occa::memory &o_a) { scaleKernel(N, alpha, o_a); } +void linAlg_t::scaleMany(const dlong N, + const dlong Nfields, + const dlong fieldOffset, + const dfloat alpha, + occa::memory &o_a, + const dlong offset) +{ scaleManyKernel(N, Nfields, fieldOffset, offset, alpha, o_a); } // o_y[n] = beta*o_y[n] + alpha*o_x[n] -void linAlg_t::axpby(const dlong N, const dfloat alpha, occa::memory& o_x, - const dfloat beta, occa::memory& o_y, const dlong xOffset, const dlong yOffset) { +void linAlg_t::axpby(const dlong N, + const dfloat alpha, + occa::memory &o_x, + const dfloat beta, + occa::memory &o_y, + const dlong xOffset, + const dlong yOffset) +{ axpbyKernel(N, xOffset, yOffset, alpha, o_x, beta, o_y); -} -void linAlg_t::axpbyMany(const dlong N, const dlong Nfields, const dlong offset, const dfloat alpha, occa::memory& o_x, - const dfloat beta, occa::memory& o_y) { + platform->flopCounter->add("axpby", 3 * static_cast(N)); +} +void linAlg_t::axpbyMany(const dlong N, + const dlong Nfields, + const dlong offset, + const dfloat alpha, + occa::memory &o_x, + const dfloat beta, + occa::memory &o_y) +{ axpbyManyKernel(N, Nfields, offset, alpha, o_x, beta, o_y); + platform->flopCounter->add("axpbyMany", 3 * static_cast(N) * Nfields); } // o_z[n] = beta*o_y[n] + alpha*o_x[n] -void linAlg_t::axpbyz(const dlong N, const dfloat alpha, occa::memory& o_x, - const dfloat beta, occa::memory& o_y, occa::memory& o_z) { +void linAlg_t::axpbyz(const dlong N, + const dfloat alpha, + occa::memory &o_x, + const dfloat beta, + occa::memory &o_y, + occa::memory &o_z) +{ axpbyzKernel(N, alpha, o_x, beta, o_y, o_z); -} -void linAlg_t::axpbyzMany(const dlong N, const dlong Nfields, const dlong fieldOffset, const dfloat alpha, occa::memory& o_x, - const dfloat beta, occa::memory& o_y, occa::memory& o_z) { + platform->flopCounter->add("axpbyz", 3 * static_cast(N)); +} +void linAlg_t::axpbyzMany(const dlong N, + const dlong Nfields, + const dlong fieldOffset, + const dfloat alpha, + occa::memory &o_x, + const dfloat beta, + occa::memory &o_y, + occa::memory &o_z) +{ axpbyzManyKernel(N, Nfields, fieldOffset, alpha, o_x, beta, o_y, o_z); + platform->flopCounter->add("axpbyzMany", 3 * static_cast(N) * Nfields); } // o_y[n] = alpha*o_x[n]*o_y[n] -void linAlg_t::axmy(const dlong N, const dfloat alpha, - occa::memory& o_x, occa::memory& o_y) { +void linAlg_t::axmy(const dlong N, const dfloat alpha, occa::memory &o_x, occa::memory &o_y) +{ axmyKernel(N, alpha, o_x, o_y); } -void linAlg_t::axmyMany(const dlong N, const dlong Nfields, const dlong offset, - const dlong mode, const dfloat alpha, - occa::memory& o_x, occa::memory& o_y) { +void linAlg_t::axmyMany(const dlong N, + const dlong Nfields, + const dlong offset, + const dlong mode, + const dfloat alpha, + occa::memory &o_x, + occa::memory &o_y) +{ axmyManyKernel(N, Nfields, offset, mode, alpha, o_x, o_y); } -void linAlg_t::axmyVector(const dlong N, const dlong offset, - const dlong mode, const dfloat alpha, - occa::memory& o_x, occa::memory& o_y) { +void linAlg_t::axmyVector(const dlong N, + const dlong offset, + const dlong mode, + const dfloat alpha, + occa::memory &o_x, + occa::memory &o_y) +{ axmyVectorKernel(N, offset, mode, alpha, o_x, o_y); } // o_z[n] = alpha*o_x[n]*o_y[n] -void linAlg_t::axmyz(const dlong N, const dfloat alpha, - occa::memory& o_x, occa::memory& o_y, occa::memory& o_z) { +void linAlg_t::axmyz(const dlong N, + const dfloat alpha, + occa::memory &o_x, + occa::memory &o_y, + occa::memory &o_z) +{ axmyzKernel(N, alpha, o_x, o_y, o_z); } -void linAlg_t::axmyzMany(const dlong N, const dlong Nfields, const dlong offset, const dfloat alpha, - occa::memory& o_x, occa::memory& o_y, occa::memory& o_z) { +void linAlg_t::axmyzMany(const dlong N, + const dlong Nfields, + const dlong offset, + const dfloat alpha, + occa::memory &o_x, + occa::memory &o_y, + occa::memory &o_z) +{ axmyzManyKernel(N, Nfields, offset, alpha, o_x, o_y, o_z); } // o_y[n] = alpha*o_x[n]/o_y[n] -void linAlg_t::axdy(const dlong N, const dfloat alpha, - occa::memory& o_x, occa::memory& o_y) { +void linAlg_t::axdy(const dlong N, const dfloat alpha, occa::memory &o_x, occa::memory &o_y) +{ axdyKernel(N, alpha, o_x, o_y); } -void linAlg_t::aydx(const dlong N, const dfloat alpha, - occa::memory& o_x, occa::memory& o_y) { +void linAlg_t::aydx(const dlong N, const dfloat alpha, occa::memory &o_x, occa::memory &o_y) +{ aydxKernel(N, alpha, o_x, o_y); } -void linAlg_t::aydxMany(const dlong N, const dlong Nfields, const dlong fieldOffset, - const dlong mode, const dfloat alpha, - occa::memory& o_x, occa::memory& o_y) { +void linAlg_t::aydxMany(const dlong N, + const dlong Nfields, + const dlong fieldOffset, + const dlong mode, + const dfloat alpha, + occa::memory &o_x, + occa::memory &o_y) +{ aydxManyKernel(N, Nfields, fieldOffset, mode, alpha, o_x, o_y); } // o_y[n] = alpha/o_y[n] -void linAlg_t::ady(const dlong N, const dfloat alpha, - occa::memory& o_y) { - adyKernel(N, alpha, o_y); -} -void linAlg_t::adyMany(const dlong N, const dlong Nfields, const dlong offset, const dfloat alpha, - occa::memory& o_y) { +void linAlg_t::ady(const dlong N, const dfloat alpha, occa::memory &o_y) { adyKernel(N, alpha, o_y); } +void linAlg_t::adyMany(const dlong N, + const dlong Nfields, + const dlong offset, + const dfloat alpha, + occa::memory &o_y) +{ adyManyKernel(N, Nfields, offset, alpha, o_y); } // o_z[n] = alpha*o_x[n]/o_y[n] -void linAlg_t::axdyz(const dlong N, const dfloat alpha, - occa::memory& o_x, occa::memory& o_y, occa::memory& o_z) { +void linAlg_t::axdyz(const dlong N, + const dfloat alpha, + occa::memory &o_x, + occa::memory &o_y, + occa::memory &o_z) +{ axdyzKernel(N, alpha, o_x, o_y, o_z); } // \sum o_a -dfloat linAlg_t::sum(const dlong N, occa::memory& o_a, MPI_Comm _comm, const dlong offset) { - int Nblock = (N+blocksize-1)/blocksize; - const dlong Nbytes = Nblock * sizeof(dfloat); - if(o_scratch.size() < Nbytes) reallocScratch(Nbytes); - - sumKernel(Nblock, N, offset, o_a, o_scratch); +dfloat linAlg_t::sum(const dlong N, occa::memory &o_a, MPI_Comm _comm, const dlong offset) +{ + int Nblock = (N + blocksize - 1) / blocksize; + const size_t Nbytes = Nblock * sizeof(dfloat); + if (o_scratch.size() < Nbytes) + reallocScratch(Nbytes); - o_scratch.copyTo(scratch, Nbytes); + if (N > 1) { + sumKernel(Nblock, N, offset, o_a, o_scratch); + o_scratch.copyTo(scratch, Nbytes); + } + else { + o_a.copyTo(scratch, Nbytes); + } dfloat sum = 0; - for(dlong n=0;n 1 || Nfields > 1) { + sumManyKernel(Nblock, N, Nfields, fieldOffset, o_a, o_scratch); - o_scratch.copyTo(scratch, Nbytes); + o_scratch.copyTo(scratch, Nbytes); + } + else { + o_a.copyTo(scratch, Nbytes); + } dfloat sum = 0; - for(dlong n=0;n 1) { + minKernel(Nblock, N, o_a, o_scratch); - o_scratch.copyTo(scratch, Nbytes); + o_scratch.copyTo(scratch, Nbytes); + } + else { + o_a.copyTo(scratch, Nbytes); + } dfloat min = scratch[0]; - for(dlong n=1;n 1) { + maxKernel(Nblock, N, o_a, o_scratch); - o_scratch.copyTo(scratch, Nbytes); + o_scratch.copyTo(scratch, Nbytes); + } + else { + o_a.copyTo(scratch, Nbytes); + } dfloat max = scratch[0]; - for(dlong n=1;n max) ? scratch[n]:max; + for (dlong n = 1; n < Nblock; ++n) { + max = (scratch[n] > max) ? scratch[n] : max; } - if (_comm != MPI_COMM_NULL) + if (_comm != MPI_COMM_NULL) MPI_Allreduce(MPI_IN_PLACE, &max, 1, MPI_DFLOAT, MPI_MAX, _comm); return max; } // ||o_a||_2 -dfloat linAlg_t::norm2(const dlong N, occa::memory& o_x, MPI_Comm _comm) { +dfloat linAlg_t::norm2(const dlong N, occa::memory &o_x, MPI_Comm _comm) +{ #ifdef ENABLE_TIMER - platform->timer.tic("dotp",1); + platform->timer.tic("dotp", 1); #endif - int Nblock = (N+blocksize-1)/blocksize; - const dlong Nbytes = Nblock * sizeof(dfloat); - if(o_scratch.size() < Nbytes) reallocScratch(Nbytes); - - norm2Kernel(Nblock, N, o_x, o_scratch); + int Nblock = (N + blocksize - 1) / blocksize; + const size_t Nbytes = Nblock * sizeof(dfloat); + if (o_scratch.size() < Nbytes) + reallocScratch(Nbytes); dfloat norm = 0; - if(serial){ - norm = *((dfloat*) o_scratch.ptr()); - } else { - o_scratch.copyTo(scratch, Nbytes); - for(dlong n=0;n 1) { + norm2Kernel(Nblock, N, o_x, o_scratch); + + if (serial) { + norm = *((dfloat *)o_scratch.ptr()); + } + else { + o_scratch.copyTo(scratch, Nbytes); + for (dlong n = 0; n < Nblock; ++n) { + norm += scratch[n]; + } } } + else { + dfloat x; + o_x.copyTo(&x, Nbytes); + norm = x * x; + } - if (_comm != MPI_COMM_NULL) + if (_comm != MPI_COMM_NULL) MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DFLOAT, MPI_SUM, _comm); #ifdef ENABLE_TIMER platform->timer.toc("dotp"); @@ -362,26 +466,40 @@ dfloat linAlg_t::norm2(const dlong N, occa::memory& o_x, MPI_Comm _comm) { return sqrt(norm); } -dfloat linAlg_t::norm2Many(const dlong N, const dlong Nfields, const dlong fieldOffset, occa::memory& o_x, MPI_Comm _comm) { +dfloat linAlg_t::norm2Many(const dlong N, + const dlong Nfields, + const dlong fieldOffset, + occa::memory &o_x, + MPI_Comm _comm) +{ #ifdef ENABLE_TIMER - platform->timer.tic("dotp",1); + platform->timer.tic("dotp", 1); #endif - int Nblock = (N+blocksize-1)/blocksize; - const dlong Nbytes = Nblock * sizeof(dfloat); - if(o_scratch.size() < Nbytes) reallocScratch(Nbytes); + int Nblock = (N + blocksize - 1) / blocksize; + const size_t Nbytes = Nblock * sizeof(dfloat); + if (o_scratch.size() < Nbytes) + reallocScratch(Nbytes); - norm2ManyKernel(Nblock, N, Nfields, fieldOffset, o_x, o_scratch); dfloat norm = 0; - if(serial){ - norm = *((dfloat*) o_scratch.ptr()); - } else { - o_scratch.copyTo(scratch, Nbytes); - for(dlong n=0;n 1 || Nfields > 1) { + norm2ManyKernel(Nblock, N, Nfields, fieldOffset, o_x, o_scratch); + if (serial) { + norm = *((dfloat *)o_scratch.ptr()); + } + else { + o_scratch.copyTo(scratch, Nbytes); + for (dlong n = 0; n < Nblock; ++n) { + norm += scratch[n]; + } } } + else { + dfloat x; + o_x.copyTo(&x, Nbytes); + norm = x * x; + } - if (_comm != MPI_COMM_NULL) + if (_comm != MPI_COMM_NULL) MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DFLOAT, MPI_SUM, _comm); #ifdef ENABLE_TIMER platform->timer.toc("dotp"); @@ -390,26 +508,36 @@ dfloat linAlg_t::norm2Many(const dlong N, const dlong Nfields, const dlong field return sqrt(norm); } // ||o_a||_1 -dfloat linAlg_t::norm1(const dlong N, occa::memory& o_x, MPI_Comm _comm) { +dfloat linAlg_t::norm1(const dlong N, occa::memory &o_x, MPI_Comm _comm) +{ #ifdef ENABLE_TIMER - platform->timer.tic("dotp",1); + platform->timer.tic("dotp", 1); #endif - int Nblock = (N+blocksize-1)/blocksize; - const dlong Nbytes = Nblock * sizeof(dfloat); - if(o_scratch.size() < Nbytes) reallocScratch(Nbytes); + int Nblock = (N + blocksize - 1) / blocksize; + const size_t Nbytes = Nblock * sizeof(dfloat); + if (o_scratch.size() < Nbytes) + reallocScratch(Nbytes); - norm1Kernel(Nblock, N, o_x, o_scratch); dfloat norm = 0; - if(serial){ - norm = *((dfloat*) o_scratch.ptr()); - } else { - o_scratch.copyTo(scratch, Nbytes); - for(dlong n=0;n 1) { + norm1Kernel(Nblock, N, o_x, o_scratch); + if (serial) { + norm = *((dfloat *)o_scratch.ptr()); + } + else { + o_scratch.copyTo(scratch, Nbytes); + for (dlong n = 0; n < Nblock; ++n) { + norm += scratch[n]; + } } } + else { + dfloat x; + o_x.copyTo(&x, Nbytes); + norm = std::abs(x); + } - if (_comm != MPI_COMM_NULL) + if (_comm != MPI_COMM_NULL) MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DFLOAT, MPI_SUM, _comm); #ifdef ENABLE_TIMER platform->timer.toc("dotp"); @@ -417,27 +545,41 @@ dfloat linAlg_t::norm1(const dlong N, occa::memory& o_x, MPI_Comm _comm) { return norm; } -dfloat linAlg_t::norm1Many(const dlong N, const dlong Nfields, const dlong fieldOffset, occa::memory& o_x, MPI_Comm _comm) { +dfloat linAlg_t::norm1Many(const dlong N, + const dlong Nfields, + const dlong fieldOffset, + occa::memory &o_x, + MPI_Comm _comm) +{ #ifdef ENABLE_TIMER - platform->timer.tic("dotp",1); + platform->timer.tic("dotp", 1); #endif - int Nblock = (N+blocksize-1)/blocksize; - const dlong Nbytes = Nblock * sizeof(dfloat); - if(o_scratch.size() < Nbytes) reallocScratch(Nbytes); - - norm1ManyKernel(Nblock, N, Nfields, fieldOffset, o_x, o_scratch); + int Nblock = (N + blocksize - 1) / blocksize; + const size_t Nbytes = Nblock * sizeof(dfloat); + if (o_scratch.size() < Nbytes) + reallocScratch(Nbytes); dfloat norm = 0; - if(serial){ - norm = *((dfloat*) o_scratch.ptr()); - } else { - o_scratch.copyTo(scratch, Nbytes); - for(dlong n=0;n 1 || Nfields > 1) { + norm1ManyKernel(Nblock, N, Nfields, fieldOffset, o_x, o_scratch); + + if (serial) { + norm = *((dfloat *)o_scratch.ptr()); + } + else { + o_scratch.copyTo(scratch, Nbytes); + for (dlong n = 0; n < Nblock; ++n) { + norm += scratch[n]; + } } } + else { + dfloat x; + o_x.copyTo(&x, Nbytes); + norm = std::abs(x); + } - if (_comm != MPI_COMM_NULL) + if (_comm != MPI_COMM_NULL) MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DFLOAT, MPI_SUM, _comm); #ifdef ENABLE_TIMER @@ -447,25 +589,41 @@ dfloat linAlg_t::norm1Many(const dlong N, const dlong Nfields, const dlong field } // o_x.o_y -dfloat linAlg_t::innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y, - MPI_Comm _comm, const dlong offset) { +dfloat +linAlg_t::innerProd(const dlong N, occa::memory &o_x, occa::memory &o_y, MPI_Comm _comm, const dlong offset) +{ #ifdef ENABLE_TIMER - platform->timer.tic("dotp",1); + platform->timer.tic("dotp", 1); #endif - int Nblock = (N+blocksize-1)/blocksize; - const dlong Nbytes = Nblock * sizeof(dfloat); - if(o_scratch.size() < Nbytes) reallocScratch(Nbytes); + int Nblock = (N + blocksize - 1) / blocksize; + const size_t Nbytes = Nblock * sizeof(dfloat); + if (o_scratch.size() < Nbytes) + reallocScratch(Nbytes); + + dfloat dot = 0; + if (N > 1) { + innerProdKernel(Nblock, N, offset, o_x, o_y, o_scratch); - innerProdKernel(Nblock, N, offset, o_x, o_y, o_scratch); + if (serial) { + dot = *((dfloat *)o_scratch.ptr()); + } + else { - o_scratch.copyTo(scratch, Nbytes); + o_scratch.copyTo(scratch, Nbytes); - dfloat dot = 0; - for(dlong n=0;ntimer.tic("dotp",1); + platform->timer.tic("dotp", 1); #endif - int Nblock = (N+blocksize-1)/blocksize; - const dlong Nbytes = Nblock * sizeof(dfloat); - if(o_scratch.size() < Nbytes) reallocScratch(Nbytes); - - weightedInnerProdKernel(Nblock, N, o_w, o_x, o_y, o_scratch); - + int Nblock = (N + blocksize - 1) / blocksize; + const size_t Nbytes = Nblock * sizeof(dfloat); + if (o_scratch.size() < Nbytes) + reallocScratch(Nbytes); dfloat dot = 0; + if (N > 1) { + weightedInnerProdKernel(Nblock, N, o_w, o_x, o_y, o_scratch); - if(serial){ - dot = *((dfloat*) o_scratch.ptr()); - } else { - o_scratch.copyTo(scratch, Nbytes); - for(dlong n=0;ntimer.toc("dotp"); #endif + platform->flopCounter->add("weightedInnerProd", 3 * static_cast(N)); return dot; } -void linAlg_t::weightedInnerProdMulti(const dlong N, - const dlong NVec, - const dlong Nfields, - const dlong fieldOffset, - occa::memory& o_w, - occa::memory& o_x, occa::memory& o_y, - MPI_Comm _comm, dfloat* result, const dlong offset) { +void linAlg_t::weightedInnerProdMulti(const dlong N, + const dlong NVec, + const dlong Nfields, + const dlong fieldOffset, + occa::memory &o_w, + occa::memory &o_x, + occa::memory &o_y, + MPI_Comm _comm, + dfloat *result, + const dlong offset) +{ #ifdef ENABLE_TIMER - platform->timer.tic("dotp",1); + platform->timer.tic("dotp", 1); #endif - int Nblock = (N+blocksize-1)/blocksize; - const dlong Nbytes = NVec * Nblock * sizeof(dfloat); - if(o_scratch.size() < Nbytes) reallocScratch(Nbytes); + int Nblock = (N + blocksize - 1) / blocksize; + const size_t Nbytes = NVec * Nblock * sizeof(dfloat); + if (o_scratch.size() < Nbytes) + reallocScratch(Nbytes); - weightedInnerProdMultiKernel(Nblock, N, Nfields, fieldOffset, NVec, offset, o_w, o_x, o_y, o_scratch); + if (N > 1 || NVec > 1 || Nfields > 1) { + weightedInnerProdMultiKernel(Nblock, N, Nfields, fieldOffset, NVec, offset, o_w, o_x, o_y, o_scratch); - o_scratch.copyTo(scratch, Nbytes); + o_scratch.copyTo(scratch, Nbytes); - for(int field = 0; field < NVec; ++field){ - dfloat dot = 0; - for(dlong n=0;ntimer.toc("dotp"); #endif + + platform->flopCounter->add("weightedInnerProdMulti", NVec * static_cast(N) * (2 * Nfields + 1)); } -dfloat linAlg_t::weightedInnerProdMany(const dlong N, - const dlong Nfields, - const dlong fieldOffset, - occa::memory& o_w, - occa::memory& o_x, occa::memory& o_y, - MPI_Comm _comm) { +dfloat linAlg_t::weightedInnerProdMany(const dlong N, + const dlong Nfields, + const dlong fieldOffset, + occa::memory &o_w, + occa::memory &o_x, + occa::memory &o_y, + MPI_Comm _comm) +{ #ifdef ENABLE_TIMER - platform->timer.tic("dotp",1); + platform->timer.tic("dotp", 1); #endif - int Nblock = (N+blocksize-1)/blocksize; - const dlong Nbytes = Nblock * sizeof(dfloat); - if(o_scratch.size() < Nbytes) reallocScratch(Nbytes); - - weightedInnerProdManyKernel(Nblock, N, Nfields, fieldOffset, o_w, o_x, o_y, o_scratch); + int Nblock = (N + blocksize - 1) / blocksize; + const size_t Nbytes = Nblock * sizeof(dfloat); + if (o_scratch.size() < Nbytes) + reallocScratch(Nbytes); dfloat dot = 0; + if (N > 1 || Nfields > 1) { + weightedInnerProdManyKernel(Nblock, N, Nfields, fieldOffset, o_w, o_x, o_y, o_scratch); - if(serial){ - dot = *((dfloat*) o_scratch.ptr()); - } else { - o_scratch.copyTo(scratch, Nbytes); - for(dlong n=0;ntimer.toc("dotp"); #endif + platform->flopCounter->add("weightedInnerProdMany", 3 * static_cast(N) * Nfields); return dot; } // ||o_a||_w2 -dfloat linAlg_t::weightedNorm2(const dlong N, occa::memory& o_w, - occa::memory& o_a, MPI_Comm _comm) { +dfloat linAlg_t::weightedNorm2(const dlong N, occa::memory &o_w, occa::memory &o_a, MPI_Comm _comm) +{ #ifdef ENABLE_TIMER - platform->timer.tic("dotp",1); + platform->timer.tic("dotp", 1); #endif - int Nblock = (N+blocksize-1)/blocksize; - const dlong Nbytes = Nblock * sizeof(dfloat); - if(o_scratch.size() < Nbytes) reallocScratch(Nbytes); - - weightedNorm2Kernel(Nblock, N, o_w, o_a, o_scratch); - - + int Nblock = (N + blocksize - 1) / blocksize; + const size_t Nbytes = Nblock * sizeof(dfloat); + if (o_scratch.size() < Nbytes) + reallocScratch(Nbytes); dfloat norm = 0; - if(serial){ - norm = *((dfloat*) o_scratch.ptr()); - } else { - o_scratch.copyTo(scratch, Nbytes); - for(dlong n=0;n 1) { + weightedNorm2Kernel(Nblock, N, o_w, o_a, o_scratch); + + if (serial) { + norm = *((dfloat *)o_scratch.ptr()); + } + else { + o_scratch.copyTo(scratch, Nbytes); + for (dlong n = 0; n < Nblock; ++n) { + norm += scratch[n]; + } } } + else { + dfloat w, a; + o_w.copyTo(&w, Nbytes); + o_a.copyTo(&a, Nbytes); + norm = w * a * a; + } - if (_comm != MPI_COMM_NULL) + if (_comm != MPI_COMM_NULL) MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DFLOAT, MPI_SUM, _comm); #ifdef ENABLE_TIMER platform->timer.toc("dotp"); #endif + platform->flopCounter->add("weightedNorm2", 3 * static_cast(N)); + return sqrt(norm); } dfloat linAlg_t::weightedNorm2Many(const dlong N, const dlong Nfields, const dlong fieldOffset, - occa::memory& o_w, - occa::memory& o_a, MPI_Comm _comm) { + occa::memory &o_w, + occa::memory &o_a, + MPI_Comm _comm) +{ #ifdef ENABLE_TIMER - platform->timer.tic("dotp",1); + platform->timer.tic("dotp", 1); #endif - int Nblock = (N+blocksize-1)/blocksize; - const dlong Nbytes = Nblock * sizeof(dfloat); - if(o_scratch.size() < Nbytes) reallocScratch(Nbytes); - - weightedNorm2ManyKernel(Nblock, N, Nfields, fieldOffset, o_w, o_a, o_scratch); - + int Nblock = (N + blocksize - 1) / blocksize; + const size_t Nbytes = Nblock * sizeof(dfloat); + if (o_scratch.size() < Nbytes) + reallocScratch(Nbytes); dfloat norm = 0; - if(serial){ - norm = *((dfloat*) o_scratch.ptr()); - } else { - o_scratch.copyTo(scratch, Nbytes); - for(dlong n=0;n 1 || Nfields > 1) { + weightedNorm2ManyKernel(Nblock, N, Nfields, fieldOffset, o_w, o_a, o_scratch); + + if (serial) { + norm = *((dfloat *)o_scratch.ptr()); + } + else { + o_scratch.copyTo(scratch, Nbytes); + for (dlong n = 0; n < Nblock; ++n) { + norm += scratch[n]; + } } } + else { + dfloat w, a; + o_w.copyTo(&w, Nbytes); + o_a.copyTo(&a, Nbytes); + norm = w * a * a; + } - if (_comm != MPI_COMM_NULL) + if (_comm != MPI_COMM_NULL) MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DFLOAT, MPI_SUM, _comm); #ifdef ENABLE_TIMER platform->timer.toc("dotp"); #endif + platform->flopCounter->add("weightedNorm2Many", 3 * static_cast(N) * Nfields); return sqrt(norm); } // ||o_a||_w1 -dfloat linAlg_t::weightedNorm1(const dlong N, occa::memory& o_w, - occa::memory& o_a, MPI_Comm _comm) { +dfloat linAlg_t::weightedNorm1(const dlong N, occa::memory &o_w, occa::memory &o_a, MPI_Comm _comm) +{ #ifdef ENABLE_TIMER - platform->timer.tic("dotp",1); + platform->timer.tic("dotp", 1); #endif - int Nblock = (N+blocksize-1)/blocksize; - const dlong Nbytes = Nblock * sizeof(dfloat); - if(o_scratch.size() < Nbytes) reallocScratch(Nbytes); - - weightedNorm1Kernel(Nblock, N, o_w, o_a, o_scratch); - - + int Nblock = (N + blocksize - 1) / blocksize; + const size_t Nbytes = Nblock * sizeof(dfloat); + if (o_scratch.size() < Nbytes) + reallocScratch(Nbytes); dfloat norm = 0; - if(serial){ - norm = *((dfloat*) o_scratch.ptr()); - } else { - o_scratch.copyTo(scratch, Nbytes); - for(dlong n=0;n 1) { + weightedNorm1Kernel(Nblock, N, o_w, o_a, o_scratch); + + if (serial) { + norm = *((dfloat *)o_scratch.ptr()); + } + else { + o_scratch.copyTo(scratch, Nbytes); + for (dlong n = 0; n < Nblock; ++n) { + norm += scratch[n]; + } } } + else { + dfloat w, a; + o_w.copyTo(&w, Nbytes); + o_a.copyTo(&a, Nbytes); + norm = std::abs(w * a); + } - if (_comm != MPI_COMM_NULL) + if (_comm != MPI_COMM_NULL) MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DFLOAT, MPI_SUM, _comm); #ifdef ENABLE_TIMER platform->timer.toc("dotp"); @@ -675,33 +905,58 @@ dfloat linAlg_t::weightedNorm1(const dlong N, occa::memory& o_w, dfloat linAlg_t::weightedNorm1Many(const dlong N, const dlong Nfields, const dlong fieldOffset, - occa::memory& o_w, - occa::memory& o_a, MPI_Comm _comm) { + occa::memory &o_w, + occa::memory &o_a, + MPI_Comm _comm) +{ #ifdef ENABLE_TIMER - platform->timer.tic("dotp",1); + platform->timer.tic("dotp", 1); #endif - int Nblock = (N+blocksize-1)/blocksize; - const dlong Nbytes = Nblock * sizeof(dfloat); - if(o_scratch.size() < Nbytes) reallocScratch(Nbytes); - - weightedNorm1ManyKernel(Nblock, N, Nfields, fieldOffset, o_w, o_a, o_scratch); - + int Nblock = (N + blocksize - 1) / blocksize; + const size_t Nbytes = Nblock * sizeof(dfloat); + if (o_scratch.size() < Nbytes) + reallocScratch(Nbytes); dfloat norm = 0; - if(serial){ - norm = *((dfloat*) o_scratch.ptr()); - } else { - o_scratch.copyTo(scratch, Nbytes); - for(dlong n=0;n 1 || Nfields > 1) { + weightedNorm1ManyKernel(Nblock, N, Nfields, fieldOffset, o_w, o_a, o_scratch); + + if (serial) { + norm = *((dfloat *)o_scratch.ptr()); } + else { + o_scratch.copyTo(scratch, Nbytes); + for (dlong n = 0; n < Nblock; ++n) { + norm += scratch[n]; + } + } + } + else { + dfloat w, a; + o_w.copyTo(&w, Nbytes); + o_a.copyTo(&a, Nbytes); + norm = std::abs(w * a); } - if (_comm != MPI_COMM_NULL) + if (_comm != MPI_COMM_NULL) MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DFLOAT, MPI_SUM, _comm); #ifdef ENABLE_TIMER platform->timer.toc("dotp"); #endif return norm; +} + +void linAlg_t::crossProduct(const dlong N, + const dlong fieldOffset, + occa::memory &o_x, + occa::memory &o_y, + occa::memory &o_z) +{ + crossProductKernel(N, fieldOffset, o_x, o_y, o_z); +} + +void linAlg_t::unitVector(const dlong N, const dlong fieldOffset, occa::memory &o_v) +{ + unitVectorKernel(N, fieldOffset, o_v); } \ No newline at end of file diff --git a/src/linAlg/linAlg.hpp b/src/linAlg/linAlg.hpp index e997710c5..d6cf40384 100644 --- a/src/linAlg/linAlg.hpp +++ b/src/linAlg/linAlg.hpp @@ -44,7 +44,7 @@ class linAlg_t { occa::memory o_scratch; void setup(); - void reallocScratch(const dlong Nbytes); + void reallocScratch(const size_t Nbytes); ~linAlg_t(); linAlg_t(); @@ -178,6 +178,15 @@ class linAlg_t { const dlong Nfields, const dlong fieldOffset, occa::memory& o_w, occa::memory& o_x, occa::memory& o_y, MPI_Comm _comm); + // z = x \cross y + void crossProduct(const dlong N, + const dlong fieldOffset, + occa::memory &o_x, + occa::memory &o_y, + occa::memory &o_z); + + void unitVector(const dlong N, const dlong fieldOffset, occa::memory &o_v); + occa::kernel fillKernel; occa::kernel absKernel; occa::kernel addKernel; @@ -215,6 +224,8 @@ class linAlg_t { occa::kernel weightedInnerProdKernel; occa::kernel weightedInnerProdManyKernel; occa::kernel weightedInnerProdMultiKernel; + occa::kernel crossProductKernel; + occa::kernel unitVectorKernel; }; #endif diff --git a/src/linAlg/registerLinAlgKernels.cpp b/src/linAlg/registerLinAlgKernels.cpp new file mode 100644 index 000000000..a10df3e43 --- /dev/null +++ b/src/linAlg/registerLinAlgKernels.cpp @@ -0,0 +1,66 @@ +#include +#include +#include + +void registerLinAlgKernels() +{ + occa::properties kernelInfo = platform->kernelInfo; + + std::string oklDir; + oklDir.assign(getenv("NEKRS_INSTALL_DIR")); + oklDir += "/okl/linAlg/"; + std::string fileName; + const bool serial = platform->serial; + + const std::string extension = serial ? ".c" : ".okl"; + const std::vector> allKernels{ + {"fill", false}, + {"vabs", false}, + {"add", false}, + {"scale", false}, + {"scaleMany", false}, + {"axpby", true}, + {"axpbyMany", true}, + {"axpbyz", false}, + {"axpbyzMany", false}, + {"axmy", true}, + {"axmyMany", true}, + {"axmyVector", true}, + {"axmyz", false}, + {"axmyzMany", false}, + {"ady", false}, + {"adyMany", false}, + {"axdy", false}, + {"aydx", false}, + {"aydxMany", false}, + {"axdyz", false}, + {"sum", false}, + {"sumMany", false}, + {"min", false}, + {"max", false}, + {"norm2", true}, + {"norm2Many", true}, + {"norm1", true}, + {"norm1Many", true}, + {"weightedNorm1", true}, + {"weightedNorm1Many", true}, + {"weightedNorm2", true}, + {"weightedNorm2Many", true}, + {"innerProd", true}, + {"weightedInnerProd", true}, + {"weightedInnerProdMany", true}, + {"weightedInnerProdMulti", false}, + {"crossProduct", false}, + {"unitVector", false}, + }; + + std::string kernelName; + bool nativeSerialImplementation; + for(auto&& nameAndSerialImpl : allKernels){ + std::tie(kernelName, nativeSerialImplementation) = nameAndSerialImpl; + const std::string extension = (serial && nativeSerialImplementation) ? ".c" : ".okl"; + platform->kernels.add( + kernelName, oklDir + kernelName + extension, kernelInfo); + } + +} \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 438fac868..6cfddc5ea 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,5 +1,5 @@ /*---------------------------------------------------------------------------*\ - Copyright (c) 2019-2021, UCHICAGO ARGONNE, LLC. + Copyright (c) 2019-2022, UCHICAGO ARGONNE, LLC. The UChicago Argonne, LLC as Operator of Argonne National Laboratory holds copyright in the Software. The copyright holder @@ -71,12 +71,32 @@ #include #include #include +#include +#include +#include +#include +#include #include "nekrs.hpp" #define DEBUG -static MPI_Comm comm; +namespace { + +std::vector serializeString(const std::string sin, char dlim) +{ + std::vector slist; + std::string s(sin); + s.erase(std::remove_if(s.begin(), s.end(), ::isspace), s.end()); + std::stringstream ss; + ss.str(s); + while( ss.good() ) { + std::string substr; + std::getline(ss, substr, dlim); + if(!substr.empty()) slist.push_back(substr); + } + return slist; +} struct cmdOptions { @@ -84,146 +104,24 @@ struct cmdOptions int ciMode = 0; int debug = 0; int sizeTarget = 0; + std::string multiSessionFile; std::string setupFile; std::string deviceID; std::string backend; }; -static cmdOptions* processCmdLineOptions(int argc, char** argv); - -int main(int argc, char** argv) +struct session { - { - int request = MPI_THREAD_SINGLE; - const char* env_val = std::getenv ("NEKRS_MPI_THREAD_MULTIPLE"); - if (env_val) - if (std::stoi(env_val)) request = MPI_THREAD_MULTIPLE; - - int provided; - int retval = MPI_Init_thread(&argc, &argv, request, &provided); - if (retval != MPI_SUCCESS) { - std::cout << "FATAL ERROR: Cannot initialize MPI!" << "\n"; - exit(EXIT_FAILURE); - } - } + int size; + std::string setupFile; +}; +cmdOptions* processCmdLineOptions(int argc, char** argv, const MPI_Comm &comm) +{ int rank, size; - MPI_Comm_dup(MPI_COMM_WORLD, &comm); MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &size); - { - if(!getenv("NEKRS_HOME")) { - std::cout << "FATAL ERROR: Cannot find env variable NEKRS_HOME!" << "\n"; - MPI_Finalize(); - exit(EXIT_FAILURE); - } - - std::string bin(getenv("NEKRS_HOME")); - bin += "/bin/nekrs"; - const char* ptr = realpath(bin.c_str(), NULL); - if(!ptr) { - std::cout << "FATAL ERROR: Cannot find " << bin << "!\n"; - MPI_Finalize(); - exit(EXIT_FAILURE); - } - } - - cmdOptions* cmdOpt = processCmdLineOptions(argc, argv); - - if (cmdOpt->debug) { - for(int currRank = 0; currRank < size; ++currRank) - if(rank == currRank) printf("rank %d: pid<%d>\n", rank, ::getpid()); - fflush(stdout); - MPI_Barrier(comm); - if (rank == 0) std::cout << "Attach debugger, then press enter to continue\n"; - if (rank == 0) std::cin.get(); - MPI_Barrier(comm); - } - if (cmdOpt->debug) feraiseexcept(FE_ALL_EXCEPT); - - MPI_Barrier(comm); - const double time0 = MPI_Wtime(); - - std::string cacheDir; - nekrs::setup(comm, cmdOpt->buildOnly, cmdOpt->sizeTarget, - cmdOpt->ciMode, cacheDir, cmdOpt->setupFile, - cmdOpt->backend, cmdOpt->deviceID); - - if (cmdOpt->buildOnly) { - nekrs::finalize(); - MPI_Finalize(); - return EXIT_SUCCESS; - } - - MPI_Barrier(comm); - double elapsedTime = (MPI_Wtime() - time0); - - const int runTimeStatFreq = 500; - const int updCheckFreq = 20; - - int tStep = 0; - double time = nekrs::startTime(); - int lastStep = nekrs::lastStep(time, tStep, elapsedTime); - - nekrs::udfExecuteStep(time, tStep, /* outputStep */ 0); - - if (rank == 0 && !lastStep) { - if (nekrs::endTime() > nekrs::startTime()) - std::cout << "\ntimestepping to time " << nekrs::endTime() << " ...\n"; - else - std::cout << "\ntimestepping for " << nekrs::numSteps() << " steps ...\n"; - } - MPI_Pcontrol(1); - while (!lastStep) { - MPI_Barrier(comm); - const double timeStart = MPI_Wtime(); - - ++tStep; - lastStep = nekrs::lastStep(time, tStep, elapsedTime); - - double dt; - if (lastStep && nekrs::endTime() > 0) - dt = nekrs::endTime() - time; - else - dt = nekrs::dt(tStep); - - int outputStep = nekrs::outputStep(time+dt, tStep); - if (nekrs::writeInterval() == 0) outputStep = 0; - if (lastStep) outputStep = 1; - if (nekrs::writeInterval() < 0) outputStep = 0; - nekrs::outputStep(outputStep); - - nekrs::runStep(time, dt, tStep); - time += dt; - - if (outputStep) nekrs::outfld(time); - - if (tStep%runTimeStatFreq == 0 || lastStep) nekrs::printRuntimeStatistics(tStep); - - MPI_Barrier(comm); - elapsedTime += (MPI_Wtime() - timeStart); - - if(tStep%updCheckFreq) nekrs::processUpdFile(); - } - MPI_Pcontrol(0); - - if (rank == 0) { - std::cout << "elapsedTime: " << elapsedTime << " s\n"; - std::cout << "End\n"; - } - fflush(stdout); - - nekrs::finalize(); - MPI_Finalize(); - return EXIT_SUCCESS; -} - -static cmdOptions* processCmdLineOptions(int argc, char** argv) -{ - int rank; - MPI_Comm_rank(comm, &rank); - cmdOptions* cmdOpt = new cmdOptions(); int err = 0; @@ -236,7 +134,7 @@ static cmdOptions* processCmdLineOptions(int argc, char** argv) { {"setup", required_argument, 0, 's'}, {"cimode", required_argument, 0, 'c'}, - {"build-only", required_argument, 0, 'b'}, + {"build-only", optional_argument, 0, 'b'}, {"debug", no_argument, 0, 'd'}, {"backend", required_argument, 0, 't'}, {"device-id", required_argument, 0, 'i'}, @@ -252,10 +150,18 @@ static cmdOptions* processCmdLineOptions(int argc, char** argv) switch(c) { case 's': cmdOpt->setupFile.assign(optarg); + if (cmdOpt->setupFile.find(".par") != std::string::npos) + cmdOpt->setupFile.erase(cmdOpt->setupFile.find(".par"), std::string::npos); + if (cmdOpt->setupFile.substr(cmdOpt->setupFile.find_last_of(".") + 1) == "sess") { + cmdOpt->multiSessionFile = cmdOpt->setupFile; + cmdOpt->setupFile.clear(); + } break; case 'b': cmdOpt->buildOnly = 1; - cmdOpt->sizeTarget = atoi(optarg); + cmdOpt->sizeTarget = size; + if(!optarg && argv[optind] != NULL && argv[optind][0] != '-') + cmdOpt->sizeTarget = std::stoi(argv[optind++]); break; case 'c': cmdOpt->ciMode = atoi(optarg); @@ -274,9 +180,9 @@ static cmdOptions* processCmdLineOptions(int argc, char** argv) cmdOpt->backend.assign(optarg); break; case 'h': - if(!optarg && argv[optind] != NULL && argv[optind][0] != '-') { + printHelp++; + if(!optarg && argv[optind] != NULL && argv[optind][0] != '-') helpCat.assign(argv[optind++]); - } break; default: err = 1; @@ -285,6 +191,10 @@ static cmdOptions* processCmdLineOptions(int argc, char** argv) } char buf[FILENAME_MAX]; + strcpy(buf, cmdOpt->multiSessionFile.c_str()); + MPI_Bcast(buf, sizeof(buf), MPI_BYTE, 0, comm); + cmdOpt->multiSessionFile.assign(buf); + strcpy(buf, cmdOpt->setupFile.c_str()); MPI_Bcast(buf, sizeof(buf), MPI_BYTE, 0, comm); cmdOpt->setupFile.assign(buf); @@ -302,34 +212,25 @@ static cmdOptions* processCmdLineOptions(int argc, char** argv) MPI_Bcast(&cmdOpt->ciMode, sizeof(cmdOpt->ciMode), MPI_BYTE, 0, comm); MPI_Bcast(&cmdOpt->debug, sizeof(cmdOpt->debug), MPI_BYTE, 0, comm); - if(cmdOpt->setupFile.empty()){ + if(cmdOpt->setupFile.empty() && cmdOpt->multiSessionFile.empty()) printHelp++; - } else { - std::string casepath, casename; - size_t last_slash = cmdOpt->setupFile.rfind('/') + 1; - casepath = cmdOpt->setupFile.substr(0,last_slash); - chdir(casepath.c_str()); - casename = cmdOpt->setupFile.substr(last_slash, cmdOpt->setupFile.length() - last_slash); - if(casepath.length() > 0) chdir(casepath.c_str()); - cmdOpt->setupFile.assign(casename); - } MPI_Bcast(&printHelp, sizeof(printHelp), MPI_BYTE, 0, comm); MPI_Bcast(&err, sizeof(err), MPI_BYTE, 0, comm); if (err | printHelp) { if (rank == 0) { if (helpCat == "par") { - std::string install_dir; - install_dir.assign(getenv("NEKRS_HOME")); - std::ifstream f(install_dir + "/include/parHelp.txt"); + std::string installDir; + installDir.assign(getenv("NEKRS_HOME")); + std::ifstream f(installDir + "/include/parHelp.txt"); if (f.is_open()) std::cout << f.rdbuf(); f.close(); } else { std::cout << "usage: ./nekrs [--help ] " - << "--setup " + << "--setup " << "[ --build-only <#procs> ] [ --cimode ] [ --debug ] " << "[ --backend ] [ --device-id ]" - << "\n"; + << "\n"; } } MPI_Finalize(); @@ -338,3 +239,273 @@ static cmdOptions* processCmdLineOptions(int argc, char** argv) return cmdOpt; } + +MPI_Comm setupSession(cmdOptions* cmdOpt, const MPI_Comm &comm) +{ + int rank, size; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + MPI_Comm newComm = comm; + + if(cmdOpt->multiSessionFile.size()) { + std::string multiSessionFileContent; + + if(rank == 0) { + std::ifstream f(cmdOpt->multiSessionFile); + if (!f) { + std::cout << "FATAL ERROR: Cannot find sess file " + << cmdOpt->multiSessionFile << "!\n"; + fflush(stdout); + MPI_Abort(comm, EXIT_FAILURE); + } + std::ostringstream ss; + if (f.is_open()) ss << f.rdbuf(); + f.close(); + multiSessionFileContent = ss.str(); + } + int bufSize = multiSessionFileContent.size() + 1; + MPI_Bcast(&bufSize, sizeof(bufSize), MPI_BYTE, 0, comm); + char* buf = (char*) malloc(bufSize * sizeof(char)); + strcpy(buf, multiSessionFileContent.c_str()); + MPI_Bcast(buf, bufSize * sizeof(char), MPI_BYTE, 0, comm); + multiSessionFileContent = std::string(buf); + free(buf); + + auto list = serializeString(multiSessionFileContent, ';'); + auto sessionList = new session[list.size()]; + + int nSessions = 0; + int rankSum = 0; + for(std::string s : list) { + auto items = serializeString(s,':'); + if(items.size() != 2) { + if(rank == 0) std::cout << "FATAL ERROR: invalid sess file entry!\n"; + fflush(stdout); + MPI_Abort(comm, EXIT_FAILURE); + } + sessionList[nSessions].setupFile = items[0]; + sessionList[nSessions].size = std::stoi(items[1]); + rankSum += sessionList[nSessions].size; + nSessions++; + } + + int err = 0; + if(rankSum != size) err = 1; + MPI_Allreduce(MPI_IN_PLACE, &err, 1, MPI_INT, MPI_SUM, comm); + if(err) { + if(rank == 0) std::cout << "FATAL ERROR: size of sub-communicators does not match parent!\n"; + fflush(stdout); + MPI_Abort(comm, EXIT_FAILURE); + } + + int color = MPI_UNDEFINED; + int rankOffsetSession = 0; + for(int i = 0; i < nSessions; i++) { + if(rank - rankOffsetSession < sessionList[i].size) { + color = i; + break; + } + rankOffsetSession += sessionList[i].size; + } + + int rankGlobal, sizeGlobal; + MPI_Comm_rank(comm, &rankGlobal); + MPI_Comm_size(comm, &sizeGlobal); + + MPI_Comm_split(comm, color, rankGlobal, &newComm); + + MPI_Comm_rank(newComm, &rank); + MPI_Comm_size(newComm, &size); + + cmdOpt->setupFile = sessionList[color].setupFile; + cmdOpt->sizeTarget = size; + + if(cmdOpt->debug) { + std::cout << "globalRank:" << rankGlobal + << " localRank: " << rank + << " commSize: " << size + << " setupFile:" << cmdOpt->setupFile + << "\n"; + } + fflush(stdout); + MPI_Barrier(comm); + + if(rank == 0) { + const std::string outputFile = cmdOpt->setupFile + ".log"; + std::cout << "redirecting output to " << outputFile << " ...\n"; + const int fd = open(outputFile.c_str(), O_WRONLY|O_CREAT|O_APPEND, S_IWUSR|S_IRUSR); + dup2(fd, fileno(stderr)); + dup2(fd, fileno(stdout)); + } + } + return newComm; +} + +} + + +int main(int argc, char** argv) +{ + const auto timeStart = std::chrono::high_resolution_clock::now(); + { + int request = MPI_THREAD_SINGLE; + const char* env_val = std::getenv ("NEKRS_MPI_THREAD_MULTIPLE"); + if (env_val) + if (std::stoi(env_val)) request = MPI_THREAD_MULTIPLE; + + int provided; + int retval = MPI_Init_thread(&argc, &argv, request, &provided); + if (retval != MPI_SUCCESS) { + std::cout << "FATAL ERROR: Cannot initialize MPI!" << "\n"; + exit(EXIT_FAILURE); + } + } + + MPI_Barrier(MPI_COMM_WORLD); + const double time0 = MPI_Wtime(); + + MPI_Comm commGlobal; + MPI_Comm_dup(MPI_COMM_WORLD, &commGlobal); + + { + if(!getenv("NEKRS_HOME")) { + std::cout << "FATAL ERROR: Cannot find env variable NEKRS_HOME!" << "\n"; + fflush(stdout); + MPI_Abort(commGlobal, EXIT_FAILURE); + } + + std::string bin(getenv("NEKRS_HOME")); + bin += "/bin/nekrs"; + const char* ptr = realpath(bin.c_str(), NULL); + if(!ptr) { + std::cout << "FATAL ERROR: Cannot find " << bin << "!\n"; + fflush(stdout); + MPI_Abort(commGlobal, EXIT_FAILURE); + } + } + + cmdOptions* cmdOpt = processCmdLineOptions(argc, argv, commGlobal); + MPI_Comm comm = setupSession(cmdOpt, commGlobal); + + int rank, size; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + + if (cmdOpt->debug) { + for(int currRank = 0; currRank < size; ++currRank) + if(rank == currRank) printf("rank %d: pid<%d>\n", rank, ::getpid()); + fflush(stdout); + MPI_Barrier(comm); + if (rank == 0) std::cout << "Attach debugger, then press enter to continue\n"; + if (rank == 0) std::cin.get(); + MPI_Barrier(comm); + } + + if (cmdOpt->debug) feraiseexcept(FE_ALL_EXCEPT); + + { // change working dir + const size_t last_slash = cmdOpt->setupFile.rfind('/') + 1; + const std::string casepath = cmdOpt->setupFile.substr(0,last_slash); + chdir(casepath.c_str()); + const std::string casename = cmdOpt->setupFile.substr(last_slash, cmdOpt->setupFile.length() - last_slash); + if(casepath.length() > 0) chdir(casepath.c_str()); + cmdOpt->setupFile.assign(casename); + } + + nekrs::setup(commGlobal, comm, + cmdOpt->buildOnly, cmdOpt->sizeTarget, + cmdOpt->ciMode, cmdOpt->setupFile, + cmdOpt->backend, cmdOpt->deviceID, + cmdOpt->debug); + + if (cmdOpt->buildOnly) { + nekrs::finalize(); + MPI_Finalize(); + return EXIT_SUCCESS; + } + + const int updCheckFreq = 20; + + int tStep = 0; + double time = nekrs::startTime(); + + double elapsedTime = 0; + { + MPI_Barrier(comm); + const auto timeStop = std::chrono::high_resolution_clock::now(); + elapsedTime += std::chrono::duration(timeStop - timeStart).count() / 1e3; + MPI_Allreduce(MPI_IN_PLACE, &elapsedTime, 1, MPI_DOUBLE, MPI_MAX, comm); + nekrs::updateTimer("setup", elapsedTime); + if (rank == 0) + std::cout << "initialization took " << elapsedTime << " s" << std::endl; + } + + nekrs::udfExecuteStep(time, tStep, /* outputStep */ 0); + + int lastStep = nekrs::lastStep(time, tStep, elapsedTime); + double elapsedStepSum = 0; + + if (rank == 0 && !lastStep) { + if (nekrs::endTime() > nekrs::startTime()) + std::cout << "\ntimestepping to time " << nekrs::endTime() << " ...\n"; + else + std::cout << "\ntimestepping for " << nekrs::numSteps() << " steps ...\n"; + } + + fflush(stdout); + MPI_Pcontrol(1); + while (!lastStep) { + MPI_Barrier(comm); + const double timeStartStep = MPI_Wtime(); + + ++tStep; + lastStep = nekrs::lastStep(time, tStep, elapsedTime); + + double dt; + if (lastStep && nekrs::endTime() > 0) + dt = nekrs::endTime() - time; + else + dt = nekrs::dt(tStep); + + int outputStep = nekrs::outputStep(time + dt, tStep); + if (nekrs::writeInterval() == 0) outputStep = 0; + if (lastStep) outputStep = 1; + if (nekrs::writeInterval() < 0) outputStep = 0; + nekrs::outputStep(outputStep); + + if (tStep <= 1000) nekrs::verboseInfo(true); + + nekrs::runStep(time, dt, tStep); + time += dt; + + if (outputStep) nekrs::outfld(time); + + if(tStep % updCheckFreq) nekrs::processUpdFile(); + + MPI_Barrier(comm); + const double elapsedStep = MPI_Wtime() - timeStartStep; + elapsedStepSum += elapsedStep; + elapsedTime += elapsedStep; + nekrs::updateTimer("elapsedStep", elapsedStep); + nekrs::updateTimer("elapsedStepSum", elapsedStepSum); + nekrs::updateTimer("elapsed", elapsedTime); + + nekrs::printInfo(time, tStep); + + if (tStep % nekrs::runTimeStatFreq() == 0 || lastStep) + nekrs::printRuntimeStatistics(tStep); + + if (tStep % 10 == 0) fflush(stdout); + } + MPI_Pcontrol(0); + + nekrs::finalize(); + + MPI_Barrier(commGlobal); + if (rank == 0) + std::cout << "End\n"; + + MPI_Finalize(); + + return EXIT_SUCCESS; +} diff --git a/src/mesh/mesh.h b/src/mesh/mesh.h index 86dd1695d..4174a983d 100644 --- a/src/mesh/mesh.h +++ b/src/mesh/mesh.h @@ -42,8 +42,12 @@ #define TETRAHEDRA 6 #define HEXAHEDRA 12 +struct nrs_t; + struct mesh_t { + dfloat avgBoundaryValue(int BID, occa::memory o_fld); + void avgBoundaryValue(int BID, int Nfields, int offsetFld, occa::memory o_flds, dfloat *avgs); void move(); void update(); void computeInvLMM(); @@ -66,6 +70,7 @@ struct mesh_t dlong Nelements; dlong fieldOffset; dlong Nlocal; + hlong NboundaryFaces; hlong* EToV; // element-to-vertex connectivity dlong* EToE; // element-to-element connectivity int* EToF; // element-to-(local)face connectivity @@ -75,10 +80,6 @@ struct mesh_t dlong* elementInfo; //type of element occa::memory o_elementInfo; - // boundary faces - hlong NboundaryFaces; // number of boundary faces - hlong* boundaryInfo; // list of all boundary faces (type, vertex-1, vertex-2, vertex-3) in the mesh - // MPI halo exchange info dlong totalHaloPairs; // number of elements to be sent in halo exchange dlong* haloElementList; // sorted list of elements to be sent in halo exchange @@ -97,6 +98,11 @@ struct mesh_t ogs_t* ogs; //occa gs pointer oogs_t* oogs; //occa gs pointer + // list of all elements + // elementList[e] = e + dlong *elementList; + occa::memory o_elementList; + // list of elements that are needed for global gather-scatter dlong NglobalGatherElements; dlong* globalGatherElementList; @@ -209,6 +215,8 @@ struct mesh_t occa::memory o_ggeo; // second order geometric factors occa::memory o_ggeoPfloat; // second order geometric factors + occa::memory o_gllzw; + occa::memory o_gllw; occa::memory o_cubw; occa::memory o_faceNodes; @@ -220,11 +228,17 @@ struct mesh_t occa::kernel geometricFactorsKernel; occa::kernel surfaceGeometricFactorsKernel; + occa::kernel cubatureGeometricFactorsKernel; occa::kernel nStagesSumVectorKernel; occa::kernel velocityDirichletKernel; + + occa::kernel avgBIDValueKernel; }; -occa::properties populateMeshProperties(int N); +mesh_t *createMeshMG(mesh_t* _mesh, + int Nc); + +occa::properties meshKernelProperties(int N); // serial sort void mysort(hlong* data, int N, const char* order); @@ -238,6 +252,8 @@ void parallelSort(int size, int rank, MPI_Comm comm, #define mymax(a,b) (((a) > (b))?(a):(b)) #define mymin(a,b) (((a) < (b))?(a):(b)) +void meshSolve(nrs_t* nrs, dfloat time, occa::memory o_U, int stage); + /* dimension independent mesh operations */ void meshConnect(mesh_t* mesh); @@ -274,13 +290,11 @@ void meshHaloExchangeBlocking(mesh_t* mesh, // print out parallel partition i void meshPartitionStatistics(mesh_t* mesh); -// build element-boundary connectivity -void meshConnectBoundary(mesh_t* mesh); - void meshParallelGatherScatterSetup(mesh_t* mesh, dlong N, hlong* globalIds, MPI_Comm &comm, + oogs_mode gsMode, int verbose); // generic mesh setup diff --git a/src/mesh/mesh3D.h b/src/mesh/mesh3D.h index c5699e66c..89f653253 100644 --- a/src/mesh/mesh3D.h +++ b/src/mesh/mesh3D.h @@ -30,195 +30,97 @@ // generic mesh structure #include "mesh.h" -extern "C" { // Begin C Linkage -#define mesh3D mesh_t - -// mesh readers -mesh3D* meshParallelReaderTri3D(char* fileName); -mesh3D* meshParallelReaderQuad3D(char* fileName); -mesh3D* meshParallelReaderTet3D(char* fileName); -mesh3D* meshParallelReaderHex3D(char* fileName); - // build connectivity in serial -void meshConnect3D(mesh3D* mesh); - -// build element-boundary connectivity -void meshConnectBoundary3D(mesh3D* mesh); +void meshConnect3D(mesh_t *mesh); // build connectivity in parallel -void meshParallelConnect3D(mesh3D* mesh); - -// repartition elements in parallel -void meshGeometricPartition3D(mesh3D* mesh); - -// print out mesh -void meshPrint3D(mesh3D* mesh); - -// print out mesh in parallel from the root process -void meshParallelPrint3D(mesh3D* mesh); - -// print out mesh partition in parallel -void meshVTU3D(mesh3D* mesh, char* fileName); - -// print out mesh field -void meshPlotVTU3D(mesh3D* mesh, char* fileNameBase, int fld); -void meshPlotContour3D(mesh_t* mesh, char* fname, dfloat* u, int Nlevels, dfloat* levels); -void meshPlotAdaptiveContour3D(mesh_t* mesh, char* fname, dfloat* u, int Nlevels, dfloat* levels, dfloat tol); +void meshParallelConnect3D(mesh_t *mesh); // compute geometric factors for local to physical map -void meshGeometricFactorsTri3D(mesh3D* mesh); -void meshGeometricFactorsQuad3D(mesh3D* mesh); -void meshGeometricFactorsTet3D(mesh3D* mesh); -void meshGeometricFactorsHex3D(mesh3D* mesh); - -void meshSurfaceGeometricFactorsTri3D(mesh3D* mesh); -void meshSurfaceGeometricFactorsQuad3D(mesh3D* mesh); -void meshSurfaceGeometricFactorsTet3D(mesh3D* mesh); -void meshSurfaceGeometricFactorsHex3D(mesh3D* mesh); - -void meshPhysicalNodesTri3D(mesh3D* mesh); -void meshPhysicalNodesQuad3D(mesh3D* mesh); -void meshPhysicalNodesTet3D(mesh3D* mesh); -void meshPhysicalNodesHex3D(mesh3D* mesh); +void meshGeometricFactorsHex3D(mesh_t *mesh); -void meshLoadReferenceNodesTet3D(mesh3D* mesh, int N); -void meshLoadReferenceNodesHex3D(mesh3D* mesh, int N, int cubN); +void meshSurfaceGeometricFactorsHex3D(mesh_t *mesh); -void meshGradientTet3D(mesh3D* mesh, dfloat* q, dfloat* dqdx, dfloat* dqdy, dfloat* dqdz); -void meshGradientHex3D(mesh3D* mesh, dfloat* q, dfloat* dqdx, dfloat* dqdy, dfloat* dqdz); +void meshPhysicalNodesHex3D(mesh_t *mesh); -// print out parallel partition i -void meshPartitionStatistics3D(mesh3D* mesh); +void meshLoadReferenceNodesHex3D(mesh_t *mesh, int N, int cubN); // default occa set up -void meshOccaSetup3D(mesh3D* mesh, setupAide &newOptions, occa::properties &kernelInfo); -void meshOccaSetupQuad3D(mesh_t* mesh, setupAide &newOptions, occa::properties &kernelInfo); -void meshOccaSetupTri3D(mesh_t* mesh, setupAide &newOptions, occa::properties &kernelInfo); +void meshOccaSetup3D(mesh_t *mesh, setupAide &newOptions, occa::properties &kernelInfo); -void meshOccaPopulateDeviceHex3D(mesh3D* mesh, setupAide &newOptions, occa::properties &kernelInfo); +void meshOccaPopulateDeviceHex3D(mesh_t *mesh, setupAide &newOptions, occa::properties &kernelInfo); void meshOccaCloneDevice(mesh_t* donorMesh, mesh_t* mesh); -// functions that call OCCA kernels -void occaTest3D(mesh3D* mesh, dfloat* q, dfloat* dqdx, dfloat* dqdy, dfloat* dqdz); - -// -void occaOptimizeGradientTet3D(mesh3D* mesh, dfloat* q, dfloat* dqdx, dfloat* dqdy, dfloat* dqdz); -void occaOptimizeGradientHex3D(mesh3D* mesh, dfloat* q, dfloat* dqdx, dfloat* dqdy, dfloat* dqdz); - // serial face-node to face-node connection -void meshConnectFaceNodes3D(mesh3D* mesh); - -// -mesh3D* meshSetupTri3D(char* filename, int N, dfloat sphereRadius); -mesh3D* meshSetupQuad3D(char* filename, int N, dfloat sphereRadius); -mesh3D* meshSetupTet3D(char* filename, int N); -mesh3D* meshSetupHex3D(char* filename, int N); +void meshConnectFaceNodes3D(mesh_t *mesh); -void meshParallelConnectNodesHex3D(mesh3D* mesh); +void meshParallelConnectNodesHex3D(mesh_t *mesh); // halo connectivity information -void meshHaloSetup3D(mesh3D* mesh); +void meshHaloSetup3D(mesh_t *mesh); // perform halo exchange -void meshHaloExchange3D(mesh3D* mesh, +void meshHaloExchange3D(mesh_t *mesh, size_t Nbytes, // number of bytes per element - void* sourceBuffer, - void* sendBuffer, - void* recvBuffer); + void *sourceBuffer, + void *sendBuffer, + void *recvBuffer); -void meshHaloExchangeStart3D(mesh3D* mesh, - size_t Nbytes, // message size per element - void* sendBuffer, // temporary buffer - void* recvBuffer); +void meshHaloExchangeStart3D(mesh_t *mesh, + size_t Nbytes, // message size per element + void *sendBuffer, // temporary buffer + void *recvBuffer); -void meshHaloExchangeFinish3D(mesh3D* mesh); +void meshHaloExchangeFinish3D(mesh_t *mesh); // build list of nodes on each face of the reference element -void meshBuildFaceNodes3D(mesh3D* mesh); -void meshBuildFaceNodesHex3D(mesh3D* mesh); - -dfloat meshMRABSetup3D(mesh3D* mesh, dfloat* EToDT, int maxLevels, dfloat finalTime); - -//MRAB weighted mesh partitioning -void meshMRABWeightedPartition3D(mesh3D* mesh, dfloat* weights, - int numLevels, int* levels); - +void meshBuildFaceNodes3D(mesh_t *mesh); +void meshBuildFaceNodesHex3D(mesh_t *mesh); void interpolateHex3D(dfloat* Inter, dfloat* x, int N, dfloat* Ix, int M); -#define norm3(a,b,c) ( sqrt((a) * (a) + (b) * (b) + (c) * (c)) ) - /* offsets for geometric factors */ #define RXID 0 #define RYID 1 -#define SXID 2 -#define SYID 3 -#define JID 4 -#define JWID 5 -#define IJWID 6 -#define RZID 7 -#define SZID 8 -#define TXID 9 -#define TYID 10 -#define TZID 11 +#define RZID 2 +#define SXID 3 +#define SYID 4 +#define SZID 5 +#define TXID 6 +#define TYID 7 +#define TZID 8 +#define JID 9 +#define JWID 10 +#define IJWID 11 /* offsets for second order geometric factors */ #define G00ID 0 #define G01ID 1 #define G11ID 2 -#define GWJID 3 -#define G12ID 4 -#define G02ID 5 -#define G22ID 6 +#define G12ID 3 +#define G02ID 4 +#define G22ID 5 +#define GWJID 6 /* offsets for nx, ny, sJ, 1/J */ #define NXID 0 #define NYID 1 -#define SJID 2 -#define IJID 3 -#define IHID 4 -#define WSJID 5 -#define WIJID 6 -#define NZID 7 -#define STXID 8 -#define STYID 9 -#define STZID 10 -#define SBXID 11 -#define SBYID 12 -#define SBZID 13 -#define SURXID 14 -#define SURYID 15 -#define SURZID 16 -// -//offsets for boltzmann PML variables -#define QXID1 0 -#define QXID2 1 -#define QXID3 2 -#define QXID4 3 -#define QXID5 4 -#define QXID6 5 -#define QXID8 6 -// -#define QYID1 7 -#define QYID2 8 -#define QYID3 9 -#define QYID4 10 -#define QYID5 11 -#define QYID7 12 -#define QYID9 13 -// -#define QZID1 14 -#define QZID2 15 -#define QZID3 16 -#define QZID4 17 -#define QZID6 18 -#define QZID7 19 -#define QZID10 20 - -mesh3D* meshSetupBoxHex3D(int N, setupAide &options); -void meshConnectPeriodicFaceNodes3D(mesh3D* mesh, dfloat xper, dfloat yper, dfloat zper); +#define NZID 2 + +// tangentails +#define T1XID 3 +#define T1YID 4 +#define T1ZID 5 + +#define T2XID 6 +#define T2YID 7 +#define T2ZID 8 + +#define SJID 9 +#define IJID 10 +#define WIJID 11 +#define WSJID 12 // Mesh generation void NodesHex3D(int _N, dfloat* _r, dfloat* _s, dfloat* _t); -void FaceNodesHex3D(int _N, dfloat* _r, dfloat* _s, dfloat* _t, int* _faceNodes); -} // end C Linkage +void FaceNodesHex3D(int _N, dfloat *_r, dfloat *_s, dfloat *_t, int *_faceNodes); #endif diff --git a/src/mesh/meshAvgBoundaryValue.cpp b/src/mesh/meshAvgBoundaryValue.cpp new file mode 100644 index 000000000..bbfdda91d --- /dev/null +++ b/src/mesh/meshAvgBoundaryValue.cpp @@ -0,0 +1,56 @@ +#include +#include "platform.hpp" + +static dfloat *sum; +static dfloat *sumFace; +static occa::memory o_sumFace; +static occa::memory h_sumFace; + +dfloat mesh_t::avgBoundaryValue(int BID, occa::memory o_fld) +{ + dfloat avg = 0.0; + avgBoundaryValue(BID, 1, fieldOffset, o_fld, &avg); + return avg; +} + +void mesh_t::avgBoundaryValue(int BID, int Nfields, int offsetFld, occa::memory o_fld, dfloat *avgs) +{ + const auto offset = Nfaces * Nelements; + const auto Nbytes = (Nfields + 1) * offset * sizeof(dfloat); + + if (o_sumFace.size() < Nbytes) { + if (o_sumFace.size()) + o_sumFace.free(); + if (h_sumFace.size()) + h_sumFace.free(); + + // pinned scratch buffer + { + h_sumFace = platform->device.mallocHost(Nbytes); + sumFace = (dfloat *)h_sumFace.ptr(); + } + + o_sumFace = platform->device.malloc(Nbytes); + + if (sum) + free(sum); + sum = (dfloat *)calloc(Nfields + 1, sizeof(dfloat)); + } + + avgBIDValueKernel(Nelements, BID, Nfields, offsetFld, offset, o_sgeo, o_EToB, o_vmapM, o_fld, o_sumFace); + + o_sumFace.copyTo(sumFace, Nbytes); + + for (int j = 0; j < Nfields + 1; ++j) { + sum[j] = 0; + for (int i = 0; i < offset; ++i) { + sum[j] += sumFace[i + j * offset]; + } + } + + MPI_Allreduce(MPI_IN_PLACE, sum, Nfields + 1, MPI_DFLOAT, MPI_SUM, platform->comm.mpiComm); + + const auto invArea = 1 / sum[Nfields]; + for (int i = 0; i < Nfields; ++i) + avgs[i] = sum[i] * invArea; +} \ No newline at end of file diff --git a/src/mesh/meshBasis1D.cpp b/src/mesh/meshBasis1D.cpp index 13ecdf112..06baeaff6 100644 --- a/src/mesh/meshBasis1D.cpp +++ b/src/mesh/meshBasis1D.cpp @@ -129,31 +129,6 @@ void DWmatrix1D(int _N, dfloat* _D, dfloat* _DT) _DT[n * _Nq + m] = 0.0; for(int k = 0; k < _Nq; ++k) _DT[n * _Nq + m] += _D[m * _Nq + k]; } - -/* - dfloat *r1D = (dfloat *) calloc(_Nq, sizeof(dfloat)); - dfloat *w1D = (dfloat *) calloc(_Nq, sizeof(dfloat)); - JacobiGLL(_N, r1D, w1D); // i.e. 1D gll points and correspondin weights from mass lumping - - dfloat *V1D = (dfloat *) calloc(_Nq*_Nq, sizeof(dfloat)); - dfloat *V1Dr = (dfloat *) calloc(_Nq*_Nq, sizeof(dfloat)); - Vandermonde1D(_N, _Nq, r1D, V1D); - GradVandermonde1D(_N, _Nq, r1D, V1Dr); - - // DW1D = V*Vr'*diag(w) - for(int n=0;n<_Nq;++n){ - for(int m=0;m<_Nq;++m){ - dfloat dw = 0; - for(int i=0; i<_Nq; i++) dw += V1D[n*_Nq + i]*V1Dr[m*_Nq + i]; - _DT[n*_Nq+m] = dw; //*w1D[m]; // scale by w - } - } - - free(r1D); - free(w1D); - free(V1D); - free(V1Dr); - */ } void InterpolationMatrix1D(int _N, diff --git a/src/mesh/meshComputeInvLMM.cpp b/src/mesh/meshComputeInvLMM.cpp new file mode 100644 index 000000000..ccc1806b2 --- /dev/null +++ b/src/mesh/meshComputeInvLMM.cpp @@ -0,0 +1,9 @@ +#include "mesh.h" +#include "linAlg.hpp" +#include "platform.hpp" +void mesh_t::computeInvLMM() +{ + o_invLMM.copyFrom(o_LMM, Nelements * Np * sizeof(dfloat)); + oogs::startFinish(o_invLMM, 1, 0, ogsDfloat, ogsAdd, oogs); + platform->linAlg->ady(Nelements * Np, 1.0, o_invLMM); +} \ No newline at end of file diff --git a/src/mesh/meshConnectBoundary.cpp b/src/mesh/meshConnectBoundary.cpp deleted file mode 100644 index 7760ae6a3..000000000 --- a/src/mesh/meshConnectBoundary.cpp +++ /dev/null @@ -1,156 +0,0 @@ -/* - - The MIT License (MIT) - - Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - - */ - -#include -#include -#include "mesh.h" - -// structure used to encode vertices that make -// each face, the element/face indices, and -// the neighbor element/face indices (if any) -struct boundaryFace_t -{ - dlong element; - int face; - - int NfaceVertices; - - hlong v[4]; // max number of face vertices - - int bctype; -}; - -// comparison function that orders vertices -// based on their combined vertex indices -int compareBoundaryFaces(const void* a, - const void* b) -{ - boundaryFace_t* fa = (boundaryFace_t*) a; - boundaryFace_t* fb = (boundaryFace_t*) b; - - for(int n = 0; n < fa->NfaceVertices; ++n) { - if(fa->v[n] < fb->v[n]) return -1; - if(fa->v[n] > fb->v[n]) return +1; - } - - return 0; -} - -/* routine to find EToB (Element To Boundary)*/ -void meshConnectBoundary(mesh_t* mesh) -{ - /* count number of boundary faces (i.e. not yet connected) */ - hlong bcnt = 0; - for(dlong e = 0; e < mesh->Nelements; ++e) - for(int f = 0; f < mesh->Nfaces; ++f) - if(mesh->EToE[e * mesh->Nfaces + f] == -1) // || mesh->EToE[e*mesh->Nfaces+f]==e) - ++bcnt; - -#if 0 - printf("Nbf = %d\n", mesh->NboundaryFaces); - printf("Nfv = %d\n", mesh->NfaceVertices); - printf("bcnt = %d\n", bcnt); - printf("Nelements = %d\n", mesh->Nelements); -#endif - - /* build list of boundary faces */ - boundaryFace_t* boundaryFaces = (boundaryFace_t*) calloc(bcnt + mesh->NboundaryFaces, - sizeof(boundaryFace_t)); - - bcnt = 0; // reset counter - for(dlong e = 0; e < mesh->Nelements; ++e) - for(int f = 0; f < mesh->Nfaces; ++f) - if(mesh->EToE[e * mesh->Nfaces + f] == -1) { - for(int n = 0; n < mesh->NfaceVertices; ++n) { - dlong vid = e * mesh->Nverts + mesh->faceVertices[f * mesh->NfaceVertices + n]; - boundaryFaces[bcnt].v[n] = mesh->EToV[vid]; - } - - mysort(boundaryFaces[bcnt].v,mesh->NfaceVertices, "descending"); - - boundaryFaces[bcnt].NfaceVertices = mesh->NfaceVertices; - boundaryFaces[bcnt].element = e; - boundaryFaces[bcnt].face = f; - boundaryFaces[bcnt].bctype = -1; - ++bcnt; - } - - /* add boundary info */ - for(hlong b = 0; b < mesh->NboundaryFaces; ++b) { - for(int n = 0; n < mesh->NfaceVertices; ++n) - boundaryFaces[bcnt].v[n] = mesh->boundaryInfo[b * (mesh->NfaceVertices + 1) + n + 1]; - - mysort(boundaryFaces[bcnt].v,mesh->NfaceVertices, "descending"); - - boundaryFaces[bcnt].NfaceVertices = mesh->NfaceVertices; - boundaryFaces[bcnt].element = -1; - boundaryFaces[bcnt].face = -1; - boundaryFaces[bcnt].bctype = mesh->boundaryInfo[b * (mesh->NfaceVertices + 1)]; - - ++bcnt; - } - -#if 0 - for(int b = 0; b < bcnt; ++b) { - printf("%d: e=%d, f=%d, bc=%d, v=", - b, - boundaryFaces[b].element, - boundaryFaces[b].face, - boundaryFaces[b].bctype); - for(int n = 0; n < mesh->NfaceVertices; ++n) - printf("%d ", boundaryFaces[b].v[n]); - printf("\n"); - } -#endif - - /* sort boundaryFaces by their vertex number pairs */ - qsort(boundaryFaces, bcnt, sizeof(boundaryFace_t), compareBoundaryFaces); - - /* scan through sorted face lists looking for element-boundary matches */ - mesh->EToB = (int*) calloc(mesh->Nelements * mesh->Nfaces, sizeof(int)); - for(dlong n = 0; n < mesh->Nelements * mesh->Nfaces; ++n) mesh->EToB[n] = -1; - - for(hlong cnt = 0; cnt < bcnt - 1; ++cnt) - if(!compareBoundaryFaces(boundaryFaces + cnt, boundaryFaces + cnt + 1)) { - dlong e = mymax(boundaryFaces[cnt].element, boundaryFaces[cnt + 1].element); - int f = mymax(boundaryFaces[cnt].face, boundaryFaces[cnt + 1].face); - - mesh->EToB[e * mesh->Nfaces + f] = - mymax(boundaryFaces[cnt].bctype, boundaryFaces[cnt + 1].bctype); - } - -#if 0 - int cnt = 0; - for(int e = 0; e < mesh->Nelements; ++e) - for(int f = 0; f < mesh->Nfaces; ++f) { - printf("EToE(%d,%d) = %d \n", e,f, mesh->EToE[cnt]); - ++cnt; - } - -#endif - - free(boundaryFaces); -} diff --git a/src/mesh/meshConnectFaceNodes3D.cpp b/src/mesh/meshConnectFaceNodes3D.cpp index 82cd2f647..2bf16f0c6 100644 --- a/src/mesh/meshConnectFaceNodes3D.cpp +++ b/src/mesh/meshConnectFaceNodes3D.cpp @@ -56,7 +56,7 @@ int findBestMatch(dfloat x1, dfloat y1, dfloat z1, } // serial face-node to face-node connection -void meshConnectFaceNodes3D(mesh3D* mesh) +void meshConnectFaceNodes3D(mesh_t *mesh) { /* volume indices of the interior and exterior face nodes for each element */ mesh->vmapM = (dlong*) calloc(mesh->Nfp * mesh->Nfaces * mesh->Nelements, sizeof(dlong)); diff --git a/src/mesh/meshConnectPeriodicFaceNodes3D.cpp b/src/mesh/meshConnectPeriodicFaceNodes3D.cpp index 63a047ce6..58a23ff5a 100644 --- a/src/mesh/meshConnectPeriodicFaceNodes3D.cpp +++ b/src/mesh/meshConnectPeriodicFaceNodes3D.cpp @@ -65,7 +65,7 @@ int findBestPeriodicMatch(dfloat xper, dfloat yper, dfloat zper, } // serial face-node to face-node connection -void meshConnectPeriodicFaceNodes3D(mesh3D* mesh, dfloat xper, dfloat yper, dfloat zper) +void meshConnectPeriodicFaceNodes3D(mesh_t *mesh, dfloat xper, dfloat yper, dfloat zper) { /* volume indices of the interior and exterior face nodes for each element */ mesh->vmapM = (dlong*) calloc(mesh->Nfp * mesh->Nfaces * mesh->Nelements, sizeof(dlong)); diff --git a/src/mesh/meshFree.cpp b/src/mesh/meshFree.cpp index 323417bfa..dbef48ce0 100644 --- a/src/mesh/meshFree.cpp +++ b/src/mesh/meshFree.cpp @@ -41,9 +41,6 @@ void meshFree(mesh_t* mesh) if(mesh->elementInfo) free(mesh->elementInfo); //type of element - // boundary faces - if(mesh->boundaryInfo) free(mesh->boundaryInfo); // list of boundary faces (type, vertex-1, vertex-2, vertex-3) - // MPI halo exchange info if(mesh->haloElementList) free(mesh->haloElementList); // sorted list of elements to be sent in halo exchange if(mesh->NhaloPairs) free(mesh->NhaloPairs); // number of elements worth of data to send/recv diff --git a/src/mesh/meshGeometricFactorsHex3D.cpp b/src/mesh/meshGeometricFactorsHex3D.cpp index 2c21b8dbb..b27cede2d 100644 --- a/src/mesh/meshGeometricFactorsHex3D.cpp +++ b/src/mesh/meshGeometricFactorsHex3D.cpp @@ -66,14 +66,14 @@ void interpolateHex3D(dfloat* I, dfloat* x, int N, dfloat* Ix, int M) free(Ix2); } -void meshGeometricFactorsHex3D(mesh3D* mesh) +void meshGeometricFactorsHex3D(mesh_t *mesh) { double tStart = MPI_Wtime(); if(platform->comm.mpiRank == 0) printf("computing geometric factors ... "); fflush(stdout); /* note that we have volume geometric factors for each node */ mesh->vgeo = (dfloat*) calloc(mesh->Nelements * mesh->Nvgeo * mesh->Np, sizeof(dfloat)); - mesh->cubvgeo = (dfloat*) calloc(mesh->Nelements * mesh->Nvgeo * mesh->cubNp, sizeof(dfloat)); + mesh->cubvgeo = (dfloat *)calloc(mesh->Nelements * mesh->Nvgeo * mesh->cubNp, sizeof(dfloat)); mesh->ggeo = (dfloat*) calloc(mesh->Nelements * mesh->Nggeo * mesh->Np, sizeof(dfloat)); dfloat minJ = 1e9, maxJ = -1e9, maxSkew = 0; @@ -210,7 +210,7 @@ void meshGeometricFactorsHex3D(mesh3D* mesh) mesh->ggeo[mesh->Nggeo * mesh->Np * e + n + mesh->Np * GWJID] = JW; } -#if 0 +#if 1 interpolateHex3D(mesh->cubInterp, xre, mesh->Nq, cubxre, mesh->cubNq); interpolateHex3D(mesh->cubInterp, xse, mesh->Nq, cubxse, mesh->cubNq); interpolateHex3D(mesh->cubInterp, xte, mesh->Nq, cubxte, mesh->cubNq); @@ -241,8 +241,6 @@ void meshGeometricFactorsHex3D(mesh3D* mesh) /* compute geometric factors for affine coordinate transform*/ dfloat J = xr * (ys * zt - zs * yt) - yr * (xs * zt - zs * xt) + zr * (xs * yt - ys * xt); - //if(J<1e-12) printf("CUBATURE J = %g !!!!!!!!!!!!!\n", J); - dfloat rx = (ys * zt - zs * yt) / J, ry = -(xs * zt - zs * xt) / J, rz = (xs * yt - ys * xt) / J; dfloat sx = -(yr * zt - zr * yt) / J, sy = (xr * zt - zr * xt) / J, @@ -266,7 +264,7 @@ void meshGeometricFactorsHex3D(mesh3D* mesh) mesh->cubvgeo[base + mesh->cubNp * TYID] = ty; mesh->cubvgeo[base + mesh->cubNp * TZID] = tz; - mesh->cubvgeo[base + mesh->cubNp * JID] = J; + mesh->cubvgeo[base + mesh->cubNp * JID] = J; mesh->cubvgeo[base + mesh->cubNp * JWID] = JW; mesh->cubvgeo[base + mesh->cubNp * IJWID] = 1. / JW; } @@ -274,11 +272,11 @@ void meshGeometricFactorsHex3D(mesh3D* mesh) } { - dfloat globalMinJ, globalMaxJ, globalMaxSkew; + dfloat globalMinJ = 0, globalMaxJ = 0, globalMaxSkew = 0; - MPI_Reduce(&minJ, &globalMinJ, 1, MPI_DFLOAT, MPI_MIN, 0, platform->comm.mpiComm); - MPI_Reduce(&maxJ, &globalMaxJ, 1, MPI_DFLOAT, MPI_MAX, 0, platform->comm.mpiComm); - MPI_Reduce(&maxSkew, &globalMaxSkew, 1, MPI_DFLOAT, MPI_MAX, 0, platform->comm.mpiComm); + MPI_Allreduce(&minJ, &globalMinJ, 1, MPI_DFLOAT, MPI_MIN, platform->comm.mpiComm); + MPI_Allreduce(&maxJ, &globalMaxJ, 1, MPI_DFLOAT, MPI_MAX, platform->comm.mpiComm); + MPI_Allreduce(&maxSkew, &globalMaxSkew, 1, MPI_DFLOAT, MPI_MAX, platform->comm.mpiComm); if(platform->comm.mpiRank == 0) printf("J [%g,%g] ", globalMinJ, globalMaxJ); @@ -286,9 +284,10 @@ void meshGeometricFactorsHex3D(mesh3D* mesh) if(globalMinJ < 0 || globalMaxJ < 0) { if(platform->options.compareArgs("GALERKIN COARSE OPERATOR","FALSE") || - (platform->options.compareArgs("GALERKIN COARSE OPERATOR","TRUE") && mesh->N > 1)) { - if(platform->comm.mpiRank == 0) printf("Jacobian < 0!"); - ABORT(EXIT_FAILURE); + (platform->options.compareArgs("GALERKIN COARSE OPERATOR","TRUE") && mesh->N > 1)) { + if (platform->comm.mpiRank == 0) + printf("Jacobian < 0!"); + EXIT_AND_FINALIZE(EXIT_FAILURE); } } diff --git a/src/mesh/meshLoadReferenceNodesHex3D.cpp b/src/mesh/meshLoadReferenceNodesHex3D.cpp index b6882f420..0e44e5c0f 100644 --- a/src/mesh/meshLoadReferenceNodesHex3D.cpp +++ b/src/mesh/meshLoadReferenceNodesHex3D.cpp @@ -30,7 +30,7 @@ #include #define NODE_GEN -void meshLoadReferenceNodesHex3D(mesh3D* mesh, int N, int cubN) +void meshLoadReferenceNodesHex3D(mesh_t *mesh, int N, int cubN) { mesh->N = N; mesh->Nq = N + 1; @@ -41,7 +41,9 @@ void meshLoadReferenceNodesHex3D(mesh3D* mesh, int N, int cubN) mesh->Nvgeo = 12; mesh->Nggeo = 7; - mesh->Nsgeo = 17; + mesh->Nsgeo = 13; + + mesh->Nlocal = mesh->Nelements * mesh->Np; int Nrows, Ncols; diff --git a/src/mesh/meshMove.cpp b/src/mesh/meshMove.cpp index 740a8d55b..fb28b1ad9 100644 --- a/src/mesh/meshMove.cpp +++ b/src/mesh/meshMove.cpp @@ -1,13 +1,8 @@ -#include -#include -#include -void mesh_t::computeInvLMM() -{ - o_invLMM.copyFrom(o_LMM, Nelements * Np * sizeof(dfloat)); - oogs::startFinish(o_invLMM, 1, 0, ogsDfloat, ogsAdd, oogs); - platform->linAlg->ady(Nelements * Np, 1.0, o_invLMM); -} +#include "mesh.h" +#include "linAlg.hpp" +#include "platform.hpp" void mesh_t::move(){ + platform->timer.tic("meshUpdate", 1); // update o_x, o_y and o_z based on mesh->o_U using AB formula nStagesSumVectorKernel( Nelements * Np, @@ -20,44 +15,58 @@ void mesh_t::move(){ o_z ); update(); + + double flops = 6 * static_cast(Nlocal) * nAB; + platform->flopCounter->add("mesh_t::move", flops); + platform->timer.toc("meshUpdate"); } -void mesh_t::update(){ - geometricFactorsKernel( - Nelements, - o_D, - o_gllw, - o_x, - o_y, - o_z, - o_LMM, - o_vgeo, - o_ggeo, - platform->o_mempool.slice0 - ); - - // do add check if negative - const dfloat minJ = platform->linAlg->min(Nelements * Np, platform->o_mempool.slice0, platform->comm.mpiComm); - const dfloat maxJ = platform->linAlg->max(Nelements * Np, platform->o_mempool.slice0, platform->comm.mpiComm); - - if(minJ < 0 || maxJ < 0) { - if(platform->options.compareArgs("GALERKIN COARSE OPERATOR","FALSE") || - (platform->options.compareArgs("GALERKIN COARSE OPERATOR","TRUE") && N > 1)) { - if(platform->comm.mpiRank == 0) printf("Jacobian < 0!"); - ABORT(EXIT_FAILURE); - } +void mesh_t::update() +{ + geometricFactorsKernel(Nelements, + o_D, + o_gllw, + o_x, + o_y, + o_z, + o_LMM, + o_vgeo, + o_ggeo, + platform->o_mempool.slice0); + + double flopsGeometricFactors = 18 * Np * Nq + 91 * Np; + flopsGeometricFactors *= static_cast(Nelements); + + cubatureGeometricFactorsKernel(Nelements, o_D, o_x, o_y, o_z, o_cubInterpT, o_cubw, o_cubvgeo); + + double flopsCubatureGeometricFactors = 0.0; + flopsCubatureGeometricFactors += 18 * Np * Nq; // deriv + flopsCubatureGeometricFactors += 18 * (cubNq * Np + cubNq * cubNq * Nq * Nq + cubNp * Nq); // c->f interp + flopsCubatureGeometricFactors += 55 * cubNp; // geometric factor computation + flopsCubatureGeometricFactors *= static_cast(Nelements); + + // do add check if negative + const dfloat minJ = + platform->linAlg->min(Nelements * Np, platform->o_mempool.slice0, platform->comm.mpiComm); + const dfloat maxJ = + platform->linAlg->max(Nelements * Np, platform->o_mempool.slice0, platform->comm.mpiComm); + + if (minJ < 0 || maxJ < 0) { + if (platform->options.compareArgs("GALERKIN COARSE OPERATOR", "FALSE") || + (platform->options.compareArgs("GALERKIN COARSE OPERATOR", "TRUE") && N > 1)) { + if (platform->comm.mpiRank == 0) + printf("Jacobian < 0!"); + ABORT(EXIT_FAILURE); + } } volume = platform->linAlg->sum(Nelements * Np, o_LMM, platform->comm.mpiComm); computeInvLMM(); - surfaceGeometricFactorsKernel( - Nelements, - o_D, - o_gllw, - o_faceNodes, - o_x, - o_y, - o_z, - o_sgeo - ); + surfaceGeometricFactorsKernel(Nelements, o_gllw, o_faceNodes, o_vgeo, o_sgeo); + + double flopsSurfaceGeometricFactors = 32 * Nq * Nq; + flopsSurfaceGeometricFactors *= static_cast(Nelements); + + double flops = flopsGeometricFactors + flopsCubatureGeometricFactors + flopsSurfaceGeometricFactors; + platform->flopCounter->add("mesh_t::update", flops); } \ No newline at end of file diff --git a/src/mesh/meshNekReader.cpp b/src/mesh/meshNekReader.cpp index fb52aa385..a482fc6cf 100644 --- a/src/mesh/meshNekReader.cpp +++ b/src/mesh/meshNekReader.cpp @@ -26,84 +26,75 @@ void meshNekReaderHex3D(int N, mesh_t* mesh) (int*) calloc(mesh->NfaceVertices * mesh->Nfaces, sizeof(int)); memcpy(mesh->faceVertices, faceVertices[0], mesh->NfaceVertices * mesh->Nfaces * sizeof(int)); - const int vtxmap[8] = {0, 1, 3, 2, 4, 5, 7, 6}; + // pre-processor maps + const int vtxMap[] = {0,1,3,2,4,5,7,6}; + const int faceMap[] = {1,2,3,4,0,5}; - // build vertex numbering + // generate element vertex numbering mesh->Nnodes = nek::set_glo_num(2, mesh->cht); mesh->EToV = (hlong*) calloc(mesh->Nelements * mesh->Nverts, sizeof(hlong)); for(int e = 0; e < mesh->Nelements; ++e) for(int j = 0; j < mesh->Nverts; j++) - mesh->EToV[e * mesh->Nverts + j] = nekData.glo_num[e * mesh->Nverts + vtxmap[j]]; + mesh->EToV[e * mesh->Nverts + j] = nekData.glo_num[e * mesh->Nverts + vtxMap[j]]; // find number of boundary faces - int nbc = 0; + hlong NboundaryFaces = 0; int* bid = nekData.boundaryIDt; if(!mesh->cht) bid = nekData.boundaryID; for(int e = 0; e < mesh->Nelements; e++) for(int iface = 0; iface < mesh->Nfaces; iface++) { - if(*bid) nbc++; + if(*bid) NboundaryFaces++; bid++; } - int* recvCounts = (int*) calloc(platform->comm.mpiCommSize, sizeof(int)); - MPI_Allgather(&nbc, 1, MPI_INT, recvCounts, 1, MPI_INT, platform->comm.mpiComm); - int* displacement = (int*) calloc(platform->comm.mpiCommSize, sizeof(int)); - displacement[0] = 0; - for(int i = 1; i < platform->comm.mpiCommSize; i++) - displacement[i] = displacement[i - 1] + recvCounts[i - 1]; + int Nbid = nekData.NboundaryIDt; + if (!mesh->cht) + Nbid = nekData.NboundaryID; + MPI_Allreduce(MPI_IN_PLACE, &NboundaryFaces, 1, MPI_HLONG, MPI_SUM, platform->comm.mpiComm); + if (platform->comm.mpiRank == 0) + printf("NboundaryIDs: %d, NboundaryFaces: %lld ", Nbid, NboundaryFaces); + mesh->NboundaryFaces = NboundaryFaces; - // build boundary info (for now every rank has all) - mesh->NboundaryFaces = nbc; - MPI_Allreduce(MPI_IN_PLACE, &mesh->NboundaryFaces, 1, MPI_HLONG, - MPI_SUM, platform->comm.mpiComm); - if(platform->comm.mpiRank == 0) { - int n = nekData.NboundaryIDt; - if(!mesh->cht) n = nekData.NboundaryID; - printf("NboundaryIDs: %d, NboundaryFaces: %lld ", n, mesh->NboundaryFaces); - } + // boundary face tags (face numbering is in pre-processor notation) + mesh->EToB = (int*) calloc(mesh->Nelements * mesh->Nfaces, sizeof(int)); + for(int i = 0; i < mesh->Nelements * mesh->Nfaces; i++) mesh->EToB[i] = -1; - int cnt = 0; bid = nekData.boundaryIDt; if(!mesh->cht) bid = nekData.boundaryID; - int* eface1 = nekData.eface1; - int* icface = nekData.icface; - mesh->boundaryInfo = (hlong*) calloc(mesh->NboundaryFaces * (mesh->NfaceVertices + 1), - sizeof(hlong)); - for(int e = 0; e < mesh->Nelements; e++) - for(int iface = 0; iface < mesh->Nfaces; iface++) { - int ibc = *bid; - if(ibc > 0) { - hlong offset = (hlong)displacement[platform->comm.mpiRank] * (mesh->NfaceVertices + 1) - + (hlong)cnt * (mesh->NfaceVertices + 1); - mesh->boundaryInfo[offset] = ibc; - for(int j = 0; j < mesh->NfaceVertices; j++) { - const int vertex = icface[j + mesh->NfaceVertices * (eface1[iface] - 1)] - 1; - mesh->boundaryInfo[offset + (j+1)] = - mesh->EToV[e * mesh->Nverts + vtxmap[vertex]]; - } - cnt++; + int minEToB = std::numeric_limits::max(); + int maxEToB = std::numeric_limits::min(); + for(int e = 0; e < mesh->Nelements; e++) { + for(int i = 0; i < mesh->Nfaces; i++) { + const int ibc = bid[e * mesh->Nfaces + i]; + if (ibc > 0) { // only valid ids + mesh->EToB[e * mesh->Nfaces + faceMap[i]] = ibc; + minEToB = std::min(ibc, minEToB); + maxEToB = std::max(ibc, maxEToB); } - bid++; } - - // hack to avoid missing large-count in MPI - MPI_Datatype bInfoType; - MPI_Type_contiguous(mesh->NfaceVertices + 1, MPI_HLONG, &bInfoType); - MPI_Type_commit(&bInfoType); - - MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, mesh->boundaryInfo, - (const int*)recvCounts, (const int*)displacement, bInfoType, platform->comm.mpiComm); - - free(recvCounts); - free(displacement); + } + if (Nbid > 0) { + MPI_Allreduce(MPI_IN_PLACE, &minEToB, 1, MPI_INT, MPI_MIN, platform->comm.mpiComm); + if (minEToB != 1) { + if (platform->comm.mpiRank == 0) + printf("\nboundary IDs are not one-based, min(ID): %d!\n", minEToB); + EXIT_AND_FINALIZE(EXIT_FAILURE); + } +#if 0 + MPI_Allreduce(MPI_IN_PLACE, &maxEToB, 1, MPI_INT, MPI_MAX, platform->comm.mpiComm); + if (maxEToB - minEToB != Nbid - 1) { + if (platform->comm.mpiRank == 0) + printf("\nboundary IDs are not contiguous!\n"); + EXIT_AND_FINALIZE(EXIT_FAILURE); + } +#endif + } // assign vertex coords - mesh->elementInfo - = (dlong*) calloc(mesh->Nelements, sizeof(dlong)); - + mesh->elementInfo = (dlong *)calloc(mesh->Nelements, sizeof(dlong)); double* VX = nekData.xc; double* VY = nekData.yc; double* VZ = nekData.zc; diff --git a/src/mesh/meshOccaSetup3D.cpp b/src/mesh/meshOccaSetup3D.cpp index 87fc1ec79..28cda9db3 100644 --- a/src/mesh/meshOccaSetup3D.cpp +++ b/src/mesh/meshOccaSetup3D.cpp @@ -31,6 +31,7 @@ #include "mesh3D.h" #include "platform.hpp" +#include "bcMap.hpp" void reportMemoryUsage(occa::device &device, const char* mess) { @@ -39,7 +40,7 @@ void reportMemoryUsage(occa::device &device, const char* mess) printf("%s: bytes allocated = %lu\n", mess, bytes); } -void meshOccaPopulateDeviceHex3D(mesh3D* mesh, setupAide &newOptions, occa::properties &kernelInfo) +void meshOccaPopulateDeviceHex3D(mesh_t *mesh, setupAide &newOptions, occa::properties &kernelInfo) { mesh->o_elementInfo = platform->device.malloc(mesh->Nelements * sizeof(dlong), mesh->elementInfo); @@ -97,15 +98,16 @@ void meshOccaPopulateDeviceHex3D(mesh3D* mesh, setupAide &newOptions, occa::prop platform->device.malloc(mesh->Nelements * mesh->Np * mesh->Nvgeo * sizeof(dfloat), mesh->vgeo); mesh->o_sgeo = - platform->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->Nfp * mesh->Nsgeo * sizeof(dfloat), - mesh->sgeo); + platform->device.malloc(mesh->Nelements * mesh->Nfaces * mesh->Nfp * mesh->Nsgeo * sizeof(dfloat), + mesh->sgeo); + mesh->o_ggeo = platform->device.malloc(mesh->Nelements * mesh->Np * mesh->Nggeo * sizeof(dfloat), mesh->ggeo); - //if(mesh->cubNq - 1) - // mesh->o_cubvgeo = - // platform->device.malloc(mesh->Nelements * mesh->Nvgeo * mesh->cubNp * sizeof(dfloat), - // mesh->cubvgeo); + if (mesh->cubNq - 1) { + mesh->o_cubvgeo = + platform->device.malloc(mesh->Nelements * mesh->Nvgeo * mesh->cubNp * sizeof(dfloat), mesh->cubvgeo); + } mesh->o_vmapM = platform->device.malloc(mesh->Nelements * mesh->Nfp * mesh->Nfaces * sizeof(dlong), @@ -144,10 +146,10 @@ void meshOccaPopulateDeviceHex3D(mesh3D* mesh, setupAide &newOptions, occa::prop platform->device.malloc(mesh->Nfp * mesh->totalHaloPairs * sizeof(dlong), mesh->haloPutNodeIds); } - kernelInfo += populateMeshProperties(mesh->N); + kernelInfo += meshKernelProperties(mesh->N); } -void meshOccaSetup3D(mesh3D* mesh, setupAide &newOptions, occa::properties &kernelInfo) +void meshOccaSetup3D(mesh_t *mesh, setupAide &newOptions, occa::properties &kernelInfo) { meshOccaPopulateDeviceHex3D(mesh, newOptions, kernelInfo); } diff --git a/src/mesh/meshParallelGatherScatterSetup.cpp b/src/mesh/meshParallelGatherScatterSetup.cpp index d54c227f1..60462a756 100644 --- a/src/mesh/meshParallelGatherScatterSetup.cpp +++ b/src/mesh/meshParallelGatherScatterSetup.cpp @@ -30,11 +30,13 @@ #include "mesh.h" #include "platform.hpp" +#include "nekInterfaceAdapter.hpp" void meshParallelGatherScatterSetup(mesh_t* mesh, dlong N, hlong* globalIds, MPI_Comm &comm, + oogs_mode gsMode, int verbose) { @@ -42,7 +44,9 @@ void meshParallelGatherScatterSetup(mesh_t* mesh, MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &size); - mesh->ogs = ogsSetup(N, globalIds, comm, verbose, platform->device); + if(platform->comm.mpiRank == 0) + std::cout << "meshParallelGatherScatterSetup N=" << mesh->N << "\n"; + mesh->ogs = ogsSetup(N, globalIds, comm, verbose, platform->device.occaDevice()); //use the gs to find what nodes are local to this rank int* minRank = (int*) calloc(N,sizeof(int)); @@ -55,8 +59,8 @@ void meshParallelGatherScatterSetup(mesh_t* mesh, ogsGatherScatter(minRank, ogsInt, ogsMin, mesh->ogs); //minRank[n] contains the smallest rank taking part in the gather of node n ogsGatherScatter(maxRank, ogsInt, ogsMax, mesh->ogs); //maxRank[n] contains the largest rank taking part in the gather of node n - int overlap = 0; - platform->options.compareArgs("ENABLE OVERLAP", "TRUE"); overlap = 1; + int overlap = 1; + if(platform->options.compareArgs("GS OVERLAP", "FALSE")) overlap = 0; // count elements that contribute to global C0 gather-scatter dlong globalCount = 0; @@ -77,9 +81,14 @@ void meshParallelGatherScatterSetup(mesh_t* mesh, localCount += 1 - isHalo; } + mesh->elementList = (dlong *)calloc(mesh->Nelements, sizeof(dlong)); mesh->globalGatherElementList = (dlong*) calloc(globalCount, sizeof(dlong)); mesh->localGatherElementList = (dlong*) calloc(localCount, sizeof(dlong)); + for (dlong e = 0; e < mesh->Nelements; ++e) { + mesh->elementList[e] = e; + } + globalCount = 0; localCount = 0; @@ -100,11 +109,15 @@ void meshParallelGatherScatterSetup(mesh_t* mesh, else mesh->localGatherElementList[localCount++] = e; } - //printf("local = %d, global = %d\n", localCount, globalCount); + + free(minRank); + free(maxRank); mesh->NglobalGatherElements = globalCount; mesh->NlocalGatherElements = localCount; + mesh->o_elementList = platform->device.malloc(mesh->Nelements * sizeof(dlong), mesh->elementList); + if(globalCount) mesh->o_globalGatherElementList = platform->device.malloc(globalCount * sizeof(dlong), mesh->globalGatherElementList); @@ -112,4 +125,33 @@ void meshParallelGatherScatterSetup(mesh_t* mesh, if(localCount) mesh->o_localGatherElementList = platform->device.malloc(localCount * sizeof(dlong), mesh->localGatherElementList); + + { // sanity check + int err = 0; + dlong gNelements = mesh->Nelements; + MPI_Allreduce(MPI_IN_PLACE, &gNelements, 1, MPI_DLONG, MPI_SUM, platform->comm.mpiComm); + const dfloat sum2 = (dfloat)gNelements * mesh->Np; + + occa::memory o_tmp = platform->device.malloc(mesh->Nlocal , sizeof(dfloat)); + platform->linAlg->fillKernel(mesh->Nlocal, 1.0, o_tmp); + + ogsGatherScatter(o_tmp, ogsDfloat, ogsAdd, mesh->ogs); + + platform->linAlg->axmyKernel(mesh->Nlocal, 1.0, mesh->ogs->o_invDegree, o_tmp); + dfloat* tmp = (dfloat*) calloc(mesh->Nlocal, sizeof(dfloat)); + o_tmp.copyTo(tmp, mesh->Nlocal * sizeof(dfloat)); + dfloat sum1 = 0; + for(int i = 0; i < mesh->Nlocal; i++) sum1 += tmp[i]; + MPI_Allreduce(MPI_IN_PLACE, &sum1, 1, MPI_DFLOAT, MPI_SUM, platform->comm.mpiComm); + sum1 = abs(sum1 - sum2) / sum2; + if(sum1 > 1e-15) { + if(platform->comm.mpiRank == 0) printf("ogsGatherScatter test err=%g!\n", sum1); + fflush(stdout); + err++; + } + o_tmp.free(); + + if(err) ABORT(1); + free(tmp); + } } diff --git a/src/mesh/meshPhysicalNodesHex3D.cpp b/src/mesh/meshPhysicalNodesHex3D.cpp index f615f6f9e..6222e27dc 100644 --- a/src/mesh/meshPhysicalNodesHex3D.cpp +++ b/src/mesh/meshPhysicalNodesHex3D.cpp @@ -30,7 +30,7 @@ #include "nrs.hpp" #include "nekInterfaceAdapter.hpp" -void meshPhysicalNodesHex3D(mesh3D* mesh) +void meshPhysicalNodesHex3D(mesh_t *mesh) { mesh->x = (dfloat*) calloc((mesh->Nelements+mesh->totalHaloPairs) * mesh->Np,sizeof(dfloat)); mesh->y = (dfloat*) calloc((mesh->Nelements+mesh->totalHaloPairs) * mesh->Np,sizeof(dfloat)); diff --git a/src/mesh/meshSetup.cpp b/src/mesh/meshSetup.cpp index 5170ed5ba..5945b1f02 100644 --- a/src/mesh/meshSetup.cpp +++ b/src/mesh/meshSetup.cpp @@ -10,45 +10,58 @@ mesh_t *createMeshV(MPI_Comm comm, mesh_t* meshT, occa::properties& kernelInfo); -occa::properties populateMeshProperties(int N) +occa::properties meshKernelProperties(int N) { - occa::properties meshProperties = platform->kernelInfo; + occa::properties meshProperties; const int Nq = N+1; const int Np = Nq * Nq * Nq; const int Nfp = Nq * Nq; constexpr int Nfaces {6}; - constexpr int Nvgeo {12}; - constexpr int Nggeo {7}; - constexpr int Nsgeo {17}; + constexpr int Nvgeo{12}; + constexpr int Nggeo{7}; + constexpr int Nsgeo{13}; meshProperties["defines/" "p_dim"] = 3; + meshProperties["defines/" "p_Nverts"] = 8; meshProperties["defines/" "p_Nfields"] = 1; meshProperties["defines/" "p_N"] = N; meshProperties["defines/" "p_Nq"] = Nq; + meshProperties["defines/" "p_Nq_g"] = Nq; meshProperties["defines/" "p_Np"] = Np; + meshProperties["defines/" "p_Np_g"] = Np; meshProperties["defines/" "p_Nfp"] = Nfp; meshProperties["defines/" "p_Nfaces"] = Nfaces; meshProperties["defines/" "p_NfacesNfp"] = Nfp * Nfaces; meshProperties["defines/" "p_Nvgeo"] = Nvgeo; meshProperties["defines/" "p_Nsgeo"] = Nsgeo; - meshProperties["defines/" "p_Nggeo"] = Nggeo; + meshProperties["defines/" + "p_Nggeo"] = Nggeo; meshProperties["defines/" "p_NXID"] = NXID; meshProperties["defines/" "p_NYID"] = NYID; meshProperties["defines/" "p_NZID"] = NZID; - meshProperties["defines/" "p_SJID"] = SJID; - meshProperties["defines/" "p_IJID"] = IJID; - meshProperties["defines/" "p_IHID"] = IHID; - meshProperties["defines/" "p_WSJID"] = WSJID; - meshProperties["defines/" "p_WIJID"] = WIJID; - meshProperties["defines/" "p_STXID"] = STXID; - meshProperties["defines/" "p_STYID"] = STYID; - meshProperties["defines/" "p_STZID"] = STZID; - meshProperties["defines/" "p_SBXID"] = SBXID; - meshProperties["defines/" "p_SBYID"] = SBYID; - meshProperties["defines/" "p_SBZID"] = SBZID; + meshProperties["defines/" + "p_SJID"] = SJID; + meshProperties["defines/" + "p_IJID"] = IJID; + meshProperties["defines/" + "p_WIJID"] = WIJID; + meshProperties["defines/" + "p_WSJID"] = WSJID; + meshProperties["defines/" + "p_T1XID"] = T1XID; + meshProperties["defines/" + "p_T1YID"] = T1YID; + meshProperties["defines/" + "p_T1ZID"] = T1ZID; + meshProperties["defines/" + "p_T2XID"] = T2XID; + meshProperties["defines/" + "p_T2YID"] = T2YID; + meshProperties["defines/" + "p_T2ZID"] = T2ZID; meshProperties["defines/" "p_G00ID"] = G00ID; meshProperties["defines/" "p_G01ID"] = G01ID; @@ -91,12 +104,12 @@ mesh_t *createMesh(MPI_Comm comm, mesh->cht = cht; + if (platform->comm.mpiRank == 0) + printf("generating t-mesh ...\n"); + // get mesh from nek meshNekReaderHex3D(N, mesh); - if (platform->comm.mpiRank == 0) - printf("generating mesh ... "); - if (mesh->Nelements * mesh->Nvgeo * cubN > std::numeric_limits::max()) { if (platform->comm.mpiRank == 0) printf("FATAL ERROR: Local element count too large!"); ABORT(EXIT_FAILURE); @@ -107,20 +120,14 @@ mesh_t *createMesh(MPI_Comm comm, // connect elements using parallel sort meshParallelConnect(mesh); - // connect elements to boundary faces - meshConnectBoundary(mesh); - // load reference (r,s,t) element nodes meshLoadReferenceNodesHex3D(mesh, N, cubN); if (platform->comm.mpiRank == 0) { - if (cubN) - printf("Nq: %d cubNq: %d\n", mesh->Nq, mesh->cubNq); - else - printf("Nq: %d\n", mesh->Nq); + printf("N: %d, Nq: %d", mesh->N, mesh->Nq); + if (cubN) printf(", cubNq: %d", mesh->cubNq); + printf("\n"); } - mesh->Nlocal = mesh->Nelements * mesh->Np; - loadKernels(mesh); // set up halo exchange info for MPI (do before connect face nodes) @@ -143,13 +150,38 @@ mesh_t *createMesh(MPI_Comm comm, // global nodes meshGlobalIds(mesh); bcMap::check(mesh); + bcMap::checkBoundaryAlignment(mesh); + bcMap::remapUnalignedBoundaries(mesh); meshOccaSetup3D(mesh, platform->options, kernelInfo); - meshParallelGatherScatterSetup(mesh, mesh->Nelements * mesh->Np, mesh->globalIds, platform->comm.mpiComm, 0); - oogs_mode oogsMode = OOGS_AUTO; - //if(platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP") oogsMode = OOGS_DEFAULT; - mesh->oogs = oogs::setup(mesh->ogs, 1, mesh->Nelements * mesh->Np, ogsDfloat, NULL, oogsMode); + meshParallelGatherScatterSetup(mesh, + mesh->Nelements * mesh->Np, + mesh->globalIds, + platform->comm.mpiComm, + OOGS_AUTO, + 0); + + int err = 0; + int Nfine; + platform->options.getArgs("POLYNOMIAL DEGREE", Nfine); + if(mesh->N == Nfine) { + dfloat* tmp = (dfloat*) calloc(mesh->Nlocal, sizeof(dfloat)); + mesh->ogs->o_invDegree.copyTo(tmp, mesh->Nlocal * sizeof(dfloat)); + double* mult = (cht) ? (double*) nek::ptr("tmult") : (double*) nek::ptr("vmult"); + dfloat sum1 = 0; + for(int i = 0; i < mesh->Nlocal; i++) sum1 += std::abs(tmp[i] - mult[i]); + MPI_Allreduce(MPI_IN_PLACE, &sum1, 1, MPI_DFLOAT, MPI_SUM, platform->comm.mpiComm); + if(sum1 > 1e-14) { + if(platform->comm.mpiRank == 0) printf("multiplicity test err=%g!\n", sum1); + fflush(stdout); + err++; + } + free(tmp); + } + if(err) ABORT(1); + + mesh->oogs = oogs::setup(mesh->ogs, 1, mesh->Nelements * mesh->Np, ogsDfloat, NULL, OOGS_AUTO); // build mass + inverse mass matrix for(dlong e = 0; e < mesh->Nelements; ++e) @@ -201,12 +233,13 @@ mesh_t* duplicateMesh(MPI_Comm comm, meshGlobalIds(mesh); bcMap::check(mesh); + bcMap::checkBoundaryAlignment(mesh); + bcMap::remapUnalignedBoundaries(mesh); + meshOccaSetup3D(mesh, platform->options, kernelInfo); - meshParallelGatherScatterSetup(mesh, mesh->Nelements * mesh->Np, mesh->globalIds, platform->comm.mpiComm, 0); - oogs_mode oogsMode = OOGS_AUTO; - //if(platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP") oogsMode = OOGS_DEFAULT; - mesh->oogs = oogs::setup(mesh->ogs, 1, mesh->Nelements * mesh->Np, ogsDfloat, NULL, oogsMode); + meshParallelGatherScatterSetup(mesh, mesh->Nelements * mesh->Np, mesh->globalIds, platform->comm.mpiComm, OOGS_AUTO, 0); + mesh->oogs = oogs::setup(mesh->ogs, 1, mesh->Nelements * mesh->Np, ogsDfloat, NULL, OOGS_AUTO); // build mass + inverse mass matrix for(dlong e = 0; e < mesh->Nelements; ++e) @@ -215,15 +248,70 @@ mesh_t* duplicateMesh(MPI_Comm comm, mesh->o_LMM.copyFrom(mesh->LMM, mesh->Nelements * mesh->Np * sizeof(dfloat)); mesh->computeInvLMM(); - if(platform->options.compareArgs("MOVING MESH", "TRUE")){ - const int maxTemporalOrder = 3; - mesh->coeffAB = (dfloat*) calloc(maxTemporalOrder, sizeof(dfloat)); - mesh->o_coeffAB = platform->device.malloc(maxTemporalOrder * sizeof(dfloat), mesh->coeffAB); + return mesh; +} +*/ + +mesh_t *createMeshMG(mesh_t* _mesh, + int Nc) +{ + mesh_t* mesh = new mesh_t(); + memcpy(mesh, _mesh, sizeof(mesh_t)); + + meshLoadReferenceNodesHex3D(mesh, Nc, 1); + meshHaloSetup(mesh); + meshPhysicalNodesHex3D(mesh); + meshHaloPhysicalNodes(mesh); + meshGeometricFactorsHex3D(mesh); + + meshConnectFaceNodes3D(mesh); + meshSurfaceGeometricFactorsHex3D(mesh); + + meshGlobalIds(mesh); + meshParallelGatherScatterSetup(mesh, mesh->Nelements * mesh->Np, mesh->globalIds, platform->comm.mpiComm, OOGS_AUTO, 0); + + mesh->o_x = platform->device.malloc(mesh->Np * mesh->Nelements * sizeof(dfloat), mesh->x); + mesh->o_y = platform->device.malloc(mesh->Np * mesh->Nelements * sizeof(dfloat), mesh->y); + mesh->o_z = platform->device.malloc(mesh->Np * mesh->Nelements * sizeof(dfloat), mesh->z); + + free(mesh->x); + free(mesh->y); + free(mesh->z); + + mesh->o_D = platform->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), mesh->D); + + dfloat* DT = (dfloat*) calloc(mesh->Nq * mesh->Nq, sizeof(dfloat)); + for (int j = 0; j < mesh->Nq; j++) + for (int i = 0; i < mesh->Nq; i++) + DT[j * mesh->Nq + i] = mesh->D[i * mesh->Nq + j]; + mesh->o_DT = platform->device.malloc(mesh->Nq * mesh->Nq * sizeof(dfloat), DT); + free(DT); + + mesh->o_ggeo = platform->device.malloc(mesh->Nelements * mesh->Np * mesh->Nggeo * sizeof(dfloat), + mesh->ggeo); + + if(!strstr(pfloatString,dfloatString)) { + mesh->o_ggeoPfloat = platform->device.malloc(mesh->Nelements * mesh->Np * mesh->Nggeo, sizeof(pfloat)); + mesh->o_DPfloat = platform->device.malloc(mesh->Nq * mesh->Nq, sizeof(pfloat)); + mesh->o_DTPfloat = platform->device.malloc(mesh->Nq * mesh->Nq, sizeof(pfloat)); + platform->copyDfloatToPfloatKernel(mesh->Nelements * mesh->Np * mesh->Nggeo, + mesh->o_ggeo, + mesh->o_ggeoPfloat); + platform->copyDfloatToPfloatKernel(mesh->Nq * mesh->Nq, + mesh->o_D, + mesh->o_DPfloat); + platform->copyDfloatToPfloatKernel(mesh->Nq * mesh->Nq, + mesh->o_DT, + mesh->o_DTPfloat); + + // TODO: once full preconditioner is in FP32, uncomment below + //mesh->o_D.free(); + //mesh->o_DT.free(); + //mesh->o_ggeo.free(); } return mesh; } -*/ mesh_t *createMeshV( MPI_Comm comm, @@ -234,6 +322,9 @@ mesh_t *createMeshV( { mesh_t *mesh = new mesh_t(); + if (platform->comm.mpiRank == 0) + printf("generating v-mesh ...\n"); + // shallow copy memcpy(mesh, meshT, sizeof(*meshT)); mesh->cht = 0; @@ -250,9 +341,6 @@ mesh_t *createMeshV( // find mesh->EToP, mesh->EToE and mesh->EToF, required mesh->EToV meshParallelConnect(mesh); - // find mesh->EToB, required mesh->EToV and mesh->boundaryInfo - meshConnectBoundary(mesh); - // set up halo exchange info for MPI (do before connect face nodes) meshHaloSetup(mesh); @@ -276,16 +364,50 @@ mesh_t *createMeshV( mesh->globalIds = meshT->globalIds; bcMap::check(mesh); + bcMap::checkBoundaryAlignment(mesh); + bcMap::remapUnalignedBoundaries(mesh); meshVOccaSetup3D(mesh, kernelInfo); - meshParallelGatherScatterSetup(mesh, mesh->Nelements * mesh->Np, mesh->globalIds, platform->comm.mpiComm, 0); - oogs_mode oogsMode = OOGS_AUTO; - //if(platform->device.mode() == "Serial" || platform->device.mode() == "OpenMP") oogsMode = OOGS_DEFAULT; - mesh->oogs = oogs::setup(mesh->ogs, 1, mesh->Nelements * mesh->Np, ogsDfloat, NULL, oogsMode); + meshParallelGatherScatterSetup(mesh, mesh->Nelements * mesh->Np, mesh->globalIds, platform->comm.mpiComm, OOGS_AUTO, 0); + + int err = 0; + int Nfine; + platform->options.getArgs("POLYNOMIAL DEGREE", Nfine); + if(mesh->N == Nfine) { + dfloat* tmp = (dfloat*) calloc(mesh->Nlocal, sizeof(dfloat)); + mesh->ogs->o_invDegree.copyTo(tmp, mesh->Nlocal * sizeof(dfloat)); + double* mult = (double*) nek::ptr("vmult"); + dfloat sum1 = 0; + for(int i = 0; i < mesh->Nlocal; i++) sum1 += std::abs(tmp[i] - mult[i]); + MPI_Allreduce(MPI_IN_PLACE, &sum1, 1, MPI_DFLOAT, MPI_SUM, platform->comm.mpiComm); + if(sum1 > 1e-14) { + if(platform->comm.mpiRank == 0) printf("multiplicity test err=%g!\n", sum1); + fflush(stdout); + err++; + } + free(tmp); + } + if(err) ABORT(1); + + mesh->oogs = oogs::setup(mesh->ogs, 1, mesh->Nelements * mesh->Np, ogsDfloat, NULL, OOGS_AUTO); mesh->computeInvLMM(); + // compute V mesh volume + dfloat volume = 0.0; + const auto Np = mesh->Np; + const auto Nggeo = mesh->Nggeo; + for(dlong e = 0; e < mesh->Nelements; ++e) { + for(dlong n = 0; n < Np; ++n){ + volume += mesh->ggeo[Nggeo * Np * e + n + Np * GWJID]; + } + } + + MPI_Allreduce(MPI_IN_PLACE, &volume, 1, MPI_DFLOAT, MPI_SUM, platform->comm.mpiComm); + mesh->volume = volume; + + return mesh; } @@ -324,16 +446,10 @@ void meshVOccaSetup3D(mesh_t* mesh, occa::properties &kernelInfo) void loadKernels(mesh_t* mesh) { const std::string meshPrefix = "mesh-"; - if(platform->options.compareArgs("MOVING MESH", "TRUE")){ - { - mesh->velocityDirichletKernel = - platform->kernels.getKernel(meshPrefix + "velocityDirichletBCHex3D"); - mesh->geometricFactorsKernel = - platform->kernels.getKernel(meshPrefix + "geometricFactorsHex3D"); - mesh->surfaceGeometricFactorsKernel = - platform->kernels.getKernel(meshPrefix + "surfaceGeometricFactorsHex3D"); - mesh->nStagesSumVectorKernel = - platform->kernels.getKernel(meshPrefix + "nStagesSumVector"); - } - } + mesh->avgBIDValueKernel = platform->kernels.get(meshPrefix + "avgBIDValue"); + mesh->velocityDirichletKernel = platform->kernels.get(meshPrefix + "velocityDirichletBCHex3D"); + mesh->geometricFactorsKernel = platform->kernels.get(meshPrefix + "geometricFactorsHex3D"); + mesh->surfaceGeometricFactorsKernel = platform->kernels.get(meshPrefix + "surfaceGeometricFactorsHex3D"); + mesh->cubatureGeometricFactorsKernel = platform->kernels.get(meshPrefix + "cubatureGeometricFactorsHex3D"); + mesh->nStagesSumVectorKernel = platform->kernels.get(meshPrefix + "nStagesSumVector"); } diff --git a/src/mesh/meshSolve.cpp b/src/mesh/meshSolve.cpp new file mode 100644 index 000000000..1d4a8fe73 --- /dev/null +++ b/src/mesh/meshSolve.cpp @@ -0,0 +1,42 @@ +#include "nrs.hpp" +#include "mesh.h" +void meshSolve(nrs_t* nrs, dfloat time, occa::memory o_U, int stage) +{ + mesh_t* mesh = nrs->meshV; + linAlg_t* linAlg = platform->linAlg; + + platform->timer.tic("meshSolve", 1); + nrs->setEllipticCoeffKernel( + mesh->Nlocal, + 1.0, + 0 * nrs->fieldOffset, + nrs->fieldOffset, + nrs->o_meshMue, + nrs->o_meshRho, + nrs->o_ellipticCoeff); + + occa::memory o_Unew = [&](nrs_t* nrs, dfloat time, int stage){ + mesh_t* mesh = nrs->meshV; + oogs_t* gsh = nrs->gsh; + + platform->linAlg->fill(nrs->NVfields*nrs->fieldOffset, 0, platform->o_mempool.slice3); + platform->o_mempool.slice0.copyFrom(mesh->o_U, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat)); + ellipticSolve(nrs->meshSolver, platform->o_mempool.slice3, platform->o_mempool.slice0); + + // enforce C0 + oogs::startFinish(platform->o_mempool.slice0, nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsAdd, nrs->gsh); + platform->linAlg->axmyMany( + mesh->Nlocal, + nrs->NVfields, + nrs->fieldOffset, + 0, + 1.0, + nrs->meshSolver->o_invDegree, + platform->o_mempool.slice0 + ); + + return platform->o_mempool.slice0; + }(nrs, time, stage); + o_U.copyFrom(o_Unew, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat)); + platform->timer.toc("meshSolve"); +} diff --git a/src/mesh/meshSurfaceGeometricFactorsHex3D.cpp b/src/mesh/meshSurfaceGeometricFactorsHex3D.cpp index 5efadc717..daed9f566 100644 --- a/src/mesh/meshSurfaceGeometricFactorsHex3D.cpp +++ b/src/mesh/meshSurfaceGeometricFactorsHex3D.cpp @@ -59,12 +59,12 @@ void interpolateFaceHex3D(int* faceNodes, dfloat* I, dfloat* x, int N, dfloat* I } /* compute outwards facing normals, surface Jacobian, and volume Jacobian for all face nodes */ -void meshSurfaceGeometricFactorsHex3D(mesh3D* mesh) +void meshSurfaceGeometricFactorsHex3D(mesh_t *mesh) { /* unified storage array for geometric factors */ - mesh->sgeo = (dfloat*) calloc((mesh->Nelements + mesh->totalHaloPairs) * - mesh->Nsgeo * mesh->Nfp * mesh->Nfaces, - sizeof(dfloat)); + mesh->sgeo = + (dfloat *)calloc((mesh->Nelements + mesh->totalHaloPairs) * mesh->Nsgeo * mesh->Nfp * mesh->Nfaces, + sizeof(dfloat)); dfloat* xre = (dfloat*) calloc(mesh->Np, sizeof(dfloat)); dfloat* xse = (dfloat*) calloc(mesh->Np, sizeof(dfloat)); @@ -196,26 +196,46 @@ void meshSurfaceGeometricFactorsHex3D(mesh3D* mesh) mesh->sgeo[base + NZID] = nz; mesh->sgeo[base + SJID] = sJ; mesh->sgeo[base + IJID] = 1. / J; - mesh->sgeo[base + WIJID] = 1. / (J * mesh->gllw[0]); mesh->sgeo[base + WSJID] = sJ * mesh->gllw[i % mesh->Nq] * mesh->gllw[i / mesh->Nq]; + + const dfloat tol = 1e-4; + dfloat vt1x = 0, vt1y = 0, vt1z = 0; + dfloat vt2x = 0, vt2y = 0, vt2z = 0; + if (std::abs(std::abs(nz) - 1.0) < tol) { + vt1x = 1.0; + vt1y = 0.0; + vt1z = 0.0; + } + else { + const dfloat mag = std::sqrt(nx * nx + ny * ny); + vt1x = -ny / mag; + vt1y = nx / mag; + vt1z = 0.0; + } + + mesh->sgeo[base + T1XID] = vt1x; + mesh->sgeo[base + T1YID] = vt1y; + mesh->sgeo[base + T1ZID] = vt1z; + + // vt2 = n \cross vt1 + vt2x = ny * vt1z - nz * vt1y; + vt2y = nz * vt1x - nx * vt1z; + vt2z = nx * vt1y - ny * vt1x; + + // normalize vt2 + const dfloat invMag = 1.0 / std::sqrt(vt2x * vt2x + vt2y * vt2y + vt2z * vt2z); + vt2x *= invMag; + vt2y *= invMag; + vt2z *= invMag; + + mesh->sgeo[base + T2XID] = vt2x; + mesh->sgeo[base + T2YID] = vt2y; + mesh->sgeo[base + T2ZID] = vt2z; } } } - for(dlong e = 0; e < mesh->Nelements; ++e) /* for each non-halo element */ - for(int n = 0; n < mesh->Nfp * mesh->Nfaces; ++n) { - dlong baseM = e * mesh->Nfp * mesh->Nfaces + n; - dlong baseP = mesh->mapP[baseM]; - // rescaling - missing factor of 2 ? (only impacts penalty and thus stiffness) - dfloat hinvM = mesh->sgeo[baseM * mesh->Nsgeo + SJID] * - mesh->sgeo[baseM * mesh->Nsgeo + IJID]; - dfloat hinvP = mesh->sgeo[baseP * mesh->Nsgeo + SJID] * - mesh->sgeo[baseP * mesh->Nsgeo + IJID]; - mesh->sgeo[baseM * mesh->Nsgeo + IHID] = mymax(hinvM,hinvP); - mesh->sgeo[baseP * mesh->Nsgeo + IHID] = mymax(hinvM,hinvP); - } - free(xre); free(xse); free(xte); diff --git a/src/mesh/registerMeshKernels.cpp b/src/mesh/registerMeshKernels.cpp new file mode 100644 index 000000000..8eedf4ebc --- /dev/null +++ b/src/mesh/registerMeshKernels.cpp @@ -0,0 +1,72 @@ +#include "nrs.hpp" +#include +#include "mesh.h" + +void registerMeshKernels(occa::properties kernelInfoBC) { + int N, cubN; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + platform->options.getArgs("CUBATURE POLYNOMIAL DEGREE", cubN); + const int Nq = N + 1; + const int cubNq = cubN + 1; + const int Np = Nq * Nq * Nq; + const int cubNp = cubNq * cubNq * cubNq; + + int nAB; + platform->options.getArgs("MESH INTEGRATION ORDER", nAB); + + auto kernelInfo = platform->kernelInfo + meshKernelProperties(N); + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + std::string oklpath = installDir + "/okl/"; + std::string kernelName; + + const std::string meshPrefix = "mesh-"; + std::string fileName; + { + + kernelName = "velocityDirichletBCHex3D"; + fileName = oklpath + "mesh/" + kernelName + ".okl"; + platform->kernels.add(meshPrefix + kernelName, fileName, kernelInfoBC); + + { + int N; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + const int Nq = N + 1; + if (BLOCKSIZE < Nq * Nq) { + if (platform->comm.mpiRank == 0) + printf("ERROR: avgBIDValue kernel requires BLOCKSIZE >= Nq * Nq." + "BLOCKSIZE = %d, Nq*Nq = %d\n", + BLOCKSIZE, + Nq * Nq); + ABORT(EXIT_FAILURE); + } + } + + kernelName = "avgBIDValue"; + fileName = oklpath + "mesh/" + kernelName + ".okl"; + platform->kernels.add(meshPrefix + kernelName, fileName, kernelInfo); + + occa::properties meshKernelInfo = kernelInfo; + meshKernelInfo["defines/p_cubNq"] = cubNq; + meshKernelInfo["defines/p_cubNp"] = cubNp; + + kernelName = "geometricFactorsHex3D"; + fileName = oklpath + "mesh/" + kernelName + ".okl"; + platform->kernels.add( + meshPrefix + kernelName, fileName, meshKernelInfo); + kernelName = "surfaceGeometricFactorsHex3D"; + fileName = oklpath + "mesh/" + kernelName + ".okl"; + platform->kernels.add( + meshPrefix + kernelName, fileName, meshKernelInfo); + + kernelName = "cubatureGeometricFactorsHex3D"; + fileName = oklpath + "mesh/" + kernelName + ".okl"; + platform->kernels.add(meshPrefix + kernelName, fileName, meshKernelInfo); + + meshKernelInfo = kernelInfo; + meshKernelInfo["defines/p_nAB"] = nAB; + kernelName = "nStagesSumVector"; + fileName = oklpath + "core/" + kernelName + ".okl"; + platform->kernels.add(meshPrefix + kernelName, fileName, meshKernelInfo); + } +} diff --git a/src/lns/constantFlowRate.cpp b/src/navierStokes/constantFlowRate.cpp similarity index 84% rename from src/lns/constantFlowRate.cpp rename to src/navierStokes/constantFlowRate.cpp index bc67bebb6..5f2597b2f 100644 --- a/src/lns/constantFlowRate.cpp +++ b/src/navierStokes/constantFlowRate.cpp @@ -3,6 +3,9 @@ #include "nrs.hpp" #include "udf.hpp" #include +#include "alignment.hpp" +#include "bcMap.hpp" +#include "bdry.hpp" namespace { static dfloat constantFlowScale = 0.0; @@ -33,6 +36,13 @@ inline void computeDirection(dfloat x1, static dfloat lengthScale; static dfloat baseFlowRate; +static dfloat currentFlowRate; +static dfloat postCorrectionFlowRate; +static dfloat flowRate; + +static int fromBID; +static int toBID; +static dfloat flowDirection[3]; } // namespace @@ -80,10 +90,11 @@ bool checkIfRecomputeDirection(nrs_t *nrs, int tstep) { bool apply(nrs_t *nrs, int tstep, dfloat time) { + double flops = 0.0; + constexpr int ndim = 3; mesh_t *mesh = nrs->meshV; - dfloat *flowDirection = nrs->flowDirection; - const dfloat flowRate = nrs->flowRate; + platform->options.getArgs("FLOW RATE", flowRate); const bool movingMesh = platform->options.compareArgs("MOVING MESH", "TRUE"); @@ -138,13 +149,16 @@ bool apply(nrs_t *nrs, int tstep, dfloat time) { lengthScale = maxCoord - minCoord; } else { + platform->options.getArgs("CONSTANT FLOW FROM BID", fromBID); + platform->options.getArgs("CONSTANT FLOW TO BID", toBID); + occa::memory o_centroid = platform->o_mempool.slice0; occa::memory o_counts = platform->o_mempool.slice3; platform->linAlg->fill( mesh->Nelements * mesh->Nfaces * 3, 0.0, o_centroid); platform->linAlg->fill(mesh->Nelements * mesh->Nfaces, 0.0, o_counts); nrs->computeFaceCentroidKernel(mesh->Nelements, - nrs->fromBID, + fromBID, mesh->o_EToB, mesh->o_vmapM, mesh->o_x, @@ -152,6 +166,7 @@ bool apply(nrs_t *nrs, int tstep, dfloat time) { mesh->o_z, o_centroid, o_counts); + flops += 3 * mesh->Nlocal; dfloat NfacesContrib = platform->linAlg->sum( mesh->Nelements * mesh->Nfaces, o_counts, platform->comm.mpiComm); @@ -179,7 +194,7 @@ bool apply(nrs_t *nrs, int tstep, dfloat time) { mesh->Nelements * mesh->Nfaces * 3, 0.0, o_centroid); platform->linAlg->fill(mesh->Nelements * mesh->Nfaces, 0.0, o_counts); nrs->computeFaceCentroidKernel(mesh->Nelements, - nrs->toBID, + toBID, mesh->o_EToB, mesh->o_vmapM, mesh->o_x, @@ -188,6 +203,8 @@ bool apply(nrs_t *nrs, int tstep, dfloat time) { o_centroid, o_counts); + flops += 3 * mesh->Nlocal; + NfacesContrib = platform->linAlg->sum( mesh->Nelements * mesh->Nfaces, o_counts, platform->comm.mpiComm); sumFaceAverages_x = platform->linAlg->sum(mesh->Nelements * mesh->Nfaces, @@ -293,16 +310,18 @@ bool apply(nrs_t *nrs, int tstep, dfloat time) { nrs->computeFieldDotNormalKernel(mesh->Nlocal, nrs->fieldOffset, - nrs->flowDirection[0], - nrs->flowDirection[1], - nrs->flowDirection[2], + flowDirection[0], + flowDirection[1], + flowDirection[2], nrs->o_U, o_currentFlowRate); + flops += 5 * mesh->Nlocal; + // scale by mass matrix platform->linAlg->axmy(mesh->Nlocal, 1.0, mesh->o_LMM, o_currentFlowRate); - const dfloat currentFlowRate = + currentFlowRate = platform->linAlg->sum( mesh->Nlocal, o_currentFlowRate, platform->comm.mpiComm) / lengthScale; @@ -310,11 +329,12 @@ bool apply(nrs_t *nrs, int tstep, dfloat time) { if (recomputeBaseFlowRate) { nrs->computeFieldDotNormalKernel(mesh->Nlocal, nrs->fieldOffset, - nrs->flowDirection[0], - nrs->flowDirection[1], - nrs->flowDirection[2], + flowDirection[0], + flowDirection[1], + flowDirection[2], nrs->o_Uc, o_baseFlowRate); + flops += 5 * mesh->Nlocal; // scale by mass matrix platform->linAlg->axmy(mesh->Nlocal, 1.0, mesh->o_LMM, o_baseFlowRate); @@ -343,6 +363,27 @@ bool apply(nrs_t *nrs, int tstep, dfloat time) { nrs->o_U); platform->linAlg->axpby(mesh->Nlocal, constantFlowScale, nrs->o_Pc, 1.0, nrs->o_P); + // compute flow rate after correction as diagnostic + nrs->computeFieldDotNormalKernel(mesh->Nlocal, + nrs->fieldOffset, + flowDirection[0], + flowDirection[1], + flowDirection[2], + nrs->o_U, + o_currentFlowRate); + + flops += 5 * mesh->Nlocal; + + // scale by mass matrix + platform->linAlg->axmy(mesh->Nlocal, 1.0, mesh->o_LMM, o_currentFlowRate); + + postCorrectionFlowRate = + platform->linAlg->sum( + mesh->Nlocal, o_currentFlowRate, platform->comm.mpiComm) / + lengthScale; + + platform->flopCounter->add("ConstantFlowRate::apply", flops); + return recomputeBaseFlowRate; } @@ -354,10 +395,12 @@ void compute(nrs_t *nrs, double lengthScale, dfloat time) { constexpr int ndim = 3; mesh_t *mesh = nrs->meshV; - dfloat *flowDirection = nrs->flowDirection; + + double flops = 0.0; platform->timer.tic("pressureSolve", 1); { + platform->timer.tic("pressure rhs", 1); occa::memory &o_gradPCoeff = platform->o_mempool.slice0; occa::memory &o_Prhs = platform->o_mempool.slice3; @@ -370,6 +413,11 @@ void compute(nrs_t *nrs, double lengthScale, dfloat time) { nrs->fieldOffset, nrs->o_ellipticCoeff, o_gradPCoeff); + + double flopsGrad = 6 * mesh->Np * mesh->Nq + 18 * mesh->Np; + flopsGrad *= static_cast(mesh->Nelements); + flops += flopsGrad; + nrs->computeFieldDotNormalKernel(mesh->Nlocal, nrs->fieldOffset, flowDirection[0], @@ -378,6 +426,8 @@ void compute(nrs_t *nrs, double lengthScale, dfloat time) { o_gradPCoeff, o_Prhs); + flops += 5 * mesh->Nlocal; + // enforce Dirichlet BCs platform->linAlg->fill(nrs->fieldOffset, -1.0*std::numeric_limits::max(), @@ -421,12 +471,14 @@ void compute(nrs_t *nrs, double lengthScale, dfloat time) { platform->o_mempool.slice6, nrs->o_Pc); + platform->timer.toc("pressure rhs"); ellipticSolve(nrs->pSolver, o_Prhs, nrs->o_Pc); } platform->timer.toc("pressureSolve"); platform->timer.tic("velocitySolve", 1); { + platform->timer.tic("velocity rhs", 1); nrs->setEllipticCoeffKernel(mesh->Nlocal, nrs->g0 * nrs->idt, 0 * nrs->fieldOffset, @@ -456,13 +508,17 @@ void compute(nrs_t *nrs, double lengthScale, dfloat time) { o_RhsVel // <- rhs = -\grad{P_c} ); + double flopsGrad = 6 * mesh->Np * mesh->Nq + 18 * mesh->Np; + flopsGrad *= static_cast(mesh->Nelements); + flops += flopsGrad; + // rhs = -\grad{P_c} + BF n_i platform->linAlg->scaleMany( mesh->Nlocal, nrs->NVfields, nrs->fieldOffset, -1.0, o_RhsVel); for (int dim = 0; dim < ndim; ++dim) { const dlong offset = dim * nrs->fieldOffset; - const dfloat n_dim = nrs->flowDirection[dim]; + const dfloat n_dim = flowDirection[dim]; platform->linAlg->axpby( mesh->Nlocal, n_dim, o_BF, 1.0, o_RhsVel, offset, offset); } @@ -495,6 +551,7 @@ void compute(nrs_t *nrs, double lengthScale, dfloat time) { nrs->fieldOffset, time, mesh->o_sgeo, + nrs->o_zeroNormalMaskVelocity, mesh->o_x, mesh->o_y, mesh->o_z, @@ -536,12 +593,16 @@ void compute(nrs_t *nrs, double lengthScale, dfloat time) { nrs->gsh); } if (nrs->uvwSolver) { + if (nrs->uvwSolver->Nmasked) nrs->maskCopyKernel(nrs->uvwSolver->Nmasked, 0 * nrs->fieldOffset, nrs->uvwSolver->o_maskIds, platform->o_mempool.slice3, nrs->o_Uc); + if (bcMap::unalignedBoundary(mesh->cht, "velocity")) { + applyZeroNormalMask(nrs, nrs->uvwSolver->o_EToB, nrs->o_zeroNormalMaskVelocity, nrs->o_Uc); + } } else { if (nrs->uSolver->Nmasked) nrs->maskCopyKernel(nrs->uSolver->Nmasked, @@ -562,6 +623,7 @@ void compute(nrs_t *nrs, double lengthScale, dfloat time) { platform->o_mempool.slice3, nrs->o_Uc); } + platform->timer.toc("velocity rhs"); if (nrs->uvwSolver) { ellipticSolve(nrs->uvwSolver, o_RhsVel, nrs->o_Uc); @@ -575,6 +637,38 @@ void compute(nrs_t *nrs, double lengthScale, dfloat time) { } } platform->timer.toc("velocitySolve"); + + platform->flopCounter->add("ConstantFlowRate::compute", flops); + +} + +void printInfo(mesh_t* mesh, bool verboseInfo) +{ + if(platform->comm.mpiRank != 0) return; + + std::string flowRateType = "flowRate"; + + dfloat currentRate = currentFlowRate; + dfloat finalFlowRate = postCorrectionFlowRate; + dfloat userSpecifiedFlowRate = flowRate * mesh->volume / lengthScale; + + dfloat err = std::abs(userSpecifiedFlowRate - finalFlowRate); + + // scale is invariant to uBulk/volumetric flow rate, since it's a unitless ratio + dfloat scale = constantFlowScale; + + if(!platform->options.compareArgs("CONSTANT FLOW RATE TYPE", "VOLUMETRIC")){ + flowRateType = "uBulk"; + + // put in bulk terms, instead of volumetric + currentRate *= lengthScale / mesh->volume; + finalFlowRate *= lengthScale / mesh->volume; + userSpecifiedFlowRate = flowRate; + err = std::abs(userSpecifiedFlowRate - finalFlowRate); + } + if(verboseInfo) + printf(" flowRate : %s0 %.2e %s %.2e err %.2e scale %.2e\n", + flowRateType.c_str(), currentRate, flowRateType.c_str(), finalFlowRate, err, scale); } } // namespace ConstantFlowRate diff --git a/src/lns/constantFlowRate.hpp b/src/navierStokes/constantFlowRate.hpp similarity index 86% rename from src/lns/constantFlowRate.hpp rename to src/navierStokes/constantFlowRate.hpp index c415c9372..1397cdd68 100644 --- a/src/lns/constantFlowRate.hpp +++ b/src/navierStokes/constantFlowRate.hpp @@ -7,6 +7,7 @@ namespace ConstantFlowRate{ bool apply(nrs_t *nrs, int tstep, dfloat time); void compute(nrs_t *nrs, dfloat lengthScale, dfloat time); bool checkIfRecompute(nrs_t* nrs, int tstep); +void printInfo(mesh_t* mesh, bool verboseInfo); dfloat scaleFactor(); } diff --git a/src/lns/tombo.cpp b/src/navierStokes/tombo.cpp similarity index 54% rename from src/lns/tombo.cpp rename to src/navierStokes/tombo.cpp index 509de27dd..631e9a8c1 100644 --- a/src/lns/tombo.cpp +++ b/src/navierStokes/tombo.cpp @@ -7,65 +7,18 @@ namespace tombo { occa::memory pressureSolve(nrs_t* nrs, dfloat time, int stage) { + platform->timer.tic("pressure rhs", 1); + double flopCount = 0.0; mesh_t* mesh = nrs->meshV; - //enforce Dirichlet BCs - platform->linAlg->fill((1+nrs->NVfields)*nrs->fieldOffset, -1.0*std::numeric_limits::max(), platform->o_mempool.slice6); - for (int sweep = 0; sweep < 2; sweep++) { - nrs->pressureDirichletBCKernel(mesh->Nelements, - time, - nrs->fieldOffset, - mesh->o_sgeo, - mesh->o_x, - mesh->o_y, - mesh->o_z, - mesh->o_vmapM, - mesh->o_EToB, - nrs->o_EToB, - nrs->o_usrwrk, - nrs->o_U, - platform->o_mempool.slice6); - - nrs->velocityDirichletBCKernel(mesh->Nelements, - nrs->fieldOffset, - time, - mesh->o_sgeo, - mesh->o_x, - mesh->o_y, - mesh->o_z, - mesh->o_vmapM, - mesh->o_EToB, - nrs->o_EToB, - nrs->o_usrwrk, - nrs->o_U, - platform->o_mempool.slice7); - - //take care of Neumann-Dirichlet shared edges across elements - if (sweep == 0) oogs::startFinish(platform->o_mempool.slice6, 1+nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsMax, nrs->gsh); - if (sweep == 1) oogs::startFinish(platform->o_mempool.slice6, 1+nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsMin, nrs->gsh); - } - - if (nrs->pSolver->Nmasked) nrs->maskCopyKernel(nrs->pSolver->Nmasked, 0, nrs->pSolver->o_maskIds, - platform->o_mempool.slice6, nrs->o_P); - - if (nrs->uvwSolver) { - if (nrs->uvwSolver->Nmasked) nrs->maskCopyKernel(nrs->uvwSolver->Nmasked, 0*nrs->fieldOffset, nrs->uvwSolver->o_maskIds, - platform->o_mempool.slice7, nrs->o_U); - } else { - if (nrs->uSolver->Nmasked) nrs->maskCopyKernel(nrs->uSolver->Nmasked, 0*nrs->fieldOffset, nrs->uSolver->o_maskIds, - platform->o_mempool.slice7, nrs->o_U); - if (nrs->vSolver->Nmasked) nrs->maskCopyKernel(nrs->vSolver->Nmasked, 1*nrs->fieldOffset, nrs->vSolver->o_maskIds, - platform->o_mempool.slice7, nrs->o_U); - if (nrs->wSolver->Nmasked) nrs->maskCopyKernel(nrs->wSolver->Nmasked, 2*nrs->fieldOffset, nrs->wSolver->o_maskIds, - platform->o_mempool.slice7, nrs->o_U); - } - nrs->curlKernel(mesh->Nelements, + 1, mesh->o_vgeo, mesh->o_D, nrs->fieldOffset, nrs->o_Ue, platform->o_mempool.slice0); + flopCount += static_cast(mesh->Nelements) * (18 * mesh->Np * mesh->Nq + 36 * mesh->Np); oogs::startFinish(platform->o_mempool.slice0, nrs->NVfields, nrs->fieldOffset,ogsDfloat, ogsAdd, nrs->gsh); @@ -77,14 +30,17 @@ occa::memory pressureSolve(nrs_t* nrs, dfloat time, int stage) nrs->meshV->o_invLMM, platform->o_mempool.slice0 ); + flopCount += mesh->Nlocal; nrs->curlKernel( mesh->Nelements, + 1, mesh->o_vgeo, mesh->o_D, nrs->fieldOffset, platform->o_mempool.slice0, platform->o_mempool.slice3); + flopCount += static_cast(mesh->Nelements) * (18 * mesh->Np * mesh->Nq + 36 * mesh->Np); nrs->gradientVolumeKernel( mesh->Nelements, @@ -93,8 +49,9 @@ occa::memory pressureSolve(nrs_t* nrs, dfloat time, int stage) nrs->fieldOffset, nrs->o_div, platform->o_mempool.slice0); + flopCount += static_cast(mesh->Nelements) * (6 * mesh->Np * mesh->Nq + 18 * mesh->Np); - if(platform->options.compareArgs("STRESSFORMULATION", "TRUE")) + if (platform->options.compareArgs("STRESSFORMULATION", "TRUE")) { nrs->pressureStressKernel( mesh->Nelements, mesh->o_vgeo, @@ -104,6 +61,8 @@ occa::memory pressureSolve(nrs_t* nrs, dfloat time, int stage) nrs->o_Ue, nrs->o_div, platform->o_mempool.slice3); + flopCount += static_cast(mesh->Nelements) * (18 * mesh->Nq * mesh->Np + 100 * mesh->Np); + } occa::memory o_irho = nrs->o_ellipticCoeff; nrs->pressureRhsKernel( @@ -115,7 +74,7 @@ occa::memory pressureSolve(nrs_t* nrs, dfloat time, int stage) platform->o_mempool.slice3, platform->o_mempool.slice0, platform->o_mempool.slice6); - + flopCount += 12 * static_cast(mesh->Nlocal); oogs::startFinish(platform->o_mempool.slice6, nrs->NVfields, nrs->fieldOffset,ogsDfloat, ogsAdd, nrs->gsh); @@ -135,7 +94,7 @@ occa::memory pressureSolve(nrs_t* nrs, dfloat time, int stage) nrs->fieldOffset, platform->o_mempool.slice6, platform->o_mempool.slice3); - + flopCount += static_cast(mesh->Nelements) * (6 * mesh->Np * mesh->Nq + 18 * mesh->Np); nrs->pressureAddQtlKernel( mesh->Nlocal, @@ -143,6 +102,7 @@ occa::memory pressureSolve(nrs_t* nrs, dfloat time, int stage) nrs->g0 * nrs->idt, nrs->o_div, platform->o_mempool.slice3); + flopCount += 3 * mesh->Nlocal; nrs->divergenceSurfaceKernel( mesh->Nelements, @@ -154,15 +114,22 @@ occa::memory pressureSolve(nrs_t* nrs, dfloat time, int stage) platform->o_mempool.slice6, nrs->o_U, platform->o_mempool.slice3); + flopCount += 25 * static_cast(mesh->Nelements) * mesh->Nq * mesh->Nq; + + platform->timer.toc("pressure rhs"); platform->o_mempool.slice1.copyFrom(nrs->o_P, mesh->Nlocal * sizeof(dfloat)); ellipticSolve(nrs->pSolver, platform->o_mempool.slice3, platform->o_mempool.slice1); + platform->flopCounter->add("pressure RHS", flopCount); + return platform->o_mempool.slice1; } occa::memory velocitySolve(nrs_t* nrs, dfloat time, int stage) { + platform->timer.tic("velocity rhs", 1); + double flopCount = 0.0; mesh_t* mesh = nrs->meshV; dfloat scale = -1./3; @@ -173,7 +140,7 @@ occa::memory velocitySolve(nrs_t* nrs, dfloat time, int stage) scale, nrs->o_mue, nrs->o_div, - platform->o_mempool.slice3); + platform->o_mempool.slice3); nrs->gradientVolumeKernel( mesh->Nelements, @@ -183,13 +150,17 @@ occa::memory velocitySolve(nrs_t* nrs, dfloat time, int stage) platform->o_mempool.slice3, platform->o_mempool.slice0); + flopCount += static_cast(mesh->Nelements) * (6 * mesh->Np * mesh->Nq + 18 * mesh->Np); + nrs->wgradientVolumeKernel( mesh->Nelements, mesh->o_vgeo, mesh->o_D, nrs->fieldOffset, nrs->o_P, - platform->o_mempool.slice3); + platform->o_mempool.slice3); + + flopCount += static_cast(mesh->Nelements) * 18 * (mesh->Np * mesh->Nq + mesh->Np); platform->linAlg->axpby( nrs->NVfields*nrs->fieldOffset, @@ -198,20 +169,21 @@ occa::memory velocitySolve(nrs_t* nrs, dfloat time, int stage) -1.0, platform->o_mempool.slice0); - nrs->velocityNeumannBCKernel( - mesh->Nelements, - nrs->fieldOffset, - mesh->o_sgeo, - mesh->o_vmapM, - mesh->o_EToB, - nrs->o_EToB, - time, - mesh->o_x, - mesh->o_y, - mesh->o_z, - nrs->o_usrwrk, - nrs->o_U, - platform->o_mempool.slice0); + nrs->velocityNeumannBCKernel(mesh->Nelements, + nrs->fieldOffset, + mesh->o_sgeo, + mesh->o_vmapM, + mesh->o_EToB, + nrs->o_EToB, + time, + mesh->o_x, + mesh->o_y, + mesh->o_z, + nrs->o_usrwrk, + nrs->o_U, + platform->o_mempool.slice0); + + flopCount += static_cast(mesh->Nelements) * (3 * mesh->Np + 36 * mesh->Nq * mesh->Nq); nrs->velocityRhsKernel( mesh->Nlocal, @@ -221,6 +193,9 @@ occa::memory velocitySolve(nrs_t* nrs, dfloat time, int stage) nrs->o_rho, platform->o_mempool.slice3); + flopCount += 6 * mesh->Nlocal; + + platform->timer.toc("velocity rhs"); platform->o_mempool.slice0.copyFrom(nrs->o_U, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat)); if(nrs->uvwSolver) { @@ -231,6 +206,8 @@ occa::memory velocitySolve(nrs_t* nrs, dfloat time, int stage) ellipticSolve(nrs->wSolver, platform->o_mempool.slice5, platform->o_mempool.slice2); } + platform->flopCounter->add("velocity RHS", flopCount); + return platform->o_mempool.slice0; } diff --git a/src/lns/tombo.hpp b/src/navierStokes/tombo.hpp similarity index 100% rename from src/lns/tombo.hpp rename to src/navierStokes/tombo.hpp diff --git a/src/nekInterface/nekInterface.f b/src/nekInterface/nekInterface.f index a6d12edc1..945fb38c2 100644 --- a/src/nekInterface/nekInterface.f +++ b/src/nekInterface/nekInterface.f @@ -105,6 +105,8 @@ subroutine nekf_ptr(ptr,id,len) ptr = loc(getps) elseif (id .eq. 'vmult') then ptr = loc(vmult) + elseif (id .eq. 'tmult') then + ptr = loc(tmult) elseif (id .eq. 'cb_scnrs') then ptr = loc(sc_nrs(1)) elseif (id .eq. 'p0th') then @@ -182,7 +184,7 @@ subroutine nekf_bootstrap(comm_in,path_in,session_in,mesh_in) c----------------------------------------------------------------------- subroutine nekf_setup(ifflow_in, $ npscal_in, p32, mpart, contol, - $ rho, mue, rhoCp, lambda) + $ rho, mue, rhoCp, lambda, stsform) include 'SIZE' include 'TOTAL' @@ -191,9 +193,13 @@ subroutine nekf_setup(ifflow_in, integer iftmsh_in, ifflow_in, mpart, p32 real rho, mue, rhoCp, lambda, contol + integer stsform common /rdump/ ntdump + common /ivrtx/ vertex ((2**ldim)*lelt) + integer*8 vertex + etimes = dnekclock_sync() call read_re2_hdr(ifbswap, .true.) @@ -220,6 +226,7 @@ subroutine nekf_setup(ifflow_in, ifheat = .false. ifvo = .true. ifpo = .true. + if(stsform.eq.1) ifstrs = .true. if (npscal_in .gt. 0) then ifheat = .true. @@ -241,19 +248,21 @@ subroutine nekf_setup(ifflow_in, call mapelpr call read_re2_data(ifbswap, .true., .true., .true.) - call izero(boundaryID, size(boundaryID)) ifld_bId = 2 if(ifflow) ifld_bId = 1 do iel = 1,nelv do ifc = 1,2*ndim - boundaryID(ifc,iel) = bc(5,ifc,iel,ifld_bId) + boundaryID(ifc,iel) = -1 + if(bc(5,ifc,iel,ifld_bId).gt.0) + $ boundaryID(ifc,iel) = bc(5,ifc,iel,ifld_bId) enddo enddo - call izero(boundaryIDt, size(boundaryIDt)) if(nelgt.ne.nelgv) then do iel = 1,nelt do ifc = 1,2*ndim - boundaryIDt(ifc,iel) = bc(5,ifc,iel,2) + boundaryIDt(ifc,iel) = -1 + if(bc(5,ifc,iel,2).gt.0) + $ boundaryIDt(ifc,iel) = bc(5,ifc,iel,2) enddo enddo endif @@ -261,7 +270,15 @@ subroutine nekf_setup(ifflow_in, call setvar ! Initialize most variables igeom = 2 - call setup_topo ! Setup domain topology + call setup_topo ! Setup domain topology + + if(.not. ifflow) then + call rone(vmult,lx1*ly1*lz1*nelv) + ifield = 1 + call dssum(vmult,lx1,ly1,lz1) + call invcol1(vmult,lx1*ly1*lz1*nelv) + endif + call genwz ! Compute GLL points, weights, etc. if(nio.eq.0) write(6,*) 'call usrdat' @@ -284,8 +301,6 @@ subroutine nekf_setup(ifflow_in, call bcmask ! Set BC masks for Dirichlet boundaries. -c call findSYMOrient - ifield = 1 if(nio.eq.0) write(6,*) 'call usrdat3' @@ -469,7 +484,15 @@ subroutine nekf_restart(rfile,l) c----------------------------------------------------------------------- subroutine nekf_end() - call nek_end() + include 'SIZE' + include 'DPROCMAP' + +#ifdef DPROCMAP +#ifdef MPI + call MPI_Win_free(dProcmapH, ierr) +#endif +#endif + !call nek_end() return end @@ -564,12 +587,24 @@ integer function nekf_bcmap(bID, ifld, ismesh) endif else if (c.eq.'o ' .or. c.eq.'O ') then ibc = 3 + if(ismesh.eq.1) then + ! outflow remaps to SYM bounds for mesh solver + ibc = 7 + endif else if (c.eq.'SYX') then ibc = 4 else if (c.eq.'SYY') then ibc = 5 else if (c.eq.'SYZ') then ibc = 6 + else if (c.eq.'SYM') then + ibc = 7 + else if (c.eq.'SHL'.or.c.eq.'shl') then + ibc = 8 + if(ismesh.eq.1) then + ! outflow remaps to SYM bounds for mesh solver + ibc = 7 + endif else if (c.eq.'mv ') then ibc = 2 endif @@ -624,22 +659,31 @@ integer function nekf_nbid(isTmsh) include 'SIZE' include 'TOTAL' - n = 0 + common /nekmpi/ mid,mp,nekcomm,nekgroup,nekreal + + integer sum + integer*8 bid8(2*ldim*lelt) + if(isTmsh.eq.1) then - do iel = 1,nelt - do ifc = 1,2*ndim - n = max(n,boundaryIDt(ifc,iel)) - enddo + n = 2*ndim*nelt + do i = 1,n + bid8(i) = boundaryIDt(i,1) enddo else - do iel = 1,nelv - do ifc = 1,2*ndim - n = max(n,boundaryID(ifc,iel)) - enddo + n = 2*ndim*nelv + do i = 1,n + bid8(i) = boundaryID(i,1) enddo endif - nekf_nbid = iglmax(n,1) + call fgslib_gs_unique(bid8, n, nekcomm, np) + + sum = 0 + do i = 1,n + if(bid8(i).gt.0) sum = sum + 1 + enddo + + nekf_nbid = iglsum(sum,1) return end @@ -688,35 +732,42 @@ subroutine nekf_gen_bcmap() include 'SIZE' include 'TOTAL' + parameter (NBID_TYPES=9) + parameter (NSBID_TYPES=3) ! scalar boundary types + integer bID, bcID - integer map(7) + integer map(NBID_TYPES) integer ibc_bmap(lbid, ldimt1) logical ifalg,ifnorx,ifnory,ifnorz character*3 cb - call izero(boundaryID, size(boundaryID)) - call izero(boundaryIDt, size(boundaryIDt)) + call ifill(boundaryID, -1, size(boundaryID)) + call ifill(boundaryIDt,-1, size(boundaryIDt)) call izero(map, size(map)) - + + if(.not.ifflow .and. .not.ifheat) return + if(ifflow) then do iel = 1,nelv do ifc = 1,2*ndim cb = cbc(ifc,iel,1) if(cb.eq.'W ') map(1) = 1 if(cb.eq.'v ') map(2) = 1 - if(cb.eq.'mv ') map(7) = 1 + if(cb.eq.'mv ') map(NBID_TYPES) = 1 ! last is reserved for mv if(cb.eq.'o ' .or. cb.eq.'O ') map(3) = 1 if(cb.eq.'SYM') then call chknord(ifalg,ifnorx,ifnory,ifnorz,ifc,iel) if (ifnorx) map(4) = 1 if (ifnory) map(5) = 1 if (ifnorz) map(6) = 1 + if (.not.ifalg) map(7) = 1 ! unaligned SYM boundary endif + if(cb.eq.'SHL'.or.cb.eq.'shl') map(8) = 1 enddo enddo - else + elseif(ifheat) then do iel = 1,nelv do ifc = 1,2*ndim cb = cbc(ifc,iel,1) @@ -728,7 +779,7 @@ subroutine nekf_gen_bcmap() endif bID = 1 - do i = 1,7 + do i = 1,NBID_TYPES map(i) = iglmax(map(i),1) if(map(i).gt.0) then map(i) = bID @@ -746,7 +797,7 @@ subroutine nekf_gen_bcmap() else if(cb.eq.'v ') then boundaryID(ifc,iel) = map(2) else if(cb.eq.'mv ') then - boundaryID(ifc,iel) = map(7) + boundaryID(ifc,iel) = map(NBID_TYPES) else if(cb.eq.'o ' .or. cb.eq.'O ') then boundaryID(ifc,iel) = map(3) else if(cb.eq.'SYM') then @@ -754,6 +805,9 @@ subroutine nekf_gen_bcmap() if (ifnorx) boundaryID(ifc,iel) = map(4) if (ifnory) boundaryID(ifc,iel) = map(5) if (ifnorz) boundaryID(ifc,iel) = map(6) + if (.not.ifalg) boundaryID(ifc,iel) = map(7) + else if(cb.eq.'SHL'.or.cb.eq.'shl') then + boundaryID(ifc,iel) = map(8) else if(cb.ne.'E ' .and. cb.ne.'P ') then ierr = 1 @@ -764,15 +818,17 @@ subroutine nekf_gen_bcmap() enddo 99 call err_chk(ierr, 'Invalid boundary condition type!$') - if(map(1).gt.0) cbc_bmap(map(1), ifld) = 'W ' - if(map(2).gt.0) cbc_bmap(map(2), ifld) = 'v ' - if(map(7).gt.0) cbc_bmap(map(7), ifld) = 'mv ' - if(map(3).gt.0) cbc_bmap(map(3), ifld) = 'o ' - if(map(4).gt.0) cbc_bmap(map(4), ifld) = 'SYX' - if(map(5).gt.0) cbc_bmap(map(5), ifld) = 'SYY' - if(map(6).gt.0) cbc_bmap(map(6), ifld) = 'SYZ' - -c write(6,*) 'vel cbc_bmap: ', (cbc_bmap(i,1), i=1,6) + if(map(1).gt.0) cbc_bmap(map(1), ifld) = 'W ' + if(map(2).gt.0) cbc_bmap(map(2), ifld) = 'v ' + if(map(NBID_TYPES).gt.0) cbc_bmap(map(NBID_TYPES), ifld) = 'mv ' + if(map(3).gt.0) cbc_bmap(map(3), ifld) = 'o ' + if(map(4).gt.0) cbc_bmap(map(4), ifld) = 'SYX' + if(map(5).gt.0) cbc_bmap(map(5), ifld) = 'SYY' + if(map(6).gt.0) cbc_bmap(map(6), ifld) = 'SYZ' + if(map(7).gt.0) cbc_bmap(map(7), ifld) = 'SYM' + if(map(8).gt.0) cbc_bmap(map(8), ifld) = 'shl' + +c write(6,*) 'vel cbc_bmap: ', (cbc_bmap(i,1), i=1,NBID_TYPES) do ifld = 2,nfield ierr = 0 @@ -798,7 +854,7 @@ subroutine nekf_gen_bcmap() enddo call err_chk(ierr, 'Invalid boundary condition type!$') - do bID = 1,7 + do bID = 1,NBID_TYPES bcID = iglmax(ibc_bmap(bID, ifld),1) if(bcID.eq.1) cbc_bmap(bID, ifld) = 't ' if(bcID.eq.2) cbc_bmap(bID, ifld) = 'I ' @@ -832,7 +888,7 @@ subroutine nekf_gen_bcmap() call err_chk(ierr, 'Invalid boundary condition type!$') bid = 1 - do i = 1,3 + do i = 1,NSBID_TYPES map(i) = iglmax(map(i),1) if(map(i).gt.0) then map(i) = bid diff --git a/src/nekInterface/nekInterfaceAdapter.cpp b/src/nekInterface/nekInterfaceAdapter.cpp index 8854e26aa..f6142fc0f 100644 --- a/src/nekInterface/nekInterfaceAdapter.cpp +++ b/src/nekInterface/nekInterfaceAdapter.cpp @@ -3,12 +3,13 @@ #include "nrs.hpp" #include "nekInterfaceAdapter.hpp" #include "bcMap.hpp" -#include "io.hpp" +#include "ioUtils.hpp" +#include "re2Reader.hpp" nekdata_private nekData; static int rank; static setupAide* options; -static nrs_t* nrs; +static nrs_t *nrs; static void (* usrdat_ptr)(void); static void (* usrdat2_ptr)(void); @@ -37,7 +38,8 @@ static void (* nek_outpost_ptr)(double* v1, double* v2, double* v3, double* vp, static void (* nek_uf_ptr)(double*, double*, double*); static int (* nek_lglel_ptr)(int*); static void (* nek_bootstrap_ptr)(int*, char*, char*, char*, int, int, int); -static void (* nek_setup_ptr)(int*, int*, int*, int*, double*, double*, double*, double*, double*); +static void ( + *nek_setup_ptr)(int *, int *, int *, int *, double *, double *, double *, double *, double *, int *); static void (* nek_ifoutfld_ptr)(int*); static void (* nek_setics_ptr)(void); static int (* nek_bcmap_ptr)(int*, int*,int*); @@ -270,7 +272,8 @@ void set_usr_handles(const char* session_in,int verbose) (void (*)(int*, char*, char*, char*, int, int, int))dlsym(handle, fname("nekf_bootstrap")); check_error(dlerror()); nek_setup_ptr = - (void (*)(int*, int*, int*, int*, double*, double*, double*, double*, double*))dlsym(handle, fname("nekf_setup")); + (void (*)(int *, int *, int *, int *, double *, double *, double *, double *, double *, int *)) + dlsym(handle, fname("nekf_setup")); check_error(dlerror()); nek_uic_ptr = (void (*)(int*))dlsym(handle, fname("nekf_uic")); check_error(dlerror()); @@ -349,8 +352,8 @@ void mkSIZE(int lx1, int lxd, int lelt, hlong lelg, int ldim, int lpmin, int ldi char line[BUFSIZ]; const char *cache_dir = getenv("NEKRS_CACHE_DIR"); - const std::string install_dir(getenv("NEKRS_HOME")); - const std::string nek5000_dir = install_dir + "/nek5000"; + const std::string installDir(getenv("NEKRS_HOME")); + const std::string nek5000_dir = installDir + "/nek5000"; const int verbose = options.compareArgs("VERBOSE","TRUE") ? 1:0; @@ -374,7 +377,10 @@ void mkSIZE(int lx1, int lxd, int lelt, hlong lelg, int ldim, int lpmin, int ldi } } - const int lx1m = options.compareArgs("MOVING MESH", "TRUE") ? lx1 : 1; + int lx1m = (options.compareArgs("MOVING MESH", "TRUE")) ? lx1 : 1; + lx1m = (options.compareArgs("STRESSFORMULATION", "TRUE")) ? lx1 : lx1m; + + constexpr int nMaxObj = 20; int count = 0; while(fgets(line, BUFSIZ, fp) != NULL) { @@ -404,6 +410,8 @@ void mkSIZE(int lx1, int lxd, int lelt, hlong lelg, int ldim, int lpmin, int ldi sprintf(line, " parameter (lelr=%d)\n", 128 * lelt); else if(strstr(line, "parameter (lx1m=") != NULL) sprintf(line, " parameter (lx1m=%d)\n", lx1m); + else if(strstr(line, "parameter (maxobj=") != NULL) + sprintf(line, " parameter (maxobj=%d)\n", nMaxObj); strcpy(sizeFile + count, line); count += strlen(line); @@ -461,12 +469,11 @@ void mkSIZE(int lx1, int lxd, int lelt, hlong lelg, int ldim, int lpmin, int ldi fflush(stdout); } + void buildNekInterface(int ldimt, int N, int np, setupAide& options) { int buildRank = rank; - int buildNodeLocal = 0; - if (getenv("NEKRS_BUILD_NODE_LOCAL")) - buildNodeLocal = std::stoi(getenv("NEKRS_BUILD_NODE_LOCAL")); + const bool buildNodeLocal = useNodeLocalCache(); if(buildNodeLocal) MPI_Comm_rank(platform->comm.mpiCommLocal, &buildRank); @@ -476,9 +483,9 @@ void buildNekInterface(int ldimt, int N, int np, setupAide& options) const std::string cache_dir = std::string(getenv("NEKRS_CACHE_DIR")) + "/nek5000"; mkdir(cache_dir.c_str(), S_IRWXU); - const std::string install_dir(getenv("NEKRS_HOME")); - const std::string nekInterface_dir = install_dir + "/nekInterface"; - const std::string nek5000_dir = install_dir + "/nek5000"; + const std::string installDir(getenv("NEKRS_HOME")); + const std::string nekInterface_dir = installDir + "/nekInterface"; + const std::string nek5000_dir = installDir + "/nek5000"; char buf[10*BUFSIZ]; char *ret = getcwd(buf, sizeof(buf)); @@ -489,33 +496,13 @@ void buildNekInterface(int ldimt, int N, int np, setupAide& options) const std::string usrname = options.getArgs("CASENAME"); const std::string meshFile = options.getArgs("MESH FILE"); - // create SIZE - strcpy(buf, meshFile.c_str()); - FILE *fp = fopen(buf, "r"); - if (!fp) { - if(rank == 0) printf("\nERROR: Cannot find %s!\n", buf); - ABORT(EXIT_FAILURE); - } - fgets(buf, 80, fp); - fclose(fp); - - char ver[10]; - int ndim; - hlong nelgv, nelgt; - // has to match header in re2 - sscanf(buf, "%5s %9lld %1d %9lld", ver, &nelgt, &ndim, &nelgv); - if(ndim != 3) { - if(rank == 0) printf("\nERROR: Unsupported ndim=%d read from re2 header!\n", ndim); - ABORT(EXIT_FAILURE); - } - if(nelgt <= 0 || nelgv <=0 || nelgv > nelgt) { - if(rank == 0) printf("\nERROR: Invalid nelgt=%lld / nelgv=%lld read from re2 header!\n", nelgt, nelgv); - ABORT(EXIT_FAILURE); - } + int nelgt, nelgv; + const int ndim = 3; + re2::nelg(meshFile, nelgt, nelgv, MPI_COMM_NULL); int lelt = (int)(nelgt/np) + 3; if(lelt > nelgt) lelt = (int)nelgt; - sprintf(buf,"%s/SIZE",cache_dir.c_str()); + sprintf(buf,"%s/SIZE",cache_dir.c_str()); mkSIZE(N + 1, 1, lelt, nelgt, ndim, np, ldimt, options, buf); // generate usr @@ -544,36 +531,45 @@ void buildNekInterface(int ldimt, int N, int np, setupAide& options) if(recompile) { const double tStart = MPI_Wtime(); const std::string pipeToNull = (rank == 0) ? std::string("") : std::string(">/dev/null 2>&1"); - const std::string include_dirs = "./ " + case_dir; + const std::string include_dirs = "./ " + case_dir; + std::string make_args = "-j8 "; + if(!verbose) make_args += "-s "; if(rank == 0) - printf("building nek for lx1=%d, lelt=%d and lelg=%d ...", N+1, lelt, nelgt); fflush(stdout); + printf("building nekInterface for lx1=%d, lelt=%d and lelg=%d ...", N+1, lelt, nelgt); fflush(stdout); - sprintf(buf, "cd %s && cp -f %s/makefile.template makefile && " - "make -s -j8 " + sprintf(buf, + "cd %s" + " && cp -f %s/makefile.template makefile" + " && make %s" "S=%s " "OPT_INCDIR=\"%s\" " "CASENAME=%s " "CASEDIR=%s " "-f %s/Makefile lib usr libnekInterface " "%s", - cache_dir.c_str(), nek5000_dir.c_str(), + cache_dir.c_str(), + nek5000_dir.c_str(), + make_args.c_str(), nek5000_dir.c_str(), include_dirs.c_str(), usrname.c_str(), cache_dir.c_str(), - nekInterface_dir.c_str(), + nekInterface_dir.c_str(), pipeToNull.c_str()); - if(verbose && rank == 0) printf("%s\n", buf); + if(verbose && rank == 0) printf("\n%s\n", buf); if(system(buf)) return EXIT_FAILURE; fileSync(libFile); - if(rank == 0) printf("done (%gs)\n\n", MPI_Wtime() - tStart); + if(rank == 0) printf("done (%gs)\n", MPI_Wtime() - tStart); + fflush(stdout); + } else { + if(rank == 0) printf("skip building nekInterface (SIZE requires no update)\n"); fflush(stdout); } - } - + } return 0; }(); + MPI_Allreduce(MPI_IN_PLACE, &err, 1, MPI_INT, MPI_SUM, platform->comm.mpiComm); if(err) ABORT(EXIT_FAILURE); } @@ -598,9 +594,7 @@ void bootstrap() MPI_Comm_size(platform->comm.mpiComm,&size); int buildRank = rank; - int buildNodeLocal = 0; - if (getenv("NEKRS_BUILD_NODE_LOCAL")) - buildNodeLocal = std::stoi(getenv("NEKRS_BUILD_NODE_LOCAL")); + const bool buildNodeLocal = useNodeLocalCache(); if(buildNodeLocal) MPI_Comm_rank(platform->comm.mpiCommLocal, &buildRank); @@ -654,6 +648,8 @@ int setup(nrs_t* nrs_in) nrs = nrs_in; MPI_Comm_rank(platform->comm.mpiComm, &rank); + bool meshSolver = options->compareArgs("MESH SOLVER", "ELASTICITY"); + std::string casename; options->getArgs("CASENAME", casename); @@ -672,11 +668,8 @@ int setup(nrs_t* nrs_in) options->getArgs("MESH CONNECTIVITY TOL", meshConTol); int nBcRead = 1; - int bcInPar = 1; - if(bcMap::size(0) == 0 && bcMap::size(1) == 0) { - bcInPar = 0; + if (bcMap::useNekBCs()) nBcRead = flow + nscal; - } dfloat rho; options->getArgs("DENSITY", rho); @@ -690,8 +683,18 @@ int setup(nrs_t* nrs_in) dfloat lambda; options->getArgs("SCALAR00 DIFFUSIVITY", lambda); - (*nek_setup_ptr)(&flow, &nscal, &nBcRead, &meshPartType, &meshConTol, - &rho, &mue, &rhoCp, &lambda); + int stressForm = options->compareArgs("STRESSFORMULATION", "TRUE"); + + (*nek_setup_ptr)(&flow, + &nscal, + &nBcRead, + &meshPartType, + &meshConTol, + &rho, + &mue, + &rhoCp, + &lambda, + &stressForm); nekData.param = (double*) ptr("param"); nekData.ifield = (int*) ptr("ifield"); @@ -750,8 +753,9 @@ int setup(nrs_t* nrs_in) int cht = 0; if (nekData.nelv != nekData.nelt && nscal) cht = 1; - // import BCs from nek if not specified in par - if(!bcInPar) { + if (bcMap::useNekBCs()) { + if (rank == 0) + printf("import BCs from nek\n"); gen_bcmap(); if(flow) { int isTMesh = 0; @@ -760,14 +764,16 @@ int setup(nrs_t* nrs_in) for(int id = 0; id < nIDs; id++) map[id] = bcmap(id + 1, 1, 0); bcMap::setBcMap("velocity", map, nIDs); - for(int id = 0; id < nIDs; id++) map[id] = bcmap(id + 1, 1, 1); - bcMap::setBcMap("mesh", map, nIDs); + if(meshSolver){ + for(int id = 0; id < nIDs; id++) map[id] = bcmap(id + 1, 1, 1); + bcMap::setBcMap("mesh", map, nIDs); + } free(map); } for(int is = 0; is < nscal; is++) { std::stringstream ss; - ss << std::setfill('0') << std::setw(2) << is; + ss << std::setfill('0') << std::setw(getDigitsRepresentation(NSCALAR_MAX - 1)) << is; std::string sid = ss.str(); int isTMesh = 0; @@ -980,8 +986,5 @@ void coeffAB(double *coeff, double *dt, int order) (*nek_setabbd_ptr)(coeff, dt, &order, &one); } -void recomputeGeometry() -{ - (*nek_updggeom_ptr)(); -} +void recomputeGeometry() { (*nek_updggeom_ptr)(); } } diff --git a/src/nekInterface/nekInterfaceAdapter.hpp b/src/nekInterface/nekInterfaceAdapter.hpp index 1d353edd4..9aa0a4a68 100644 --- a/src/nekInterface/nekInterfaceAdapter.hpp +++ b/src/nekInterface/nekInterfaceAdapter.hpp @@ -118,7 +118,7 @@ void bootstrap(); void ifoutfld(int i); void setic(void); void userchk(void); -int bcmap(int bid, int ifld); +int bcmap(int bid, int ifld); void copyToNek(dfloat time, int tstep); void ocopyToNek(void); diff --git a/src/core/nrs.hpp b/src/nrs.hpp similarity index 67% rename from src/core/nrs.hpp rename to src/nrs.hpp index dcc47a41f..9916a46f1 100644 --- a/src/core/nrs.hpp +++ b/src/nrs.hpp @@ -17,141 +17,134 @@ #include "linAlg.hpp" #include "timer.hpp" #include "platform.hpp" +#include "fldFile.hpp" + +struct nrs_t { + + static constexpr double targetBenchmark {0.1}; + + bool multiSession; -struct nrs_t -{ int dim, elementType; - mesh_t* _mesh; - mesh_t* meshV; + mesh_t *_mesh; + mesh_t *meshV; - elliptic_t* uSolver; - elliptic_t* vSolver; - elliptic_t* wSolver; - elliptic_t* uvwSolver; - elliptic_t* pSolver; - elliptic_t* meshSolver; + elliptic_t *uSolver; + elliptic_t *vSolver; + elliptic_t *wSolver; + elliptic_t *uvwSolver; + elliptic_t *pSolver; + elliptic_t *meshSolver; - cds_t* cds; + cds_t *cds; - oogs_t* gsh; + oogs_t *gsh; dlong ellipticWrkOffset; int flow; - + int cht; int Nscalar; + int NVfields, NTfields; dlong fieldOffset; + dlong cubatureOffset; setupAide vOptions, pOptions, mOptions; - int NVfields, NTfields; - - int converged; + int timeStepConverged; dfloat dt[3], idt; - dfloat p0th[3] = {0.0, 0.0, 0.0}; - dfloat CFL; - dfloat unitTimeCFL; - - dfloat dp0thdt; - int tstep; - int lastStep; dfloat g0, ig0; + dfloat CFL, unitTimeCFL; - int cht; + dfloat p0th[3] = {0.0, 0.0, 0.0}; + dfloat dp0thdt; int nEXT; int nBDF; + int lastStep; int isOutputStep; int outputForceStep; - dfloat* U, * P; - dfloat* BF, * FU; - - // unit normal flow direction for constant flow rate - dfloat flowDirection[3]; - int fromBID; - int toBID; - dfloat flowRate; - - //RK Subcycle Data - int nRK; - dfloat* coeffsfRK, * weightsRK, * nodesRK; + int nRK, Nsubsteps; + dfloat *coeffsfRK, *weightsRK, *nodesRK; occa::memory o_coeffsfRK, o_weightsRK; - //ARK data - int Nrk; - dfloat* rkC; - - //EXTBDF data - dfloat* coeffEXT, * coeffBDF, * coeffSubEXT; + dfloat *U, *P; + occa::memory o_U, o_P; - int Nsubsteps; - dfloat* Ue, sdt; + dfloat *Ue; occa::memory o_Ue; - dfloat* div; + dfloat *div; occa::memory o_div; dfloat rho, mue; occa::memory o_rho, o_mue; occa::memory o_meshRho, o_meshMue; - dfloat* usrwrk; + dfloat *usrwrk; occa::memory o_usrwrk; - occa::memory o_idH; // i.e. inverse of 1D Gll Spacing for quad and Hex + occa::memory o_idH; - int filterNc; // filter cut modes i.e. below is not touched - dfloat* filterM, filterS; - occa::memory o_filterMT; // transpose of filter matrix - occa::kernel filterRTKernel; // Relaxation-Term based filtering - occa::kernel advectMeshVelocityKernel; + dfloat *BF, *FU; + occa::memory o_BF; + occa::memory o_FU; + + dfloat *prop; + occa::memory o_prop, o_ellipticCoeff; + + dfloat *coeffEXT, *coeffBDF, *coeffSubEXT; + occa::memory o_coeffEXT, o_coeffBDF, o_coeffSubEXT; + + int *EToB; + int *EToBMeshVelocity; + occa::memory o_EToB; + occa::memory o_EToBMeshVelocity; + + occa::memory o_EToBVVelocity; + occa::memory o_EToBVMeshVelocity; + + occa::memory o_Uc, o_Pc; + occa::memory o_prevProp; + + occa::memory o_relUrst; + occa::memory o_Urst; + + occa::properties *kernelInfo; + int filterNc; + dfloat *filterM, filterS; + occa::memory o_filterMT; + + occa::kernel filterRTKernel; + occa::kernel advectMeshVelocityKernel; occa::kernel pressureAddQtlKernel; occa::kernel pressureStressKernel; - - occa::kernel subCycleVolumeKernel, subCycleCubatureVolumeKernel; - occa::kernel subCycleSurfaceKernel, subCycleCubatureSurfaceKernel; - occa::kernel subCycleRKUpdateKernel; occa::kernel extrapolateKernel; occa::kernel subCycleRKKernel; occa::kernel subCycleInitU0Kernel; occa::kernel nStagesSum3Kernel; - occa::kernel wgradientVolumeKernel; + occa::kernel subCycleVolumeKernel, subCycleCubatureVolumeKernel; + occa::kernel subCycleSurfaceKernel, subCycleCubatureSurfaceKernel; + occa::kernel subCycleRKUpdateKernel; occa::kernel subCycleStrongCubatureVolumeKernel; occa::kernel subCycleStrongVolumeKernel; occa::kernel computeFaceCentroidKernel; occa::kernel computeFieldDotNormalKernel; - occa::memory o_U, o_P; - - occa::memory o_Uc, o_Pc; - occa::memory o_prevProp; - - occa::memory o_relUrst; - occa::memory o_Urst; occa::kernel UrstCubatureKernel; occa::kernel UrstKernel; - occa::memory o_BF; - occa::memory o_FU; - - int var_coeff; - dfloat* prop, * ellipticCoeff; - occa::memory o_prop, o_ellipticCoeff; - - //EXTBDF data - occa::memory o_coeffEXT, o_coeffBDF, o_coeffSubEXT; - occa::kernel advectionVolumeKernel; occa::kernel advectionCubatureVolumeKernel; - occa::kernel advectionStrongVolumeKernel; - occa::kernel advectionStrongCubatureVolumeKernel; + occa::kernel strongAdvectionVolumeKernel; + occa::kernel strongAdvectionCubatureVolumeKernel; occa::kernel gradientVolumeKernel; @@ -173,22 +166,19 @@ struct nrs_t occa::kernel setEllipticCoeffKernel; occa::kernel setEllipticCoeffPressureKernel; - occa::kernel pressureAxKernel; occa::kernel curlKernel; occa::kernel maskCopyKernel; occa::kernel maskKernel; - int* EToB; - int* EToBMesh; - occa::memory o_EToB; - occa::memory o_EToBMesh; + occa::memory o_zeroNormalMaskVelocity; + occa::memory o_zeroNormalMaskMeshVelocity; + occa::kernel averageNormalBcTypeKernel; + occa::kernel fixZeroNormalMaskKernel; + occa::kernel initializeZeroNormalMaskKernel; - occa::properties* kernelInfo; + occa::kernel applyZeroNormalMaskKernel; }; - -#include "io.hpp" - // std::to_string might be not accurate enough static std::string to_string_f(double a) { @@ -204,7 +194,7 @@ static std::vector serializeString(const std::string sin, char dlim s.erase(std::remove_if(s.begin(), s.end(), ::isspace), s.end()); std::stringstream ss; ss.str(s); - while( ss.good() ) { + while (ss.good()) { std::string substr; std::getline(ss, substr, dlim); slist.push_back(substr); @@ -212,11 +202,12 @@ static std::vector serializeString(const std::string sin, char dlim return slist; } -void evaluateProperties(nrs_t* nrs, const double timeNew); +void evaluateProperties(nrs_t *nrs, const double timeNew); void compileKernels(); -std::vector -determineMGLevels(std::string section); +std::vector determineMGLevels(std::string section); + +int numberActiveFields(nrs_t *nrs); #endif diff --git a/src/plugins/RANSktau.cpp b/src/plugins/RANSktau.cpp index 2158aa5a7..453ffa549 100644 --- a/src/plugins/RANSktau.cpp +++ b/src/plugins/RANSktau.cpp @@ -41,44 +41,83 @@ static dfloat coeff[] = { 85.0, // fb_c1 100.0, // fb_c2 0.52, // alp_inf - 1e-8 // TINY + 1e-8, // TINY + 0 // Pope correction }; } -void RANSktau::buildKernel(occa::properties kernelInfo) +void RANSktau::buildKernel(occa::properties _kernelInfo) { - kernelInfo["defines/p_sigma_k"] = coeff[0]; - kernelInfo["defines/p_sigma_tau"] = coeff[1]; - kernelInfo["defines/p_alpinf_str"] = coeff[2]; - kernelInfo["defines/p_beta0"] = coeff[3]; - kernelInfo["defines/p_kappa"] = coeff[4]; - kernelInfo["defines/p_betainf_str"] = coeff[5]; - kernelInfo["defines/p_ibetainf_str3"] = 1 / pow(coeff[5],3); - kernelInfo["defines/p_sigd_min"] = coeff[6]; - kernelInfo["defines/p_sigd_max"] = coeff[7]; - kernelInfo["defines/p_fb_c1st"] = coeff[8]; - kernelInfo["defines/p_fb_c2st"] = coeff[9]; - kernelInfo["defines/p_fb_c1"] = coeff[10]; - kernelInfo["defines/p_fb_c2"] = coeff[11]; - kernelInfo["defines/p_alp_inf"] = coeff[12]; - kernelInfo["defines/p_tiny"] = coeff[13]; - - int N; - platform->options.getArgs("POLYNOMIAL DEGREE", N); - - kernelInfo += populateMeshProperties(N); - - std::string fileName; + occa::properties kernelInfo; + if(!kernelInfo.get("defines/p_sigma_k").size()) + kernelInfo["defines/p_sigma_k"] = coeff[0]; + if(!kernelInfo.get("defines/p_sigma_tau").size()) + kernelInfo["defines/p_sigma_tau"] = coeff[1]; + if(!kernelInfo.get("defines/p_alpinf_str").size()) + kernelInfo["defines/p_alpinf_str"] = coeff[2]; + if(!kernelInfo.get("defines/p_beta0").size()) + kernelInfo["defines/p_beta0"] = coeff[3]; + if(!kernelInfo.get("defines/p_kappa").size()) + kernelInfo["defines/p_kappa"] = coeff[4]; + if(!kernelInfo.get("defines/p_betainf_str").size()) + kernelInfo["defines/p_betainf_str"] = coeff[5]; + if(!kernelInfo.get("defines/p_ibetainf_str3").size()) + kernelInfo["defines/p_ibetainf_str3"] = 1 / pow(coeff[5],3); + if(!kernelInfo.get("defines/p_sigd_min").size()) + kernelInfo["defines/p_sigd_min"] = coeff[6]; + if(!kernelInfo.get("defines/p_sigd_max").size()) + kernelInfo["defines/p_sigd_max"] = coeff[7]; + if(!kernelInfo.get("defines/p_fb_c1st").size()) + kernelInfo["defines/p_fb_c1st"] = coeff[8]; + if(!kernelInfo.get("defines/p_fb_c2st").size()) + kernelInfo["defines/p_fb_c2st"] = coeff[9]; + if(!kernelInfo.get("defines/p_fb_c1").size()) + kernelInfo["defines/p_fb_c1"] = coeff[10]; + if(!kernelInfo.get("defines/p_fb_c2").size()) + kernelInfo["defines/p_fb_c2"] = coeff[11]; + if(!kernelInfo.get("defines/p_alp_inf").size()) + kernelInfo["defines/p_alp_inf"] = coeff[12]; + if(!kernelInfo.get("defines/p_tiny").size()) + kernelInfo["defines/p_tiny"] = coeff[13]; + if(!kernelInfo.get("defines/p_pope").size()) + kernelInfo["defines/p_pope"] = coeff[14]; + + const int verbose = platform->options.compareArgs("VERBOSE","TRUE") ? 1:0; + + if(platform->comm.mpiRank == 0 && verbose) { + std::cout << "\nRANSktau settings\n"; + std::cout << kernelInfo << std::endl; + } + + kernelInfo += _kernelInfo; + + std::string path; int rank = platform->comm.mpiRank; - fileName.assign(getenv("NEKRS_INSTALL_DIR")); - fileName += "/okl/plugins/RANSktau.okl"; + path.assign(getenv("NEKRS_INSTALL_DIR")); + path += "/okl/plugins/"; + std::string fileName, kernelName; + const std::string extension = ".okl"; { - computeKernel = platform->device.buildKernel(fileName, "computeHex3D", kernelInfo); - SijOijKernel = platform->device.buildKernel(fileName, "SijOijHex3D", kernelInfo); - SijOijMag2Kernel = platform->device.buildKernel(fileName, "SijOijMag2", kernelInfo); - limitKernel = platform->device.buildKernel(fileName, "limit", kernelInfo); - mueKernel = platform->device.buildKernel(fileName, "mue", kernelInfo); + kernelName = "RANSktauComputeHex3D"; + fileName = path + kernelName + extension; + computeKernel = platform->device.buildKernel(fileName, kernelInfo, true); + + kernelName = "SijOijHex3D"; + fileName = path + kernelName + extension; + SijOijKernel = platform->device.buildKernel(fileName, kernelInfo, true); + + kernelName = "SijOijMag2"; + fileName = path + kernelName + extension; + SijOijMag2Kernel = platform->device.buildKernel(fileName, kernelInfo, true); + + kernelName = "limit"; + fileName = path + kernelName + extension; + limitKernel = platform->device.buildKernel(fileName, kernelInfo, true); + + kernelName = "mue"; + fileName = path + kernelName + extension; + mueKernel = platform->device.buildKernel(fileName, kernelInfo, true); } int Nscalar; @@ -185,8 +224,6 @@ void RANSktau::setup(nrs_t* nrsIn, dfloat mueIn, dfloat rhoIn, { if(setupCalled) return; - - nrs = nrsIn; mueLam = mueIn; rho = rhoIn; diff --git a/src/plugins/lowMach.cpp b/src/plugins/lowMach.cpp index d3271e89c..69907bfa8 100644 --- a/src/plugins/lowMach.cpp +++ b/src/plugins/lowMach.cpp @@ -23,19 +23,38 @@ static occa::kernel surfaceFluxKernel; void lowMach::buildKernel(occa::properties kernelInfo) { - int N; - platform->options.getArgs("POLYNOMIAL DEGREE", N); - - kernelInfo += populateMeshProperties(N); - - std::string fileName; + std::string path; int rank = platform->comm.mpiRank; - fileName.assign(getenv("NEKRS_INSTALL_DIR")); - fileName += "/okl/plugins/lowMach.okl"; + path.assign(getenv("NEKRS_INSTALL_DIR")); + path += "/okl/plugins/"; + std::string kernelName, fileName; + const std::string extension = ".okl"; { - qtlKernel = platform->device.buildKernel(fileName, "qtlHex3D" , kernelInfo); - p0thHelperKernel = platform->device.buildKernel(fileName, "p0thHelper", kernelInfo); - surfaceFluxKernel = platform->device.buildKernel(fileName, "surfaceFlux", kernelInfo); + kernelName = "qtlHex3D"; + fileName = path + kernelName + extension; + qtlKernel = platform->device.buildKernel(fileName, kernelInfo, true); + + kernelName = "p0thHelper"; + fileName = path + kernelName + extension; + p0thHelperKernel = platform->device.buildKernel(fileName, kernelInfo, true); + + { + int N; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + const int Nq = N + 1; + if (BLOCKSIZE < Nq * Nq) { + if (platform->comm.mpiRank == 0) + printf("ERROR: surfaceFlux kernel requires BLOCKSIZE >= Nq * Nq." + "BLOCKSIZE = %d, Nq*Nq = %d\n", + BLOCKSIZE, + Nq * Nq); + ABORT(EXIT_FAILURE); + } + } + + kernelName = "surfaceFlux"; + fileName = path + kernelName + extension; + surfaceFluxKernel = platform->device.buildKernel(fileName, kernelInfo, true); } } @@ -71,6 +90,9 @@ void lowMach::qThermalIdealGasSingleComponent(dfloat time, occa::memory o_div) cds->o_S, platform->o_mempool.slice0); + double flopsGrad = 6 * mesh->Np * mesh->Nq + 18 * mesh->Np; + flopsGrad *= static_cast(mesh->Nelements); + oogs::startFinish(platform->o_mempool.slice0, nrs->NVfields, nrs->fieldOffset,ogsDfloat, ogsAdd, nrs->gsh); platform->linAlg->axmyVector( @@ -81,12 +103,11 @@ void lowMach::qThermalIdealGasSingleComponent(dfloat time, occa::memory o_div) nrs->meshV->o_invLMM, platform->o_mempool.slice0); + platform->linAlg->fill(mesh->Nelements * mesh->Np, 0.0, platform->o_mempool.slice3); if(udf.sEqnSource) { platform->timer.tic("udfSEqnSource", 1); udf.sEqnSource(nrs, time, cds->o_S, platform->o_mempool.slice3); platform->timer.toc("udfSEqnSource"); - } else { - platform->linAlg->fill(mesh->Nelements * mesh->Np, 0.0, platform->o_mempool.slice3); } qtlKernel( @@ -101,6 +122,9 @@ void lowMach::qThermalIdealGasSingleComponent(dfloat time, occa::memory o_div) platform->o_mempool.slice3, o_div); + double flopsQTL = 18 * mesh->Np * mesh->Nq + 23 * mesh->Np; + flopsQTL *= static_cast(mesh->Nelements); + oogs::startFinish(o_div, 1, nrs->fieldOffset, ogsDfloat, ogsAdd, nrs->gsh); platform->linAlg->axmy( @@ -108,7 +132,9 @@ void lowMach::qThermalIdealGasSingleComponent(dfloat time, occa::memory o_div) 1.0, nrs->meshV->o_invLMM, o_div); - + + double surfaceFlops = 0.0; + if(nrs->pSolver->allNeumann){ const dfloat dd = (1.0 - gamma0) / gamma0; const dlong Nlocal = mesh->Nlocal; @@ -125,6 +151,10 @@ void lowMach::qThermalIdealGasSingleComponent(dfloat time, occa::memory o_div) nrs->o_Ue, platform->o_mempool.slice0 ); + + double surfaceFluxFlops = 13 * mesh->Nq * mesh->Nq; + surfaceFluxFlops *= static_cast(mesh->Nelements); + platform->o_mempool.slice0.copyTo(platform->mempool.slice0, mesh->Nelements * sizeof(dfloat)); dfloat termV = 0.0; for(int i = 0 ; i < mesh->Nelements; ++i) termV += platform->mempool.slice0[i]; @@ -138,6 +168,9 @@ void lowMach::qThermalIdealGasSingleComponent(dfloat time, occa::memory o_div) platform->o_mempool.slice0, platform->o_mempool.slice1 ); + + double p0thHelperFlops = 4 * mesh->Nlocal; + const dfloat prhs = (termQ - termV)/linAlg->sum(Nlocal, platform->o_mempool.slice0, platform->comm.mpiComm); linAlg->axpby(Nlocal, -prhs, platform->o_mempool.slice1, 1.0, o_div); @@ -150,8 +183,13 @@ void lowMach::qThermalIdealGasSingleComponent(dfloat time, occa::memory o_div) nrs->p0th[0] = Saqpq / (nrs->g0 - nrs->dt[0] * prhs); nrs->dp0thdt = prhs * nrs->p0th[0]; + + surfaceFlops += surfaceFluxFlops + p0thHelperFlops; } qThermal = 0; + + double flops = surfaceFlops + flopsGrad + flopsQTL; + platform->flopCounter->add("lowMach::qThermalIdealGasSingleComponent", flops); } void lowMach::dpdt(occa::memory o_FU) diff --git a/src/plugins/avg.cpp b/src/plugins/tavg.cpp similarity index 77% rename from src/plugins/avg.cpp rename to src/plugins/tavg.cpp index 0100a09a4..3b989a99b 100644 --- a/src/plugins/avg.cpp +++ b/src/plugins/tavg.cpp @@ -15,7 +15,7 @@ #include "nrs.hpp" #include "nekInterfaceAdapter.hpp" -#include "avg.hpp" +#include "tavg.hpp" #include "platform.hpp" #include "linAlg.hpp" @@ -45,38 +45,48 @@ static dfloat timel; static int outfldCounter = 0; } -void avg::buildKernel(occa::properties kernelInfo) +void tavg::buildKernel(occa::properties kernelInfo) { - std::string fileName; + std::string path; int rank = platform->comm.mpiRank; - fileName.assign(getenv("NEKRS_INSTALL_DIR")); - fileName += "/okl/plugins/avg.okl"; + path.assign(getenv("NEKRS_INSTALL_DIR")); + path += "/okl/plugins/"; + std::string kernelName, fileName; + const std::string extension = ".okl"; { - EXKernel = platform->device.buildKernel(fileName, "EX", kernelInfo); - EXXKernel = platform->device.buildKernel(fileName, "EXX", kernelInfo); - EXYKernel = platform->device.buildKernel(fileName, "EXY", kernelInfo); + kernelName = "EX"; + fileName = path + kernelName + extension; + EXKernel = platform->device.buildKernel(fileName, kernelInfo, true); + + kernelName = "EXX"; + fileName = path + kernelName + extension; + EXXKernel = platform->device.buildKernel(fileName, kernelInfo, true); + + kernelName = "EXY"; + fileName = path + kernelName + extension; + EXYKernel = platform->device.buildKernel(fileName, kernelInfo, true); } buildKernelCalled = 1; } -void avg::reset() +void tavg::reset() { counter = 0; atime = 0; } -void avg::EX (dlong N, dfloat a, dfloat b, int nflds, occa::memory o_x, occa::memory o_EX) +void tavg::EX (dlong N, dfloat a, dfloat b, int nflds, occa::memory o_x, occa::memory o_EX) { EXKernel(N, nrs->fieldOffset, nflds, a, b, o_x, o_EX); } -void avg::EXX(dlong N, dfloat a, dfloat b, int nflds, occa::memory o_x, occa::memory o_EXX) +void tavg::EXX(dlong N, dfloat a, dfloat b, int nflds, occa::memory o_x, occa::memory o_EXX) { EXXKernel(N, nrs->fieldOffset, nflds, a, b, o_x, o_EXX); } -void avg::EXY(dlong N, +void tavg::EXY(dlong N, dfloat a, dfloat b, int nflds, @@ -87,12 +97,12 @@ void avg::EXY(dlong N, EXYKernel(N, nrs->fieldOffset, nflds, a, b, o_x, o_y, o_EXY); } -void avg::run(dfloat time) +void tavg::run(dfloat time) { - if(!nrs->converged) return; + if(!nrs->timeStepConverged) return; if(!setupCalled || !buildKernelCalled) { - std::cout << "avg::run() was called prior to avg::setup()!\n"; + std::cout << "tavg::run() was called prior to tavg::setup()!\n"; ABORT(1); } @@ -141,10 +151,10 @@ void avg::run(dfloat time) timel = time; } -void avg::setup(nrs_t* nrs_) +void tavg::setup(nrs_t* nrs_) { if(!buildKernelCalled) { - std::cout << "avg::setup() was called prior avg::buildKernel()!\n"; + std::cout << "tavg::setup() was called prior tavg::buildKernel()!\n"; ABORT(1); } @@ -176,9 +186,9 @@ void avg::setup(nrs_t* nrs_) setupCalled = 1; } -void avg::outfld(int _outXYZ, int FP64) +void tavg::outfld(int _outXYZ, int FP64) { - if(!nrs->converged) return; + if(!nrs->timeStepConverged) return; cds_t* cds = nrs->cds; mesh_t* mesh = nrs->meshV; @@ -218,7 +228,7 @@ void avg::outfld(int _outXYZ, int FP64) } -void avg::outfld() +void tavg::outfld() { - avg::outfld(/* outXYZ */ 0, /* FP64 */ 1); + tavg::outfld(/* outXYZ */ 0, /* FP64 */ 1); } diff --git a/src/plugins/avg.hpp b/src/plugins/tavg.hpp similarity index 97% rename from src/plugins/avg.hpp rename to src/plugins/tavg.hpp index 7fa140a83..84f0fd3cc 100644 --- a/src/plugins/avg.hpp +++ b/src/plugins/tavg.hpp @@ -1,7 +1,7 @@ #include "nrs.hpp" #include "nekInterfaceAdapter.hpp" -namespace avg +namespace tavg { void buildKernel(occa::properties kernelInfo); void run(dfloat time); diff --git a/src/plugins/velRecycling.cpp b/src/plugins/velRecycling.cpp index a5f373e42..503ba9dcd 100644 --- a/src/plugins/velRecycling.cpp +++ b/src/plugins/velRecycling.cpp @@ -28,7 +28,6 @@ static occa::memory o_tmp1, o_tmp2; static occa::kernel setBCVectorValueKernel; static occa::kernel getBCFluxKernel; static occa::kernel sumReductionKernel; -static occa::kernel scalarMultiplyKernel; static bool buildKernelCalled = 0; static bool setupCalled = 0; @@ -41,25 +40,25 @@ static int Nblock; void velRecycling::buildKernel(occa::properties kernelInfo) { + std::string path; + path.assign(getenv("NEKRS_INSTALL_DIR")); + path += "/okl/plugins/"; - int N; - platform->options.getArgs("POLYNOMIAL DEGREE", N); - kernelInfo += populateMeshProperties(N); - - std::string fileName; - fileName.assign(getenv("NEKRS_INSTALL_DIR")); - fileName += "/okl/plugins/velRecycling.okl"; + std::string fileName, kernelName; + const std::string extension = ".okl"; { - setBCVectorValueKernel = platform->device.buildKernel(fileName, - "setBCVectorValue", - kernelInfo); - getBCFluxKernel = platform->device.buildKernel(fileName, "getBCFlux", kernelInfo); - sumReductionKernel = platform->device.buildKernel(fileName, - "sumReduction", - kernelInfo); - scalarMultiplyKernel = platform->device.buildKernel(fileName, - "scalarMultiply", - kernelInfo); + kernelName = "setBCVectorValue"; + fileName = path + kernelName + extension; + setBCVectorValueKernel = platform->device.buildKernel(fileName, kernelInfo, true); + + kernelName = "getBCFlux"; + fileName = path + kernelName + extension; + getBCFluxKernel = platform->device.buildKernel(fileName, kernelInfo, true); + + + kernelName = "sumReduction"; + fileName = path + kernelName + extension; + sumReductionKernel = platform->device.buildKernel(fileName, kernelInfo, true); } } @@ -101,7 +100,7 @@ void velRecycling::copy() const dfloat scale = -wbar * sbuf[0] / sbuf[1]; //printf("rescaling inflow: %f\n", scale); - scalarMultiplyKernel(nrs->NVfields * nrs->fieldOffset, scale, o_wrk); + platform->linAlg->scale(nrs->NVfields * nrs->fieldOffset, scale, o_wrk); } void velRecycling::setup(nrs_t* nrs_, occa::memory o_wrk_, const hlong eOffset, const int bID_, @@ -133,7 +132,7 @@ void velRecycling::setup(nrs_t* nrs_, occa::memory o_wrk_, const hlong eOffset, } } - ogs = ogsSetup(Ntotal, ids, platform->comm.mpiComm, 1, platform->device); + ogs = ogsSetup(Ntotal, ids, platform->comm.mpiComm, 1, platform->device.occaDevice()); free(ids); const int NfpTotal = mesh->Nelements * mesh->Nfaces * mesh->Nfp; diff --git a/src/postProcessing/planarAvg.cpp b/src/postProcessing/planarAvg.cpp new file mode 100644 index 000000000..3ccb39dce --- /dev/null +++ b/src/postProcessing/planarAvg.cpp @@ -0,0 +1,151 @@ +#include "nrs.hpp" +#include "platform.hpp" +#include "linAlg.hpp" +#include "nekInterfaceAdapter.hpp" +#include "postProcessing.hpp" + +namespace { +inline int mod1(int i, int n) { + if(!i) return 0; + return (i+n-1)%n + 1; +} + +void get_exyz(int& ex, int& ey, int& ez,int eg, int nelx, int nely) +{ + ex = mod1(eg, nelx); + ey = 1 + (mod1(eg, nelx*nely) - 1)/nelx; + ez = 1 + (eg-1)/(nelx*nely); +} + +oogs_t *gtpp_gs_setup(nrs_t *nrs, int nelgx, int nelgy, int nelgz, std::string dir) +{ + mesh_t* mesh = nrs->meshV; + const auto nelgxy = nelgx*nelgy; + const auto nelgyz = nelgy*nelgz; + const auto nelgzx = nelgz*nelgx; + + auto *ids = (hlong *) calloc(mesh->Nlocal, sizeof(hlong)); + + for(int iel = 0; iel < mesh->Nelements; iel++) { + const auto eg = nek::lglel(iel) + 1; + int ex, ey, ez; + const auto nx1 = mesh->Nq; + const auto ny1 = mesh->Nq; + const auto nz1 = mesh->Nq; + + // Enumerate points in the y-z plane + if(dir == "x") { + get_exyz(ex,ey,ez,eg,nelgx,nelgyz); + const auto ex_g = ey; + for(int k = 0; k < mesh->Nq; k++) { + for(int j = 0; j < mesh->Nq; j++) { + for(int i = 0; i < mesh->Nq; i++) { + const auto id = iel*mesh->Np + k*mesh->Nq*mesh->Nq + j*mesh->Nq + i; + ids[id] = (j+1) + ny1*k + ny1*nz1*(ex_g-1); + } + } + } + } + + // Enumerate points in the x-z plane + if(dir == "y") { + get_exyz(ex,ey,ez,eg,nelgx,nelgy); + const auto ex_g = (ez-1)*nelgx+ex; + for(int k = 0; k < mesh->Nq; k++) { + for(int j = 0; j < mesh->Nq; j++) { + for(int i = 0; i < mesh->Nq; i++) { + const auto id = iel*mesh->Np + k*mesh->Nq*mesh->Nq + j*mesh->Nq + i; + ids[id] = (k+1) + nz1*i + nx1*nz1*(ex_g-1); + } + } + } + } + + // Enumerate points in the x-y plane + if(dir == "z") { + get_exyz(ex,ey,ez,eg,nelgxy,1); + const auto ex_g = ex; + for(int k = 0; k < mesh->Nq; k++) { + for(int j = 0; j < mesh->Nq; j++) { + for(int i = 0; i < mesh->Nq; i++) { + const auto id = iel*mesh->Np + k*mesh->Nq*mesh->Nq + j*mesh->Nq + i; + ids[id] = (i+1) + nx1*j + nx1*ny1*(ex_g-1) + 1; + } + } + } + } + } + +#if 0 + dfloat *idsDfloat = (dfloat *) calloc(mesh->Nlocal, sizeof(dfloat)); + for(int i = 0; i < mesh->Nlocal; i++) idsDfloat[i] = ids[i]; + occa::memory o_idsDfloat = platform->device.malloc(1*mesh->Nlocal * sizeof(dfloat), idsDfloat); + writeFld("ids", 0.0, 1, 1, &o_NULL, &o_NULL, &o_idsDfloat, 1); +#endif + + auto ogsh = ogsSetup(mesh->Nlocal, ids, platform->comm.mpiComm, 1, platform->device.occaDevice()); + free(ids); + auto oogsh = oogs::setup(ogsh, 6, nrs->fieldOffset, ogsDfloat, NULL, OOGS_AUTO); + return oogsh; +} + +} // namespace + +void postProcessing::planarAvg(nrs_t *nrs, const std::string& dir, int NELGX, int NELGY, int NELGZ, int nflds, occa::memory o_avg) +{ + mesh_t* mesh = nrs->meshV; + static auto firstTime = 1; + const auto fieldOffsetByte = nrs->fieldOffset * sizeof(dfloat); + static occa::memory o_avgWeights; + static oogs_t *oogs_x; + static oogs_t *oogs_y; + static oogs_t *oogs_z; + + if(firstTime) { + o_avgWeights = platform->device.malloc(3*fieldOffsetByte); + + oogs_x = gtpp_gs_setup(nrs, NELGX, NELGY, NELGZ, "x"); + auto o_avgWeights_x = o_avgWeights.slice(0*fieldOffsetByte, fieldOffsetByte); + o_avgWeights_x.copyFrom(mesh->o_LMM, mesh->Nlocal*sizeof(dfloat)); + oogs::startFinish(o_avgWeights_x, 1, mesh->Nlocal, ogsDfloat, ogsAdd, oogs_x); + platform->linAlg->ady(mesh->Nlocal, 1, o_avgWeights_x); + platform->linAlg->axmy(mesh->Nlocal, 1, mesh->o_LMM, o_avgWeights_x); + + oogs_y = gtpp_gs_setup(nrs, NELGX*NELGY, 1, NELGZ, "y"); + auto o_avgWeights_y = o_avgWeights.slice(1*fieldOffsetByte, fieldOffsetByte); + o_avgWeights_y.copyFrom(mesh->o_LMM, mesh->Nlocal*sizeof(dfloat)); + oogs::startFinish(o_avgWeights_y, 1, mesh->Nlocal, ogsDfloat, ogsAdd, oogs_y); + platform->linAlg->ady(mesh->Nlocal, 1, o_avgWeights_y); + platform->linAlg->axmy(mesh->Nlocal, 1, mesh->o_LMM, o_avgWeights_y); + + oogs_z = gtpp_gs_setup(nrs, NELGX*NELGY, 1, NELGZ, "z"); + auto o_avgWeights_z = o_avgWeights.slice(2*fieldOffsetByte, fieldOffsetByte); + o_avgWeights_z.copyFrom(mesh->o_LMM, mesh->Nlocal*sizeof(dfloat)); + oogs::startFinish(o_avgWeights_z, 1, mesh->Nlocal, ogsDfloat, ogsAdd, oogs_z); + platform->linAlg->ady(mesh->Nlocal, 1, o_avgWeights_z); + platform->linAlg->axmy(mesh->Nlocal, 1, mesh->o_LMM, o_avgWeights_z); + + firstTime = 0; + } + + occa::memory o_wghts; + oogs_t *gsh; + if(dir == "x") { + o_wghts = o_avgWeights.slice(0*fieldOffsetByte, fieldOffsetByte); + gsh = oogs_x; + } else if(dir == "y") { + o_wghts = o_avgWeights.slice(1*fieldOffsetByte, fieldOffsetByte); + gsh = oogs_y; + } else if(dir == "z") { + o_wghts = o_avgWeights.slice(2*fieldOffsetByte, fieldOffsetByte); + gsh = oogs_z; + } else { + if (platform->comm.mpiRank == 0) printf("ERROR in planarAvg: Unknown direction!"); + ABORT(EXIT_FAILURE); + } + for(int ifld = 0; ifld < nflds; ifld++) { + auto o_wrk = o_avg.slice(ifld*fieldOffsetByte, fieldOffsetByte); + platform->linAlg->axmy(mesh->Nlocal, 1, o_wghts, o_wrk); + } + oogs::startFinish(o_avg, nflds, nrs->fieldOffset, ogsDfloat, ogsAdd, gsh); +} diff --git a/src/postProcessing/postProcessing.hpp b/src/postProcessing/postProcessing.hpp new file mode 100644 index 000000000..415958b11 --- /dev/null +++ b/src/postProcessing/postProcessing.hpp @@ -0,0 +1,11 @@ +#if !defined(nekrs_post_hpp_) +#define nekrs_post_hpp_ + +#include "nrs.hpp" + +namespace postProcessing +{ +void planarAvg(nrs_t *nrs, const std::string& dir, int NELGX, int NELGY, int NELGZ, int nflds, occa::memory o_avg); +} + +#endif diff --git a/src/regularization/avm.cpp b/src/regularization/avm.cpp index a216ad0e2..2ecdf5018 100644 --- a/src/regularization/avm.cpp +++ b/src/regularization/avm.cpp @@ -38,29 +38,19 @@ void allocateMemory(cds_t* cds) void compileKernels(cds_t* cds) { - mesh_t* mesh = cds->mesh[0]; - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - const std::string oklpath = install_dir + "/okl/cds/regularization/"; - std::string filename = oklpath + "relativeMassHighestMode.okl"; - occa::properties info = platform->kernelInfo; - info["defines/" "p_Nq"] = cds->mesh[0]->Nq; - info["defines/" "p_Np"] = cds->mesh[0]->Np; + std::string kernelName; + + kernelName = "relativeMassHighestMode"; relativeMassHighestModeKernel = - platform->device.buildKernel(filename, - "relativeMassHighestMode", - info); + platform->kernels.get(kernelName); - filename = oklpath + "computeMaxVisc.okl"; + kernelName = "computeMaxVisc"; computeMaxViscKernel = - platform->device.buildKernel(filename, - "computeMaxVisc", - info); - filename = oklpath + "interpolateP1.okl"; + platform->kernels.get(kernelName); + + kernelName = "interpolateP1"; interpolateP1Kernel = - platform->device.buildKernel(filename, - "interpolateP1", - info); + platform->kernels.get(kernelName); } } @@ -124,16 +114,16 @@ occa::memory computeEps(nrs_t* nrs, const dfloat time, const dlong scalarIndex, recomputeUrst = false; } - cds->advectionStrongVolumeKernel( - cds->meshV->Nelements, - mesh->o_D, - cds->vFieldOffset, - 0, - o_filteredField, - o_aliasedUrst, - o_rhoField, - o_hpfResidual); - + cds->strongAdvectionVolumeKernel(cds->meshV->Nelements, + cds->meshV->o_vgeo, + mesh->o_D, + cds->vFieldOffset, + 0, + o_filteredField, + o_aliasedUrst, + o_rhoField, + o_hpfResidual); + occa::memory o_S_field = o_S + cds->fieldOffsetScan[scalarIndex] * sizeof(dfloat); const dfloat Uavg = platform->linAlg->weightedNorm2( diff --git a/src/setup/cdsSetup.cpp b/src/setup/cdsSetup.cpp new file mode 100644 index 000000000..75b8c00df --- /dev/null +++ b/src/setup/cdsSetup.cpp @@ -0,0 +1,250 @@ +cds_t *cdsSetup(nrs_t *nrs, setupAide options) +{ + const std::string section = "cds-"; + cds_t *cds = new cds_t(); + platform_t *platform = platform_t::getInstance(); + device_t &device = platform->device; + + cds->mesh[0] = nrs->_mesh; + mesh_t *mesh = cds->mesh[0]; + cds->meshV = nrs->_mesh->fluid; + cds->elementType = nrs->elementType; + cds->dim = nrs->dim; + cds->NVfields = nrs->NVfields; + cds->NSfields = nrs->Nscalar; + + cds->coeffEXT = nrs->coeffEXT; + cds->coeffBDF = nrs->coeffBDF; + cds->coeffSubEXT = nrs->coeffSubEXT; + cds->nBDF = nrs->nBDF; + cds->nEXT = nrs->nEXT; + cds->o_coeffEXT = nrs->o_coeffEXT; + cds->o_coeffBDF = nrs->o_coeffBDF; + cds->o_coeffSubEXT = nrs->o_coeffSubEXT; + + cds->o_usrwrk = &(nrs->o_usrwrk); + + cds->vFieldOffset = nrs->fieldOffset; + cds->vCubatureOffset = nrs->cubatureOffset; + cds->fieldOffset[0] = nrs->fieldOffset; + cds->fieldOffsetScan[0] = 0; + dlong sum = cds->fieldOffset[0]; + for (int s = 1; s < cds->NSfields; ++s) { + cds->fieldOffset[s] = cds->fieldOffset[0]; + cds->fieldOffsetScan[s] = sum; + sum += cds->fieldOffset[s]; + cds->mesh[s] = cds->mesh[0]; + } + cds->fieldOffsetSum = sum; + + cds->gsh = nrs->gsh; + cds->gshT = (nrs->cht) ? oogs::setup(mesh->ogs, 1, nrs->fieldOffset, ogsDfloat, NULL, OOGS_AUTO) : cds->gsh; + + cds->U = nrs->U; + cds->S = (dfloat *)calloc(std::max(cds->nBDF, cds->nEXT) * cds->fieldOffsetSum, sizeof(dfloat)); + cds->BF = (dfloat *)calloc(cds->fieldOffsetSum, sizeof(dfloat)); + cds->FS = (dfloat *)calloc(cds->nEXT * cds->fieldOffsetSum, sizeof(dfloat)); + + cds->Nsubsteps = nrs->Nsubsteps; + if (cds->Nsubsteps) { + cds->nRK = nrs->nRK; + cds->coeffsfRK = nrs->coeffsfRK; + cds->weightsRK = nrs->weightsRK; + cds->nodesRK = nrs->nodesRK; + cds->o_coeffsfRK = nrs->o_coeffsfRK; + cds->o_weightsRK = nrs->o_weightsRK; + } + + cds->dt = nrs->dt; + + cds->prop = (dfloat *)calloc(2 * cds->fieldOffsetSum, sizeof(dfloat)); + + for (int is = 0; is < cds->NSfields; is++) { + const int scalarWidth = getDigitsRepresentation(NSCALAR_MAX - 1); + std::stringstream ss; + ss << std::setfill('0') << std::setw(scalarWidth) << is; + std::string sid = ss.str(); + + if (options.compareArgs("SCALAR" + sid + " SOLVER", "NONE")) + continue; + + dfloat diff = 1; + dfloat rho = 1; + options.getArgs("SCALAR" + sid + " DIFFUSIVITY", diff); + options.getArgs("SCALAR" + sid + " DENSITY", rho); + + const dlong off = cds->fieldOffsetSum; + for (int e = 0; e < mesh->Nelements; e++) + for (int n = 0; n < mesh->Np; n++) { + cds->prop[0 * off + cds->fieldOffsetScan[is] + e * mesh->Np + n] = diff; + cds->prop[1 * off + cds->fieldOffsetScan[is] + e * mesh->Np + n] = rho; + } + } + + cds->o_prop = device.malloc(2 * cds->fieldOffsetSum * sizeof(dfloat), cds->prop); + cds->o_diff = cds->o_prop.slice(0 * cds->fieldOffsetSum * sizeof(dfloat)); + cds->o_rho = cds->o_prop.slice(1 * cds->fieldOffsetSum * sizeof(dfloat)); + + cds->o_ellipticCoeff = nrs->o_ellipticCoeff; + + cds->o_U = nrs->o_U; + cds->o_Ue = nrs->o_Ue; + cds->o_S = + platform->device.malloc(std::max(cds->nBDF, cds->nEXT) * cds->fieldOffsetSum * sizeof(dfloat), cds->S); + cds->o_Se = platform->device.malloc(cds->fieldOffsetSum, sizeof(dfloat)); + cds->o_BF = platform->device.malloc(cds->fieldOffsetSum * sizeof(dfloat), cds->BF); + cds->o_FS = platform->device.malloc(cds->nEXT * cds->fieldOffsetSum * sizeof(dfloat), cds->FS); + + cds->o_relUrst = nrs->o_relUrst; + cds->o_Urst = nrs->o_Urst; + + for (int is = 0; is < cds->NSfields; is++) { + const int scalarWidth = getDigitsRepresentation(NSCALAR_MAX - 1); + std::stringstream ss; + ss << std::setfill('0') << std::setw(scalarWidth) << is; + std::string sid = ss.str(); + + cds->compute[is] = 1; + if (options.compareArgs("SCALAR" + sid + " SOLVER", "NONE")) { + cds->compute[is] = 0; + continue; + } + + mesh_t *mesh; + (is) ? mesh = cds->meshV : mesh = cds->mesh[0]; // only first scalar can be a CHT mesh + + cds->options[is] = options; + + cds->options[is].setArgs("REGULARIZATION RAMP CONSTANT", + options.getArgs("SCALAR" + sid + " REGULARIZATION RAMP CONSTANT")); + cds->options[is].setArgs("REGULARIZATION AVM C0", + options.getArgs("SCALAR" + sid + " REGULARIZATION AVM C0")); + cds->options[is].setArgs("REGULARIZATION METHOD", + options.getArgs("SCALAR" + sid + " REGULARIZATION METHOD")); + cds->options[is].setArgs("REGULARIZATION VISMAX COEFF", + options.getArgs("SCALAR" + sid + " REGULARIZATION VISMAX COEFF")); + cds->options[is].setArgs("REGULARIZATION SCALING COEFF", + options.getArgs("SCALAR" + sid + " REGULARIZATION SCALING COEFF")); + cds->options[is].setArgs("HPFRT STRENGTH", options.getArgs("SCALAR" + sid + " HPFRT STRENGTH")); + cds->options[is].setArgs("HPFRT MODES", options.getArgs("SCALAR" + sid + " HPFRT MODES")); + cds->options[is].setArgs("KRYLOV SOLVER", options.getArgs("SCALAR" + sid + " KRYLOV SOLVER")); + cds->options[is].setArgs("PGMRES RESTART", options.getArgs("SCALAR" + sid + " PGMRES RESTART")); + cds->options[is].setArgs("DISCRETIZATION", options.getArgs("SCALAR DISCRETIZATION")); + cds->options[is].setArgs("BASIS", options.getArgs("SCALAR BASIS")); + cds->options[is].setArgs("PRECONDITIONER", options.getArgs("SCALAR" + sid + " PRECONDITIONER")); + cds->options[is].setArgs("SOLVER TOLERANCE", options.getArgs("SCALAR" + sid + " SOLVER TOLERANCE")); + cds->options[is].setArgs("LINEAR SOLVER STOPPING CRITERION", + options.getArgs("SCALAR" + sid + " LINEAR SOLVER STOPPING CRITERION")); + cds->options[is].setArgs("INITIAL GUESS", options.getArgs("SCALAR" + sid + " INITIAL GUESS")); + cds->options[is].setArgs("RESIDUAL PROJECTION VECTORS", + options.getArgs("SCALAR" + sid + " RESIDUAL PROJECTION VECTORS")); + cds->options[is].setArgs("RESIDUAL PROJECTION START", + options.getArgs("SCALAR" + sid + " RESIDUAL PROJECTION START")); + cds->options[is].setArgs("MAXIMUM ITERATIONS", options.getArgs("SCALAR" + sid + " MAXIMUM ITERATIONS")); + + dfloat largeNumber = 1 << 20; + cds->EToB[is] = (int *)calloc(mesh->Nelements * mesh->Nfaces, sizeof(int)); + int *EToB = cds->EToB[is]; + int cnt = 0; + for (int e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + EToB[cnt] = bcMap::id(mesh->EToB[f + e * mesh->Nfaces], "scalar" + sid); + cnt++; + } + } + cds->o_EToB[is] = device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int), EToB); + } + + bool scalarFilteringEnabled = false; + bool avmEnabled = false; + { + for (int is = 0; is < cds->NSfields; is++) { + if (!cds->options[is].compareArgs("REGULARIZATION METHOD", "NONE")) + scalarFilteringEnabled = true; + if (cds->options[is].compareArgs("REGULARIZATION METHOD", "HPF_RESIDUAL")) + avmEnabled = true; + if (cds->options[is].compareArgs("REGULARIZATION METHOD", "HIGHEST_MODAL_DECAY")) + avmEnabled = true; + } + } + + if (scalarFilteringEnabled) { + const dlong Nmodes = cds->mesh[0]->N + 1; + cds->o_filterMT = platform->device.malloc(cds->NSfields * Nmodes * Nmodes, sizeof(dfloat)); + for (int is = 0; is < cds->NSfields; is++) { + if (cds->options[is].compareArgs("REGULARIZATION METHOD", "NONE")) + continue; + if (!cds->compute[is]) + continue; + int filterNc = -1; + cds->options[is].getArgs("HPFRT MODES", filterNc); + dfloat filterS; + cds->options[is].getArgs("HPFRT STRENGTH", filterS); + filterS = -1.0 * fabs(filterS); + cds->filterS[is] = filterS; + + dfloat *A = filterSetup(cds->mesh[is], filterNc); + + const dlong Nmodes = cds->mesh[is]->N + 1; + cds->o_filterMT.copyFrom(A, Nmodes * Nmodes * sizeof(dfloat), is * Nmodes * Nmodes * sizeof(dfloat)); + + free(A); + } + } + + if (avmEnabled) + avm::setup(cds); + + std::string kernelName; + const std::string suffix = "Hex3D"; + { + kernelName = "strongAdvectionVolume" + suffix; + cds->strongAdvectionVolumeKernel = platform->kernels.get(section + kernelName); + + kernelName = "strongAdvectionCubatureVolume" + suffix; + cds->strongAdvectionCubatureVolumeKernel = platform->kernels.get(section + kernelName); + + kernelName = "advectMeshVelocity" + suffix; + cds->advectMeshVelocityKernel = platform->kernels.get(section + kernelName); + + kernelName = "maskCopy"; + cds->maskCopyKernel = platform->kernels.get(section + kernelName); + + kernelName = "sumMakef"; + cds->sumMakefKernel = platform->kernels.get(section + kernelName); + + kernelName = "helmholtzBC" + suffix; + cds->helmholtzRhsBCKernel = platform->kernels.get(section + kernelName); + kernelName = "dirichletBC"; + cds->dirichletBCKernel = platform->kernels.get(section + kernelName); + + kernelName = "setEllipticCoeff"; + cds->setEllipticCoeffKernel = platform->kernels.get(section + kernelName); + + kernelName = "filterRT" + suffix; + cds->filterRTKernel = platform->kernels.get(section + kernelName); + + kernelName = "nStagesSum3"; + cds->nStagesSum3Kernel = platform->kernels.get(section + kernelName); + + if (cds->Nsubsteps) { + if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) { + kernelName = "subCycleStrongCubatureVolume" + suffix; + cds->subCycleStrongCubatureVolumeKernel = platform->kernels.get(section + kernelName); + } + kernelName = "subCycleStrongVolume" + suffix; + cds->subCycleStrongVolumeKernel = platform->kernels.get(section + kernelName); + + kernelName = "subCycleRKUpdate"; + cds->subCycleRKUpdateKernel = platform->kernels.get(section + kernelName); + kernelName = "subCycleRK"; + cds->subCycleRKKernel = platform->kernels.get(section + kernelName); + + kernelName = "subCycleInitU0"; + cds->subCycleInitU0Kernel = platform->kernels.get(section + kernelName); + } + } + + return cds; +} + diff --git a/src/setup/setup.cpp b/src/setup/setup.cpp new file mode 100644 index 000000000..d84fe2d48 --- /dev/null +++ b/src/setup/setup.cpp @@ -0,0 +1,1022 @@ +#include +#include +#include + +#include "nrs.hpp" +#include "meshSetup.hpp" +#include "bdry.hpp" +#include "bcMap.hpp" +#include "nekInterfaceAdapter.hpp" +#include "udf.hpp" +#include "filter.hpp" +#include "avm.hpp" + +#include "cdsSetup.cpp" + +std::vector determineMGLevels(std::string section) +{ + const std::string optionsPrefix = [section]() { + std::string prefix = section + std::string(" "); + if (section.find("temperature") != std::string::npos) { + prefix = std::string("scalar00 "); + } + std::transform(prefix.begin(), prefix.end(), prefix.begin(), [](unsigned char c) { + return std::toupper(c); + }); + return prefix; + }(); + + std::vector levels; + int N; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + + std::string p_mglevels; + if (platform->options.getArgs(optionsPrefix + "MULTIGRID COARSENING", p_mglevels)) { + const std::vector mgLevelList = serializeString(p_mglevels, ','); + for (auto &&s : mgLevelList) { + levels.push_back(std::stoi(s)); + } + + bool invalid = false; + invalid |= (levels[0] != N); // top level order must match + for (unsigned i = 0U; i < levels.size(); ++i) { + invalid |= (levels[i] < 0); // each level must be positive + if (i > 0) + invalid |= (levels[i] >= levels[i - 1]); // each successive level must be smaller + } + + if (invalid) { + if (platform->comm.mpiRank == 0) + printf("ERROR: Invalid multigrid coarsening!\n"); + ABORT(EXIT_FAILURE); + ; + } + if (levels.back() > 1) { + if (platform->options.compareArgs(optionsPrefix + "MULTIGRID COARSE SOLVE", "TRUE")) { + // if the coarse level has p > 1 and requires solving the coarsest level, + // rather than just smoothing, SEMFEM must be used for the discretization + const auto usesSEMFEM = + platform->options.compareArgs(optionsPrefix + "MULTIGRID COARSE SEMFEM", "TRUE"); + + if (!usesSEMFEM) { + if (platform->comm.mpiRank == 0) { + printf("Error! FEM coarse discretization only supports p=1 for the coarsest level!\n"); + } + ABORT(1); + } + } + } + + return levels; + } + else if (platform->options.compareArgs(optionsPrefix + "MULTIGRID DOWNWARD SMOOTHER", "ASM") || + platform->options.compareArgs(optionsPrefix + "MULTIGRID DOWNWARD SMOOTHER", "RAS")) { + std::map> mg_level_lookup = { + {1, {1}}, + {2, {2, 1}}, + {3, {3, 1}}, + {4, {4, 2, 1}}, + {5, {5, 3, 1}}, + {6, {6, 3, 1}}, + {7, {7, 3, 1}}, + {8, {8, 5, 1}}, + {9, {9, 5, 1}}, + {10, {10, 6, 1}}, + {11, {11, 6, 1}}, + {12, {12, 7, 1}}, + {13, {13, 7, 1}}, + {14, {14, 8, 1}}, + {15, {15, 9, 1}}, + }; + + return mg_level_lookup.at(N); + } + else { + std::map> mg_level_lookup = { + {1, {1}}, + {2, {2, 1}}, + {3, {3, 1}}, + {4, {4, 2, 1}}, + {5, {5, 3, 1}}, + {6, {6, 4, 2, 1}}, + {7, {7, 5, 3, 1}}, + {8, {8, 6, 4, 1}}, + {9, {9, 7, 5, 1}}, + {10, {10, 8, 5, 1}}, + {11, {11, 9, 5, 1}}, + {12, {12, 10, 5, 1}}, + {13, {13, 11, 5, 1}}, + {14, {14, 12, 5, 1}}, + {15, {15, 13, 5, 1}}, + }; + + return mg_level_lookup.at(N); + } +} + +void nrsSetup(MPI_Comm comm, setupAide &options, nrs_t *nrs) +{ + { + int N; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + const int Nq = N + 1; + if (BLOCKSIZE < Nq * Nq) { + if (platform->comm.mpiRank == 0) + printf("ERROR: several kernels requires BLOCKSIZE >= Nq * Nq." + "BLOCKSIZE = %d, Nq*Nq = %d\n", + BLOCKSIZE, + Nq * Nq); + ABORT(EXIT_FAILURE); + } + } + platform_t *platform = platform_t::getInstance(); + device_t &device = platform->device; + nrs->kernelInfo = new occa::properties(); + *(nrs->kernelInfo) = platform->kernelInfo; + occa::properties &kernelInfo = *nrs->kernelInfo; + kernelInfo["defines"].asObject(); + kernelInfo["includes"].asArray(); + kernelInfo["header"].asArray(); + kernelInfo["flags"].asObject(); + kernelInfo["include_paths"].asArray(); + + int N, cubN; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + platform->options.getArgs("CUBATURE POLYNOMIAL DEGREE", cubN); + platform->options.getArgs("NUMBER OF SCALARS", nrs->Nscalar); + platform->options.getArgs("MESH DIMENSION", nrs->dim); + platform->options.getArgs("ELEMENT TYPE", nrs->elementType); + if (platform->device.mode() == "Serial") + platform->options.setArgs("GS OVERLAP", "FALSE"); + + nrs->flow = 1; + if (platform->options.compareArgs("VELOCITY", "FALSE")) + nrs->flow = 0; + if (platform->options.compareArgs("VELOCITY SOLVER", "NONE")) + nrs->flow = 0; + + if (nrs->flow) { + if (platform->options.compareArgs("STRESSFORMULATION", "TRUE")) + platform->options.setArgs("VELOCITY BLOCK SOLVER", "TRUE"); + } + + // init nek + { + int rank, size; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + std::string casename; + platform->options.getArgs("CASENAME", casename); + + nek::setup(nrs); + nek::setic(); + nek::userchk(); + if (platform->comm.mpiRank == 0) + std::cout << "\n"; + } + + nrs->cht = 0; + if (nekData.nelv != nekData.nelt && nrs->Nscalar) + nrs->cht = 1; + if (nrs->cht && !platform->options.compareArgs("SCALAR00 IS TEMPERATURE", "TRUE")) { + if (platform->comm.mpiRank == 0) + std::cout << "Conjugate heat transfer requires solving for temperature!\n"; + ABORT(EXIT_FAILURE); + ; + } + if (nrs->cht && options.compareArgs("MOVING MESH", "TRUE")) { + if (platform->comm.mpiRank == 0){ + std::cout << "Conjugate heat transfer + moving mesh is not supported\n"; + } + ABORT(EXIT_FAILURE); + } + + nrs->_mesh = createMesh(comm, N, cubN, nrs->cht, kernelInfo); + nrs->meshV = (mesh_t *)nrs->_mesh->fluid; + mesh_t *mesh = nrs->meshV; + + { + double val = (double)mesh->NlocalGatherElements / mesh->Nelements; + MPI_Allreduce(MPI_IN_PLACE, &val, 1, MPI_DOUBLE, MPI_MIN, platform->comm.mpiComm); + if (platform->comm.mpiRank == 0) + printf("min %2.0f%% of the local elements are internal\n", 100 * val); + } + + nrs->NVfields = 3; + nrs->NTfields = nrs->NVfields + 1; // Total Velocity + Pressure + mesh->Nfields = 1; + + platform->options.getArgs("SUBCYCLING STEPS", nrs->Nsubsteps); + platform->options.getArgs("DT", nrs->dt[0]); + + if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO1")) { + nrs->nBDF = 1; + } + else if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO2")) { + nrs->nBDF = 2; + } + else if (platform->options.compareArgs("TIME INTEGRATOR", "TOMBO3")) { + nrs->nBDF = 3; + } + nrs->nEXT = 3; + if (nrs->Nsubsteps) + nrs->nEXT = nrs->nBDF; + nrs->coeffEXT = (dfloat *)calloc(nrs->nEXT, sizeof(dfloat)); + nrs->coeffBDF = (dfloat *)calloc(nrs->nBDF, sizeof(dfloat)); + + nrs->nRK = 4; + nrs->coeffSubEXT = (dfloat *)calloc(3, sizeof(dfloat)); + + dfloat mue = 1; + dfloat rho = 1; + platform->options.getArgs("VISCOSITY", mue); + platform->options.getArgs("DENSITY", rho); + + const dlong Nlocal = mesh->Nlocal; + + { // setup fieldOffset + nrs->fieldOffset = mesh->Np * (mesh->Nelements + mesh->totalHaloPairs); + mesh_t *meshT = nrs->_mesh; + nrs->fieldOffset = mymax(nrs->fieldOffset, meshT->Np * (meshT->Nelements + meshT->totalHaloPairs)); + + const int pageW = ALIGN_SIZE / sizeof(dfloat); + if (nrs->fieldOffset % pageW) + nrs->fieldOffset = (nrs->fieldOffset / pageW + 1) * pageW; + } + + nrs->_mesh->fieldOffset = nrs->fieldOffset; + + { // setup cubatureOffset + if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) { + nrs->cubatureOffset = std::max(nrs->fieldOffset, mesh->Nelements * mesh->cubNp); + } + else { + nrs->cubatureOffset = nrs->fieldOffset; + } + const int pageW = ALIGN_SIZE / sizeof(dfloat); + if (nrs->cubatureOffset % pageW) + nrs->cubatureOffset = (nrs->cubatureOffset / pageW + 1) * pageW; + } + + if (nrs->Nsubsteps) { + int Sorder; + platform->options.getArgs("SUBCYCLING TIME ORDER", Sorder); + if (Sorder == 4 && nrs->nRK == 4) { // ERK(4,4) + dfloat rka[4] = {0.0, 1.0 / 2.0, 1.0 / 2.0, 1.0}; + dfloat rkb[4] = {1.0 / 6.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 6.0}; + dfloat rkc[4] = {0.0, 1.0 / 2.0, 1.0 / 2.0, 1.0}; + nrs->coeffsfRK = (dfloat *)calloc(nrs->nRK, sizeof(dfloat)); + nrs->weightsRK = (dfloat *)calloc(nrs->nRK, sizeof(dfloat)); + nrs->nodesRK = (dfloat *)calloc(nrs->nRK, sizeof(dfloat)); + memcpy(nrs->coeffsfRK, rka, nrs->nRK * sizeof(dfloat)); + memcpy(nrs->weightsRK, rkb, nrs->nRK * sizeof(dfloat)); + memcpy(nrs->nodesRK, rkc, nrs->nRK * sizeof(dfloat)); + } + else { + if (platform->comm.mpiRank == 0) + std::cout << "Unsupported subcycling scheme!\n"; + ABORT(1); + } + nrs->o_coeffsfRK = device.malloc(nrs->nRK * sizeof(dfloat), nrs->coeffsfRK); + nrs->o_weightsRK = device.malloc(nrs->nRK * sizeof(dfloat), nrs->weightsRK); + } + + // setup mempool + int ellipticMaxFields = 1; + if (platform->options.compareArgs("VELOCITY BLOCK SOLVER", "TRUE")) + ellipticMaxFields = nrs->NVfields; + const int ellipticWrkFields = elliptic_t::NScratchFields * ellipticMaxFields; + + int wrkFields = 10; + if (nrs->Nsubsteps) + wrkFields = 9 + 3 * nrs->NVfields; + if (options.compareArgs("MOVING MESH", "TRUE")) + wrkFields += nrs->NVfields; + + const int mempoolNflds = std::max(wrkFields, 2 * nrs->NVfields + ellipticWrkFields); + platform->create_mempool(nrs->fieldOffset, mempoolNflds); + + // offset mempool available for elliptic because also used it for ellipticSolve input/output + auto const o_mempoolElliptic = + platform->o_mempool.o_ptr.slice(2 * nrs->NVfields * nrs->fieldOffset * sizeof(dfloat)); + + if (options.compareArgs("MOVING MESH", "TRUE")) { + const int nBDF = std::max(nrs->nBDF, nrs->nEXT); + platform->o_mempool.slice0.copyFrom(mesh->o_LMM, mesh->Nlocal * sizeof(dfloat)); + mesh->o_LMM.free(); + mesh->o_LMM = platform->device.malloc(nrs->fieldOffset * nBDF, sizeof(dfloat)); + mesh->o_LMM.copyFrom(platform->o_mempool.slice0, mesh->Nlocal * sizeof(dfloat)); + platform->o_mempool.slice0.copyFrom(mesh->o_invLMM, mesh->Nlocal * sizeof(dfloat)); + mesh->o_invLMM.free(); + mesh->o_invLMM = platform->device.malloc(nrs->fieldOffset * nBDF, sizeof(dfloat)); + mesh->o_invLMM.copyFrom(platform->o_mempool.slice0, mesh->Nlocal * sizeof(dfloat)); + + const int nAB = std::max(nrs->nEXT, mesh->nAB); + mesh->U = (dfloat *)calloc(nrs->NVfields * nrs->fieldOffset * nAB, sizeof(dfloat)); + mesh->o_U = platform->device.malloc(nrs->NVfields * nrs->fieldOffset * nAB * sizeof(dfloat), mesh->U); + if (nrs->Nsubsteps) + mesh->o_divU = platform->device.malloc(nrs->fieldOffset * nAB, sizeof(dfloat)); + } + + { + const dlong Nstates = nrs->Nsubsteps ? std::max(nrs->nBDF, nrs->nEXT) : 1; + if (nrs->Nsubsteps && platform->options.compareArgs("MOVING MESH", "TRUE")) + nrs->o_relUrst = platform->device.malloc(Nstates * nrs->NVfields * nrs->cubatureOffset, sizeof(dfloat)); + else + nrs->o_Urst = platform->device.malloc(Nstates * nrs->NVfields * nrs->cubatureOffset, sizeof(dfloat)); + } + + nrs->U = + (dfloat *)calloc(nrs->NVfields * std::max(nrs->nBDF, nrs->nEXT) * nrs->fieldOffset, sizeof(dfloat)); + nrs->Ue = (dfloat *)calloc(nrs->NVfields * nrs->fieldOffset, sizeof(dfloat)); + nrs->P = (dfloat *)calloc(nrs->fieldOffset, sizeof(dfloat)); + nrs->BF = (dfloat *)calloc(nrs->NVfields * nrs->fieldOffset, sizeof(dfloat)); + nrs->FU = (dfloat *)calloc(nrs->NVfields * nrs->nEXT * nrs->fieldOffset, sizeof(dfloat)); + + nrs->o_U = platform->device.malloc(nrs->NVfields * std::max(nrs->nBDF, nrs->nEXT) * nrs->fieldOffset * + sizeof(dfloat), + nrs->U); + nrs->o_Ue = platform->device.malloc(nrs->NVfields * nrs->fieldOffset * sizeof(dfloat), nrs->Ue); + nrs->o_P = platform->device.malloc(nrs->fieldOffset * sizeof(dfloat), nrs->P); + nrs->o_BF = platform->device.malloc(nrs->NVfields * nrs->fieldOffset * sizeof(dfloat), nrs->BF); + nrs->o_FU = platform->device.malloc(nrs->NVfields * nrs->nEXT * nrs->fieldOffset * sizeof(dfloat), nrs->FU); + + nrs->o_ellipticCoeff = device.malloc(2 * nrs->fieldOffset * sizeof(dfloat)); + + int nProperties = 2; + if (options.compareArgs("MESH SOLVER", "ELASTICITY")) + nProperties = 4; + nrs->prop = (dfloat *)calloc(nProperties * nrs->fieldOffset, sizeof(dfloat)); + for (int e = 0; e < mesh->Nelements; e++) + for (int n = 0; n < mesh->Np; n++) { + nrs->prop[0 * nrs->fieldOffset + e * mesh->Np + n] = mue; + nrs->prop[1 * nrs->fieldOffset + e * mesh->Np + n] = rho; + } + + nrs->o_prop = device.malloc(nProperties * nrs->fieldOffset * sizeof(dfloat), nrs->prop); + nrs->o_mue = nrs->o_prop.slice(0 * nrs->fieldOffset * sizeof(dfloat)); + nrs->o_rho = nrs->o_prop.slice(1 * nrs->fieldOffset * sizeof(dfloat)); + if (options.compareArgs("MESH SOLVER", "ELASTICITY")) { + nrs->o_meshMue = nrs->o_prop.slice(2 * nrs->fieldOffset * sizeof(dfloat)); + nrs->o_meshRho = nrs->o_prop.slice(3 * nrs->fieldOffset * sizeof(dfloat)); + } + + if (platform->options.compareArgs("CONSTANT FLOW RATE", "TRUE")) { + nrs->o_Uc = platform->device.malloc(nrs->NVfields * nrs->fieldOffset * sizeof(dfloat)); + nrs->o_Pc = platform->device.malloc(nrs->fieldOffset * sizeof(dfloat)); + nrs->o_prevProp = device.malloc(2 * nrs->fieldOffset * sizeof(dfloat), nrs->prop); + } + + nrs->div = (dfloat *)calloc(nrs->fieldOffset, sizeof(dfloat)); + nrs->o_div = device.malloc(nrs->fieldOffset * sizeof(dfloat), nrs->div); + + nrs->o_coeffEXT = platform->device.malloc(nrs->nEXT * sizeof(dfloat), nrs->coeffEXT); + nrs->o_coeffBDF = platform->device.malloc(nrs->nBDF * sizeof(dfloat), nrs->coeffBDF); + nrs->o_coeffSubEXT = platform->device.malloc(nrs->nEXT * sizeof(dfloat), nrs->coeffEXT); + + // meshParallelGatherScatterSetup(mesh, mesh->Nlocal, mesh->globalIds, platform->comm.mpiComm, OOGS_AUTO, + // 0); + nrs->gsh = oogs::setup(mesh->ogs, nrs->NVfields, nrs->fieldOffset, ogsDfloat, NULL, OOGS_AUTO); + + nrs->EToB = (int *)calloc(mesh->Nelements * mesh->Nfaces, sizeof(int)); + int cnt = 0; + for (int e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + nrs->EToB[cnt] = bcMap::id(mesh->EToB[f + e * mesh->Nfaces], "velocity"); + cnt++; + } + } + nrs->o_EToB = device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int), nrs->EToB); + + if (platform->options.compareArgs("MESH SOLVER", "ELASTICITY")) { + + nrs->EToBMeshVelocity = (int *)calloc(mesh->Nelements * mesh->Nfaces, sizeof(int)); + int cnt = 0; + for (int e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + int bc = bcMap::id(mesh->EToB[f + e * mesh->Nfaces], "mesh"); + nrs->EToBMeshVelocity[cnt] = bcMap::id(mesh->EToB[f + e * mesh->Nfaces], "mesh"); + cnt++; + } + } + nrs->o_EToBMeshVelocity = + device.malloc(mesh->Nelements * mesh->Nfaces * sizeof(int), nrs->EToBMeshVelocity); + } + + if (platform->options.compareArgs("VELOCITY REGULARIZATION METHOD", "RELAXATION")) { + + nrs->filterNc = -1; + dfloat filterS; + platform->options.getArgs("VELOCITY HPFRT STRENGTH", filterS); + platform->options.getArgs("VELOCITY HPFRT MODES", nrs->filterNc); + filterS = -1.0 * fabs(filterS); + nrs->filterS = filterS; + + dfloat *A = filterSetup(nrs->meshV, nrs->filterNc); + + const dlong Nmodes = nrs->meshV->N + 1; + + nrs->o_filterMT = platform->device.malloc(Nmodes * Nmodes * sizeof(dfloat), A); + + free(A); + } + + // build kernels + std::string kernelName; + const std::string suffix = "Hex3D"; + { + const std::string section = "nrs-"; + kernelName = "nStagesSum3"; + nrs->nStagesSum3Kernel = platform->kernels.get(section + kernelName); + + kernelName = "computeFieldDotNormal"; + nrs->computeFieldDotNormalKernel = platform->kernels.get(section + kernelName); + + kernelName = "computeFaceCentroid"; + nrs->computeFaceCentroidKernel = platform->kernels.get(section + kernelName); + + { + kernelName = "strongAdvectionVolume" + suffix; + nrs->strongAdvectionVolumeKernel = platform->kernels.get(section + kernelName); + kernelName = "strongAdvectionCubatureVolume" + suffix; + nrs->strongAdvectionCubatureVolumeKernel = platform->kernels.get(section + kernelName); + } + + kernelName = "curl" + suffix; + nrs->curlKernel = platform->kernels.get(section + kernelName); + + kernelName = "gradientVolume" + suffix; + nrs->gradientVolumeKernel = platform->kernels.get(section + kernelName); + + kernelName = "wGradientVolume" + suffix; + nrs->wgradientVolumeKernel = platform->kernels.get(section + kernelName); + + { + kernelName = "sumMakef"; + nrs->sumMakefKernel = platform->kernels.get(section + kernelName); + } + + kernelName = "wDivergenceVolume" + suffix; + nrs->wDivergenceVolumeKernel = platform->kernels.get(section + kernelName); + kernelName = "divergenceVolume" + suffix; + nrs->divergenceVolumeKernel = platform->kernels.get(section + kernelName); + + kernelName = "divergenceSurface" + suffix; + nrs->divergenceSurfaceKernel = platform->kernels.get(section + kernelName); + + kernelName = "advectMeshVelocity" + suffix; + nrs->advectMeshVelocityKernel = platform->kernels.get(section + kernelName); + + kernelName = "pressureRhs" + suffix; + nrs->pressureRhsKernel = platform->kernels.get(section + kernelName); + + kernelName = "pressureStress" + suffix; + nrs->pressureStressKernel = platform->kernels.get(section + kernelName); + + kernelName = "pressureDirichletBC" + suffix; + nrs->pressureDirichletBCKernel = platform->kernels.get(section + kernelName); + + kernelName = "velocityRhs" + suffix; + nrs->velocityRhsKernel = platform->kernels.get(section + kernelName); + + kernelName = "averageNormalBcType"; + nrs->averageNormalBcTypeKernel = platform->kernels.get(section + kernelName); + + kernelName = "fixZeroNormalMask"; + nrs->fixZeroNormalMaskKernel = platform->kernels.get(section + kernelName); + + kernelName = "applyZeroNormalMask"; + nrs->applyZeroNormalMaskKernel = platform->kernels.get(section + kernelName); + + kernelName = "initializeZeroNormalMask"; + nrs->initializeZeroNormalMaskKernel = platform->kernels.get(section + kernelName); + + kernelName = "velocityDirichletBC" + suffix; + nrs->velocityDirichletBCKernel = platform->kernels.get(section + kernelName); + + kernelName = "velocityNeumannBC" + suffix; + nrs->velocityNeumannBCKernel = platform->kernels.get(section + kernelName); + + kernelName = "UrstCubature" + suffix; + nrs->UrstCubatureKernel = platform->kernels.get(section + kernelName); + + kernelName = "Urst" + suffix; + nrs->UrstKernel = platform->kernels.get(section + kernelName); + + if (nrs->Nsubsteps) { + if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) { + kernelName = "subCycleStrongCubatureVolume" + suffix; + nrs->subCycleStrongCubatureVolumeKernel = platform->kernels.get(section + kernelName); + } + kernelName = "subCycleStrongVolume" + suffix; + nrs->subCycleStrongVolumeKernel = platform->kernels.get(section + kernelName); + + kernelName = "subCycleRKUpdate"; + nrs->subCycleRKUpdateKernel = platform->kernels.get(section + kernelName); + kernelName = "subCycleRK"; + nrs->subCycleRKKernel = platform->kernels.get(section + kernelName); + + kernelName = "subCycleInitU0"; + nrs->subCycleInitU0Kernel = platform->kernels.get(section + kernelName); + } + + kernelName = "extrapolate"; + nrs->extrapolateKernel = platform->kernels.get(section + kernelName); + + kernelName = "maskCopy"; + nrs->maskCopyKernel = platform->kernels.get(section + kernelName); + kernelName = "mask"; + nrs->maskKernel = platform->kernels.get(section + kernelName); + + kernelName = "filterRT" + suffix; + nrs->filterRTKernel = platform->kernels.get(section + kernelName); + + kernelName = "cfl" + suffix; + nrs->cflKernel = platform->kernels.get(section + kernelName); + + kernelName = "pressureAddQtl"; + nrs->pressureAddQtlKernel = platform->kernels.get(section + kernelName); + + kernelName = "setEllipticCoeff"; + nrs->setEllipticCoeffKernel = platform->kernels.get(section + kernelName); + kernelName = "setEllipticCoeffPressure"; + nrs->setEllipticCoeffPressureKernel = platform->kernels.get(section + kernelName); + } + + if (nrs->Nscalar) { + nrs->cds = cdsSetup(nrs, platform->options); + } + + // get IC + t0 from nek + double startTime; + nek::copyFromNek(startTime); + platform->options.setArgs("START TIME", to_string_f(startTime)); + + if (platform->comm.mpiRank == 0) + printf("calling udf_setup ... "); + fflush(stdout); + udf.setup(nrs); + if (platform->comm.mpiRank == 0) + printf("done\n"); + fflush(stdout); + + nrs->o_U.copyFrom(nrs->U); + nrs->o_P.copyFrom(nrs->P); + nrs->o_prop.copyFrom(nrs->prop); + if (nrs->Nscalar) { + nrs->cds->o_S.copyFrom(nrs->cds->S); + nrs->cds->o_prop.copyFrom(nrs->cds->prop); + } + + evaluateProperties(nrs, startTime); + nrs->o_prop.copyTo(nrs->prop); + if (nrs->Nscalar) + nrs->cds->o_prop.copyTo(nrs->cds->prop); + + nek::ocopyToNek(startTime, 0); + + // setup elliptic solvers + + if (nrs->Nscalar) { + cds_t *cds = nrs->cds; + + const int scalarWidth = getDigitsRepresentation(NSCALAR_MAX - 1); + + for (int is = 0; is < cds->NSfields; is++) { + std::stringstream ss; + ss << std::setfill('0') << std::setw(scalarWidth) << is; + std::string sid = ss.str(); + + if (!cds->compute[is]) + continue; + + mesh_t *mesh; + (is) ? mesh = cds->meshV : mesh = cds->mesh[0]; // only first scalar can be a CHT mesh + + if (platform->comm.mpiRank == 0) + std::cout << "================= ELLIPTIC SETUP SCALAR" << sid << " ===============\n"; + + int nbrBIDs = bcMap::size(0); + if (nrs->cht && is == 0) + nbrBIDs = bcMap::size(1); + for (int bID = 1; bID <= nbrBIDs; bID++) { + std::string bcTypeText(bcMap::text(bID, "scalar" + sid)); + if (platform->comm.mpiRank == 0) + printf("bID %d -> bcType %s\n", bID, bcTypeText.c_str()); + } + + cds->solver[is] = new elliptic_t(); + cds->solver[is]->name = "scalar" + sid; + cds->solver[is]->blockSolver = 0; + cds->solver[is]->Nfields = 1; + cds->solver[is]->Ntotal = nrs->fieldOffset; + cds->solver[is]->o_wrk = o_mempoolElliptic; + cds->solver[is]->mesh = mesh; + cds->solver[is]->dim = cds->dim; + cds->solver[is]->elementType = cds->elementType; + + const int coeffField = platform->options.compareArgs("SCALAR" + sid + " COEFF FIELD", "TRUE"); + cds->solver[is]->coeffField = coeffField; + cds->solver[is]->coeffFieldPreco = coeffField; + cds->solver[is]->poisson = 0; + + platform->linAlg->fill(2 * nrs->fieldOffset, 1.0, nrs->o_ellipticCoeff); + cds->solver[is]->o_lambda = cds->o_ellipticCoeff; + cds->solver[is]->loffset = 0; + cds->solver[is]->options = cds->options[is]; + + cds->solver[is]->EToB = (int *)calloc(mesh->Nelements * mesh->Nfaces, sizeof(int)); + for (dlong e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + const int bID = mesh->EToB[f + e * mesh->Nfaces]; + cds->solver[is]->EToB[f + e * mesh->Nfaces] = bcMap::type(bID, "scalar" + sid); + } + } + + ellipticSolveSetup(cds->solver[is]); + } + } + + if (nrs->flow) { + + if (platform->comm.mpiRank == 0) + printf("================ ELLIPTIC SETUP VELOCITY ================\n"); + + nrs->uvwSolver = NULL; + + bool unalignedBoundary = bcMap::unalignedBoundary(mesh->cht, "velocity"); + if (unalignedBoundary) { + if (!options.compareArgs("STRESSFORMULATION", "TRUE")) { + if (platform->comm.mpiRank == 0) + printf("ERROR: unaligned SHL/SYM boundaries require STRESSFORMULATION = TRUE\n"); + ABORT(EXIT_FAILURE); + } + } + + if (platform->options.compareArgs("VELOCITY BLOCK SOLVER", "TRUE")) + nrs->uvwSolver = new elliptic_t(); + + for (int bID = 1; bID <= bcMap::size(0); bID++) { + std::string bcTypeText(bcMap::text(bID, "velocity")); + if (platform->comm.mpiRank == 0) + printf("bID %d -> bcType %s\n", bID, bcTypeText.c_str()); + } + + nrs->vOptions = options; + nrs->vOptions.setArgs("PGMRES RESTART", options.getArgs("VELOCITY PGMRES RESTART")); + nrs->vOptions.setArgs("KRYLOV SOLVER", options.getArgs("VELOCITY KRYLOV SOLVER")); + nrs->vOptions.setArgs("SOLVER TOLERANCE", options.getArgs("VELOCITY SOLVER TOLERANCE")); + nrs->vOptions.setArgs("LINEAR SOLVER STOPPING CRITERION", + options.getArgs("VELOCITY LINEAR SOLVER STOPPING CRITERION")); + nrs->vOptions.setArgs("DISCRETIZATION", options.getArgs("VELOCITY DISCRETIZATION")); + nrs->vOptions.setArgs("BASIS", options.getArgs("VELOCITY BASIS")); + nrs->vOptions.setArgs("PRECONDITIONER", options.getArgs("VELOCITY PRECONDITIONER")); + nrs->vOptions.setArgs("INITIAL GUESS", options.getArgs("VELOCITY INITIAL GUESS")); + nrs->vOptions.setArgs("RESIDUAL PROJECTION VECTORS", + options.getArgs("VELOCITY RESIDUAL PROJECTION VECTORS")); + nrs->vOptions.setArgs("RESIDUAL PROJECTION START", options.getArgs("VELOCITY RESIDUAL PROJECTION START")); + nrs->vOptions.setArgs("MULTIGRID COARSENING", options.getArgs("VELOCITY MULTIGRID COARSENING")); + nrs->vOptions.setArgs("MULTIGRID SMOOTHER", options.getArgs("VELOCITY MULTIGRID SMOOTHER")); + nrs->vOptions.setArgs("MULTIGRID CHEBYSHEV DEGREE", + options.getArgs("VELOCITY MULTIGRID CHEBYSHEV DEGREE")); + nrs->vOptions.setArgs("PARALMOND CYCLE", options.getArgs("VELOCITY PARALMOND CYCLE")); + nrs->vOptions.setArgs("PARALMOND SMOOTHER", options.getArgs("VELOCITY PARALMOND SMOOTHER")); + nrs->vOptions.setArgs("PARALMOND PARTITION", options.getArgs("VELOCITY PARALMOND PARTITION")); + nrs->vOptions.setArgs("PARALMOND CHEBYSHEV DEGREE", + options.getArgs("VELOCITY PARALMOND CHEBYSHEV DEGREE")); + nrs->vOptions.setArgs("PARALMOND AGGREGATION STRATEGY", + options.getArgs("VELOCITY PARALMOND AGGREGATION STRATEGY")); + nrs->vOptions.setArgs("MAXIMUM ITERATIONS", options.getArgs("VELOCITY MAXIMUM ITERATIONS")); + nrs->vOptions.setArgs("STABILIZATION METHOD", options.getArgs("VELOCITY STABILIZATION METHOD")); + nrs->vOptions.setArgs("HPFRT STRENGTH", options.getArgs("VELOCITY HPFRT STRENGTH")); + nrs->vOptions.setArgs("HPFRT MODES", options.getArgs("VELOCITY HPFRT MODES")); + + nrs->mOptions = options; + nrs->mOptions.setArgs("PGMRES RESTART", options.getArgs("MESH PGMRES RESTART")); + nrs->mOptions.setArgs("KRYLOV SOLVER", options.getArgs("MESH KRYLOV SOLVER")); + nrs->mOptions.setArgs("SOLVER TOLERANCE", options.getArgs("MESH SOLVER TOLERANCE")); + nrs->mOptions.setArgs("DISCRETIZATION", options.getArgs("MESH DISCRETIZATION")); + nrs->mOptions.setArgs("BASIS", options.getArgs("MESH BASIS")); + nrs->mOptions.setArgs("PRECONDITIONER", options.getArgs("MESH PRECONDITIONER")); + nrs->mOptions.setArgs("INITIAL GUESS", options.getArgs("MESH INITIAL GUESS")); + nrs->mOptions.setArgs("RESIDUAL PROJECTION VECTORS", options.getArgs("MESH RESIDUAL PROJECTION VECTORS")); + nrs->mOptions.setArgs("RESIDUAL PROJECTION START", options.getArgs("MESH RESIDUAL PROJECTION START")); + nrs->mOptions.setArgs("MULTIGRID COARSENING", options.getArgs("MESH MULTIGRID COARSENING")); + nrs->mOptions.setArgs("MULTIGRID SMOOTHER", options.getArgs("MESH MULTIGRID SMOOTHER")); + nrs->mOptions.setArgs("MULTIGRID CHEBYSHEV DEGREE", options.getArgs("MESH MULTIGRID CHEBYSHEV DEGREE")); + nrs->mOptions.setArgs("PARALMOND CYCLE", options.getArgs("MESH PARALMOND CYCLE")); + nrs->mOptions.setArgs("PARALMOND SMOOTHER", options.getArgs("MESH PARALMOND SMOOTHER")); + nrs->mOptions.setArgs("PARALMOND PARTITION", options.getArgs("MESH PARALMOND PARTITION")); + nrs->mOptions.setArgs("PARALMOND CHEBYSHEV DEGREE", options.getArgs("MESH PARALMOND CHEBYSHEV DEGREE")); + nrs->mOptions.setArgs("PARALMOND AGGREGATION STRATEGY", + options.getArgs("MESH PARALMOND AGGREGATION STRATEGY")); + nrs->mOptions.setArgs("MAXIMUM ITERATIONS", options.getArgs("MESH MAXIMUM ITERATIONS")); + + // coeff used by ellipticSetup to detect allNeumann + platform->linAlg->fill(2 * nrs->fieldOffset, 1.0, nrs->o_ellipticCoeff); + + const int velCoeffField = platform->options.compareArgs("VELOCITY COEFF FIELD", "TRUE"); + + if (nrs->uvwSolver) { + nrs->uvwSolver->blockSolver = 1; + nrs->uvwSolver->stressForm = 0; + if (options.compareArgs("STRESSFORMULATION", "TRUE")) + nrs->uvwSolver->stressForm = 1; + nrs->uvwSolver->Nfields = nrs->NVfields; + nrs->uvwSolver->Ntotal = nrs->fieldOffset; + nrs->uvwSolver->o_wrk = o_mempoolElliptic; + nrs->uvwSolver->mesh = mesh; + nrs->uvwSolver->options = nrs->vOptions; + nrs->uvwSolver->dim = nrs->dim; + nrs->uvwSolver->elementType = nrs->elementType; + nrs->uvwSolver->coeffField = velCoeffField; + nrs->uvwSolver->coeffFieldPreco = velCoeffField; + nrs->uvwSolver->o_lambda = nrs->o_ellipticCoeff; + nrs->uvwSolver->loffset = 0; // use same ellipticCoeff for u,v and w + nrs->uvwSolver->poisson = 0; + nrs->uvwSolver->EToB = + (int *)calloc(mesh->Nelements * mesh->Nfaces * nrs->uvwSolver->Nfields, sizeof(int)); + for (int fld = 0; fld < nrs->uvwSolver->Nfields; fld++) { + std::string key; + if (fld == 0) + key = "x-velocity"; + if (fld == 1) + key = "y-velocity"; + if (fld == 2) + key = "z-velocity"; + for (dlong e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + const int offset = fld * mesh->Nelements * mesh->Nfaces; + const int bID = mesh->EToB[f + e * mesh->Nfaces]; + nrs->uvwSolver->EToB[f + e * mesh->Nfaces + offset] = bcMap::type(bID, key); + } + } + } + + ellipticSolveSetup(nrs->uvwSolver); + if (unalignedBoundary) { + nrs->o_zeroNormalMaskVelocity = platform->device.malloc(3 * nrs->fieldOffset * sizeof(dfloat)); + nrs->o_EToBVVelocity = platform->device.malloc(nrs->meshV->Nlocal * sizeof(dlong)); + createEToBV(nrs->meshV, nrs->uvwSolver->EToB, nrs->o_EToBVVelocity); + createZeroNormalMask(nrs, nrs->uvwSolver->o_EToB, nrs->o_EToBVVelocity, nrs->o_zeroNormalMaskVelocity); + + nrs->uvwSolver->applyZeroNormalMask = + [nrs](dlong Nelements, occa::memory &o_elementList, occa::memory &o_x) { + applyZeroNormalMask(nrs, + Nelements, + o_elementList, + nrs->uvwSolver->o_EToB, + nrs->o_zeroNormalMaskVelocity, + o_x); + }; + } + } + else { + nrs->uSolver = new elliptic_t(); + nrs->uSolver->blockSolver = 0; + nrs->uSolver->Nfields = 1; + nrs->uSolver->Ntotal = nrs->fieldOffset; + nrs->uSolver->o_wrk = o_mempoolElliptic; + nrs->uSolver->mesh = mesh; + nrs->uSolver->options = nrs->vOptions; + nrs->uSolver->dim = nrs->dim; + nrs->uSolver->elementType = nrs->elementType; + nrs->uSolver->coeffField = velCoeffField; + nrs->uSolver->coeffFieldPreco = velCoeffField; + nrs->uSolver->o_lambda = nrs->o_ellipticCoeff; + nrs->uSolver->loffset = 0; + nrs->uSolver->poisson = 0; + nrs->uSolver->EToB = (int *)calloc(mesh->Nelements * mesh->Nfaces, sizeof(int)); + for (dlong e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + const int bID = mesh->EToB[f + e * mesh->Nfaces]; + nrs->uSolver->EToB[f + e * mesh->Nfaces] = bcMap::type(bID, "x-velocity"); + } + } + + ellipticSolveSetup(nrs->uSolver); + + nrs->vSolver = new elliptic_t(); + nrs->vSolver->blockSolver = 0; + nrs->vSolver->Nfields = 1; + nrs->vSolver->Ntotal = nrs->fieldOffset; + nrs->vSolver->o_wrk = o_mempoolElliptic; + nrs->vSolver->mesh = mesh; + nrs->vSolver->options = nrs->vOptions; + nrs->vSolver->dim = nrs->dim; + nrs->vSolver->elementType = nrs->elementType; + nrs->vSolver->coeffField = velCoeffField; + nrs->vSolver->coeffFieldPreco = velCoeffField; + nrs->vSolver->o_lambda = nrs->o_ellipticCoeff; + nrs->vSolver->loffset = 0; + nrs->vSolver->poisson = 0; + nrs->vSolver->EToB = (int *)calloc(mesh->Nelements * mesh->Nfaces, sizeof(int)); + for (dlong e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + const int bID = mesh->EToB[f + e * mesh->Nfaces]; + nrs->vSolver->EToB[f + e * mesh->Nfaces] = bcMap::type(bID, "y-velocity"); + } + } + + ellipticSolveSetup(nrs->vSolver); + + nrs->wSolver = new elliptic_t(); + nrs->wSolver->blockSolver = 0; + nrs->wSolver->Nfields = 1; + nrs->wSolver->Ntotal = nrs->fieldOffset; + nrs->wSolver->o_wrk = o_mempoolElliptic; + nrs->wSolver->mesh = mesh; + nrs->wSolver->options = nrs->vOptions; + nrs->wSolver->dim = nrs->dim; + nrs->wSolver->elementType = nrs->elementType; + nrs->wSolver->coeffField = velCoeffField; + nrs->wSolver->coeffFieldPreco = velCoeffField; + nrs->wSolver->o_lambda = nrs->o_ellipticCoeff; + nrs->wSolver->loffset = 0; + nrs->wSolver->poisson = 0; + nrs->wSolver->EToB = (int *)calloc(mesh->Nelements * mesh->Nfaces, sizeof(int)); + for (dlong e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + const int bID = mesh->EToB[f + e * mesh->Nfaces]; + nrs->wSolver->EToB[f + e * mesh->Nfaces] = bcMap::type(bID, "z-velocity"); + } + } + + ellipticSolveSetup(nrs->wSolver); + } + + if (platform->options.compareArgs("VELOCITY BLOCK SOLVER", "TRUE")) { + nrs->uvwSolver->name = "velocity"; + } + else { + nrs->uSolver->name = "x-velocity"; + nrs->vSolver->name = "y-velocity"; + nrs->wSolver->name = "z-velocity"; + } + } // flow + + if (nrs->flow) { + if (platform->comm.mpiRank == 0) + printf("================ ELLIPTIC SETUP PRESSURE ================\n"); + + nrs->pOptions = options; + nrs->pOptions.setArgs("PGMRES RESTART", options.getArgs("PRESSURE PGMRES RESTART")); + nrs->pOptions.setArgs("KRYLOV SOLVER", options.getArgs("PRESSURE KRYLOV SOLVER")); + nrs->pOptions.setArgs("SOLVER TOLERANCE", options.getArgs("PRESSURE SOLVER TOLERANCE")); + nrs->pOptions.setArgs("LINEAR SOLVER STOPPING CRITERION", + options.getArgs("PRESSURE LINEAR SOLVER STOPPING CRITERION")); + nrs->pOptions.setArgs("DISCRETIZATION", options.getArgs("PRESSURE DISCRETIZATION")); + nrs->pOptions.setArgs("BASIS", options.getArgs("PRESSURE BASIS")); + nrs->pOptions.setArgs("PRECONDITIONER", options.getArgs("PRESSURE PRECONDITIONER")); + nrs->pOptions.setArgs("SEMFEM SOLVER", options.getArgs("PRESSURE SEMFEM SOLVER")); + nrs->pOptions.setArgs("SEMFEM SOLVER PRECISION", options.getArgs("PRESSURE SEMFEM SOLVER PRECISION")); + nrs->pOptions.setArgs("MULTIGRID COARSENING", options.getArgs("PRESSURE MULTIGRID COARSENING")); + nrs->pOptions.setArgs("MULTIGRID SMOOTHER", options.getArgs("PRESSURE MULTIGRID SMOOTHER")); + nrs->pOptions.setArgs("MULTIGRID COARSE SOLVE", options.getArgs("PRESSURE MULTIGRID COARSE SOLVE")); + nrs->pOptions.setArgs("MULTIGRID COARSE SEMFEM", options.getArgs("PRESSURE MULTIGRID COARSE SEMFEM")); + nrs->pOptions.setArgs("MULTIGRID DOWNWARD SMOOTHER", + options.getArgs("PRESSURE MULTIGRID DOWNWARD SMOOTHER")); + nrs->pOptions.setArgs("MULTIGRID UPWARD SMOOTHER", options.getArgs("PRESSURE MULTIGRID UPWARD SMOOTHER")); + nrs->pOptions.setArgs("MULTIGRID CHEBYSHEV DEGREE", + options.getArgs("PRESSURE MULTIGRID CHEBYSHEV DEGREE")); + nrs->pOptions.setArgs("PARALMOND CYCLE", options.getArgs("PRESSURE PARALMOND CYCLE")); + nrs->pOptions.setArgs("PARALMOND SMOOTHER", options.getArgs("PRESSURE MULTIGRID SMOOTHER")); + nrs->pOptions.setArgs("PARALMOND PARTITION", options.getArgs("PRESSURE PARALMOND PARTITION")); + nrs->pOptions.setArgs("PARALMOND CHEBYSHEV DEGREE", + options.getArgs("PRESSURE PARALMOND CHEBYSHEV DEGREE")); + nrs->pOptions.setArgs("PARALMOND AGGREGATION STRATEGY", + options.getArgs("PRESSURE PARALMOND AGGREGATION STRATEGY")); + nrs->pOptions.setArgs("INITIAL GUESS", options.getArgs("PRESSURE INITIAL GUESS")); + nrs->pOptions.setArgs("RESIDUAL PROJECTION VECTORS", + options.getArgs("PRESSURE RESIDUAL PROJECTION VECTORS")); + nrs->pOptions.setArgs("RESIDUAL PROJECTION START", options.getArgs("PRESSURE RESIDUAL PROJECTION START")); + nrs->pOptions.setArgs("MULTIGRID VARIABLE COEFFICIENT", "FALSE"); + nrs->pOptions.setArgs("MAXIMUM ITERATIONS", options.getArgs("PRESSURE MAXIMUM ITERATIONS")); + nrs->pOptions.setArgs("MULTIGRID CHEBYSHEV MAX EIGENVALUE BOUND FACTOR", + options.getArgs("PRESSURE MULTIGRID CHEBYSHEV MAX EIGENVALUE BOUND FACTOR")); + nrs->pOptions.setArgs("MULTIGRID CHEBYSHEV MIN EIGENVALUE BOUND FACTOR", + options.getArgs("PRESSURE MULTIGRID CHEBYSHEV MIN EIGENVALUE BOUND FACTOR")); + + nrs->pSolver = new elliptic_t(); + nrs->pSolver->name = "pressure"; + nrs->pSolver->blockSolver = 0; + nrs->pSolver->Nfields = 1; + nrs->pSolver->Ntotal = nrs->fieldOffset; + nrs->pSolver->o_wrk = o_mempoolElliptic; + nrs->pSolver->mesh = mesh; + nrs->pSolver->dim = nrs->dim; + nrs->pSolver->elementType = nrs->elementType; + + int pCoeffField = 0; + if (platform->options.compareArgs("LOWMACH", "TRUE")) + pCoeffField = 1; + + nrs->pSolver->coeffField = pCoeffField; + nrs->pSolver->coeffFieldPreco = pCoeffField; + nrs->pSolver->poisson = 1; + + // lambda0 = 1/rho lambda1 = 0 + platform->linAlg->fill(2 * nrs->fieldOffset, 0.0, nrs->o_ellipticCoeff); + nrs->o_ellipticCoeff.copyFrom(nrs->o_rho, nrs->fieldOffset * sizeof(dfloat)); + platform->linAlg->ady(mesh->Nlocal, 1.0, nrs->o_ellipticCoeff); + nrs->pSolver->o_lambda = nrs->o_ellipticCoeff; + nrs->pSolver->loffset = 0; // Poisson + nrs->pSolver->options = nrs->pOptions; + { + const std::vector levels = determineMGLevels("pressure"); + nrs->pSolver->nLevels = levels.size(); + nrs->pSolver->levels = (int *)calloc(nrs->pSolver->nLevels, sizeof(int)); + for (int i = 0; i < nrs->pSolver->nLevels; ++i) + nrs->pSolver->levels[i] = levels.at(i); + } + nrs->pSolver->EToB = (int *)calloc(mesh->Nelements * mesh->Nfaces, sizeof(int)); + for (dlong e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + const int bID = mesh->EToB[f + e * mesh->Nfaces]; + nrs->pSolver->EToB[f + e * mesh->Nfaces] = bcMap::type(bID, "pressure"); + } + } + + ellipticSolveSetup(nrs->pSolver); + + } // flow + if (nrs->flow) { + + if (options.compareArgs("MESH SOLVER", "ELASTICITY")) { + + bool unalignedBoundary = bcMap::unalignedBoundary(mesh->cht, "mesh"); + if (unalignedBoundary) { + if (platform->comm.mpiRank == 0) { + printf("ERROR: unaligned SYM/SHL boundary condition are currently not supported with the mesh " + "solver.\n"); + } + ABORT(EXIT_FAILURE); + } + + if (platform->comm.mpiRank == 0) + printf("================ ELLIPTIC SETUP MESH ================\n"); + + for (int bID = 1; bID <= bcMap::size(0); bID++) { + std::string bcTypeText(bcMap::text(bID, "mesh")); + if (platform->comm.mpiRank == 0) + printf("bID %d -> bcType %s\n", bID, bcTypeText.c_str()); + } + + const int meshCoeffField = platform->options.compareArgs("MESH COEFF FIELD", "TRUE"); + platform->linAlg->fill(2 * nrs->fieldOffset, 1.0, nrs->o_ellipticCoeff); + + nrs->meshSolver = new elliptic_t(); + nrs->meshSolver->name = "mesh"; + nrs->meshSolver->blockSolver = 1; + nrs->meshSolver->stressForm = 1; + nrs->meshSolver->Nfields = nrs->NVfields; + nrs->meshSolver->Ntotal = nrs->fieldOffset; + nrs->meshSolver->o_wrk = o_mempoolElliptic; + nrs->meshSolver->mesh = mesh; + nrs->meshSolver->options = nrs->mOptions; + nrs->meshSolver->dim = nrs->dim; + nrs->meshSolver->elementType = nrs->elementType; + nrs->meshSolver->coeffField = meshCoeffField; + nrs->meshSolver->coeffFieldPreco = meshCoeffField; + nrs->meshSolver->o_lambda = nrs->o_ellipticCoeff; + nrs->meshSolver->loffset = 0; // use same ellipticCoeff for u,v and w + nrs->meshSolver->poisson = 0; + + nrs->meshSolver->EToB = + (int *)calloc(mesh->Nelements * mesh->Nfaces * nrs->meshSolver->Nfields, sizeof(int)); + for (int fld = 0; fld < nrs->meshSolver->Nfields; fld++) { + std::string key; + if (fld == 0) + key = "x-mesh"; + if (fld == 1) + key = "y-mesh"; + if (fld == 2) + key = "z-mesh"; + for (dlong e = 0; e < mesh->Nelements; e++) { + for (int f = 0; f < mesh->Nfaces; f++) { + const int offset = fld * mesh->Nelements * mesh->Nfaces; + const int bID = mesh->EToB[f + e * mesh->Nfaces]; + nrs->meshSolver->EToB[f + e * mesh->Nfaces + offset] = bcMap::type(bID, key); + } + } + } + + ellipticSolveSetup(nrs->meshSolver); + if (unalignedBoundary) { + nrs->o_zeroNormalMaskMeshVelocity = platform->device.malloc(3 * nrs->fieldOffset * sizeof(dfloat)); + nrs->o_EToBVMeshVelocity = platform->device.malloc(nrs->meshV->Nlocal * sizeof(dlong)); + createEToBV(nrs->meshV, nrs->meshSolver->EToB, nrs->o_EToBVMeshVelocity); + createZeroNormalMask(nrs, nrs->meshSolver->o_EToB, nrs->o_EToBVMeshVelocity, nrs->o_zeroNormalMaskMeshVelocity); + nrs->meshSolver->applyZeroNormalMask = + [nrs](dlong Nelements, occa::memory &o_elementList, occa::memory &o_x) { + applyZeroNormalMask(nrs, + Nelements, + o_elementList, + nrs->meshSolver->o_EToB, + nrs->o_zeroNormalMaskMeshVelocity, + o_x); + }; + } + } + } +} diff --git a/src/core/setup.hpp b/src/setup/setup.hpp similarity index 100% rename from src/core/setup.hpp rename to src/setup/setup.hpp diff --git a/src/core/cfl.cpp b/src/timeStepper/cfl.cpp similarity index 61% rename from src/core/cfl.cpp rename to src/timeStepper/cfl.cpp index 3515049b7..653a2f660 100644 --- a/src/core/cfl.cpp +++ b/src/timeStepper/cfl.cpp @@ -4,24 +4,23 @@ static int firstTime = 1; -void setup(nrs_t* nrs) +void setup(nrs_t *nrs) { - mesh_t* mesh = nrs->meshV; - + mesh_t *mesh = nrs->meshV; - dfloat* dH; - if(nrs->elementType == QUADRILATERALS || nrs->elementType == HEXAHEDRA) { - dH = (dfloat*) calloc((mesh->N + 1),sizeof(dfloat)); + dfloat *dH; + if (nrs->elementType == QUADRILATERALS || nrs->elementType == HEXAHEDRA) { + dH = (dfloat *)calloc((mesh->N + 1), sizeof(dfloat)); - for(int n = 0; n < (mesh->N + 1); n++) { - if(n == 0) + for (int n = 0; n < (mesh->N + 1); n++) { + if (n == 0) dH[n] = mesh->gllz[n + 1] - mesh->gllz[n]; - else if(n == mesh->N) + else if (n == mesh->N) dH[n] = mesh->gllz[n] - mesh->gllz[n - 1]; else - dH[n] = 0.5 * ( mesh->gllz[n + 1] - mesh->gllz[n - 1]); + dH[n] = 0.5 * (mesh->gllz[n + 1] - mesh->gllz[n - 1]); } - for(int n = 0; n < (mesh->N + 1); n++) + for (int n = 0; n < (mesh->N + 1); n++) dH[n] = 1.0 / dH[n]; nrs->o_idH = platform->device.malloc((mesh->N + 1) * sizeof(dfloat), dH); @@ -30,11 +29,12 @@ void setup(nrs_t* nrs) firstTime = 0; } -dfloat computeCFL(nrs_t* nrs) +dfloat computeCFL(nrs_t *nrs) { - mesh_t* mesh = nrs->meshV; - - if(firstTime) setup(nrs); + mesh_t *mesh = nrs->meshV; + + if (firstTime) + setup(nrs); // Compute cfl factors i.e. dt* U / h nrs->cflKernel(mesh->Nelements, @@ -51,8 +51,8 @@ dfloat computeCFL(nrs_t* nrs) // finish reduction dfloat cfl = 0.f; - for(dlong n = 0; n < mesh->Nelements; ++n) - cfl = mymax(cfl, platform->mempool.slice0[n]); + for (dlong n = 0; n < mesh->Nelements; ++n) + cfl = mymax(cfl, platform->mempool.slice0[n]); dfloat gcfl = 0.f; MPI_Allreduce(&cfl, &gcfl, 1, MPI_DFLOAT, MPI_MAX, platform->comm.mpiComm); diff --git a/src/core/cfl.hpp b/src/timeStepper/cfl.hpp similarity index 72% rename from src/core/cfl.hpp rename to src/timeStepper/cfl.hpp index d8d50df8c..ff4453f8f 100644 --- a/src/core/cfl.hpp +++ b/src/timeStepper/cfl.hpp @@ -2,6 +2,6 @@ #define nekrs_cfl_hpp_ #include "nrs.hpp" -dfloat computeCFL(nrs_t* nrs); +dfloat computeCFL(nrs_t *nrs); #endif diff --git a/src/timeStepper/subCycling.cpp b/src/timeStepper/subCycling.cpp new file mode 100644 index 000000000..0882f55c1 --- /dev/null +++ b/src/timeStepper/subCycling.cpp @@ -0,0 +1,768 @@ +#include +#include +#include + +#include "linAlg.hpp" +#include "nrs.hpp" + +static void flops(mesh_t *mesh, int Nfields) +{ + const auto cubNq = mesh->cubNq; + const auto cubNp = mesh->cubNp; + const auto Nq = mesh->Nq; + const auto Np = mesh->Np; + const auto nEXT = 3; + const auto Nelements = mesh->Nelements; + double flopCount = 0.0; // per elem basis + if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) { + flopCount += 6. * cubNp * nEXT; // extrapolate U(r,s,t) to current time + flopCount += 6. * cubNp * cubNq * Nfields; // apply Dcub + flopCount += 3. * Np * Nfields; // compute NU + flopCount += 4. * Nq * (cubNp + cubNq * cubNq * Nq + cubNq * Nq * Nq) * Nfields; // interpolation + } + else { + flopCount = Nq * Nq * Nq * (6. * Nq + 6. * nEXT + 8.) * Nfields; + } + flopCount *= Nelements; + + platform->flopCounter->add("subcycling", flopCount); +} + +occa::memory velocitySubCycleMovingMesh(nrs_t* nrs, int nEXT, dfloat time, occa::memory o_U) +{ + mesh_t* mesh = nrs->meshV; + linAlg_t* linAlg = platform->linAlg; + + occa::memory &o_p0 = platform->o_mempool.slice0; + occa::memory &o_u1 = platform->o_mempool.slice3; + + occa::memory &o_r1 = platform->o_mempool.slice6; + occa::memory &o_r2 = platform->o_mempool.slice9; + occa::memory &o_r3 = platform->o_mempool.slice12; + occa::memory &o_r4 = platform->o_mempool.slice15; + + occa::memory &o_LMMe = platform->o_mempool.slice18; + + // Solve for Each SubProblem + for (int torder = nEXT - 1; torder >= 0; torder--) { + // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt) + dlong toffset = torder * nrs->NVfields * nrs->fieldOffset; + const dlong offset = torder * nrs->fieldOffset; + nrs->subCycleInitU0Kernel(mesh->Nlocal, + nrs->NVfields, + nrs->fieldOffset, + torder, + nEXT, + toffset, + offset, + nrs->coeffBDF[torder], + mesh->o_LMM, + o_U, + o_p0); + + // Advance subproblem from here from t^(n-torder) to t^(n-torder+1) + dfloat tsub = time; + for (int i = torder; i > 0; i--) + tsub -= nrs->dt[i]; + const dfloat sdt = nrs->dt[torder] / nrs->Nsubsteps; + + for (int ststep = 0; ststep < nrs->Nsubsteps; ++ststep) { + const dfloat tstage = tsub + ststep * sdt; + + o_u1.copyFrom(o_p0, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat)); + + for (int rk = 0; rk < nrs->nRK; ++rk) { + occa::memory o_rhs; + if (rk == 0) + o_rhs = o_r1; + if (rk == 1) + o_rhs = o_r2; + if (rk == 2) + o_rhs = o_r3; + if (rk == 3) + o_rhs = o_r4; + // Extrapolate velocity to subProblem stage time + const dfloat t = tstage + sdt * nrs->nodesRK[rk]; + const dfloat tn0 = time; + const dfloat tn1 = time - nrs->dt[1]; + const dfloat tn2 = time - (nrs->dt[1] + nrs->dt[2]); + dfloat extC[3] = {0., 0., 0.}; + switch (nEXT) { + case 1: + extC[0] = 1; + extC[1] = 0; + extC[2] = 0; + break; + case 2: + extC[0] = (t - tn1) / (tn0 - tn1); + extC[1] = (t - tn0) / (tn1 - tn0); + extC[2] = 0; + break; + case 3: + extC[0] = (t - tn1) * (t - tn2) / ((tn0 - tn1) * (tn0 - tn2)); + extC[1] = (t - tn0) * (t - tn2) / ((tn1 - tn0) * (tn1 - tn2)); + extC[2] = (t - tn0) * (t - tn1) / ((tn2 - tn0) * (tn2 - tn1)); + break; + } + + nrs->nStagesSum3Kernel(mesh->Nlocal, + nrs->fieldOffset, + nEXT, + extC[0], + extC[1], + extC[2], + mesh->o_LMM, + o_LMMe); + linAlg->aydxMany(mesh->Nlocal, + nrs->NVfields, + nrs->fieldOffset, + 0, + 1.0, + o_LMMe, + o_u1); + + if (mesh->NglobalGatherElements) { + if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) + nrs->subCycleStrongCubatureVolumeKernel(mesh->NglobalGatherElements, + mesh->o_globalGatherElementList, + mesh->o_cubDiffInterpT, + mesh->o_cubInterpT, + nrs->fieldOffset, + nrs->cubatureOffset, + 0, + mesh->o_invLMM, + mesh->o_divU, + extC[0], + extC[1], + extC[2], + nrs->o_relUrst, + o_u1, + o_rhs); + else + nrs->subCycleStrongVolumeKernel(mesh->NglobalGatherElements, + mesh->o_globalGatherElementList, + mesh->o_D, + nrs->fieldOffset, + 0, + mesh->o_invLMM, + mesh->o_divU, + extC[0], + extC[1], + extC[2], + nrs->o_relUrst, + o_u1, + o_rhs); + } + + oogs::start(o_rhs, + nrs->NVfields, + nrs->fieldOffset, + ogsDfloat, + ogsAdd, + nrs->gsh); + + if (mesh->NlocalGatherElements) { + if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) + nrs->subCycleStrongCubatureVolumeKernel(mesh->NlocalGatherElements, + mesh->o_localGatherElementList, + mesh->o_cubDiffInterpT, + mesh->o_cubInterpT, + nrs->fieldOffset, + nrs->cubatureOffset, + 0, + mesh->o_invLMM, + mesh->o_divU, + extC[0], + extC[1], + extC[2], + nrs->o_relUrst, + o_u1, + o_rhs); + else + nrs->subCycleStrongVolumeKernel(mesh->NlocalGatherElements, + mesh->o_localGatherElementList, + mesh->o_D, + nrs->fieldOffset, + 0, + mesh->o_invLMM, + mesh->o_divU, + extC[0], + extC[1], + extC[2], + nrs->o_relUrst, + o_u1, + o_rhs); + } + + oogs::finish(o_rhs, + nrs->NVfields, + nrs->fieldOffset, + ogsDfloat, + ogsAdd, + nrs->gsh); + + flops(nrs->meshV, nrs->NVfields); + + linAlg->axmyMany(mesh->Nlocal, + nrs->NVfields, + nrs->fieldOffset, + 0, + 1.0, + o_LMMe, + o_rhs); + + if (rk != 3) + linAlg->axpbyzMany(mesh->Nlocal, + nrs->NVfields, + nrs->fieldOffset, + 1.0, + o_p0, + -sdt * nrs->coeffsfRK[rk + 1], + o_rhs, + o_u1); + else + nrs->subCycleRKKernel(mesh->Nlocal, + nrs->NVfields, + nrs->fieldOffset, + sdt, + nrs->o_weightsRK, + o_r1, + o_r2, + o_r3, + o_r4, + o_p0); + } + } + } + return o_p0; +} +occa::memory velocitySubCycle( + nrs_t *nrs, int nEXT, dfloat time, occa::memory o_U) { + mesh_t *mesh = nrs->meshV; + linAlg_t *linAlg = platform->linAlg; + + // Solve for Each SubProblem + for (int torder = nEXT - 1; torder >= 0; torder--) { + // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt) + dlong toffset = torder * nrs->NVfields * nrs->fieldOffset; + nrs->subCycleInitU0Kernel(mesh->Nlocal, + nrs->NVfields, + nrs->fieldOffset, + torder, + nEXT, + toffset, + 0, + nrs->coeffBDF[torder], + mesh->o_LMM, + o_U, + platform->o_mempool.slice0); + + // Advance subproblem from here from t^(n-torder) to t^(n-torder+1) + dfloat tsub = time; + for (int i = torder; i > 0; i--) + tsub -= nrs->dt[i]; + const dfloat sdt = nrs->dt[torder] / nrs->Nsubsteps; + + for (int ststep = 0; ststep < nrs->Nsubsteps; ++ststep) { + const dfloat tstage = tsub + ststep * sdt; + + platform->o_mempool.slice0.copyFrom(platform->o_mempool.slice0, + nrs->NVfields * nrs->fieldOffset * sizeof(dfloat), + nrs->NVfields * nrs->fieldOffset * sizeof(dfloat), + 0); + + for (int rk = 0; rk < nrs->nRK; ++rk) { + // Extrapolate velocity to subProblem stage time + const dfloat t = tstage + sdt * nrs->nodesRK[rk]; + const dfloat tn0 = time; + const dfloat tn1 = time - nrs->dt[1]; + const dfloat tn2 = time - (nrs->dt[1] + nrs->dt[2]); + dfloat extC[3] = {0., 0., 0.}; + switch (nEXT) { + case 1: + extC[0] = 1; + extC[1] = 0; + extC[2] = 0; + break; + case 2: + extC[0] = (t - tn1) / (tn0 - tn1); + extC[1] = (t - tn0) / (tn1 - tn0); + extC[2] = 0; + break; + case 3: + extC[0] = (t - tn1) * (t - tn2) / ((tn0 - tn1) * (tn0 - tn2)); + extC[1] = (t - tn0) * (t - tn2) / ((tn1 - tn0) * (tn1 - tn2)); + extC[2] = (t - tn0) * (t - tn1) / ((tn2 - tn0) * (tn2 - tn1)); + break; + } + + if (mesh->NglobalGatherElements) { + if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) + nrs->subCycleStrongCubatureVolumeKernel(mesh->NglobalGatherElements, + mesh->o_globalGatherElementList, + mesh->o_cubDiffInterpT, + mesh->o_cubInterpT, + nrs->fieldOffset, + nrs->cubatureOffset, + rk * nrs->NVfields * nrs->fieldOffset, + mesh->o_invLMM, + mesh->o_divU, + extC[0], + extC[1], + extC[2], + nrs->o_Urst, + platform->o_mempool.slice0, + platform->o_mempool.slice6); + else + nrs->subCycleStrongVolumeKernel(mesh->NglobalGatherElements, + mesh->o_globalGatherElementList, + mesh->o_D, + nrs->fieldOffset, + rk * nrs->NVfields * nrs->fieldOffset, + mesh->o_invLMM, + mesh->o_divU, + extC[0], + extC[1], + extC[2], + nrs->o_Urst, + platform->o_mempool.slice0, + platform->o_mempool.slice6); + } + + occa::memory o_rhs; + if (rk == 0) + o_rhs = platform->o_mempool.slice6; + if (rk == 1) + o_rhs = platform->o_mempool.slice9; + if (rk == 2) + o_rhs = platform->o_mempool.slice12; + if (rk == 3) + o_rhs = platform->o_mempool.slice15; + + oogs::start(o_rhs, + nrs->NVfields, + nrs->fieldOffset, + ogsDfloat, + ogsAdd, + nrs->gsh); + + if (mesh->NlocalGatherElements) { + if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) + nrs->subCycleStrongCubatureVolumeKernel(mesh->NlocalGatherElements, + mesh->o_localGatherElementList, + mesh->o_cubDiffInterpT, + mesh->o_cubInterpT, + nrs->fieldOffset, + nrs->cubatureOffset, + rk * nrs->NVfields * nrs->fieldOffset, + mesh->o_invLMM, + mesh->o_divU, + extC[0], + extC[1], + extC[2], + nrs->o_Urst, + platform->o_mempool.slice0, + platform->o_mempool.slice6); + else + nrs->subCycleStrongVolumeKernel(mesh->NlocalGatherElements, + mesh->o_localGatherElementList, + mesh->o_D, + nrs->fieldOffset, + rk * nrs->NVfields * nrs->fieldOffset, + mesh->o_invLMM, + mesh->o_divU, + extC[0], + extC[1], + extC[2], + nrs->o_Urst, + platform->o_mempool.slice0, + platform->o_mempool.slice6); + } + + oogs::finish(o_rhs, + nrs->NVfields, + nrs->fieldOffset, + ogsDfloat, + ogsAdd, + nrs->gsh); + + flops(nrs->meshV, nrs->NVfields); + + nrs->subCycleRKUpdateKernel(mesh->Nlocal, + rk, + sdt, + nrs->fieldOffset, + nrs->o_coeffsfRK, + nrs->o_weightsRK, + platform->o_mempool.slice3, + platform->o_mempool.slice6, + platform->o_mempool.slice0); + } + } + } + linAlg->axmyMany(mesh->Nlocal, + 3, + nrs->fieldOffset, + 0, + 1.0, + mesh->o_LMM, + platform->o_mempool.slice0); + return platform->o_mempool.slice0; +} + +occa::memory scalarSubCycleMovingMesh(cds_t *cds, + int nEXT, + dfloat time, + int is, + occa::memory o_U, + occa::memory o_S) { + + linAlg_t *linAlg = platform->linAlg; + + occa::memory &o_r1 = platform->o_mempool.slice2; + occa::memory &o_r2 = platform->o_mempool.slice3; + occa::memory &o_r3 = platform->o_mempool.slice4; + occa::memory &o_r4 = platform->o_mempool.slice5; + + occa::memory &o_p0 = platform->o_mempool.slice0; + occa::memory &o_u1 = platform->o_mempool.slice6; + + occa::memory &o_LMMe = platform->o_mempool.slice1; + + // Solve for Each SubProblem + for (int torder = (nEXT - 1); torder >= 0; torder--) { + // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt) + const dlong toffset = + cds->fieldOffsetScan[is] + torder * cds->fieldOffsetSum; + const dlong offset = torder * cds->fieldOffset[is]; + cds->subCycleInitU0Kernel(cds->mesh[0]->Nlocal, + 1, + cds->fieldOffset[is], + torder, + nEXT, + toffset, + offset, + cds->coeffBDF[torder], + cds->mesh[0]->o_LMM, + o_S, + o_p0); + + // Advance SubProblem to t^(n-torder+1) + dfloat tsub = time; + for (int i = torder; i > 0; i--) + tsub -= cds->dt[i]; + const dfloat sdt = cds->dt[torder] / cds->Nsubsteps; + + for (int ststep = 0; ststep < cds->Nsubsteps; ++ststep) { + const dfloat tstage = tsub + ststep * sdt; + o_u1.copyFrom(o_p0, cds->mesh[0]->Nlocal * sizeof(dfloat)); + for (int rk = 0; rk < cds->nRK; ++rk) { + occa::memory o_rhs; + if (rk == 0) + o_rhs = o_r1; + if (rk == 1) + o_rhs = o_r2; + if (rk == 2) + o_rhs = o_r3; + if (rk == 3) + o_rhs = o_r4; + + // Extrapolate velocity to subProblem stage time + const dfloat t = tstage + sdt * cds->nodesRK[rk]; + const dfloat tn0 = time; + const dfloat tn1 = time - cds->dt[1]; + const dfloat tn2 = time - (cds->dt[1] + cds->dt[2]); + dfloat extC[3] = {0., 0., 0.}; + switch (nEXT) { + case 1: + extC[0] = 1; + extC[1] = 0; + extC[2] = 0; + break; + case 2: + extC[0] = (t - tn1) / (tn0 - tn1); + extC[1] = (t - tn0) / (tn1 - tn0); + extC[2] = 0; + break; + case 3: + extC[0] = (t - tn1) * (t - tn2) / ((tn0 - tn1) * (tn0 - tn2)); + extC[1] = (t - tn0) * (t - tn2) / ((tn1 - tn0) * (tn1 - tn2)); + extC[2] = (t - tn0) * (t - tn1) / ((tn2 - tn0) * (tn2 - tn1)); + break; + } + cds->nStagesSum3Kernel(cds->mesh[0]->Nlocal, + cds->vFieldOffset, + nEXT, + extC[0], + extC[1], + extC[2], + cds->mesh[0]->o_LMM, + o_LMMe); + linAlg->aydx(cds->mesh[0]->Nlocal, 1.0, o_LMMe, o_u1); + + if (cds->meshV->NglobalGatherElements) { + if (cds->options[is].compareArgs("ADVECTION TYPE", "CUBATURE")) + cds->subCycleStrongCubatureVolumeKernel(cds->meshV->NglobalGatherElements, + cds->meshV->o_globalGatherElementList, + cds->meshV->o_cubDiffInterpT, + cds->meshV->o_cubInterpT, + cds->vFieldOffset, + cds->vCubatureOffset, + 0, + cds->mesh[0]->o_invLMM, + cds->mesh[0]->o_divU, + extC[0], + extC[1], + extC[2], + cds->o_relUrst, + o_u1, + o_rhs); + else + cds->subCycleStrongVolumeKernel(cds->meshV->NglobalGatherElements, + cds->meshV->o_globalGatherElementList, + cds->meshV->o_D, + cds->vFieldOffset, + 0, + cds->mesh[0]->o_invLMM, + cds->mesh[0]->o_divU, + extC[0], + extC[1], + extC[2], + cds->o_relUrst, + o_u1, + o_rhs); + } + + oogs::start( + o_rhs, 1, cds->fieldOffset[is], ogsDfloat, ogsAdd, cds->gsh); + + if (cds->meshV->NlocalGatherElements) { + if (cds->options[is].compareArgs("ADVECTION TYPE", "CUBATURE")) + cds->subCycleStrongCubatureVolumeKernel(cds->meshV->NlocalGatherElements, + cds->meshV->o_localGatherElementList, + cds->meshV->o_cubDiffInterpT, + cds->meshV->o_cubInterpT, + cds->vFieldOffset, + cds->vCubatureOffset, + 0, + cds->mesh[0]->o_invLMM, + cds->mesh[0]->o_divU, + extC[0], + extC[1], + extC[2], + cds->o_relUrst, + o_u1, + o_rhs); + else + cds->subCycleStrongVolumeKernel(cds->meshV->NlocalGatherElements, + cds->meshV->o_localGatherElementList, + cds->meshV->o_D, + cds->vFieldOffset, + 0, + cds->mesh[0]->o_invLMM, + cds->mesh[0]->o_divU, + extC[0], + extC[1], + extC[2], + cds->o_relUrst, + o_u1, + o_rhs); + } + + oogs::finish( + o_rhs, 1, cds->fieldOffset[is], ogsDfloat, ogsAdd, cds->gsh); + + flops(cds->mesh[0], 1); + + linAlg->axmy(cds->mesh[0]->Nlocal, 1.0, o_LMMe, o_rhs); + if (rk != 3) + linAlg->axpbyz(cds->mesh[0]->Nlocal, + 1.0, + o_p0, + -sdt * cds->coeffsfRK[rk + 1], + o_rhs, + o_u1); + else + cds->subCycleRKKernel(cds->mesh[0]->Nlocal, + sdt, + cds->o_weightsRK, + o_r1, + o_r2, + o_r3, + o_r4, + o_p0); + } + } + } + return o_p0; +} + +occa::memory scalarSubCycle(cds_t *cds, + int nEXT, + dfloat time, + int is, + occa::memory o_U, + occa::memory o_S) { + linAlg_t *linAlg = platform->linAlg; + + // Solve for Each SubProblem + for (int torder = (nEXT - 1); torder >= 0; torder--) { + // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt) + const dlong toffset = + cds->fieldOffsetScan[is] + torder * cds->fieldOffsetSum; + cds->subCycleInitU0Kernel(cds->mesh[0]->Nlocal, + 1, + cds->fieldOffset[is], + torder, + nEXT, + toffset, + 0, + cds->coeffBDF[torder], + cds->mesh[0]->o_LMM, + o_S, + platform->o_mempool.slice0); + + // Advance SubProblem to t^(n-torder+1) + dfloat tsub = time; + for (int i = torder; i > 0; i--) + tsub -= cds->dt[i]; + const dfloat sdt = cds->dt[torder] / cds->Nsubsteps; + + for (int ststep = 0; ststep < cds->Nsubsteps; ++ststep) { + const dfloat tstage = tsub + ststep * sdt; + + platform->o_mempool.slice0.copyFrom(platform->o_mempool.slice0, + cds->fieldOffset[is] * sizeof(dfloat), + cds->fieldOffset[is] * sizeof(dfloat), + 0); + + for (int rk = 0; rk < cds->nRK; ++rk) { + // Extrapolate velocity to subProblem stage time + const dfloat t = tstage + sdt * cds->nodesRK[rk]; + const dfloat tn0 = time; + const dfloat tn1 = time - cds->dt[1]; + const dfloat tn2 = time - (cds->dt[1] + cds->dt[2]); + dfloat extC[3] = {0., 0., 0.}; + switch (nEXT) { + case 1: + extC[0] = 1; + extC[1] = 0; + extC[2] = 0; + break; + case 2: + extC[0] = (t - tn1) / (tn0 - tn1); + extC[1] = (t - tn0) / (tn1 - tn0); + extC[2] = 0; + break; + case 3: + extC[0] = (t - tn1) * (t - tn2) / ((tn0 - tn1) * (tn0 - tn2)); + extC[1] = (t - tn0) * (t - tn2) / ((tn1 - tn0) * (tn1 - tn2)); + extC[2] = (t - tn0) * (t - tn1) / ((tn2 - tn0) * (tn2 - tn1)); + break; + } + + if (cds->meshV->NglobalGatherElements) { + if (cds->options[is].compareArgs("ADVECTION TYPE", "CUBATURE")) + cds->subCycleStrongCubatureVolumeKernel( + cds->meshV->NglobalGatherElements, + cds->meshV->o_globalGatherElementList, + cds->meshV->o_cubDiffInterpT, + cds->meshV->o_cubInterpT, + cds->vFieldOffset, + cds->vCubatureOffset, + rk * cds->fieldOffset[is], + cds->mesh[0]->o_invLMM, + cds->mesh[0]->o_divU, + extC[0], + extC[1], + extC[2], + cds->o_Urst, + platform->o_mempool.slice0, + platform->o_mempool.slice2); + else + cds->subCycleStrongVolumeKernel(cds->meshV->NglobalGatherElements, + cds->meshV->o_globalGatherElementList, + cds->meshV->o_D, + cds->vFieldOffset, + rk * cds->fieldOffset[is], + cds->mesh[0]->o_invLMM, + cds->mesh[0]->o_divU, + extC[0], + extC[1], + extC[2], + cds->o_Urst, + platform->o_mempool.slice0, + platform->o_mempool.slice2); + } + + occa::memory o_rhs; + if (rk == 0) + o_rhs = platform->o_mempool.slice2; + if (rk == 1) + o_rhs = platform->o_mempool.slice3; + if (rk == 2) + o_rhs = platform->o_mempool.slice4; + if (rk == 3) + o_rhs = platform->o_mempool.slice5; + + oogs::start( + o_rhs, 1, cds->fieldOffset[is], ogsDfloat, ogsAdd, cds->gsh); + + if (cds->meshV->NlocalGatherElements) { + if (cds->options[is].compareArgs("ADVECTION TYPE", "CUBATURE")) + cds->subCycleStrongCubatureVolumeKernel( + cds->meshV->NlocalGatherElements, + cds->meshV->o_localGatherElementList, + cds->meshV->o_cubDiffInterpT, + cds->meshV->o_cubInterpT, + cds->vFieldOffset, + cds->vCubatureOffset, + rk * cds->fieldOffset[is], + cds->mesh[0]->o_invLMM, + cds->mesh[0]->o_divU, + extC[0], + extC[1], + extC[2], + cds->o_Urst, + platform->o_mempool.slice0, + platform->o_mempool.slice2); + else + cds->subCycleStrongVolumeKernel(cds->meshV->NlocalGatherElements, + cds->meshV->o_localGatherElementList, + cds->meshV->o_D, + cds->vFieldOffset, + rk * cds->fieldOffset[is], + cds->mesh[0]->o_invLMM, + cds->mesh[0]->o_divU, + extC[0], + extC[1], + extC[2], + cds->o_Urst, + platform->o_mempool.slice0, + platform->o_mempool.slice2); + } + + oogs::finish( + o_rhs, 1, cds->fieldOffset[is], ogsDfloat, ogsAdd, cds->gsh); + + flops(cds->mesh[0], 1); + + cds->subCycleRKUpdateKernel(cds->meshV->Nlocal, + rk, + sdt, + cds->fieldOffset[is], + cds->o_coeffsfRK, + cds->o_weightsRK, + platform->o_mempool.slice1, + platform->o_mempool.slice2, + platform->o_mempool.slice0); + } + } + } + linAlg->axmy(cds->mesh[0]->Nlocal, + 1.0, + cds->mesh[0]->o_LMM, + platform->o_mempool.slice0); + return platform->o_mempool.slice0; +} + diff --git a/src/timeStepper/subCycling.hpp b/src/timeStepper/subCycling.hpp new file mode 100644 index 000000000..8a9629ed5 --- /dev/null +++ b/src/timeStepper/subCycling.hpp @@ -0,0 +1,14 @@ +#if !defined(nekrs_subcycle_hpp_) +#define nekrs_subcycle_hpp_ + +#include "nrs.hpp" + +occa::memory velocitySubCycle(nrs_t* nrs, int nEXT, dfloat time, occa::memory o_U); +occa::memory velocitySubCycleMovingMesh(nrs_t* nrs, int nEXT, dfloat time, occa::memory o_U); +occa::memory scalarSubCycleMovingMesh(cds_t *cds, int nEXT, dfloat time, + int is, occa::memory o_U, + occa::memory o_S); +occa::memory scalarSubCycle(cds_t *cds, int nEXT, dfloat time, int is, + occa::memory o_U, occa::memory o_S); + +#endif diff --git a/src/timeStepper/timeStepper.cpp b/src/timeStepper/timeStepper.cpp index 19d6b3fc3..bf683c9fe 100644 --- a/src/timeStepper/timeStepper.cpp +++ b/src/timeStepper/timeStepper.cpp @@ -2,15 +2,44 @@ #include #include +#include "nrs.hpp" #include "avm.hpp" #include "cfl.hpp" #include "constantFlowRate.hpp" -#include "linAlg.hpp" #include "nekInterfaceAdapter.hpp" -#include "nrs.hpp" #include "timeStepper.hpp" #include "tombo.hpp" +#include "subCycling.hpp" #include "udf.hpp" +#include "bcMap.hpp" +#include "bdry.hpp" + +namespace { + +void advectionFlops(mesh_t *mesh, int Nfields) +{ + const auto cubNq = mesh->cubNq; + const auto cubNp = mesh->cubNp; + const auto Nq = mesh->Nq; + const auto Np = mesh->Np; + const auto Nelements = mesh->Nelements; + double flopCount = 0.0; // per elem basis + if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) { + flopCount += 4. * Nq * (cubNp + cubNq * cubNq * Nq + cubNq * Nq * Nq); // interpolation + flopCount += 6. * cubNp * cubNq; // apply Dcub + flopCount += 5 * cubNp; // compute advection term on cubature mesh + flopCount += mesh->Np; // weight by inv. mass matrix + } + else { + flopCount += 8 * (Np * Nq + Np); + } + + flopCount *= Nelements; + flopCount *= Nfields; + + platform->flopCounter->add("advection", flopCount); +} +} // namespace void evaluateProperties(nrs_t *nrs, const double timeNew) { platform->timer.tic("udfProperties", 1); @@ -42,9 +71,9 @@ void evaluateProperties(nrs_t *nrs, const double timeNew) { namespace timeStepper { -double tElapsed = 0; -double tElapsedStepMin = std::numeric_limits::max(); -double tElapsedStepMax = std::numeric_limits::min(); +double tSolve = 0; +double tSolveStepMin = std::numeric_limits::max(); +double tSolveStepMax = std::numeric_limits::min(); void adjustDt(nrs_t* nrs, int tstep) { @@ -111,6 +140,8 @@ void adjustDt(nrs_t* nrs, int tstep) } } + nrs->CFL = CFL; + return; } const double unitTimeCFLold = (tstep == 1) ? CFL/nrs->dt[0] : nrs->unitTimeCFL; @@ -155,17 +186,11 @@ void adjustDt(nrs_t* nrs, int tstep) } } -void step(nrs_t *nrs, dfloat time, dfloat dt, int tstep) { - const double tStart = MPI_Wtime(); - +void extrapolate(nrs_t *nrs) +{ mesh_t *mesh = nrs->meshV; - cds_t *cds = nrs->cds; - coeffs(nrs, dt, tstep); - - const bool movingMesh = platform->options.compareArgs("MOVING MESH", "TRUE"); - if (nrs->flow) nrs->extrapolateKernel(mesh->Nelements, nrs->NVfields, @@ -183,18 +208,25 @@ void step(nrs_t *nrs, dfloat time, dfloat dt, int tstep) { cds->o_coeffEXT, cds->o_S, cds->o_Se); +} + +void step(nrs_t *nrs, dfloat time, dfloat dt, int tstep) +{ + const double tStart = MPI_Wtime(); + mesh_t *mesh = nrs->meshV; + cds_t *cds = nrs->cds; + + const bool movingMesh = platform->options.compareArgs("MOVING MESH", "TRUE"); + + coeffs(nrs, dt, tstep); - dlong cubatureOffset; - if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) - cubatureOffset = std::max(nrs->fieldOffset, mesh->Nelements * mesh->cubNp); - else - cubatureOffset = nrs->fieldOffset; + extrapolate(nrs); if (nrs->Nsubsteps) { mesh_t *mesh = nrs->meshV; if (nrs->cht) mesh = nrs->cds->mesh[0]; - const dlong NbyteCubature = nrs->NVfields * cubatureOffset * sizeof(dfloat); + const dlong NbyteCubature = nrs->NVfields * nrs->cubatureOffset * sizeof(dfloat); for (int s = nrs->nEXT; s > 1; s--) { const dlong Nbyte = nrs->fieldOffset * sizeof(dfloat); if (movingMesh) { @@ -211,38 +243,49 @@ void step(nrs_t *nrs, dfloat time, dfloat dt, int tstep) { (s - 2) * NbyteCubature); } } - if (movingMesh) + if (movingMesh) { + double flops = 18 * (mesh->Np * mesh->Nq + mesh->Np); + flops *= static_cast(mesh->Nelements); nrs->divergenceVolumeKernel(mesh->Nelements, mesh->o_vgeo, mesh->o_D, nrs->fieldOffset, mesh->o_U, mesh->o_divU); + platform->flopCounter->add("divergenceVolumeKernel", flops); + } } const bool relative = movingMesh && nrs->Nsubsteps; occa::memory &o_Urst = relative ? nrs->o_relUrst : nrs->o_Urst; mesh = nrs->meshV; - if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) + double flopCount = 0.0; + + if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) { nrs->UrstCubatureKernel(mesh->Nelements, - mesh->o_D, - mesh->o_x, - mesh->o_y, - mesh->o_z, - mesh->o_cubInterpT, - mesh->o_cubw, - nrs->fieldOffset, - cubatureOffset, - nrs->o_U, - mesh->o_U, - o_Urst); - else + mesh->o_cubvgeo, + mesh->o_cubInterpT, + nrs->fieldOffset, + nrs->cubatureOffset, + nrs->o_U, + mesh->o_U, + o_Urst); + flopCount += 6 * mesh->Np * mesh->cubNq; + flopCount += 6 * mesh->Nq * mesh->Nq * mesh->cubNq * mesh->cubNq; + flopCount += 6 * mesh->Nq * mesh->cubNp; + flopCount += 24 * mesh->cubNp; + flopCount *= mesh->Nelements; + } + else { nrs->UrstKernel(mesh->Nelements, mesh->o_vgeo, nrs->fieldOffset, nrs->o_U, mesh->o_U, o_Urst); + flopCount += 24 * static_cast(mesh->Nlocal); + } + platform->flopCounter->add("Urst", flopCount); if (nrs->Nscalar) { platform->timer.tic("makeq", 1); @@ -275,6 +318,15 @@ void step(nrs_t *nrs, dfloat time, dfloat dt, int tstep) { (s - 2) * NbyteScalar); } mesh->move(); + + if (bcMap::unalignedBoundary(mesh->cht, "velocity")) { + createZeroNormalMask(nrs, nrs->uvwSolver->o_EToB, nrs->o_EToBVVelocity, nrs->o_zeroNormalMaskVelocity); + } + + if (bcMap::unalignedBoundary(mesh->cht, "mesh") && platform->options.compareArgs("MESH SOLVER", "ELASTICITY")){ + createZeroNormalMask(nrs, nrs->meshSolver->o_EToB, nrs->o_EToBVMeshVelocity, nrs->o_zeroNormalMaskMeshVelocity); + } + if (nrs->cht) nrs->meshV->computeInvLMM(); for (int s = std::max(nrs->nEXT, mesh->nAB); s > 1; s--) { @@ -286,22 +338,25 @@ void step(nrs_t *nrs, dfloat time, dfloat dt, int tstep) { platform->device.finish(); MPI_Barrier(platform->comm.mpiComm); const double tPreStep = MPI_Wtime() - tStart; + tSolve += tPreStep; const int isOutputStep = nrs->isOutputStep; - nrs->converged = false; + nrs->timeStepConverged = false; - double tElapsedStep = 0; - int stage = 0; + int iter = 0; do { platform->device.finish(); MPI_Barrier(platform->comm.mpiComm); - const double tStartStage = MPI_Wtime(); + const double tSolveStepStart = MPI_Wtime(); - stage++; + iter++; const dfloat timeNew = time + nrs->dt[0]; + ////////////////////////////////////////////// + applyDirichlet(nrs, timeNew); + if (nrs->Nscalar) - scalarSolve(nrs, timeNew, cds->o_S, stage); + scalarSolve(nrs, timeNew, cds->o_S, iter); evaluateProperties(nrs, timeNew); @@ -311,40 +366,43 @@ void step(nrs_t *nrs, dfloat time, dfloat dt, int tstep) { udf.div(nrs, timeNew, nrs->o_div); } - fluidSolve(nrs, timeNew, nrs->o_P, nrs->o_U, stage, tstep); + if (nrs->flow) + fluidSolve(nrs, timeNew, nrs->o_P, nrs->o_U, iter, tstep); + if(platform->options.compareArgs("MESH SOLVER", "ELASTICITY")) - meshSolve(nrs, timeNew, nrs->meshV->o_U, stage); + meshSolve(nrs, timeNew, nrs->meshV->o_U, iter); + ////////////////////////////////////////////// - nrs->converged = (udf.converged) ? udf.converged(nrs, stage) : true; + nrs->timeStepConverged = (udf.timeStepConverged) ? udf.timeStepConverged(nrs, iter) : true; platform->device.finish(); MPI_Barrier(platform->comm.mpiComm); - double tElapsedStage = MPI_Wtime() - tStartStage; - if (stage == 1) tElapsedStage += tPreStep; - tElapsedStep += tElapsedStage; - tElapsed += tElapsedStage; - - printInfo(nrs, timeNew, tstep, tElapsedStage, tElapsed); + double tSolveStep = MPI_Wtime() - tSolveStepStart; + tSolve += tSolveStep; + + if(tstep > 9) { + tSolveStepMin = std::fmin(tSolveStep, tSolveStepMin); + tSolveStepMax = std::fmax(tSolveStep, tSolveStepMax); + platform->timer.set("minSolveStep", tSolveStepMin); + platform->timer.set("maxSolveStep", tSolveStepMax); + } platform->timer.tic("udfExecuteStep", 1); nek::ifoutfld(0); nrs->isOutputStep = 0; - if (isOutputStep && nrs->converged) { + if (isOutputStep && nrs->timeStepConverged) { nek::ifoutfld(1); nrs->isOutputStep = 1; } if (udf.executeStep) udf.executeStep(nrs, timeNew, tstep); platform->timer.toc("udfExecuteStep"); - } while (!nrs->converged); - if(tstep > 9) { - tElapsedStepMin = std::fmin(tElapsedStep, tElapsedStepMin); - tElapsedStepMax = std::fmax(tElapsedStep, tElapsedStepMax); - platform->timer.set("minSolveStep", tElapsedStepMin); - platform->timer.set("maxSolveStep", tElapsedStepMax); - } - platform->timer.set("solve", tElapsed); + if (!nrs->timeStepConverged) + printInfo(nrs, timeNew, tstep); + } while (!nrs->timeStepConverged); + + platform->timer.set("solve", tSolve); nrs->dt[2] = nrs->dt[1]; nrs->dt[1] = nrs->dt[0]; @@ -405,9 +463,6 @@ void makeq( platform->timer.toc("udfSEqnSource"); } - const dlong cubatureOffset = - std::max(cds->vFieldOffset, cds->meshV->Nelements * cds->meshV->cubNp); - for (int is = 0; is < cds->NSfields; is++) { if (!cds->compute[is]) continue; @@ -426,6 +481,10 @@ void makeq( cds->o_rho, cds->o_S, o_FS); + + double flops = 6 * mesh->Np * mesh->Nq + 4 * mesh->Np; + flops *= static_cast(mesh->Nelements); + platform->flopCounter->add("scalarFilterRT", flops); } const int movingMesh = cds->options[is].compareArgs("MOVING MESH", "TRUE"); if (movingMesh && !cds->Nsubsteps) { @@ -438,41 +497,44 @@ void makeq( mesh->o_U, cds->o_S, o_FS); + double flops = 18 * mesh->Np * mesh->Nq + 21 * mesh->Np; + flops *= static_cast(mesh->Nelements); + platform->flopCounter->add("scalar advectMeshVelocity", flops); } occa::memory o_Usubcycling = platform->o_mempool.slice0; if (cds->options[is].compareArgs("ADVECTION", "TRUE")) { if (cds->Nsubsteps) { if (movingMesh) - o_Usubcycling = scalarStrongSubCycleMovingMesh( + o_Usubcycling = scalarSubCycleMovingMesh( cds, mymin(tstep, cds->nEXT), time, is, cds->o_U, cds->o_S); else - o_Usubcycling = scalarStrongSubCycle( + o_Usubcycling = scalarSubCycle( cds, mymin(tstep, cds->nEXT), time, is, cds->o_U, cds->o_S); } else { if (cds->options[is].compareArgs("ADVECTION TYPE", "CUBATURE")) - cds->advectionStrongCubatureVolumeKernel(cds->meshV->Nelements, - mesh->o_vgeo, - mesh->o_cubDiffInterpT, - mesh->o_cubInterpT, - mesh->o_cubProjectT, - cds->vFieldOffset, - isOffset, - cubatureOffset, - cds->o_S, - cds->o_Urst, - cds->o_rho, - platform->o_mempool.slice0); + cds->strongAdvectionCubatureVolumeKernel(cds->meshV->Nelements, + mesh->o_vgeo, + mesh->o_cubDiffInterpT, + mesh->o_cubInterpT, + mesh->o_cubProjectT, + cds->vFieldOffset, + isOffset, + cds->vCubatureOffset, + cds->o_S, + cds->o_Urst, + cds->o_rho, + platform->o_mempool.slice0); else - cds->advectionStrongVolumeKernel(cds->meshV->Nelements, - mesh->o_vgeo, - mesh->o_D, - cds->vFieldOffset, - isOffset, - cds->o_S, - cds->o_Urst, - cds->o_rho, - platform->o_mempool.slice0); + cds->strongAdvectionVolumeKernel(cds->meshV->Nelements, + mesh->o_vgeo, + mesh->o_D, + cds->vFieldOffset, + isOffset, + cds->o_S, + cds->o_Urst, + cds->o_rho, + platform->o_mempool.slice0); platform->linAlg->axpby(cds->meshV->Nelements * cds->meshV->Np, -1.0, platform->o_mempool.slice0, @@ -480,6 +542,8 @@ void makeq( o_FS, 0, isOffset); + + advectionFlops(cds->mesh[0], 1); } } else { platform->linAlg->fill(cds->fieldOffsetSum, 0.0, o_Usubcycling); @@ -498,6 +562,10 @@ void makeq( o_FS, cds->o_rho, o_BF); + + dfloat scalarSumMakef = (3 * cds->nEXT + 3) * static_cast(mesh->Nlocal); + scalarSumMakef += (cds->Nsubsteps) ? mesh->Nlocal : 3 * cds->nBDF * static_cast(mesh->Nlocal); + platform->flopCounter->add("scalarSumMakef", scalarSumMakef); } for (int s = std::max(cds->nBDF, cds->nEXT); s > 1; s--) { @@ -555,7 +623,7 @@ void makef( platform->timer.toc("udfUEqnSource"); } - if(platform->options.compareArgs("REGULARIZATION METHOD", "RELAXATION")) + if (platform->options.compareArgs("REGULARIZATION METHOD", "RELAXATION")) { nrs->filterRTKernel( mesh->Nelements, nrs->o_filterMT, @@ -563,6 +631,10 @@ void makef( nrs->fieldOffset, nrs->o_U, o_FU); + double flops = 24 * mesh->Np * mesh->Nq + 3 * mesh->Np; + flops *= static_cast(mesh->Nelements); + platform->flopCounter->add("velocityFilterRT", flops); + } if (movingMesh && !nrs->Nsubsteps) { nrs->advectMeshVelocityKernel(mesh->Nelements, @@ -572,43 +644,48 @@ void makef( mesh->o_U, nrs->o_U, o_FU); + double flops = 54 * mesh->Np * mesh->Nq + 63 * mesh->Np; + flops *= static_cast(mesh->Nelements); + platform->flopCounter->add("velocity advectMeshVelocity", flops); } occa::memory o_Usubcycling = platform->o_mempool.slice0; if (platform->options.compareArgs("ADVECTION", "TRUE")) { if (nrs->Nsubsteps) { if (movingMesh) - o_Usubcycling = velocityStrongSubCycleMovingMesh( + o_Usubcycling = velocitySubCycleMovingMesh( nrs, mymin(tstep, nrs->nEXT), time, nrs->o_U); else - o_Usubcycling = velocityStrongSubCycle( + o_Usubcycling = velocitySubCycle( nrs, mymin(tstep, nrs->nEXT), time, nrs->o_U); } else { if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) - nrs->advectionStrongCubatureVolumeKernel(mesh->Nelements, - mesh->o_vgeo, - mesh->o_cubDiffInterpT, - mesh->o_cubInterpT, - mesh->o_cubProjectT, - nrs->fieldOffset, - std::max(nrs->fieldOffset, mesh->Nelements * mesh->cubNp), - nrs->o_U, - nrs->o_Urst, - platform->o_mempool.slice0); + nrs->strongAdvectionCubatureVolumeKernel(mesh->Nelements, + mesh->o_vgeo, + mesh->o_cubDiffInterpT, + mesh->o_cubInterpT, + mesh->o_cubProjectT, + nrs->fieldOffset, + nrs->cubatureOffset, + nrs->o_U, + nrs->o_Urst, + platform->o_mempool.slice0); else - nrs->advectionStrongVolumeKernel(mesh->Nelements, - mesh->o_vgeo, - mesh->o_D, - nrs->fieldOffset, - nrs->o_U, - nrs->o_Urst, - platform->o_mempool.slice0); + nrs->strongAdvectionVolumeKernel(mesh->Nelements, + mesh->o_vgeo, + mesh->o_D, + nrs->fieldOffset, + nrs->o_U, + nrs->o_Urst, + platform->o_mempool.slice0); platform->linAlg->axpby(nrs->NVfields * nrs->fieldOffset, -1.0, platform->o_mempool.slice0, 1.0, o_FU); + + advectionFlops(nrs->meshV, nrs->NVfields); } } else { if (nrs->Nsubsteps) @@ -627,6 +704,16 @@ void makef( o_FU, o_BF); + dfloat sumMakefFlops = 0.0; + if (nrs->Nsubsteps) { + sumMakefFlops += (6 + 6 * nrs->nEXT) * static_cast(mesh->Nlocal); + } + else { + sumMakefFlops += (6 * nrs->nEXT + 12 * nrs->nBDF) * static_cast(mesh->Nlocal); + } + + platform->flopCounter->add("sumMakef", sumMakefFlops); + if (verbose) { const dfloat debugNorm = platform->linAlg->weightedNorm2Many(mesh->Nlocal, nrs->NVfields, @@ -677,862 +764,111 @@ void fluidSolve( } -void meshSolve(nrs_t* nrs, dfloat time, occa::memory o_U, int stage) -{ - mesh_t* mesh = nrs->meshV; - linAlg_t* linAlg = platform->linAlg; - - platform->timer.tic("meshSolve", 1); - nrs->setEllipticCoeffKernel( - mesh->Nlocal, - 1.0, - 0 * nrs->fieldOffset, - nrs->fieldOffset, - nrs->o_meshMue, - nrs->o_meshRho, - nrs->o_ellipticCoeff); - - occa::memory o_Unew = [&](nrs_t* nrs, dfloat time, int stage){ - mesh_t* mesh = nrs->meshV; - oogs_t* gsh = nrs->gsh; - - //enforce Dirichlet BCs - platform->linAlg->fill(nrs->NVfields*nrs->fieldOffset, -1.0*std::numeric_limits::max(), platform->o_mempool.slice3); - for (int sweep = 0; sweep < 2; sweep++) { - nrs->meshV->velocityDirichletKernel(mesh->Nelements, - nrs->fieldOffset, - mesh->o_vmapM, - nrs->o_EToBMesh, - nrs->o_U, - platform->o_mempool.slice3); - - //take care of Neumann-Dirichlet shared edges across elements - if(sweep == 0) oogs::startFinish(platform->o_mempool.slice3, nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsMax, gsh); - if(sweep == 1) oogs::startFinish(platform->o_mempool.slice3, nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsMin, gsh); - } - - if (nrs->meshSolver->Nmasked) nrs->maskCopyKernel(nrs->meshSolver->Nmasked, 0*nrs->fieldOffset, nrs->meshSolver->o_maskIds, - platform->o_mempool.slice3, mesh->o_U); - - platform->linAlg->fill(nrs->NVfields*nrs->fieldOffset, 0, platform->o_mempool.slice3); - platform->o_mempool.slice0.copyFrom(mesh->o_U, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat)); - ellipticSolve(nrs->meshSolver, platform->o_mempool.slice3, platform->o_mempool.slice0); - - // enforce C0 - oogs::startFinish(platform->o_mempool.slice0, nrs->NVfields, nrs->fieldOffset, ogsDfloat, ogsAdd, nrs->gsh); - platform->linAlg->axmyMany( - mesh->Nlocal, - nrs->NVfields, - nrs->fieldOffset, - 0, - 1.0, - nrs->meshSolver->o_invDegree, - platform->o_mempool.slice0 - ); - - return platform->o_mempool.slice0; - }(nrs, time, stage); - o_U.copyFrom(o_Unew, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat)); - platform->timer.toc("meshSolve"); -} - -occa::memory velocityStrongSubCycleMovingMesh(nrs_t* nrs, int nEXT, dfloat time, occa::memory o_U) +void printInfo(nrs_t *nrs, dfloat time, int tstep) { - mesh_t* mesh = nrs->meshV; - linAlg_t* linAlg = platform->linAlg; - - occa::memory &o_p0 = platform->o_mempool.slice0; - occa::memory &o_u1 = platform->o_mempool.slice3; - - occa::memory &o_r1 = platform->o_mempool.slice6; - occa::memory &o_r2 = platform->o_mempool.slice9; - occa::memory &o_r3 = platform->o_mempool.slice12; - occa::memory &o_r4 = platform->o_mempool.slice15; - - occa::memory &o_LMMe = platform->o_mempool.slice18; - - const dlong cubatureOffset = - std::max(nrs->fieldOffset, mesh->cubNp * mesh->Nelements); - - // Solve for Each SubProblem - for (int torder = nEXT - 1; torder >= 0; torder--) { - // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt) - dlong toffset = torder * nrs->NVfields * nrs->fieldOffset; - const dlong offset = torder * nrs->fieldOffset; - nrs->subCycleInitU0Kernel(mesh->Nlocal, - nrs->NVfields, - nrs->fieldOffset, - torder, - nEXT, - toffset, - offset, - nrs->coeffBDF[torder], - mesh->o_LMM, - o_U, - o_p0); - - // Advance subproblem from here from t^(n-torder) to t^(n-torder+1) - dfloat tsub = time; - for (int i = torder; i > 0; i--) - tsub -= nrs->dt[i]; - const dfloat sdt = nrs->dt[torder] / nrs->Nsubsteps; - - for (int ststep = 0; ststep < nrs->Nsubsteps; ++ststep) { - const dfloat tstage = tsub + ststep * sdt; - - o_u1.copyFrom(o_p0, nrs->NVfields * nrs->fieldOffset * sizeof(dfloat)); - - for (int rk = 0; rk < nrs->nRK; ++rk) { - occa::memory o_rhs; - if (rk == 0) - o_rhs = o_r1; - if (rk == 1) - o_rhs = o_r2; - if (rk == 2) - o_rhs = o_r3; - if (rk == 3) - o_rhs = o_r4; - // Extrapolate velocity to subProblem stage time - const dfloat t = tstage + sdt * nrs->nodesRK[rk]; - const dfloat tn0 = time; - const dfloat tn1 = time - nrs->dt[1]; - const dfloat tn2 = time - (nrs->dt[1] + nrs->dt[2]); - dfloat extC[3] = {0., 0., 0.}; - switch (nEXT) { - case 1: - extC[0] = 1; - extC[1] = 0; - extC[2] = 0; - break; - case 2: - extC[0] = (t - tn1) / (tn0 - tn1); - extC[1] = (t - tn0) / (tn1 - tn0); - extC[2] = 0; - break; - case 3: - extC[0] = (t - tn1) * (t - tn2) / ((tn0 - tn1) * (tn0 - tn2)); - extC[1] = (t - tn0) * (t - tn2) / ((tn1 - tn0) * (tn1 - tn2)); - extC[2] = (t - tn0) * (t - tn1) / ((tn2 - tn0) * (tn2 - tn1)); - break; - } - - nrs->nStagesSum3Kernel(mesh->Nlocal, - nrs->fieldOffset, - nEXT, - extC[0], - extC[1], - extC[2], - mesh->o_LMM, - o_LMMe); - linAlg->aydxMany(mesh->Nlocal, - nrs->NVfields, - nrs->fieldOffset, - 0, - 1.0, - o_LMMe, - o_u1); - - if (mesh->NglobalGatherElements) { - if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) - nrs->subCycleStrongCubatureVolumeKernel(mesh->NglobalGatherElements, - mesh->o_globalGatherElementList, - mesh->o_cubDiffInterpT, - mesh->o_cubInterpT, - nrs->fieldOffset, - cubatureOffset, - 0, - mesh->o_invLMM, - mesh->o_divU, - extC[0], - extC[1], - extC[2], - nrs->o_relUrst, - o_u1, - o_rhs); - else - nrs->subCycleStrongVolumeKernel(mesh->NglobalGatherElements, - mesh->o_globalGatherElementList, - mesh->o_D, - nrs->fieldOffset, - 0, - mesh->o_invLMM, - mesh->o_divU, - extC[0], - extC[1], - extC[2], - nrs->o_relUrst, - o_u1, - o_rhs); - } - - oogs::start(o_rhs, - nrs->NVfields, - nrs->fieldOffset, - ogsDfloat, - ogsAdd, - nrs->gsh); - - if (mesh->NlocalGatherElements) { - if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) - nrs->subCycleStrongCubatureVolumeKernel(mesh->NlocalGatherElements, - mesh->o_localGatherElementList, - mesh->o_cubDiffInterpT, - mesh->o_cubInterpT, - nrs->fieldOffset, - cubatureOffset, - 0, - mesh->o_invLMM, - mesh->o_divU, - extC[0], - extC[1], - extC[2], - nrs->o_relUrst, - o_u1, - o_rhs); - else - nrs->subCycleStrongVolumeKernel(mesh->NlocalGatherElements, - mesh->o_localGatherElementList, - mesh->o_D, - nrs->fieldOffset, - 0, - mesh->o_invLMM, - mesh->o_divU, - extC[0], - extC[1], - extC[2], - nrs->o_relUrst, - o_u1, - o_rhs); - } - - oogs::finish(o_rhs, - nrs->NVfields, - nrs->fieldOffset, - ogsDfloat, - ogsAdd, - nrs->gsh); - linAlg->axmyMany(mesh->Nlocal, - nrs->NVfields, - nrs->fieldOffset, - 0, - 1.0, - o_LMMe, - o_rhs); - - if (rk != 3) - linAlg->axpbyzMany(mesh->Nlocal, - nrs->NVfields, - nrs->fieldOffset, - 1.0, - o_p0, - -sdt * nrs->coeffsfRK[rk + 1], - o_rhs, - o_u1); - else - nrs->subCycleRKKernel(mesh->Nlocal, - nrs->NVfields, - nrs->fieldOffset, - sdt, - nrs->o_weightsRK, - o_r1, - o_r2, - o_r3, - o_r4, - o_p0); - } - } - } - return o_p0; -} -occa::memory velocityStrongSubCycle( - nrs_t *nrs, int nEXT, dfloat time, occa::memory o_U) { - mesh_t *mesh = nrs->meshV; - linAlg_t *linAlg = platform->linAlg; - - dlong cubatureOffset; - if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) - cubatureOffset = std::max(nrs->fieldOffset, mesh->Nelements * mesh->cubNp); - else - cubatureOffset = nrs->fieldOffset; - - // Solve for Each SubProblem - for (int torder = nEXT - 1; torder >= 0; torder--) { - // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt) - dlong toffset = torder * nrs->NVfields * nrs->fieldOffset; - nrs->subCycleInitU0Kernel(mesh->Nlocal, - nrs->NVfields, - nrs->fieldOffset, - torder, - nEXT, - toffset, - 0, - nrs->coeffBDF[torder], - mesh->o_LMM, - o_U, - platform->o_mempool.slice0); - - // Advance subproblem from here from t^(n-torder) to t^(n-torder+1) - dfloat tsub = time; - for (int i = torder; i > 0; i--) - tsub -= nrs->dt[i]; - const dfloat sdt = nrs->dt[torder] / nrs->Nsubsteps; - - for (int ststep = 0; ststep < nrs->Nsubsteps; ++ststep) { - const dfloat tstage = tsub + ststep * sdt; - - platform->o_mempool.slice0.copyFrom(platform->o_mempool.slice0, - nrs->NVfields * nrs->fieldOffset * sizeof(dfloat), - nrs->NVfields * nrs->fieldOffset * sizeof(dfloat), - 0); - - for (int rk = 0; rk < nrs->nRK; ++rk) { - // Extrapolate velocity to subProblem stage time - const dfloat t = tstage + sdt * nrs->nodesRK[rk]; - const dfloat tn0 = time; - const dfloat tn1 = time - nrs->dt[1]; - const dfloat tn2 = time - (nrs->dt[1] + nrs->dt[2]); - dfloat extC[3] = {0., 0., 0.}; - switch (nEXT) { - case 1: - extC[0] = 1; - extC[1] = 0; - extC[2] = 0; - break; - case 2: - extC[0] = (t - tn1) / (tn0 - tn1); - extC[1] = (t - tn0) / (tn1 - tn0); - extC[2] = 0; - break; - case 3: - extC[0] = (t - tn1) * (t - tn2) / ((tn0 - tn1) * (tn0 - tn2)); - extC[1] = (t - tn0) * (t - tn2) / ((tn1 - tn0) * (tn1 - tn2)); - extC[2] = (t - tn0) * (t - tn1) / ((tn2 - tn0) * (tn2 - tn1)); - break; - } - - if (mesh->NglobalGatherElements) { - if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) - nrs->subCycleStrongCubatureVolumeKernel(mesh->NglobalGatherElements, - mesh->o_globalGatherElementList, - mesh->o_cubDiffInterpT, - mesh->o_cubInterpT, - nrs->fieldOffset, - cubatureOffset, - rk * nrs->NVfields * nrs->fieldOffset, - mesh->o_invLMM, - mesh->o_divU, - extC[0], - extC[1], - extC[2], - nrs->o_Urst, - platform->o_mempool.slice0, - platform->o_mempool.slice6); - else - nrs->subCycleStrongVolumeKernel(mesh->NglobalGatherElements, - mesh->o_globalGatherElementList, - mesh->o_D, - nrs->fieldOffset, - rk * nrs->NVfields * nrs->fieldOffset, - mesh->o_invLMM, - mesh->o_divU, - extC[0], - extC[1], - extC[2], - nrs->o_Urst, - platform->o_mempool.slice0, - platform->o_mempool.slice6); - } - - occa::memory o_rhs; - if (rk == 0) - o_rhs = platform->o_mempool.slice6; - if (rk == 1) - o_rhs = platform->o_mempool.slice9; - if (rk == 2) - o_rhs = platform->o_mempool.slice12; - if (rk == 3) - o_rhs = platform->o_mempool.slice15; - - oogs::start(o_rhs, - nrs->NVfields, - nrs->fieldOffset, - ogsDfloat, - ogsAdd, - nrs->gsh); - - if (mesh->NlocalGatherElements) { - if (platform->options.compareArgs("ADVECTION TYPE", "CUBATURE")) - nrs->subCycleStrongCubatureVolumeKernel(mesh->NlocalGatherElements, - mesh->o_localGatherElementList, - mesh->o_cubDiffInterpT, - mesh->o_cubInterpT, - nrs->fieldOffset, - cubatureOffset, - rk * nrs->NVfields * nrs->fieldOffset, - mesh->o_invLMM, - mesh->o_divU, - extC[0], - extC[1], - extC[2], - nrs->o_Urst, - platform->o_mempool.slice0, - platform->o_mempool.slice6); - else - nrs->subCycleStrongVolumeKernel(mesh->NlocalGatherElements, - mesh->o_localGatherElementList, - mesh->o_D, - nrs->fieldOffset, - rk * nrs->NVfields * nrs->fieldOffset, - mesh->o_invLMM, - mesh->o_divU, - extC[0], - extC[1], - extC[2], - nrs->o_Urst, - platform->o_mempool.slice0, - platform->o_mempool.slice6); - } - - oogs::finish(o_rhs, - nrs->NVfields, - nrs->fieldOffset, - ogsDfloat, - ogsAdd, - nrs->gsh); - - nrs->subCycleRKUpdateKernel(mesh->Nlocal, - rk, - sdt, - nrs->fieldOffset, - nrs->o_coeffsfRK, - nrs->o_weightsRK, - platform->o_mempool.slice3, - platform->o_mempool.slice6, - platform->o_mempool.slice0); - } - } - } - linAlg->axmyMany(mesh->Nlocal, - 3, - nrs->fieldOffset, - 0, - 1.0, - mesh->o_LMM, - platform->o_mempool.slice0); - return platform->o_mempool.slice0; -} - -occa::memory scalarStrongSubCycleMovingMesh(cds_t *cds, - int nEXT, - dfloat time, - int is, - occa::memory o_U, - occa::memory o_S) { - - linAlg_t *linAlg = platform->linAlg; - - occa::memory &o_r1 = platform->o_mempool.slice2; - occa::memory &o_r2 = platform->o_mempool.slice3; - occa::memory &o_r3 = platform->o_mempool.slice4; - occa::memory &o_r4 = platform->o_mempool.slice5; - - occa::memory &o_p0 = platform->o_mempool.slice0; - occa::memory &o_u1 = platform->o_mempool.slice6; - - occa::memory &o_LMMe = platform->o_mempool.slice1; - - dlong cubatureOffset = - std::max(cds->vFieldOffset, cds->meshV->Nelements * cds->meshV->cubNp); - - // Solve for Each SubProblem - for (int torder = (nEXT - 1); torder >= 0; torder--) { - // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt) - const dlong toffset = - cds->fieldOffsetScan[is] + torder * cds->fieldOffsetSum; - const dlong offset = torder * cds->fieldOffset[is]; - cds->subCycleInitU0Kernel(cds->mesh[0]->Nlocal, - 1, - cds->fieldOffset[is], - torder, - nEXT, - toffset, - offset, - cds->coeffBDF[torder], - cds->mesh[0]->o_LMM, - o_S, - o_p0); - - // Advance SubProblem to t^(n-torder+1) - dfloat tsub = time; - for (int i = torder; i > 0; i--) - tsub -= cds->dt[i]; - const dfloat sdt = cds->dt[torder] / cds->Nsubsteps; - - for (int ststep = 0; ststep < cds->Nsubsteps; ++ststep) { - const dfloat tstage = tsub + ststep * sdt; - o_u1.copyFrom(o_p0, cds->mesh[0]->Nlocal * sizeof(dfloat)); - for (int rk = 0; rk < cds->nRK; ++rk) { - occa::memory o_rhs; - if (rk == 0) - o_rhs = o_r1; - if (rk == 1) - o_rhs = o_r2; - if (rk == 2) - o_rhs = o_r3; - if (rk == 3) - o_rhs = o_r4; - - // Extrapolate velocity to subProblem stage time - const dfloat t = tstage + sdt * cds->nodesRK[rk]; - const dfloat tn0 = time; - const dfloat tn1 = time - cds->dt[1]; - const dfloat tn2 = time - (cds->dt[1] + cds->dt[2]); - dfloat extC[3] = {0., 0., 0.}; - switch (nEXT) { - case 1: - extC[0] = 1; - extC[1] = 0; - extC[2] = 0; - break; - case 2: - extC[0] = (t - tn1) / (tn0 - tn1); - extC[1] = (t - tn0) / (tn1 - tn0); - extC[2] = 0; - break; - case 3: - extC[0] = (t - tn1) * (t - tn2) / ((tn0 - tn1) * (tn0 - tn2)); - extC[1] = (t - tn0) * (t - tn2) / ((tn1 - tn0) * (tn1 - tn2)); - extC[2] = (t - tn0) * (t - tn1) / ((tn2 - tn0) * (tn2 - tn1)); - break; - } - cds->nStagesSum3Kernel(cds->mesh[0]->Nlocal, - cds->vFieldOffset, - nEXT, - extC[0], - extC[1], - extC[2], - cds->mesh[0]->o_LMM, - o_LMMe); - linAlg->aydx(cds->mesh[0]->Nlocal, 1.0, o_LMMe, o_u1); - - if (cds->meshV->NglobalGatherElements) { - if (cds->options[is].compareArgs("ADVECTION TYPE", "CUBATURE")) - cds->subCycleStrongCubatureVolumeKernel( - cds->meshV->NglobalGatherElements, - cds->meshV->o_globalGatherElementList, - cds->meshV->o_cubDiffInterpT, - cds->meshV->o_cubInterpT, - cds->vFieldOffset, - cubatureOffset, - 0, - cds->mesh[0]->o_invLMM, - cds->mesh[0]->o_divU, - extC[0], - extC[1], - extC[2], - cds->o_relUrst, - o_u1, - o_rhs); - else - cds->subCycleStrongVolumeKernel(cds->meshV->NglobalGatherElements, - cds->meshV->o_globalGatherElementList, - cds->meshV->o_D, - cds->vFieldOffset, - 0, - cds->mesh[0]->o_invLMM, - cds->mesh[0]->o_divU, - extC[0], - extC[1], - extC[2], - cds->o_relUrst, - o_u1, - o_rhs); - } - - oogs::start( - o_rhs, 1, cds->fieldOffset[is], ogsDfloat, ogsAdd, cds->gsh); - - if (cds->meshV->NlocalGatherElements) { - if (cds->options[is].compareArgs("ADVECTION TYPE", "CUBATURE")) - cds->subCycleStrongCubatureVolumeKernel( - cds->meshV->NlocalGatherElements, - cds->meshV->o_localGatherElementList, - cds->meshV->o_cubDiffInterpT, - cds->meshV->o_cubInterpT, - cds->vFieldOffset, - cubatureOffset, - 0, - cds->mesh[0]->o_invLMM, - cds->mesh[0]->o_divU, - extC[0], - extC[1], - extC[2], - cds->o_relUrst, - o_u1, - o_rhs); - else - cds->subCycleStrongVolumeKernel(cds->meshV->NlocalGatherElements, - cds->meshV->o_localGatherElementList, - cds->meshV->o_D, - cds->vFieldOffset, - 0, - cds->mesh[0]->o_invLMM, - cds->mesh[0]->o_divU, - extC[0], - extC[1], - extC[2], - cds->o_relUrst, - o_u1, - o_rhs); - } - - oogs::finish( - o_rhs, 1, cds->fieldOffset[is], ogsDfloat, ogsAdd, cds->gsh); - - linAlg->axmy(cds->mesh[0]->Nlocal, 1.0, o_LMMe, o_rhs); - if (rk != 3) - linAlg->axpbyz(cds->mesh[0]->Nlocal, - 1.0, - o_p0, - -sdt * cds->coeffsfRK[rk + 1], - o_rhs, - o_u1); - else - cds->subCycleRKKernel(cds->mesh[0]->Nlocal, - sdt, - cds->o_weightsRK, - o_r1, - o_r2, - o_r3, - o_r4, - o_p0); - } - } - } - return o_p0; -} -occa::memory scalarStrongSubCycle(cds_t *cds, - int nEXT, - dfloat time, - int is, - occa::memory o_U, - occa::memory o_S) { - linAlg_t *linAlg = platform->linAlg; - dlong offset = - std::max(cds->vFieldOffset, cds->meshV->Nelements * cds->meshV->cubNp); - - // Solve for Each SubProblem - for (int torder = (nEXT - 1); torder >= 0; torder--) { - // Initialize SubProblem Velocity i.e. Ud = U^(t-torder*dt) - const dlong toffset = - cds->fieldOffsetScan[is] + torder * cds->fieldOffsetSum; - cds->subCycleInitU0Kernel(cds->mesh[0]->Nlocal, - 1, - cds->fieldOffset[is], - torder, - nEXT, - toffset, - 0, - cds->coeffBDF[torder], - cds->mesh[0]->o_LMM, - o_S, - platform->o_mempool.slice0); - - // Advance SubProblem to t^(n-torder+1) - dfloat tsub = time; - for (int i = torder; i > 0; i--) - tsub -= cds->dt[i]; - const dfloat sdt = cds->dt[torder] / cds->Nsubsteps; - - for (int ststep = 0; ststep < cds->Nsubsteps; ++ststep) { - const dfloat tstage = tsub + ststep * sdt; - - platform->o_mempool.slice0.copyFrom(platform->o_mempool.slice0, - cds->fieldOffset[is] * sizeof(dfloat), - cds->fieldOffset[is] * sizeof(dfloat), - 0); - - for (int rk = 0; rk < cds->nRK; ++rk) { - // Extrapolate velocity to subProblem stage time - const dfloat t = tstage + sdt * cds->nodesRK[rk]; - const dfloat tn0 = time; - const dfloat tn1 = time - cds->dt[1]; - const dfloat tn2 = time - (cds->dt[1] + cds->dt[2]); - dfloat extC[3] = {0., 0., 0.}; - switch (nEXT) { - case 1: - extC[0] = 1; - extC[1] = 0; - extC[2] = 0; - break; - case 2: - extC[0] = (t - tn1) / (tn0 - tn1); - extC[1] = (t - tn0) / (tn1 - tn0); - extC[2] = 0; - break; - case 3: - extC[0] = (t - tn1) * (t - tn2) / ((tn0 - tn1) * (tn0 - tn2)); - extC[1] = (t - tn0) * (t - tn2) / ((tn1 - tn0) * (tn1 - tn2)); - extC[2] = (t - tn0) * (t - tn1) / ((tn2 - tn0) * (tn2 - tn1)); - break; - } - - if (cds->meshV->NglobalGatherElements) { - if (cds->options[is].compareArgs("ADVECTION TYPE", "CUBATURE")) - cds->subCycleStrongCubatureVolumeKernel( - cds->meshV->NglobalGatherElements, - cds->meshV->o_globalGatherElementList, - cds->meshV->o_cubDiffInterpT, - cds->meshV->o_cubInterpT, - cds->vFieldOffset, - offset, - rk * cds->fieldOffset[is], - cds->mesh[0]->o_invLMM, - cds->mesh[0]->o_divU, - extC[0], - extC[1], - extC[2], - cds->o_Urst, - platform->o_mempool.slice0, - platform->o_mempool.slice2); - else - cds->subCycleStrongVolumeKernel(cds->meshV->NglobalGatherElements, - cds->meshV->o_globalGatherElementList, - cds->meshV->o_D, - cds->vFieldOffset, - rk * cds->fieldOffset[is], - cds->mesh[0]->o_invLMM, - cds->mesh[0]->o_divU, - extC[0], - extC[1], - extC[2], - cds->o_Urst, - platform->o_mempool.slice0, - platform->o_mempool.slice2); - } - - occa::memory o_rhs; - if (rk == 0) - o_rhs = platform->o_mempool.slice2; - if (rk == 1) - o_rhs = platform->o_mempool.slice3; - if (rk == 2) - o_rhs = platform->o_mempool.slice4; - if (rk == 3) - o_rhs = platform->o_mempool.slice5; - - oogs::start( - o_rhs, 1, cds->fieldOffset[is], ogsDfloat, ogsAdd, cds->gsh); - - if (cds->meshV->NlocalGatherElements) { - if (cds->options[is].compareArgs("ADVECTION TYPE", "CUBATURE")) - cds->subCycleStrongCubatureVolumeKernel( - cds->meshV->NlocalGatherElements, - cds->meshV->o_localGatherElementList, - cds->meshV->o_cubDiffInterpT, - cds->meshV->o_cubInterpT, - cds->vFieldOffset, - offset, - rk * cds->fieldOffset[is], - cds->mesh[0]->o_invLMM, - cds->mesh[0]->o_divU, - extC[0], - extC[1], - extC[2], - cds->o_Urst, - platform->o_mempool.slice0, - platform->o_mempool.slice2); - else - cds->subCycleStrongVolumeKernel(cds->meshV->NlocalGatherElements, - cds->meshV->o_localGatherElementList, - cds->meshV->o_D, - cds->vFieldOffset, - rk * cds->fieldOffset[is], - cds->mesh[0]->o_invLMM, - cds->mesh[0]->o_divU, - extC[0], - extC[1], - extC[2], - cds->o_Urst, - platform->o_mempool.slice0, - platform->o_mempool.slice2); - } - - oogs::finish( - o_rhs, 1, cds->fieldOffset[is], ogsDfloat, ogsAdd, cds->gsh); - - cds->subCycleRKUpdateKernel(cds->meshV->Nlocal, - rk, - sdt, - cds->fieldOffset[is], - cds->o_coeffsfRK, - cds->o_weightsRK, - platform->o_mempool.slice1, - platform->o_mempool.slice2, - platform->o_mempool.slice0); - } - } - } - linAlg->axmy(cds->mesh[0]->Nlocal, - 1.0, - cds->mesh[0]->o_LMM, - platform->o_mempool.slice0); - return platform->o_mempool.slice0; -} - -void printInfo( - nrs_t *nrs, dfloat time, int tstep, double tElapsedStep, double tElapsed) { cds_t *cds = nrs->cds; - const int enforceVerbose = tstep < 1001; + const double elapsedStep = platform->timer.query("elapsedStep", "DEVICE:MAX"); + const double elapsedStepSum = platform->timer.query("elapsedStepSum", "DEVICE:MAX"); + bool verboseInfo = platform->options.compareArgs("VERBOSE SOLVER INFO", "TRUE"); const dfloat cfl = computeCFL(nrs); dfloat divUErrVolAvg, divUErrL2; - if (platform->options.compareArgs("VERBOSE SOLVER INFO", "TRUE") || enforceVerbose){ + + if (verboseInfo){ computeDivUErr(nrs, divUErrVolAvg, divUErrL2); } if (platform->comm.mpiRank == 0) { - if (platform->options.compareArgs("VERBOSE SOLVER INFO", "TRUE") || - enforceVerbose) { + if (verboseInfo){ if (nrs->flow) { elliptic_t *solver = nrs->pSolver; - printf(" P : iter %03d resNorm00 %.2e resNorm0 %.2e resNorm %.2e\n", + if(solver->solutionProjection){ + const int prevVecs = solver->solutionProjection->getPrevNumVecsProjection(); + if (prevVecs > 0) { + printf(" projP : resNorm0 %.2e resNorm %.2e ratio = %.3e %d/%d\n", + solver->res00Norm, + solver->res0Norm, + solver->res00Norm / solver->res0Norm, + prevVecs, + solver->solutionProjection->getMaxNumVecsProjection()); + } + } + printf(" P : iter %03d resNorm0 %.2e resNorm %.2e\n", solver->Niter, - solver->res00Norm, solver->res0Norm, solver->resNorm); if (nrs->uvwSolver) { solver = nrs->uvwSolver; - printf(" UVW: iter %03d resNorm00 %.2e resNorm0 %.2e " + if(solver->solutionProjection){ + const int prevVecs = solver->solutionProjection->getPrevNumVecsProjection(); + if (prevVecs > 0) { + printf(" projUVW : resNorm0 %.2e resNorm %.2e ratio = %.3e %d/%d\n", + solver->res00Norm, + solver->res0Norm, + solver->res00Norm / solver->res0Norm, + prevVecs, + solver->solutionProjection->getMaxNumVecsProjection()); + } + } + printf(" UVW : iter %03d resNorm0 %.2e " "resNorm %.2e divErrNorms %.2e %.2e\n", solver->Niter, - solver->res00Norm, solver->res0Norm, solver->resNorm, divUErrVolAvg, divUErrL2); } else { solver = nrs->uSolver; - printf(" U : iter %03d resNorm00 %.2e resNorm0 %.2e " + if(solver->solutionProjection){ + const int prevVecs = solver->solutionProjection->getPrevNumVecsProjection(); + if (prevVecs > 0) { + printf(" projU : resNorm0 %.2e resNorm %.2e ratio = %.3e %d/%d\n", + solver->res00Norm, + solver->res0Norm, + solver->res00Norm / solver->res0Norm, + prevVecs, + solver->solutionProjection->getMaxNumVecsProjection()); + } + } + printf(" U : iter %03d resNorm0 %.2e " "resNorm %.2e divErrNorms %.2e %.2e\n", solver->Niter, - solver->res00Norm, solver->res0Norm, solver->resNorm, divUErrVolAvg, divUErrL2); solver = nrs->vSolver; - printf(" V : iter %03d resNorm00 %.2e resNorm0 %.2e " + if(solver->solutionProjection){ + const int prevVecs = solver->solutionProjection->getPrevNumVecsProjection(); + if (prevVecs > 0) { + printf(" projV : resNorm0 %.2e resNorm %.2e ratio = %.3e %d/%d\n", + solver->res00Norm, + solver->res0Norm, + solver->res00Norm / solver->res0Norm, + prevVecs, + solver->solutionProjection->getMaxNumVecsProjection()); + } + } + printf(" V : iter %03d resNorm0 %.2e " "resNorm %.2e\n", solver->Niter, - solver->res00Norm, solver->res0Norm, solver->resNorm); solver = nrs->wSolver; - printf(" W : iter %03d resNorm00 %.2e resNorm0 %.2e " + if(solver->solutionProjection){ + const int prevVecs = solver->solutionProjection->getPrevNumVecsProjection(); + if (prevVecs > 0) { + printf(" projW : resNorm0 %.2e resNorm %.2e ratio = %.3e %d/%d\n", + solver->res00Norm, + solver->res0Norm, + solver->res00Norm / solver->res0Norm, + prevVecs, + solver->solutionProjection->getMaxNumVecsProjection()); + } + } + printf(" W : iter %03d resNorm0 %.2e " "resNorm %.2e\n", solver->Niter, - solver->res00Norm, solver->res0Norm, solver->resNorm); } @@ -1541,23 +877,51 @@ void printInfo( if(nrs->meshSolver) { elliptic_t* solver = nrs->meshSolver; - printf(" MSH: iter %03d resNorm00 %.2e resNorm0 %.2e resNorm %.2e\n", - solver->Niter, solver->res00Norm, solver->res0Norm, solver->resNorm); + if(solver->solutionProjection){ + const int prevVecs = solver->solutionProjection->getPrevNumVecsProjection(); + if (prevVecs > 0) { + printf(" projMSH : resNorm0 %.2e resNorm %.2e ratio = %.3e %d/%d\n", + solver->res00Norm, + solver->res0Norm, + solver->res00Norm / solver->res0Norm, + prevVecs, + solver->solutionProjection->getMaxNumVecsProjection()); + } + } + printf(" MSH : iter %03d resNorm0 %.2e resNorm %.2e\n", + solver->Niter, solver->res0Norm, solver->resNorm); } for(int is = 0; is < nrs->Nscalar; is++) { if (cds->compute[is]) { elliptic_t *solver = cds->solver[is]; - printf(" S%02d: iter %03d resNorm00 %.2e resNorm0 %.2e " + if (solver->solutionProjection) { + const int prevVecs = solver->solutionProjection->getPrevNumVecsProjection(); + if (prevVecs > 0) { + printf(" projS%02d : resNorm0 %.2e resNorm %.2e ratio = %.3e %d/%d\n", + is, + solver->res00Norm, + solver->res0Norm, + solver->res00Norm / solver->res0Norm, + prevVecs, + solver->solutionProjection->getMaxNumVecsProjection()); + } + } + printf(" S%02d : iter %03d resNorm0 %.2e " "resNorm %.2e\n", is, solver->Niter, - solver->res00Norm, solver->res0Norm, solver->resNorm); } } } + + if(platform->options.compareArgs("CONSTANT FLOW RATE", "TRUE")){ + ConstantFlowRate::printInfo(nrs->meshV, verboseInfo); + } + + printf("step= %d t= %.8e dt=%.1e C= %.2f", tstep, time, nrs->dt[0], cfl); if (nrs->flow) { @@ -1575,18 +939,20 @@ void printInfo( for(int is = 0; is < nrs->Nscalar; is++) if(cds->compute[is]) printf(" S: %d", cds->solver[is]->Niter); - - printf(" eTimeStep= %.2es eTime= %.5es\n", tElapsedStep, tElapsed); + + if (nrs->timeStepConverged) + printf(" elapsedStep= %.2es elapsedStepSum= %.5es", elapsedStep, elapsedStepSum); + + printf("\n"); } - if (cfl > 30 || std::isnan(cfl) || std::isinf(cfl)) { + bool largeCFLCheck = (cfl > 30) && numberActiveFields(nrs); + + if (largeCFLCheck || std::isnan(cfl) || std::isinf(cfl)) { if (platform->comm.mpiRank == 0) std::cout << "Unreasonable CFL! Dying ...\n" << std::endl; ABORT(1); } - - if (tstep % 10 == 0) - fflush(stdout); } void computeDivUErr(nrs_t* nrs, dfloat& divUErrVolAvg, dfloat& divUErrL2) @@ -1598,6 +964,12 @@ void computeDivUErr(nrs_t* nrs, dfloat& divUErrVolAvg, dfloat& divUErrL2) nrs->fieldOffset, nrs->o_U, platform->o_mempool.slice0); + + double flops = 18 * (mesh->Np * mesh->Nq + mesh->Np); + flops *= static_cast(mesh->Nelements); + + platform->flopCounter->add("divergenceVolumeKernel", flops); + oogs::startFinish(platform->o_mempool.slice0, 1, nrs->fieldOffset, ogsDfloat, ogsAdd, nrs->gsh); platform->linAlg->axmy(mesh->Nlocal, 1.0, mesh->o_invLMM, platform->o_mempool.slice0); @@ -1613,6 +985,7 @@ void computeDivUErr(nrs_t* nrs, dfloat& divUErrVolAvg, dfloat& divUErrL2) mesh->o_LMM, platform->o_mempool.slice0, platform->comm.mpiComm) / sqrt(mesh->volume); + divUErrVolAvg = platform->linAlg->innerProd(mesh->Nlocal, mesh->o_LMM, platform->o_mempool.slice0, diff --git a/src/timeStepper/timeStepper.hpp b/src/timeStepper/timeStepper.hpp index c9405366e..e2884d811 100644 --- a/src/timeStepper/timeStepper.hpp +++ b/src/timeStepper/timeStepper.hpp @@ -12,7 +12,6 @@ void makef(nrs_t* nrs, dfloat time, int tstep, occa::memory o_FU, occa::memory o occa::memory velocityStrongSubCycle(nrs_t* nrs, int nEXT, dfloat time, occa::memory o_U); occa::memory velocityStrongSubCycleMovingMesh(nrs_t* nrs, int nEXT, dfloat time, occa::memory o_U); void fluidSolve(nrs_t* nrs, dfloat time, occa::memory o_P, occa::memory o_U, int stage, int tstep); -void meshSolve(nrs_t* nrs, dfloat time, occa::memory o_U, int stage); void makeq(nrs_t *nrs, dfloat time, int tstep, occa::memory o_FS, occa::memory o_BF); @@ -22,9 +21,7 @@ occa::memory scalarStrongSubCycleMovingMesh(cds_t *cds, int nEXT, dfloat time, occa::memory scalarStrongSubCycle(cds_t *cds, int nEXT, dfloat time, int is, occa::memory o_U, occa::memory o_S); void scalarSolve(nrs_t *nrs, dfloat time, occa::memory o_S, int stage); -void printInfo(nrs_t *nrs, dfloat time, int tstep, double tElapsedStep, - double tElapsed); - +void printInfo(nrs_t *nrs, dfloat time, int tstep); void computeDivUErr(nrs_t* nrs, dfloat& divUErrL1, dfloat& divUErrL2); } diff --git a/src/udf/CMakeLists.txt b/src/udf/CMakeLists.txt index 41376c9f5..e24b6e4f9 100644 --- a/src/udf/CMakeLists.txt +++ b/src/udf/CMakeLists.txt @@ -9,8 +9,6 @@ set(CMAKE_SHARED_LIBRARY_SUFFIX ".so") #set(CMAKE_VERBOSE_MAKEFILE on) -set(CMAKE_CXX_STANDARD 14) - set(NEKRS_INSTALL_DIR $ENV{NEKRS_INSTALL_DIR}) set(OGSDIR ${NEKRS_INSTALL_DIR}/gatherScatter) set(GSDIR ${NEKRS_INSTALL_DIR}/gslib) @@ -23,15 +21,16 @@ set(INCLUDE_DIRS ${NEKRS_INSTALL_DIR}/include/mesh ${NEKRS_INSTALL_DIR}/include/io ${NEKRS_INSTALL_DIR}/include/core - ${NEKRS_INSTALL_DIR}/include/core/utils + ${NEKRS_INSTALL_DIR}/include/utils ${NEKRS_INSTALL_DIR}/include/timeStepper ${NEKRS_INSTALL_DIR}/include/udf - ${NEKRS_INSTALL_DIR}/include/lns + ${NEKRS_INSTALL_DIR}/include/navierStokes ${NEKRS_INSTALL_DIR}/include/elliptic ${NEKRS_INSTALL_DIR}/include/elliptic/parAlmond ${NEKRS_INSTALL_DIR}/include/nekInterface ${NEKRS_INSTALL_DIR}/include/cds ${NEKRS_INSTALL_DIR}/include/linAlg + ${NEKRS_INSTALL_DIR}/include/postProcessing ${NEKRS_INSTALL_DIR}/occa/include ) diff --git a/src/udf/compileUDFKernels.cpp b/src/udf/compileUDFKernels.cpp new file mode 100644 index 000000000..e57e245c5 --- /dev/null +++ b/src/udf/compileUDFKernels.cpp @@ -0,0 +1,45 @@ +#include +#include "udf.hpp" + +occa::properties compileUDFKernels() +{ + const bool buildNodeLocal = useNodeLocalCache(); + + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + int N; + platform->options.getArgs("POLYNOMIAL DEGREE", N); + occa::properties kernelInfo = platform->kernelInfo + meshKernelProperties(N); + kernelInfo["defines"].asObject(); + kernelInfo["includes"].asArray(); + kernelInfo["header"].asArray(); + kernelInfo["flags"].asObject(); + kernelInfo["include_paths"].asArray(); + + MPI_Barrier(platform->comm.mpiComm); + const double tStart = MPI_Wtime(); + if (platform->comm.mpiRank == 0) + printf("loading udf kernels ... "); + fflush(stdout); + + occa::properties kernelInfoBC = kernelInfo; + if (udf.loadKernels) { + // side-effect: kernelInfoBC will include any relevant user-defined kernel props + udf.loadKernels(kernelInfoBC); + } + const std::string bcDataFile = installDir + "/include/bdry/bcData.h"; + kernelInfoBC["includes"] += bcDataFile.c_str(); + std::string boundaryHeaderFileName; + platform->options.getArgs("DATA FILE", boundaryHeaderFileName); + kernelInfoBC["includes"] += realpath(boundaryHeaderFileName.c_str(), NULL); + + kernelInfoBC += meshKernelProperties(N); + + MPI_Barrier(platform->comm.mpiComm); + const double loadTime = MPI_Wtime() - tStart; + if (platform->comm.mpiRank == 0) + printf("done (%gs)\n", loadTime); + fflush(stdout); + + return kernelInfoBC; +} diff --git a/src/udf/udf.cpp b/src/udf/udf.cpp index 27a90d71f..7018be550 100644 --- a/src/udf/udf.cpp +++ b/src/udf/udf.cpp @@ -5,12 +5,14 @@ #include #include "udf.hpp" -#include "io.hpp" +#include "ioUtils.hpp" #include "platform.hpp" +#include "bcMap.hpp" UDF udf = {NULL, NULL, NULL, NULL}; static int velocityDirichletConditions = 0; +static int meshVelocityDirichletConditions = 0; static int velocityNeumannConditions = 0; static int pressureDirichletConditions = 0; static int scalarDirichletConditions = 0; @@ -45,6 +47,10 @@ void oudfFindDirichlet(std::string &field) std::cout << "WARNING: Cannot find oudf function: pressureDirichletConditions!\n"; // ABORT(EXIT_FAILURE); this bc is optional } + if(field.find("mesh") != std::string::npos && !meshVelocityDirichletConditions && !bcMap::useDerivedMeshBoundaryConditions()) { + if (platform->comm.mpiRank == 0) std::cout << "Cannot find oudf function: meshVelocityDirichletConditions!\n"; + ABORT(EXIT_FAILURE); + } } void oudfFindNeumann(std::string &field) @@ -80,9 +86,7 @@ void oudfInit(setupAide &options) int buildRank = platform->comm.mpiRank; MPI_Comm comm = platform->comm.mpiComm; - int buildNodeLocal = 0; - if (getenv("NEKRS_BUILD_NODE_LOCAL")) - buildNodeLocal = std::stoi(getenv("NEKRS_BUILD_NODE_LOCAL")); + const bool buildNodeLocal = useNodeLocalCache(); if(buildNodeLocal) { MPI_Comm_rank(platform->comm.mpiCommLocal, &buildRank); comm = platform->comm.mpiCommLocal; @@ -105,6 +109,13 @@ void oudfInit(setupAide &options) if(!found) out << "void velocityDirichletConditions(bcData *bc){}\n"; + found = std::regex_search(buffer.str(), std::regex(R"(\s*void\s+meshVelocityDirichletConditions)")); + meshVelocityDirichletConditions = found; + if(!found) + out << "void meshVelocityDirichletConditions(bcData *bc){\n" + " velocityDirichletConditions(bc);\n" + "}\n"; + found = std::regex_search(buffer.str(), std::regex(R"(\s*void\s+velocityNeumannConditions)")); velocityNeumannConditions = found; if(!found) @@ -125,15 +136,11 @@ void oudfInit(setupAide &options) if(!found) out << "void scalarDirichletConditions(bcData *bc){}\n"; - out << - "@kernel void __dummy__(int N) {" - " for (int i = 0; i < N; ++i; @tile(64, @outer, @inner)) {}" - "}"; - out.close(); } MPI_Bcast(&velocityDirichletConditions, 1, MPI_INT, 0, comm); + MPI_Bcast(&meshVelocityDirichletConditions, 1, MPI_INT, 0, comm); MPI_Bcast(&velocityNeumannConditions, 1, MPI_INT, 0, comm); MPI_Bcast(&pressureDirichletConditions, 1, MPI_INT, 0, comm); MPI_Bcast(&scalarNeumannConditions, 1, MPI_INT, 0, comm); @@ -146,9 +153,7 @@ void oudfInit(setupAide &options) void udfBuild(const char* udfFile, setupAide& options) { int buildRank = platform->comm.mpiRank; - int buildNodeLocal = 0; - if (getenv("NEKRS_BUILD_NODE_LOCAL")) - buildNodeLocal = std::stoi(getenv("NEKRS_BUILD_NODE_LOCAL")); + const bool buildNodeLocal = useNodeLocalCache(); if(buildNodeLocal) MPI_Comm_rank(platform->comm.mpiCommLocal, &buildRank); @@ -156,9 +161,9 @@ void udfBuild(const char* udfFile, setupAide& options) if(buildRank == 0){ double tStart = MPI_Wtime(); - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - std::string udf_dir = install_dir + "/udf"; + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + std::string udf_dir = installDir + "/udf"; std::string cache_dir; cache_dir.assign(getenv("NEKRS_CACHE_DIR")); @@ -188,19 +193,41 @@ void udfBuild(const char* udfFile, setupAide& options) char udfFileResolved[BUFSIZ]; realpath(udfFile, udfFileResolved); sprintf(cmd, - "cd %s/udf && cp -f %s udf.cpp && cp -f %s/CMakeLists.txt . && " - "rm -f *.so && cmake -Wno-dev -DCASE_DIR=\"%s\" -DCMAKE_CXX_COMPILER=\"$NEKRS_CXX\" " - "-DCMAKE_CXX_FLAGS=\"$NEKRS_CXXFLAGS\" . %s", - cache_dir.c_str(), - udfFileResolved, - udf_dir.c_str(), - case_dir.c_str(), - pipeToNull.c_str()); + "cp -f %s %s/udf/udf.cpp && cp -f %s/CMakeLists.txt %s/udf && rm -f %s/udf/*.so", + udfFileResolved, + cache_dir.c_str(), + udf_dir.c_str(), + cache_dir.c_str(), + cache_dir.c_str()); if(verbose && platform->comm.mpiRank == 0) printf("%s\n", cmd); if(system(cmd)) return EXIT_FAILURE; + + std::string cmakeFlags("-Wno-dev"); + if(verbose) cmakeFlags += " --trace-expand"; + std::string cmakeBuildDir = cache_dir + "/udf"; + sprintf(cmd, "cmake %s -S %s -B %s -DCASE_DIR=\"%s\" -DCMAKE_CXX_COMPILER=\"$NEKRS_CXX\" " + "-DCMAKE_CXX_FLAGS=\"$NEKRS_CXXFLAGS\" %s", + cmakeFlags.c_str(), + cmakeBuildDir.c_str(), + cmakeBuildDir.c_str(), + case_dir.c_str(), + pipeToNull.c_str()); + const int retVal = system(cmd); + if(verbose && platform->comm.mpiRank == 0) { + printf("%s (retVal: %d)\n", cmd, retVal); + } + if(retVal) return EXIT_FAILURE; } - sprintf(cmd, "cd %s/udf && make %s", cache_dir.c_str(), pipeToNull.c_str()); - if(system(cmd)) return EXIT_FAILURE; + + { + sprintf(cmd, "cd %s/udf && make %s", cache_dir.c_str(), pipeToNull.c_str()); + const int retVal = system(cmd); + if(verbose && platform->comm.mpiRank == 0) { + printf("%s (retVal: %d)\n", cmd, retVal); + } + if(retVal) return EXIT_FAILURE; + } + fileSync(udfLib.c_str()); if(platform->comm.mpiRank == 0) printf("done (%gs)\n", MPI_Wtime() - tStart); fflush(stdout); @@ -221,7 +248,7 @@ void* udfLoadFunction(const char* fname, int errchk) sprintf(udfLib, "%s/udf/libUDF.so", cache_dir); void* h, * fptr; - h = dlopen(udfLib, RTLD_LAZY | RTLD_GLOBAL); + h = dlopen(udfLib, RTLD_NOW | RTLD_GLOBAL); if (!h) goto errOpen; fptr = dlsym(h,fname); @@ -242,16 +269,16 @@ void* udfLoadFunction(const char* fname, int errchk) void udfLoad(void) { *(void**)(&udf.setup0) = udfLoadFunction("UDF_Setup0",0); - *(void**)(&udf.setup) = udfLoadFunction("UDF_Setup",1); + *(void**)(&udf.setup) = udfLoadFunction("UDF_Setup",0); *(void**)(&udf.loadKernels) = udfLoadFunction("UDF_LoadKernels",1); *(void**)(&udf.executeStep) = udfLoadFunction("UDF_ExecuteStep",0); } -occa::kernel udfBuildKernel(occa::properties kernelInfo, const char* function) +occa::kernel oudfBuildKernel(occa::properties kernelInfo, const char *function) { - std::string install_dir; - install_dir.assign(getenv("NEKRS_INSTALL_DIR")); - const std::string bcDataFile = install_dir + "/include/core/bcData.h"; + std::string installDir; + installDir.assign(getenv("NEKRS_INSTALL_DIR")); + const std::string bcDataFile = installDir + "/include/bdry/bcData.h"; kernelInfo["includes"] += bcDataFile.c_str(); // provide some common kernel args diff --git a/src/udf/udf.hpp b/src/udf/udf.hpp index 8728a3429..658dd7a91 100644 --- a/src/udf/udf.hpp +++ b/src/udf/udf.hpp @@ -7,6 +7,7 @@ #include "nekInterfaceAdapter.hpp" #include "parReader.hpp" #include "constantFlowRate.hpp" +#include "postProcessing.hpp" extern "C" { void UDF_Setup0(MPI_Comm comm, setupAide &options); @@ -38,7 +39,7 @@ struct UDF udfsEqnSource sEqnSource; udfproperties properties; udfdiv div; - udfconv converged; + udfconv timeStepConverged; }; extern UDF udf; @@ -49,6 +50,6 @@ void oudfInit(setupAide &options); void udfBuild(const char* udfFile, setupAide& options); void udfLoad(void); void* udfLoadFunction(const char* fname, int errchk); -occa::kernel udfBuildKernel(occa::properties kernelInfo, const char* function); +occa::kernel oudfBuildKernel(occa::properties kernelInfo, const char *function); #endif diff --git a/src/core/utils/inipp.cpp b/src/utils/inipp.cpp similarity index 96% rename from src/core/utils/inipp.cpp rename to src/utils/inipp.cpp index 252a54842..0b5a8f4c5 100644 --- a/src/core/utils/inipp.cpp +++ b/src/utils/inipp.cpp @@ -35,6 +35,8 @@ #include +int getDigitsRepresentation(int n) { return std::to_string(n).length(); } + namespace inipp { namespace detail @@ -116,7 +118,7 @@ void Ini::parse(std::stringstream & is, bool lowerValue) if (line.back() == char_section_end) { section = line.substr(1, length - 2); transform(section.begin(), section.end(), section.begin(), - std::ptr_fun(std::tolower)); + [](int c){return std::tolower(c);}); } else { errors.push_back(line); } @@ -124,7 +126,7 @@ void Ini::parse(std::stringstream & is, bool lowerValue) std::string variable(line.substr(0, pos)); std::string value(line.substr(pos + 1, length)); transform(variable.begin(), variable.end(), variable.begin(), - std::ptr_fun(std::tolower)); + [](int c){return std::tolower(c);}); detail::rtrim(variable); detail::ltrim(value); @@ -132,7 +134,7 @@ void Ini::parse(std::stringstream & is, bool lowerValue) lowerValue && value.back() == '"'; if (lowerValue && !inquotes) transform(value.begin(), value.end(), value.begin(), - std::ptr_fun(std::tolower)); + [](int c){return std::tolower(c);}); value.erase(std::remove(value.begin(), value.end(), '"'), value.end()); auto & sec = sections[section]; diff --git a/src/core/utils/inipp.hpp b/src/utils/inipp.hpp similarity index 90% rename from src/core/utils/inipp.hpp rename to src/utils/inipp.hpp index b95b586cb..f90d630c9 100644 --- a/src/core/utils/inipp.hpp +++ b/src/utils/inipp.hpp @@ -38,6 +38,8 @@ #include #include +int getDigitsRepresentation(int n); + namespace inipp { @@ -67,7 +69,7 @@ string_to_boolean_t string_to_boolean( const std::string s, bool strict = false s2.begin(), s2.end(), s2.begin(), - std::ptr_fun ( std::tolower ) + [](int c){return std::tolower(c);} ); // Does the string represent a FALSE? @@ -135,6 +137,24 @@ class Ini } } + template bool set(const String &key, const String &value, TT &&src) + { + if (sections[key].count(value)) { + if (std::is_same::value) { + sections[key][value] = src; + } + else { + std::ostringstream ss; + ss << src; + sections[key][value] = ss.str(); + } + return true; + } + else { + return false; + } + } + bool extract(const String & key, const String & value, String & dst) ; diff --git a/src/core/utils/mysort.cpp b/src/utils/mysort.cpp similarity index 100% rename from src/core/utils/mysort.cpp rename to src/utils/mysort.cpp diff --git a/src/core/utils/parallelSort.cpp b/src/utils/parallelSort.cpp similarity index 100% rename from src/core/utils/parallelSort.cpp rename to src/utils/parallelSort.cpp diff --git a/src/utils/randomVector.hpp b/src/utils/randomVector.hpp new file mode 100644 index 000000000..3222534d2 --- /dev/null +++ b/src/utils/randomVector.hpp @@ -0,0 +1,18 @@ +#include +#include +#include +#include +#include +template std::vector randomVector(int N) +{ + + std::default_random_engine dev; + std::uniform_real_distribution dist{0.0, 1.0}; + + auto gen = [&dist, &dev]() { return dist(dev); }; + + std::vector vec(N); + std::generate(vec.begin(), vec.end(), gen); + + return vec; +} \ No newline at end of file diff --git a/src/utils/setupAide.cpp b/src/utils/setupAide.cpp new file mode 100644 index 000000000..04971b4ef --- /dev/null +++ b/src/utils/setupAide.cpp @@ -0,0 +1,123 @@ +/* + + The MIT License (MIT) + + Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + + */ + +#include +#include "setupAide.hpp" + +std::string setupAide::getArgs(std::string key) const +{ + if(keyWordToDataMap.count(key) == 0){ + return ""; + } + return keyWordToDataMap.at(key); +} + +void setupAide::removeArgs(std::string key) +{ + auto iter = keyWordToDataMap.find(key); + if(iter != keyWordToDataMap.end()){ + keyWordToDataMap.erase(iter); + } +} + +void setupAide::setArgs(std::string key, std::string value) +{ + keyWordToDataMap[key] = value; +} + +int setupAide::getArgs(std::string key, std::vector < std::string >& m, std::string delimeter) const +{ + std::string args, current; + std::vector < std::string > argv; + int argc, size; + + args = getArgs(key); + + size = args.length(); + + current = ""; + + for(int i = 0; i < size; i++) { // TW + while( i < size && delimeter.find(args[i]) == std::string::npos ) + current += args[i++]; + + if(current.length()) + argv.push_back(current); + + current = ""; + } + + argc = argv.size(); + + if(!argc) + return 0; + + m.resize(argc); + + for(int i = 0; i < argc; i++) // TW + m[i] = argv[i]; + + return 1; +} + +int setupAide::compareArgs(std::string key, std::string token) const +{ + std::string foundToken; + if(getArgs(key,foundToken)) { + if(foundToken == token) + return 1; + if(foundToken.find(token) != std::string::npos) + return 2; + } + + return 0; +} + +std::ostream & operator << (std::ostream &os, const setupAide &aide){ + int maxLength = 0; + for(auto&& keyAndValuePair : aide.keyWordToDataMap) + { + const std::string key = keyAndValuePair.first; + int L = key.length(); + if(L > maxLength) + maxLength = L; + } + + std::string key, value; + + for(auto&& keyAndValuePair : aide.keyWordToDataMap) + { + std::tie(key, value) = keyAndValuePair; + os << "key: " << key << ","; + + for(int j = key.length(); j < maxLength; ++j) + os << " "; + + os << "value: " << value << std::endl; + } + + return os; + } diff --git a/src/core/setupAide.hpp b/src/utils/setupAide.hpp similarity index 76% rename from src/core/setupAide.hpp rename to src/utils/setupAide.hpp index 38729e0e6..0baa70be9 100644 --- a/src/core/setupAide.hpp +++ b/src/utils/setupAide.hpp @@ -38,39 +38,34 @@ SOFTWARE. #include #include "nrssys.hpp" +#include class setupAide { private: - std::vector data; - std::vector keyword; + std::map keyWordToDataMap; public: - setupAide(); - setupAide(std::string); + setupAide(){}; + ~setupAide() = default; - setupAide(const setupAide&); - setupAide& operator=(const setupAide&); + setupAide(const setupAide&) = default; + setupAide& operator=(const setupAide&) = default; - std::string readFile(std::string); - void read(std::string); + std::string getArgs(std::string) const; - std::string getArgs(std::string); + void removeArgs(std::string key); void setArgs(std::string key, std::string value); template - int getArgs(std::string, T&); + int getArgs(std::string, T&) const; template - int getArgs(std::string, std::vector&); + int getArgs(std::string, std::vector&) const; - int getArgs(std::string, std::vector&, std::string); + int getArgs(std::string, std::vector&, std::string) const; - - int compareArgs(std::string key, std::string token); - - std::vector &getData(){ return data; } - std::vector &getKeyword() { return keyword; } + int compareArgs(std::string key, std::string token) const; friend std::ostream & operator << (std::ostream &out, const setupAide &aide); }; diff --git a/src/core/setupAide.tpp b/src/utils/setupAide.tpp similarity index 67% rename from src/core/setupAide.tpp rename to src/utils/setupAide.tpp index fd5af0223..c3f90df5f 100644 --- a/src/core/setupAide.tpp +++ b/src/utils/setupAide.tpp @@ -1,5 +1,5 @@ template -int setupAide::getArgs(std::string key, T& t){ +int setupAide::getArgs(std::string key, T& t) const{ std::vector m; getArgs(key,m); @@ -10,12 +10,11 @@ int setupAide::getArgs(std::string key, T& t){ return 1; } - //printf("Failed to find [%s].\n", key.c_str()); return 0; } template -int setupAide::getArgs(std::string key, std::vector& m){ +int setupAide::getArgs(std::string key, std::vector& m) const { std::stringstream args; std::vector argv; int argc; @@ -29,7 +28,6 @@ int setupAide::getArgs(std::string key, std::vector& m){ argc = argv.size(); if(!argc){ - //printf("Failed to find [%s].\n", key.c_str()); return 0; } diff --git a/src/core/utils/tinyexpr.c b/src/utils/tinyexpr.c similarity index 100% rename from src/core/utils/tinyexpr.c rename to src/utils/tinyexpr.c diff --git a/src/core/utils/tinyexpr.h b/src/utils/tinyexpr.h similarity index 100% rename from src/core/utils/tinyexpr.h rename to src/utils/tinyexpr.h