From 65147ed815d96fa94a05d307c1d9980877b7d0e8 Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Tue, 27 Aug 2024 16:53:23 +0200 Subject: [PATCH] Nvidia remastered (#464) * nvidia: update hello world following changes in #456 * update Nvidia backend to use the new LLVM infra * update Nvidia multiplication --- constantine/math_compiler/codegen_amdgpu.nim | 39 -- constantine/math_compiler/codegen_nvidia.nim | 35 +- .../math_compiler/impl_fields_dispatch.nim | 34 ++ .../math_compiler/impl_fields_globals.nim | 3 +- .../math_compiler/impl_fields_nvidia.nim | 444 +++++++++--------- constantine/math_compiler/impl_fields_sat.nim | 2 +- constantine/math_compiler/ir.nim | 22 +- constantine/math_compiler/pub_fields.nim | 38 +- constantine/platforms/llvm/asm_nvidia.nim | 11 +- tests/gpu/hello_world_nvidia.nim | 76 ++- tests/gpu/t_nvidia_fp.nim | 257 +++------- 11 files changed, 437 insertions(+), 524 deletions(-) create mode 100644 constantine/math_compiler/impl_fields_dispatch.nim diff --git a/constantine/math_compiler/codegen_amdgpu.nim b/constantine/math_compiler/codegen_amdgpu.nim index c9e84b9ee..cd87b3640 100644 --- a/constantine/math_compiler/codegen_amdgpu.nim +++ b/constantine/math_compiler/codegen_amdgpu.nim @@ -67,45 +67,6 @@ proc hipDeviceInit*(deviceID = 0'i32): HipDevice = return hipDevice -# ############################################################ -# -# LLVM IR for AMD GPUs -# -# ############################################################ -# -# Note: -# __device__ functions for field and elliptic curve arithmetic -# might be compiled by default with scalar codegen -# -# We will need to either: -# - Derive explicitly a vectorized version of the warp/wave size (32) -# - Derive implicitly a vectorized version, probably with __forceinline__ - -proc wrapInCallableHipKernel*(module: ModuleRef, fn: FnDef) = - ## Create a public wrapper of a Hip device function - ## - ## A function named `addmod` can be found by appending _public - ## check hipModuleGetFunction(fnPointer, cuModule, "addmod_public") - - let pubName = fn.fnImpl.getName() & "_public" - let pubFn = module.addFunction(cstring(pubName), fn.fnTy) - - let ctx = module.getContext() - let builder = ctx.createBuilder() - defer: builder.dispose() - - let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody") - builder.positionAtEnd(blck) - - var args = newSeq[ValueRef](fn.fnTy.countParamTypes()) - for i, arg in mpairs(args): - arg = pubFn.getParam(i.uint32) - discard builder.call2(fn.fnTy, fn.fnImpl, args) - - # A public kernel must return void - builder.retVoid() - pubFn.setCallingConvention(AMDGPU_KERNEL) - # ############################################################ # # Code generation diff --git a/constantine/math_compiler/codegen_nvidia.nim b/constantine/math_compiler/codegen_nvidia.nim index fdc4c393b..ccacca53b 100644 --- a/constantine/math_compiler/codegen_nvidia.nim +++ b/constantine/math_compiler/codegen_nvidia.nim @@ -109,37 +109,6 @@ proc cudaDeviceInit*(deviceID = 0'i32): CUdevice = return cuDevice -# ############################################################ -# -# LLVM IR for Nvidia GPUs -# -# ############################################################ - -proc wrapInCallableCudaKernel*(module: ModuleRef, fn: FnDef) = - ## Create a public wrapper of a cuda device function - ## - ## A function named `addmod` can be found by appending _public - ## check cuModuleGetFunction(fnPointer, cuModule, "addmod_public") - - let pubName = fn.fnImpl.getName() & "_public" - let pubFn = module.addFunction(cstring(pubName), fn.fnTy) - - let ctx = module.getContext() - let builder = ctx.createBuilder() - defer: builder.dispose() - - let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody") - builder.positionAtEnd(blck) - - var args = newSeq[ValueRef](fn.fnTy.countParamTypes()) - for i, arg in mpairs(args): - arg = pubFn.getParam(i.uint32) - discard builder.call2(fn.fnTy, fn.fnImpl, args) - - # A public kernel must return void - builder.retVoid() - module.tagCudaKernel((fn.fnTy, pubFn)) - # ############################################################ # # Code generation @@ -194,9 +163,7 @@ proc codegenNvidiaPTX*(asy: Assembler_LLVM, sm: tuple[major, minor: int32]): str # # ############################################################ -proc getCudaKernel*(cuMod: CUmodule, cm: CurveMetadata, opcode: Opcode): CUfunction = - # Public kernels are appended _public - let fnName = cm.genSymbol(opcode) & "_public" +proc getCudaKernel*(cuMod: CUmodule, fnName: string): CUfunction = check cuModuleGetFunction(result, cuMod, fnName) proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) = diff --git a/constantine/math_compiler/impl_fields_dispatch.nim b/constantine/math_compiler/impl_fields_dispatch.nim new file mode 100644 index 000000000..020a10dc8 --- /dev/null +++ b/constantine/math_compiler/impl_fields_dispatch.nim @@ -0,0 +1,34 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + constantine/platforms/llvm/llvm, + ./ir, + ./impl_fields_sat {.all.}, + ./impl_fields_nvidia {.all.} + +proc modadd*(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) = + case asy.backend + of {bkX86_64_Linux, bkAmdGpu}: + asy.modadd_sat(fd, r, a, b, M) + of bkNvidiaPTX: + asy.modadd_nvidia(fd, r, a, b, M) + +proc modsub*(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) = + case asy.backend + of bkNvidiaPTX: + asy.modsub_nvidia(fd, r, a, b, M) + else: + doAssert false, "Unimplemented" + +proc mtymul*(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) = + case asy.backend + of bkNvidiaPTX: + asy.mtymul_nvidia(fd, r, a, b, M) + else: + doAssert false, "Unimplemented" \ No newline at end of file diff --git a/constantine/math_compiler/impl_fields_globals.nim b/constantine/math_compiler/impl_fields_globals.nim index faac591ca..1957aa930 100644 --- a/constantine/math_compiler/impl_fields_globals.nim +++ b/constantine/math_compiler/impl_fields_globals.nim @@ -157,8 +157,7 @@ proc getM0ninv*(asy: Assembler_LLVM, fd: FieldDescriptor): ValueRef = fd.wordTy ) - - return m0ninv + return asy.load2(fd.wordTy, m0ninv, "m0ninv") when isMainModule: let asy = Assembler_LLVM.new("test_module", bkX86_64_Linux) diff --git a/constantine/math_compiler/impl_fields_nvidia.nim b/constantine/math_compiler/impl_fields_nvidia.nim index 5843d02d8..b27a843c0 100644 --- a/constantine/math_compiler/impl_fields_nvidia.nim +++ b/constantine/math_compiler/impl_fields_nvidia.nim @@ -8,7 +8,8 @@ import constantine/platforms/llvm/[llvm, asm_nvidia], - ./ir + ./ir, + ./impl_fields_globals # ############################################################ # @@ -46,7 +47,9 @@ import # # We cannot use i256 on Nvidia target: https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp#L244-L276 -proc finalSubMayOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) = +const SectionName = "ctt.fields" + +proc finalSubMayOverflow(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, M: Array) = ## If a >= Modulus: r <- a-M ## else: r <- a ## @@ -55,31 +58,27 @@ proc finalSubMayOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r ## ## To be used when the final substraction can ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256) - - let bld = asy.builder - let fieldTy = cm.getFieldType(field) - let scratch = bld.makeArray(fieldTy) - let M = cm.getModulus(field) - let N = M.len + let N = fd.numWords + let scratch = asy.makeArray(fd.fieldTy) # Contains 0x0001 (if overflowed limbs) or 0x0000 - let overflowedLimbs = bld.add_ci(0'u32, 0'u32) + let overflowedLimbs = asy.br.add_ci(0'u32, 0'u32) # Now substract the modulus, and test a < M with the last borrow - scratch[0] = bld.sub_bo(a[0], M[0]) + scratch[0] = asy.br.sub_bo(a[0], M[0]) for i in 1 ..< N: - scratch[i] = bld.sub_bio(a[i], M[i]) + scratch[i] = asy.br.sub_bio(a[i], M[i]) # 1. if `overflowedLimbs`, underflowedModulus >= 0 # 2. if a >= M, underflowedModulus >= 0 # if underflowedModulus >= 0: a-M else: a # TODO: predicated mov instead? - let underflowedModulus = bld.sub_bi(overflowedLimbs, 0'u32) + let underflowedModulus = asy.br.sub_bi(overflowedLimbs, 0'u32) for i in 0 ..< N: - r[i] = bld.slct(scratch[i], a[i], underflowedModulus) + r[i] = asy.br.slct(scratch[i], a[i], underflowedModulus) -proc finalSubNoOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) = +proc finalSubNoOverflow(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, M: Array) = ## If a >= Modulus: r <- a-M ## else: r <- a ## @@ -88,275 +87,262 @@ proc finalSubNoOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, ## ## To be used when the modulus does not use the full bitwidth of the storing words ## (say using 255 bits for the modulus out of 256 available in words) - - let bld = asy.builder - let fieldTy = cm.getFieldType(field) - let scratch = bld.makeArray(fieldTy) - let M = cm.getModulus(field) - let N = M.len + let N = fd.numWords + let scratch = asy.makeArray(fd.fieldTy) # Now substract the modulus, and test a < M with the last borrow - scratch[0] = bld.sub_bo(a[0], M[0]) + scratch[0] = asy.br.sub_bo(a[0], M[0]) for i in 1 ..< N: - scratch[i] = bld.sub_bio(a[i], M[i]) + scratch[i] = asy.br.sub_bio(a[i], M[i]) # If it underflows here a was smaller than the modulus, which is what we want - let underflowedModulus = bld.sub_bi(0'u32, 0'u32) + let underflowedModulus = asy.br.sub_bi(0'u32, 0'u32) for i in 0 ..< N: - r[i] = bld.slct(scratch[i], a[i], underflowedModulus) + r[i] = asy.br.slct(scratch[i], a[i], underflowedModulus) -proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef = +proc modadd_nvidia(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) {.used.} = ## Generate an optimized modular addition kernel ## with parameters `a, b, modulus: Limbs -> Limbs` + let red = if fd.spareBits >= 1: "noo" + else: "mayo" + let name = "_modadd_" & red & ".u" & $fd.w & "x" & $fd.numWords - let procName = cm.genSymbol(block: - case field - of fp: opFpAdd - of fr: opFrAdd) - let fieldTy = cm.getFieldType(field) - let pFieldTy = pointer_t(fieldTy) + asy.llvmInternalFnDef( + name, SectionName, + asy.void_t, toTypes([r, a, b, M]), + {kHot}): - let addModTy = function_t(asy.void_t, [pFieldTy, pFieldTy, pFieldTy]) - let addModKernel = asy.module.addFunction(cstring procName, addModTy) - let blck = asy.ctx.appendBasicBlock(addModKernel, "addModBody") - asy.builder.positionAtEnd(blck) + tagParameter(1, "sret") - let bld = asy.builder + let (rr, aa, bb, MM) = llvmParams - let r = bld.asArray(addModKernel.getParam(0), fieldTy) - let a = bld.asArray(addModKernel.getParam(1), fieldTy) - let b = bld.asArray(addModKernel.getParam(2), fieldTy) + # Pointers are opaque in LLVM now + let r = asy.asArray(rr, fd.fieldTy) + let a = asy.asArray(aa, fd.fieldTy) + let b = asy.asArray(bb, fd.fieldTy) + let M = asy.asArray(MM, fd.fieldTy) - let t = bld.makeArray(fieldTy) - let N = cm.getNumWords(field) + let t = asy.makeArray(fd.fieldTy) + let N = fd.numWords - t[0] = bld.add_co(a[0], b[0]) - for i in 1 ..< N: - t[i] = bld.add_cio(a[i], b[i]) + t[0] = asy.br.add_co(a[0], b[0]) + for i in 1 ..< N: + t[i] = asy.br.add_cio(a[i], b[i]) - if cm.getSpareBits(field) >= 1: - asy.finalSubNoOverflow(cm, field, t, t) - else: - asy.finalSubMayOverflow(cm, field, t, t) + if fd.spareBits >= 1: + asy.finalSubNoOverflow(fd, t, t, M) + else: + asy.finalSubMayOverflow(fd, t, t, M) - bld.store(r, t) - bld.retVoid() + asy.store(r, t) + asy.br.retVoid() - return (addModTy, addModKernel) + asy.callFn(name, [r, a, b, M]) -proc field_sub_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef = +proc modsub_nvidia(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) {.used.} = ## Generate an optimized modular substraction kernel ## with parameters `a, b, modulus: Limbs -> Limbs` + let name = "_modsub.u" & $fd.w & "x" & $fd.numWords - let procName = cm.genSymbol(block: - case field - of fp: opFpSub - of fr: opFrSub) - let fieldTy = cm.getFieldType(field) - let pFieldTy = pointer_t(fieldTy) + asy.llvmInternalFnDef( + name, SectionName, + asy.void_t, toTypes([r, a, b, M]), + {kHot}): - let subModTy = function_t(asy.void_t, [pFieldTy, pFieldTy, pFieldTy]) - let subModKernel = asy.module.addFunction(cstring procName, subModTy) - let blck = asy.ctx.appendBasicBlock(subModKernel, "subModBody") - asy.builder.positionAtEnd(blck) + tagParameter(1, "sret") - let bld = asy.builder + let (rr, aa, bb, MM) = llvmParams - let r = bld.asArray(subModKernel.getParam(0), fieldTy) - let a = bld.asArray(subModKernel.getParam(1), fieldTy) - let b = bld.asArray(subModKernel.getParam(2), fieldTy) + # Pointers are opaque in LLVM now + let r = asy.asArray(rr, fd.fieldTy) + let a = asy.asArray(aa, fd.fieldTy) + let b = asy.asArray(bb, fd.fieldTy) + let M = asy.asArray(MM, fd.fieldTy) - let t = bld.makeArray(fieldTy) - let N = cm.getNumWords(field) - let zero = case cm.wordSize - of w32: constInt(asy.i32_t, 0) - of w64: constInt(asy.i64_t, 0) + let t = asy.makeArray(fd.fieldTy) + let N = fd.numWords - t[0] = bld.sub_bo(a[0], b[0]) - for i in 1 ..< N: - t[i] = bld.sub_bio(a[i], b[i]) + t[0] = asy.br.sub_bo(a[0], b[0]) + for i in 1 ..< N: + t[i] = asy.br.sub_bio(a[i], b[i]) - let underflowMask = bld.sub_bi(zero, zero) + let underflowMask = asy.br.sub_bi(fd.zero, fd.zero) - # If underflow - # TODO: predicated mov instead? - let M = (seq[ValueRef])(cm.getModulus(field)) - let maskedM = bld.makeArray(fieldTy) - for i in 0 ..< N: - maskedM[i] = bld.`and`(M[i], underflowMask) + # If underflow + # TODO: predicated mov instead? + let maskedM = asy.makeArray(fd.fieldTy) + for i in 0 ..< N: + maskedM[i] = asy.br.`and`(M[i], underflowMask) - block: - t[0] = bld.add_co(t[0], maskedM[0]) - for i in 1 ..< N-1: - t[i] = bld.add_cio(t[i], maskedM[i]) - if N > 1: - t[N-1] = bld.add_ci(t[N-1], maskedM[N-1]) + block: + t[0] = asy.br.add_co(t[0], maskedM[0]) + for i in 1 ..< N-1: + t[i] = asy.br.add_cio(t[i], maskedM[i]) + if N > 1: + t[N-1] = asy.br.add_ci(t[N-1], maskedM[N-1]) - bld.store(r, t) - bld.retVoid() + asy.store(r, t) + asy.br.retVoid() - return (subModTy, subModKernel) + asy.callFn(name, [r, a, b, M]) -proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, skipFinalSub: bool): FnDef = +proc mtymul_CIOS_sparebit(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef, finalReduce: bool) = ## Generate an optimized modular multiplication kernel ## with parameters `a, b, modulus: Limbs -> Limbs` - let procName = cm.genSymbol(block: - if skipFinalSub: - case field - of fp: opFpMulSkipFinalSub - of fr: opFrMulSkipFinalSub + let name = + if not finalReduce and fd.spareBits >= 2: + "_mty_mulur.u" & $fd.w & "x" & $fd.numWords & "b2" else: - case field - of fp: opFpMul - of fr: opFrMul) - let fieldTy = cm.getFieldType(field) - let pFieldTy = pointer_t(fieldTy) - - let mulModTy = function_t(asy.void_t, [pFieldTy, pFieldTy, pFieldTy]) - let mulModKernel = asy.module.addFunction(cstring procName, mulModTy) - let blck = asy.ctx.appendBasicBlock(mulModKernel, "mulModBody") - asy.builder.positionAtEnd(blck) - - let bld = asy.builder - - let r = bld.asArray(mulModKernel.getParam(0), fieldTy) - let a = bld.asArray(mulModKernel.getParam(1), fieldTy) - let b = bld.asArray(mulModKernel.getParam(2), fieldTy) - - # Algorithm - # ----------------------------------------- - # - # On x86, with a single carry chain and a spare bit: - # - # for i=0 to N-1 - # (A, t[0]) <- a[0] * b[i] + t[0] - # m <- (t[0] * m0ninv) mod 2ʷ - # (C, _) <- m * M[0] + t[0] - # for j=1 to N-1 - # (A, t[j]) <- a[j] * b[i] + A + t[j] - # (C, t[j-1]) <- m * M[j] + C + t[j] - # - # t[N-1] = C + A - # - # with MULX, ADCX, ADOX dual carry chains - # - # for i=0 to N-1 - # for j=0 to N-1 - # (A,t[j]) := t[j] + a[j]*b[i] + A - # m := t[0]*m0ninv mod W - # C,_ := t[0] + m*M[0] - # for j=1 to N-1 - # (C,t[j-1]) := t[j] + m*M[j] + C - # t[N-1] = C + A - # - # In our case, we only have a single carry flag - # but we have a lot of registers - # and a multiply-accumulate instruction - # - # Hence we can use the dual carry chain approach - # one chain after the other instead of interleaved like on x86. - - let t = bld.makeArray(fieldTy) - let N = cm.getNumWords(field) - let m0ninv = ValueRef cm.getMontgomeryNegInverse0(field) - let M = (seq[ValueRef])(cm.getModulus(field)) - let zero = case cm.wordSize - of w32: constInt(asy.i32_t, 0) - of w64: constInt(asy.i64_t, 0) + doAssert fd.spareBits >= 1 + "_mty_mul.u" & $fd.w & "x" & $fd.numWords & "b1" - for i in 0 ..< N: - # Multiplication - # ------------------------------- - # for j=0 to N-1 - # (A,t[j]) := t[j] + a[j]*b[i] + A + asy.llvmInternalFnDef( + name, SectionName, + asy.void_t, toTypes([r, a, b, M]), + {kHot}): + + tagParameter(1, "sret") + + let (rr, aa, bb, MM) = llvmParams + + # Pointers are opaque in LLVM now + let r = asy.asArray(rr, fd.fieldTy) + let a = asy.asArray(aa, fd.fieldTy) + let b = asy.asArray(bb, fd.fieldTy) + let M = asy.asArray(MM, fd.fieldTy) + + let t = asy.makeArray(fd.fieldTy) + let N = fd.numWords + let m0ninv = asy.getM0ninv(fd) + + # Algorithm + # ----------------------------------------- # - # for 4 limbs, implicit column-wise carries + # On x86, with a single carry chain and a spare bit: # - # t[0] = t[0] + (a[0]*b[i]).lo - # t[1] = t[1] + (a[1]*b[i]).lo + (a[0]*b[i]).hi - # t[2] = t[2] + (a[2]*b[i]).lo + (a[1]*b[i]).hi - # t[3] = t[3] + (a[3]*b[i]).lo + (a[2]*b[i]).hi - # overflow = (a[3]*b[i]).hi + # for i=0 to N-1 + # (A, t[0]) <- a[0] * b[i] + t[0] + # m <- (t[0] * m0ninv) mod 2ʷ + # (C, _) <- m * M[0] + t[0] + # for j=1 to N-1 + # (A, t[j]) <- a[j] * b[i] + A + t[j] + # (C, t[j-1]) <- m * M[j] + C + t[j] # - # or + # t[N-1] = C + A # - # t[0] = t[0] + (a[0]*b[i]).lo - # t[1] = t[1] + (a[0]*b[i]).hi + (a[1]*b[i]).lo - # t[2] = t[2] + (a[2]*b[i]).lo + (a[1]*b[i]).hi - # t[3] = t[3] + (a[2]*b[i]).hi + (a[3]*b[i]).lo - # overflow = carry + (a[3]*b[i]).hi + # with MULX, ADCX, ADOX dual carry chains # - # Depending if we chain lo/hi or even/odd - # The even/odd carry chain is more likely to be optimized via μops-fusion - # as it's common to compute the full product. That said: - # - it's annoying if the number of limbs is odd with edge conditions. - # - GPUs are RISC architectures and unlikely to have clever instruction rescheduling logic - let bi = b[i] - var A = ValueRef zero - - if i == 0: - for j in 0 ..< N: - t[j] = bld.mul(a[j], bi) - else: - t[0] = bld.mulloadd_co(a[0], bi, t[0]) - for j in 1 ..< N: - t[j] = bld.mulloadd_cio(a[j], bi, t[j]) - if N > 1: - A = bld.add_ci(zero, zero) - if N > 1: - t[1] = bld.mulhiadd_co(a[0], bi, t[1]) - for j in 2 ..< N: - t[j] = bld.mulhiadd_cio(a[j-1], bi, t[j]) - A = bld.mulhiadd_ci(a[N-1], bi, A) - - # Reduction - # ------------------------------- + # for i=0 to N-1 + # for j=0 to N-1 + # (A,t[j]) := t[j] + a[j]*b[i] + A # m := t[0]*m0ninv mod W - # # C,_ := t[0] + m*M[0] # for j=1 to N-1 # (C,t[j-1]) := t[j] + m*M[j] + C # t[N-1] = C + A # - # for 4 limbs, implicit column-wise carries - # _ = t[0] + (m*M[0]).lo - # t[0] = t[1] + (m*M[1]).lo + (m*M[0]).hi - # t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi - # t[2] = t[3] + (m*M[3]).lo + (m*M[2]).hi - # t[3] = A + carry + (m*M[3]).hi - # - # or + # In our case, we only have a single carry flag + # but we have a lot of registers + # and a multiply-accumulate instruction # - # _ = t[0] + (m*M[0]).lo - # t[0] = t[1] + (m*M[0]).hi + (m*M[1]).lo - # t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi - # t[2] = t[3] + (m*M[2]).hi + (m*M[3]).lo - # t[3] = A + carry + (m*M[3]).hi - - let m = bld.mul(t[0], m0ninv) - let _ = bld.mulloadd_co(m, M[0], t[0]) - for j in 1 ..< N: - t[j-1] = bld.mulloadd_cio(m, M[j], t[j]) - t[N-1] = bld.add_ci(A, 0) - if N > 1: - t[0] = bld.mulhiadd_co(m, M[0], t[0]) - for j in 1 ..< N-1: - t[j] = bld.mulhiadd_cio(m, M[j], t[j]) - t[N-1] = bld.mulhiadd_ci(m, M[N-1], t[N-1]) - else: - t[0] = bld.mulhiadd(m, M[0], t[0]) + # Hence we can use the dual carry chain approach + # one chain after the other instead of interleaved like on x86. + + for i in 0 ..< N: + # Multiplication + # ------------------------------- + # for j=0 to N-1 + # (A,t[j]) := t[j] + a[j]*b[i] + A + # + # for 4 limbs, implicit column-wise carries + # + # t[0] = t[0] + (a[0]*b[i]).lo + # t[1] = t[1] + (a[1]*b[i]).lo + (a[0]*b[i]).hi + # t[2] = t[2] + (a[2]*b[i]).lo + (a[1]*b[i]).hi + # t[3] = t[3] + (a[3]*b[i]).lo + (a[2]*b[i]).hi + # overflow = (a[3]*b[i]).hi + # + # or + # + # t[0] = t[0] + (a[0]*b[i]).lo + # t[1] = t[1] + (a[0]*b[i]).hi + (a[1]*b[i]).lo + # t[2] = t[2] + (a[2]*b[i]).lo + (a[1]*b[i]).hi + # t[3] = t[3] + (a[2]*b[i]).hi + (a[3]*b[i]).lo + # overflow = carry + (a[3]*b[i]).hi + # + # Depending if we chain lo/hi or even/odd + # The even/odd carry chain is more likely to be optimized via μops-fusion + # as it's common to compute the full product. That said: + # - it's annoying if the number of limbs is odd with edge conditions. + # - GPUs are RISC architectures and unlikely to have clever instruction rescheduling logic + let bi = b[i] + var A = fd.zero + + if i == 0: + for j in 0 ..< N: + t[j] = asy.br.mul(a[j], bi) + else: + t[0] = asy.br.mulloadd_co(a[0], bi, t[0]) + for j in 1 ..< N: + t[j] = asy.br.mulloadd_cio(a[j], bi, t[j]) + if N > 1: + A = asy.br.add_ci(fd.zero, fd.zero) + if N > 1: + t[1] = asy.br.mulhiadd_co(a[0], bi, t[1]) + for j in 2 ..< N: + t[j] = asy.br.mulhiadd_cio(a[j-1], bi, t[j]) + A = asy.br.mulhiadd_ci(a[N-1], bi, A) + + # Reduction + # ------------------------------- + # m := t[0]*m0ninv mod W + # + # C,_ := t[0] + m*M[0] + # for j=1 to N-1 + # (C,t[j-1]) := t[j] + m*M[j] + C + # t[N-1] = C + A + # + # for 4 limbs, implicit column-wise carries + # _ = t[0] + (m*M[0]).lo + # t[0] = t[1] + (m*M[1]).lo + (m*M[0]).hi + # t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi + # t[2] = t[3] + (m*M[3]).lo + (m*M[2]).hi + # t[3] = A + carry + (m*M[3]).hi + # + # or + # + # _ = t[0] + (m*M[0]).lo + # t[0] = t[1] + (m*M[0]).hi + (m*M[1]).lo + # t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi + # t[2] = t[3] + (m*M[2]).hi + (m*M[3]).lo + # t[3] = A + carry + (m*M[3]).hi + + let m = asy.br.mul(t[0], m0ninv) + let _ = asy.br.mulloadd_co(m, M[0], t[0]) + for j in 1 ..< N: + t[j-1] = asy.br.mulloadd_cio(m, M[j], t[j]) + t[N-1] = asy.br.add_ci(A, 0) + if N > 1: + t[0] = asy.br.mulhiadd_co(m, M[0], t[0]) + for j in 1 ..< N-1: + t[j] = asy.br.mulhiadd_cio(m, M[j], t[j]) + t[N-1] = asy.br.mulhiadd_ci(m, M[N-1], t[N-1]) + else: + t[0] = asy.br.mulhiadd(m, M[0], t[0]) - if not skipFinalSub: - asy.finalSubNoOverflow(cm, field, t, t) + if finalReduce: + asy.finalSubNoOverflow(fd, t, t, M) - bld.store(r, t) - bld.retVoid() + asy.store(r, t) + asy.br.retVoid() - return (mulModTy, mulModKernel) + asy.callFn(name, [r, a, b, M]) -proc field_mul_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, skipFinalSub = false): FnDef = +proc mtymul_nvidia(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef, finalReduce = true) {.used.} = ## Generate an optimized modular addition kernel ## with parameters `a, b, modulus: Limbs -> Limbs` - return asy.field_mul_CIOS_sparebit_gen(cm, field, skipFinalSub) + + # TODO: spareBits == 0 + asy.mtymul_CIOS_sparebit(fd, r, a, b, M, finalReduce) diff --git a/constantine/math_compiler/impl_fields_sat.nim b/constantine/math_compiler/impl_fields_sat.nim index cd52f96f2..ab8fa4bfd 100644 --- a/constantine/math_compiler/impl_fields_sat.nim +++ b/constantine/math_compiler/impl_fields_sat.nim @@ -121,7 +121,7 @@ proc finalSubNoOverflow*(asy: Assembler_LLVM, fd: FieldDescriptor, rr, a, M: Val let t = asy.br.select(borrow, a, a_minus_M) asy.store(rr, t) -proc modadd*(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) = +proc modadd_sat(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) {.used.} = ## Generate an optimized modular addition kernel ## with parameters `a, b, modulus: Limbs -> Limbs` diff --git a/constantine/math_compiler/ir.nim b/constantine/math_compiler/ir.nim index ab80e036a..db46a273f 100644 --- a/constantine/math_compiler/ir.nim +++ b/constantine/math_compiler/ir.nim @@ -259,11 +259,11 @@ proc makeArray*(asy: Assembler_LLVM, elemTy: TypeRef, len: uint32): Array = proc `[]`*(a: Array, index: SomeInteger): ValueRef {.inline.}= # First dereference the array pointer with 0, then access the `index` - let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.buf, [ValueRef constInt(a.int32_t, 0), ValueRef constInt(a.int32_t, uint64 index)]) + let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.buf, [constInt(a.int32_t, 0), constInt(a.int32_t, uint64 index)]) a.builder.load2(a.elemTy, pelem) proc `[]=`*(a: Array, index: SomeInteger, val: ValueRef) {.inline.}= - let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.buf, [ValueRef constInt(a.int32_t, 0), ValueRef constInt(a.int32_t, uint64 index)]) + let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.buf, [constInt(a.int32_t, 0), constInt(a.int32_t, uint64 index)]) a.builder.store(val, pelem) proc store*(asy: Assembler_LLVM, dst: Array, src: Array) {.inline.}= @@ -345,9 +345,10 @@ proc defineGlobalConstant*( proc tagCudaKernel(asy: Assembler_LLVM, fn: ValueRef) = ## Tag a function as a Cuda Kernel, i.e. callable from host - let returnTy = fn.getTypeOf().getReturnType() - doAssert returnTy.isVoid(), block: - "Kernels must not return values but function returns " & $returnTy.getTypeKind() + # We cannot get the full function type from its impl so we cannot do this check. + # let returnTy = fn.getTypeOf().getReturnType() + # doAssert returnTy.isVoid(), block: + # "Kernels must not return values but function returns " & $returnTy.getTypeKind() asy.module.addNamedMetadataOperand( "nvvm.annotations", @@ -360,8 +361,11 @@ proc tagCudaKernel(asy: Assembler_LLVM, fn: ValueRef) = proc setPublic(asy: Assembler_LLVM, fn: ValueRef) = case asy.backend - of bkAmdGpu: fn.setFnCallConv(AMDGPU_KERNEL) - of bkNvidiaPtx: asy.tagCudaKernel(fn) + of bkAmdGpu: + fn.setFnCallConv(AMDGPU_KERNEL) + of bkNvidiaPtx: + # asy.tagCudaKernel(fn) + fn.setFnCallConv(PTX_Kernel) else: discard # ############################################################ @@ -474,7 +478,7 @@ template llvmFnDef[N: static int]( savedLoc = blck let llvmParams {.inject.} = unpackParams(asy.br, paramsTys) - template tagParameter(idx: int, attr: string) {.inject.} = + template tagParameter(idx: int, attr: string) {.inject, used.} = let a = asy.ctx.createAttr(attr) fn.addAttribute(cint idx, a) body @@ -484,7 +488,7 @@ template llvmFnDef[N: static int]( fn.setLinkage(linkInternal) else: asy.setPublic(fn) - fn.setSection(sectionName) + fn.setSection(cstring sectionName) asy.addAttributes(fn, attrs) asy.br.positionAtEnd(savedLoc) diff --git a/constantine/math_compiler/pub_fields.nim b/constantine/math_compiler/pub_fields.nim index 51f92da3e..83e602644 100644 --- a/constantine/math_compiler/pub_fields.nim +++ b/constantine/math_compiler/pub_fields.nim @@ -10,13 +10,13 @@ import constantine/platforms/llvm/llvm, ./ir, ./impl_fields_globals, - ./impl_fields_sat + ./impl_fields_dispatch proc genFpAdd*(asy: Assembler_LLVM, fd: FieldDescriptor): string = ## Generate a public field addition proc ## with signature ## void name(FieldType r, FieldType a, FieldType b) - ## with r the result and a, b the operants + ## with r the result and a, b the operands ## and return the corresponding name to call it let name = fd.name & "_add" @@ -28,3 +28,37 @@ proc genFpAdd*(asy: Assembler_LLVM, fd: FieldDescriptor): string = asy.br.retVoid() return name + +proc genFpSub*(asy: Assembler_LLVM, fd: FieldDescriptor): string = + ## Generate a public field substraction proc + ## with signature + ## void name(FieldType r, FieldType a, FieldType b) + ## with r the result and a, b the operands + ## and return the corresponding name to call it + + let name = fd.name & "_sub" + asy.llvmPublicFnDef(name, "ctt." & fd.name, asy.void_t, [fd.fieldTy, fd.fieldTy, fd.fieldTy]): + let M = asy.getModulusPtr(fd) + + let (r, a, b) = llvmParams + asy.modsub(fd, r, a, b, M) + asy.br.retVoid() + + return name + +proc genFpMul*(asy: Assembler_LLVM, fd: FieldDescriptor): string = + ## Generate a public field substraction proc + ## with signature + ## void name(FieldType r, FieldType a, FieldType b) + ## with r the result and a, b the operands + ## and return the corresponding name to call it + + let name = fd.name & "_mul" + asy.llvmPublicFnDef(name, "ctt." & fd.name, asy.void_t, [fd.fieldTy, fd.fieldTy, fd.fieldTy]): + let M = asy.getModulusPtr(fd) + + let (r, a, b) = llvmParams + asy.mtymul(fd, r, a, b, M) # TODO: for now we only suport Montgomery representation + asy.br.retVoid() + + return name \ No newline at end of file diff --git a/constantine/platforms/llvm/asm_nvidia.nim b/constantine/platforms/llvm/asm_nvidia.nim index c56c9063f..c3de09024 100644 --- a/constantine/platforms/llvm/asm_nvidia.nim +++ b/constantine/platforms/llvm/asm_nvidia.nim @@ -98,9 +98,9 @@ macro genInstr(body: untyped): untyped = instrBody.add quote do: let `ctx` {.used.} = builder.getContext() # lhs: ValueRef or uint32 or uint64 - let `numBits` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf().getIntTypeWidth() + let `numBits` = when `lhs` is ValueRef: `lhs`.getTypeOf().getIntTypeWidth() else: 8*sizeof(`lhs`) - let `regTy` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf() + let `regTy` = when `lhs` is ValueRef: `lhs`.getTypeOf() elif `lhs` is uint32: `ctx`.int32_t() elif `lhs` is uint64: `ctx`.int64_t() else: {.error "Unsupported input type " & $typeof(`lhs`).} @@ -225,8 +225,7 @@ macro genInstr(body: untyped): untyped = # else: constInt(uint64(op)) opArray.add nnkWhenStmt.newTree( nnkElifBranch.newTree(nnkInfix.newTree(ident"is", op, bindSym"ValueRef"), op), - nnkElifBranch.newTree(nnkInfix.newTree(ident"is", op, bindSym"ConstValueRef"), newCall(ident"ValueRef", op)), - nnkElse.newTree(newCall(ident"ValueRef", newCall(ident"constInt", regTy, newCall(ident"uint64", op)))) + nnkElse.newTree(newCall(ident"constInt", regTy, newCall(ident"uint64", op))) ) # builder.call2(ty, inlineASM, [lhs, rhs], name) instrBody.add newCall( @@ -249,14 +248,14 @@ macro genInstr(body: untyped): untyped = opDefs.add newIdentDefs( operands[i], nnkInfix.newTree(ident"or", - nnkInfix.newTree(ident"or", ident"AnyValueRef", ident"uint32"), + nnkInfix.newTree(ident"or", ident"ValueRef", ident"uint32"), ident"uint64") ) elif constraint == "rn": opDefs.add newIdentDefs( operands[i], nnkInfix.newTree(ident"or", - ident"AnyValueRef", + ident"ValueRef", ident"uint32") ) else: diff --git a/tests/gpu/hello_world_nvidia.nim b/tests/gpu/hello_world_nvidia.nim index f2a069cfc..7e26caf40 100644 --- a/tests/gpu/hello_world_nvidia.nim +++ b/tests/gpu/hello_world_nvidia.nim @@ -8,7 +8,8 @@ import constantine/platforms/llvm/llvm, - constantine/math_compiler/codegen_nvidia + constantine/platforms/abis/nvidia_abi {.all.}, + constantine/platforms/abis/c_abi # ############################################################ # @@ -79,7 +80,17 @@ proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult # # ############################################################ -template check*(status: NvvmResult) = +template check(status: CUresult) = + ## Check the status code of a CUDA operation + ## Exit program with error if failure + + let code = status # ensure that the input expression is evaluated once only + if code != CUDA_SUCCESS: + writeStackTrace() + stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n') + quit 1 + +template check(status: NvvmResult) = let code = status # Assign so execution is done once only. if code != NVVM_SUCCESS: stderr.write astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code @@ -148,6 +159,22 @@ proc ptxCodegenViaLlvmNvptx(module: ModuleRef, sm: tuple[major, minor: int32]): echo "Nvidia JIT compiler Hello World" +proc tagCudaKernel(module: ModuleRef, fnTy: TypeRef, fnImpl: ValueRef) = + ## Tag a function as a Cuda Kernel, i.e. callable from host + + doAssert fnTy.getReturnType().isVoid(), block: + "Kernels must not return values but function returns " & $fnTy.getReturnType().getTypeKind() + + let ctx = module.getContext() + module.addNamedMetadataOperand( + "nvvm.annotations", + ctx.asValueRef(ctx.metadataNode([ + fnImpl.asMetadataRef(), + ctx.metadataNode("kernel"), + constInt(ctx.int32_t(), 1, LlvmBool(false)).asMetadataRef() + ])) + ) + proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKernelName: string) = # ###################################### @@ -175,7 +202,7 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe block: let addType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false)) let addKernel = module.addFunction(addKernelName, addType) - let blck = ctx.appendBasicBlock(addKernel, "addBody") + let blck = ctx.appendBasicBlock(addKernel) builder.positionAtEnd(blck) let r = addKernel.getParam(0) let a = addKernel.getParam(1) @@ -184,12 +211,12 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe builder.store(sum, r) builder.retVoid() - module.wrapInCallableCudaKernel((addType, addKernel)) + module.tagCudaKernel(addType, addKernel) block: let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false)) let mulKernel = module.addFunction(mulKernelName, mulType) - let blck = ctx.appendBasicBlock(mulKernel, "mulBody") + let blck = ctx.appendBasicBlock(mulKernel) builder.positionAtEnd(blck) let r = mulKernel.getParam(0) let a = mulKernel.getParam(1) @@ -198,7 +225,7 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe builder.store(prod, r) builder.retVoid() - module.wrapInCallableCudaKernel((mulType, mulKernel)) + module.tagCudaKernel(mulType, mulKernel) module.verify(AbortProcessAction) @@ -228,7 +255,33 @@ type PTXviaLlvmNvptx proc getCudaKernel(cuMod: CUmodule, fnName: string): CUfunction = - check cuModuleGetFunction(result, cuMod, fnName & "_public") + check cuModuleGetFunction(result, cuMod, fnName[0].unsafeAddr) + +proc cudaDeviceInit(deviceID = 0'i32): CUdevice = + + check cuInit(deviceID.uint32) + + var devCount: int32 + check cuDeviceGetCount(devCount) + if devCount == 0: + echo "cudaDeviceInit error: no devices supporting CUDA" + quit 1 + + var cuDevice: CUdevice + check cuDeviceGet(cuDevice, deviceID) + var name = newString(128) + check cuDeviceGetName(name[0].addr, name.len.int32, cuDevice) + echo "Using CUDA Device [", deviceID, "]: ", cstring(name) + + var major, minor: int32 + check cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice) + check cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice) + echo "Compute Capability: SM ", major, ".", minor + if major < 6: + echo "Error: Device ",deviceID," is not sm_60 (Pascal generation, GTX 1080) or later" + quit 1 + + return cuDevice proc main(backend: CodegenBackend) = @@ -271,7 +324,7 @@ proc main(backend: CodegenBackend) = var cuCtx: CUcontext var cuMod: CUmodule check cuCtxCreate(cuCtx, 0, cudaDevice) - check cuModuleLoadData(cuMod, ptx) + check cuModuleLoadData(cuMod, ptx[0].unsafeAddr) let addKernel = cuMod.getCudaKernel(addKernelName) let mulKernel = cuMod.getCudaKernel(mulKernelName) @@ -323,9 +376,10 @@ proc main(backend: CodegenBackend) = check cuCtxDestroy(cuCtx) cuCtx = CUcontext(nil) -echo "\n\nCompilation via Nvidia NVVM\n###########################\n" -main(PTXviaNvidiaNvvm) -echo "\n\nEnd: Compilation via Nvidia NVVM\n################################" +# echo "\n\nCompilation via Nvidia NVVM\n###########################\n" +# main(PTXviaNvidiaNvvm) +# echo "\n\nEnd: Compilation via Nvidia NVVM\n################################" +echo "[Skip] Compilation via Nvidia NVVM, incompatibilities between LLVM IR and NVVM IR" echo "\n\nCompilation via LLVM NVPTX\n##########################\n" main(PTXviaLlvmNvptx) diff --git a/tests/gpu/t_nvidia_fp.nim b/tests/gpu/t_nvidia_fp.nim index af4de0075..ec5e8a2ed 100644 --- a/tests/gpu/t_nvidia_fp.nim +++ b/tests/gpu/t_nvidia_fp.nim @@ -14,9 +14,9 @@ import constantine/platforms/llvm/llvm, constantine/platforms/static_for, constantine/named/algebras, - constantine/math/io/io_bigints, constantine/math/arithmetic, - constantine/math_compiler/[ir, impl_fields_nvidia, codegen_nvidia], + constantine/math/io/[io_bigints, io_fields], + constantine/math_compiler/[ir, pub_fields, codegen_nvidia], # Test utilities helpers/prng_unsafe @@ -28,33 +28,6 @@ echo "test_nvidia_fp xoshiro512** seed: ", seed const Iters = 10 -proc init(T: type CurveMetadata, asy: Assembler_LLVM, curve: static Algebra, wordSize: WordSize): T = - CurveMetadata.init( - asy.ctx, - $curve & "_", wordSize, - fpBits = uint32 Fp[curve].bits(), - fpMod = Fp[curve].getModulus().toHex(), - frBits = uint32 Fr[curve].bits(), - frMod = Fr[curve].getModulus().toHex()) - -proc genFieldAddPTX(asy: Assembler_LLVM, cm: CurveMetadata) = - let fpAdd = asy.field_add_gen(cm, fp) - asy.module.wrapInCallableCudaKernel(fpAdd) - let frAdd = asy.field_add_gen(cm, fr) - asy.module.wrapInCallableCudaKernel(frAdd) - -proc genFieldSubPTX(asy: Assembler_LLVM, cm: CurveMetadata) = - let fpSub = asy.field_sub_gen(cm, fp) - asy.module.wrapInCallableCudaKernel(fpSub) - let frSub = asy.field_sub_gen(cm, fr) - asy.module.wrapInCallableCudaKernel(frSub) - -proc genFieldMulPTX(asy: Assembler_LLVM, cm: CurveMetadata) = - let fpMul = asy.field_mul_gen(cm, fp) - asy.module.wrapInCallableCudaKernel(fpMul) - let frMul = asy.field_mul_gen(cm, fr) - asy.module.wrapInCallableCudaKernel(frMul) - # Init LLVM # ------------------------- initializeFullNVPTXTarget() @@ -66,173 +39,55 @@ var sm: tuple[major, minor: int32] check cuDeviceGetAttribute(sm.major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDevice) check cuDeviceGetAttribute(sm.minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDevice) -proc t_field_add(curve: static Algebra) = - # Codegen - # ------------------------- - let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve)) - let cm32 = CurveMetadata.init(asy, curve, w32) - asy.genFieldAddPTX(cm32) - let cm64 = CurveMetadata.init(asy, curve, w64) - asy.genFieldAddPTX(cm64) - - let ptx = asy.codegenNvidiaPTX(sm) - - # GPU exec - # ------------------------- - var cuCtx: CUcontext - var cuMod: CUmodule - check cuCtxCreate(cuCtx, 0, cudaDevice) - check cuModuleLoadData(cuMod, ptx) - defer: - check cuMod.cuModuleUnload() - check cuCtx.cuCtxDestroy() - - let fpAdd32 = cuMod.getCudaKernel(cm32, opFpAdd) - let fpAdd64 = cuMod.getCudaKernel(cm64, opFpAdd) - let frAdd32 = cuMod.getCudaKernel(cm32, opFrAdd) - let frAdd64 = cuMod.getCudaKernel(cm64, opFrAdd) - - # Fp - for i in 0 ..< Iters: - let a = rng.random_long01Seq(Fp[curve]) - let b = rng.random_long01Seq(Fp[curve]) - - var rCPU, rGPU_32, rGPU_64: Fp[curve] - - rCPU.sum(a, b) - fpAdd32.exec(rGPU_32, a, b) - fpAdd64.exec(rGPU_64, a, b) - - doAssert bool(rCPU == rGPU_32) - doAssert bool(rCPU == rGPU_64) - - # Fr - for i in 0 ..< Iters: - let a = rng.random_long01Seq(Fr[curve]) - let b = rng.random_long01Seq(Fr[curve]) - - var rCPU, rGPU_32, rGPU_64: Fr[curve] - - rCPU.sum(a, b) - frAdd32.exec(rGPU_32, a, b) - frAdd64.exec(rGPU_64, a, b) - - doAssert bool(rCPU == rGPU_32) - doAssert bool(rCPU == rGPU_64) - -proc t_field_sub(curve: static Algebra) = - # Codegen - # ------------------------- - let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve)) - let cm32 = CurveMetadata.init(asy, curve, w32) - asy.genFieldSubPTX(cm32) - let cm64 = CurveMetadata.init(asy, curve, w64) - asy.genFieldSubPTX(cm64) - - let ptx = asy.codegenNvidiaPTX(sm) - - # GPU exec - # ------------------------- - var cuCtx: CUcontext - var cuMod: CUmodule - check cuCtxCreate(cuCtx, 0, cudaDevice) - check cuModuleLoadData(cuMod, ptx) - defer: - check cuMod.cuModuleUnload() - check cuCtx.cuCtxDestroy() - - let fpSub32 = cuMod.getCudaKernel(cm32, opFpSub) - let fpSub64 = cuMod.getCudaKernel(cm64, opFpSub) - let frSub32 = cuMod.getCudaKernel(cm32, opFrSub) - let frSub64 = cuMod.getCudaKernel(cm64, opFrSub) - - # Fp - for i in 0 ..< Iters: - let a = rng.random_long01Seq(Fp[curve]) - let b = rng.random_long01Seq(Fp[curve]) - - var rCPU, rGPU_32, rGPU_64: Fp[curve] - - rCPU.diff(a, b) - fpSub32.exec(rGPU_32, a, b) - fpSub64.exec(rGPU_64, a, b) - - doAssert bool(rCPU == rGPU_32) - doAssert bool(rCPU == rGPU_64) - - # Fr - for i in 0 ..< Iters: - let a = rng.random_long01Seq(Fr[curve]) - let b = rng.random_long01Seq(Fr[curve]) - - var rCPU, rGPU_32, rGPU_64: Fr[curve] - - rCPU.diff(a, b) - frSub32.exec(rGPU_32, a, b) - frSub64.exec(rGPU_64, a, b) - - doAssert bool(rCPU == rGPU_32) - doAssert bool(rCPU == rGPU_64) - -proc t_field_mul(curve: static Algebra) = - # Codegen - # ------------------------- - let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve)) - let cm32 = CurveMetadata.init(asy, curve, w32) - asy.genFieldMulPTX(cm32) - - # 64-bit integer fused-multiply-add with carry is buggy: - # https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py - # https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067 - - # let cm64 = CurveMetadata.init(asy, curve, w64) - # asy.genFieldMulPTX(cm64) - - let ptx = asy.codegenNvidiaPTX(sm) +template gen_binop_test( + testName: untyped, + kernGenerator: untyped, + cpuFn: untyped) = - # GPU exec - # ------------------------- - var cuCtx: CUcontext - var cuMod: CUmodule - check cuCtxCreate(cuCtx, 0, cudaDevice) - check cuModuleLoadData(cuMod, ptx) - defer: - check cuMod.cuModuleUnload() - check cuCtx.cuCtxDestroy() - let fpMul32 = cuMod.getCudaKernel(cm32, opFpMul) - let frMul32 = cuMod.getCudaKernel(cm32, opFrMul) - # let fpMul64 = cuMod.getCudaKernel(cm64, opFpMul) - # let frMul64 = cuMod.getCudaKernel(cm64, opFrMul) + proc testName[Name: static Algebra](field: type FF[Name], wordSize: int) = + # Codegen + # ------------------------- + let name = if field is Fp: $Name & "_fp" + else: $Name & "_fr" + let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & name & $wordSize)) + let fd = asy.ctx.configureField( + name, field.bits(), + field.getModulus().toHex(), + v = 1, w = wordSize + ) - # Fp - for i in 0 ..< Iters: - let a = rng.random_long01Seq(Fp[curve]) - let b = rng.random_long01Seq(Fp[curve]) + asy.definePrimitives(fd) + let kernName = asy.kernGenerator(fd) + let ptx = asy.codegenNvidiaPTX(sm) - var rCPU, rGPU_32: Fp[curve] # rGPU_64 + # GPU exec + # ------------------------- + var cuCtx: CUcontext + var cuMod: CUmodule + check cuCtxCreate(cuCtx, 0, cudaDevice) + check cuModuleLoadData(cuMod, ptx) + defer: + check cuMod.cuModuleUnload() + check cuCtx.cuCtxDestroy() - rCPU.prod(a, b) - fpMul32.exec(rGPU_32, a, b) - # fpMul64.exec(rGPU_64, a, b) + let kernel = cuMod.getCudaKernel(kernName) - doAssert bool(rCPU == rGPU_32) - # doAssert bool(rCPU == rGPU_64) + for i in 0 ..< Iters: + let a = rng.random_long01Seq(field) + let b = rng.random_long01Seq(field) - # Fr - for i in 0 ..< Iters: - let a = rng.random_long01Seq(Fr[curve]) - let b = rng.random_long01Seq(Fr[curve]) + var rCPU, rGPU: field - var rCPU, rGPU_32: Fr[curve] # rGPU_64 + rCPU.cpuFn(a, b) + kernel.exec(rGPU, a, b) - rCPU.prod(a, b) - frMul32.exec(rGPU_32, a, b) - # frMul64.exec(rGPU_64, a, b) + doAssert bool(rCPU == rGPU) - doAssert bool(rCPU == rGPU_32) - # doAssert bool(rCPU == rGPU_64) +gen_binop_test(t_field_add, genFpAdd, sum) +gen_binop_test(t_field_sub, genFpSub, diff) +gen_binop_test(t_field_mul, genFpMul, prod) proc main() = const curves = [ @@ -253,11 +108,31 @@ proc main() = suite "[Nvidia GPU] Field Arithmetic": staticFor i, 0, curves.len: const curve = curves[i] - test "Nvidia GPU field addition (𝔽p, 𝔽r) for " & $curve: - t_field_add(curve) - test "Nvidia GPU field substraction (𝔽p, 𝔽r) for " & $curve: - t_field_sub(curve) - test "Nvidia GPU field multiplication (𝔽p, 𝔽r) for " & $curve: - t_field_mul(curve) + for wordSize in [32, 64]: + test "Nvidia GPU field addition 𝔽p " & $wordSize & "-bit for " & $curve: + t_field_add(Fp[curve], wordSize) + test "Nvidia GPU field substraction 𝔽p " & $wordSize & "-bit for " & $curve: + t_field_sub(Fp[curve], wordSize) + test "Nvidia GPU field multiplication 𝔽p " & $wordSize & "-bit for " & $curve: + if wordSize == 64: + skip() + # 64-bit integer fused-multiply-add with carry is buggy: + # https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py + # https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067 + else: + t_field_mul(Fp[curve], wordSize) + + test "Nvidia GPU field addition 𝔽r " & $wordSize & "-bit for " & $curve: + t_field_add(Fr[curve], wordSize) + test "Nvidia GPU field substraction 𝔽r " & $wordSize & "-bit for " & $curve: + t_field_sub(Fr[curve], wordSize) + test "Nvidia GPU field multiplication 𝔽r " & $wordSize & "-bit for " & $curve: + if wordSize == 64: + skip() + # 64-bit integer fused-multiply-add with carry is buggy: + # https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py + # https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067 + else: + t_field_mul(Fr[curve], wordSize) main()