From 65147ed815d96fa94a05d307c1d9980877b7d0e8 Mon Sep 17 00:00:00 2001
From: Mamy Ratsimbazafy <mamy_github@numforge.co>
Date: Tue, 27 Aug 2024 16:53:23 +0200
Subject: [PATCH] Nvidia remastered (#464)

* nvidia: update hello world following changes in #456

* update Nvidia backend to use the new LLVM infra

* update Nvidia multiplication
---
 constantine/math_compiler/codegen_amdgpu.nim  |  39 --
 constantine/math_compiler/codegen_nvidia.nim  |  35 +-
 .../math_compiler/impl_fields_dispatch.nim    |  34 ++
 .../math_compiler/impl_fields_globals.nim     |   3 +-
 .../math_compiler/impl_fields_nvidia.nim      | 444 +++++++++---------
 constantine/math_compiler/impl_fields_sat.nim |   2 +-
 constantine/math_compiler/ir.nim              |  22 +-
 constantine/math_compiler/pub_fields.nim      |  38 +-
 constantine/platforms/llvm/asm_nvidia.nim     |  11 +-
 tests/gpu/hello_world_nvidia.nim              |  76 ++-
 tests/gpu/t_nvidia_fp.nim                     | 257 +++-------
 11 files changed, 437 insertions(+), 524 deletions(-)
 create mode 100644 constantine/math_compiler/impl_fields_dispatch.nim

diff --git a/constantine/math_compiler/codegen_amdgpu.nim b/constantine/math_compiler/codegen_amdgpu.nim
index c9e84b9ee..cd87b3640 100644
--- a/constantine/math_compiler/codegen_amdgpu.nim
+++ b/constantine/math_compiler/codegen_amdgpu.nim
@@ -67,45 +67,6 @@ proc hipDeviceInit*(deviceID = 0'i32): HipDevice =
 
   return hipDevice
 
-# ############################################################
-#
-#                   LLVM IR for AMD GPUs
-#
-# ############################################################
-#
-# Note:
-#   __device__ functions for field and elliptic curve arithmetic
-#   might be compiled by default with scalar codegen
-#
-#   We will need to either:
-#   - Derive explicitly a vectorized version of the warp/wave size (32)
-#   - Derive implicitly a vectorized version, probably with __forceinline__
-
-proc wrapInCallableHipKernel*(module: ModuleRef, fn: FnDef) =
-  ## Create a public wrapper of a Hip device function
-  ##
-  ## A function named `addmod` can be found by appending _public
-  ##   check hipModuleGetFunction(fnPointer, cuModule, "addmod_public")
-
-  let pubName = fn.fnImpl.getName() & "_public"
-  let pubFn = module.addFunction(cstring(pubName), fn.fnTy)
-
-  let ctx = module.getContext()
-  let builder = ctx.createBuilder()
-  defer: builder.dispose()
-
-  let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody")
-  builder.positionAtEnd(blck)
-
-  var args = newSeq[ValueRef](fn.fnTy.countParamTypes())
-  for i, arg in mpairs(args):
-    arg = pubFn.getParam(i.uint32)
-  discard builder.call2(fn.fnTy, fn.fnImpl, args)
-
-  # A public kernel must return void
-  builder.retVoid()
-  pubFn.setCallingConvention(AMDGPU_KERNEL)
-
 # ############################################################
 #
 #                      Code generation
diff --git a/constantine/math_compiler/codegen_nvidia.nim b/constantine/math_compiler/codegen_nvidia.nim
index fdc4c393b..ccacca53b 100644
--- a/constantine/math_compiler/codegen_nvidia.nim
+++ b/constantine/math_compiler/codegen_nvidia.nim
@@ -109,37 +109,6 @@ proc cudaDeviceInit*(deviceID = 0'i32): CUdevice =
 
   return cuDevice
 
-# ############################################################
-#
-#                   LLVM IR for Nvidia GPUs
-#
-# ############################################################
-
-proc wrapInCallableCudaKernel*(module: ModuleRef, fn: FnDef) =
-  ## Create a public wrapper of a cuda device function
-  ##
-  ## A function named `addmod` can be found by appending _public
-  ##   check cuModuleGetFunction(fnPointer, cuModule, "addmod_public")
-
-  let pubName = fn.fnImpl.getName() & "_public"
-  let pubFn = module.addFunction(cstring(pubName), fn.fnTy)
-
-  let ctx = module.getContext()
-  let builder = ctx.createBuilder()
-  defer: builder.dispose()
-
-  let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody")
-  builder.positionAtEnd(blck)
-
-  var args = newSeq[ValueRef](fn.fnTy.countParamTypes())
-  for i, arg in mpairs(args):
-    arg = pubFn.getParam(i.uint32)
-  discard builder.call2(fn.fnTy, fn.fnImpl, args)
-
-  # A public kernel must return void
-  builder.retVoid()
-  module.tagCudaKernel((fn.fnTy, pubFn))
-
 # ############################################################
 #
 #                      Code generation
@@ -194,9 +163,7 @@ proc codegenNvidiaPTX*(asy: Assembler_LLVM, sm: tuple[major, minor: int32]): str
 #
 # ############################################################
 
-proc getCudaKernel*(cuMod: CUmodule, cm: CurveMetadata, opcode: Opcode): CUfunction =
-  # Public kernels are appended _public
-  let fnName = cm.genSymbol(opcode) & "_public"
+proc getCudaKernel*(cuMod: CUmodule, fnName: string): CUfunction =
   check cuModuleGetFunction(result, cuMod, fnName)
 
 proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) =
diff --git a/constantine/math_compiler/impl_fields_dispatch.nim b/constantine/math_compiler/impl_fields_dispatch.nim
new file mode 100644
index 000000000..020a10dc8
--- /dev/null
+++ b/constantine/math_compiler/impl_fields_dispatch.nim
@@ -0,0 +1,34 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  constantine/platforms/llvm/llvm,
+  ./ir,
+  ./impl_fields_sat {.all.},
+  ./impl_fields_nvidia {.all.}
+
+proc modadd*(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) =
+  case asy.backend
+  of {bkX86_64_Linux, bkAmdGpu}:
+    asy.modadd_sat(fd, r, a, b, M)
+  of bkNvidiaPTX:
+    asy.modadd_nvidia(fd, r, a, b, M)
+
+proc modsub*(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) =
+  case asy.backend
+  of bkNvidiaPTX:
+    asy.modsub_nvidia(fd, r, a, b, M)
+  else:
+    doAssert false, "Unimplemented"
+
+proc mtymul*(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) =
+  case asy.backend
+  of bkNvidiaPTX:
+    asy.mtymul_nvidia(fd, r, a, b, M)
+  else:
+    doAssert false, "Unimplemented"
\ No newline at end of file
diff --git a/constantine/math_compiler/impl_fields_globals.nim b/constantine/math_compiler/impl_fields_globals.nim
index faac591ca..1957aa930 100644
--- a/constantine/math_compiler/impl_fields_globals.nim
+++ b/constantine/math_compiler/impl_fields_globals.nim
@@ -157,8 +157,7 @@ proc getM0ninv*(asy: Assembler_LLVM, fd: FieldDescriptor): ValueRef =
         fd.wordTy
       )
 
-
-  return m0ninv
+  return asy.load2(fd.wordTy, m0ninv, "m0ninv")
 
 when isMainModule:
   let asy = Assembler_LLVM.new("test_module", bkX86_64_Linux)
diff --git a/constantine/math_compiler/impl_fields_nvidia.nim b/constantine/math_compiler/impl_fields_nvidia.nim
index 5843d02d8..b27a843c0 100644
--- a/constantine/math_compiler/impl_fields_nvidia.nim
+++ b/constantine/math_compiler/impl_fields_nvidia.nim
@@ -8,7 +8,8 @@
 
 import
   constantine/platforms/llvm/[llvm, asm_nvidia],
-  ./ir
+  ./ir,
+  ./impl_fields_globals
 
 # ############################################################
 #
@@ -46,7 +47,9 @@ import
 #
 # We cannot use i256 on Nvidia target: https://github.com/llvm/llvm-project/blob/llvmorg-18.1.8/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp#L244-L276
 
-proc finalSubMayOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
+const SectionName = "ctt.fields"
+
+proc finalSubMayOverflow(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, M: Array) =
   ## If a >= Modulus: r <- a-M
   ## else:            r <- a
   ##
@@ -55,31 +58,27 @@ proc finalSubMayOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r
   ##
   ## To be used when the final substraction can
   ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
-
-  let bld = asy.builder
-  let fieldTy = cm.getFieldType(field)
-  let scratch = bld.makeArray(fieldTy)
-  let M = cm.getModulus(field)
-  let N = M.len
+  let N = fd.numWords
+  let scratch = asy.makeArray(fd.fieldTy)
 
   # Contains 0x0001 (if overflowed limbs) or 0x0000
-  let overflowedLimbs = bld.add_ci(0'u32, 0'u32)
+  let overflowedLimbs = asy.br.add_ci(0'u32, 0'u32)
 
   # Now substract the modulus, and test a < M with the last borrow
-  scratch[0] = bld.sub_bo(a[0], M[0])
+  scratch[0] = asy.br.sub_bo(a[0], M[0])
   for i in 1 ..< N:
-    scratch[i] = bld.sub_bio(a[i], M[i])
+    scratch[i] = asy.br.sub_bio(a[i], M[i])
 
   # 1. if `overflowedLimbs`, underflowedModulus >= 0
   # 2. if a >= M, underflowedModulus >= 0
   # if underflowedModulus >= 0: a-M else: a
   # TODO: predicated mov instead?
-  let underflowedModulus = bld.sub_bi(overflowedLimbs, 0'u32)
+  let underflowedModulus = asy.br.sub_bi(overflowedLimbs, 0'u32)
 
   for i in 0 ..< N:
-    r[i] = bld.slct(scratch[i], a[i], underflowedModulus)
+    r[i] = asy.br.slct(scratch[i], a[i], underflowedModulus)
 
-proc finalSubNoOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
+proc finalSubNoOverflow(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, M: Array) =
   ## If a >= Modulus: r <- a-M
   ## else:            r <- a
   ##
@@ -88,275 +87,262 @@ proc finalSubNoOverflow(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r,
   ##
   ## To be used when the modulus does not use the full bitwidth of the storing words
   ## (say using 255 bits for the modulus out of 256 available in words)
-
-  let bld = asy.builder
-  let fieldTy = cm.getFieldType(field)
-  let scratch = bld.makeArray(fieldTy)
-  let M = cm.getModulus(field)
-  let N = M.len
+  let N = fd.numWords
+  let scratch = asy.makeArray(fd.fieldTy)
 
   # Now substract the modulus, and test a < M with the last borrow
-  scratch[0] = bld.sub_bo(a[0], M[0])
+  scratch[0] = asy.br.sub_bo(a[0], M[0])
   for i in 1 ..< N:
-    scratch[i] = bld.sub_bio(a[i], M[i])
+    scratch[i] = asy.br.sub_bio(a[i], M[i])
 
   # If it underflows here a was smaller than the modulus, which is what we want
-  let underflowedModulus = bld.sub_bi(0'u32, 0'u32)
+  let underflowedModulus = asy.br.sub_bi(0'u32, 0'u32)
 
   for i in 0 ..< N:
-    r[i] = bld.slct(scratch[i], a[i], underflowedModulus)
+    r[i] = asy.br.slct(scratch[i], a[i], underflowedModulus)
 
-proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef =
+proc modadd_nvidia(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) {.used.} =
   ## Generate an optimized modular addition kernel
   ## with parameters `a, b, modulus: Limbs -> Limbs`
+  let red = if fd.spareBits >= 1: "noo"
+            else: "mayo"
+  let name = "_modadd_" & red & ".u" & $fd.w & "x" & $fd.numWords
 
-  let procName = cm.genSymbol(block:
-    case field
-    of fp: opFpAdd
-    of fr: opFrAdd)
-  let fieldTy = cm.getFieldType(field)
-  let pFieldTy = pointer_t(fieldTy)
+  asy.llvmInternalFnDef(
+          name, SectionName,
+          asy.void_t, toTypes([r, a, b, M]),
+          {kHot}):
 
-  let addModTy = function_t(asy.void_t, [pFieldTy, pFieldTy, pFieldTy])
-  let addModKernel = asy.module.addFunction(cstring procName, addModTy)
-  let blck = asy.ctx.appendBasicBlock(addModKernel, "addModBody")
-  asy.builder.positionAtEnd(blck)
+    tagParameter(1, "sret")
 
-  let bld = asy.builder
+    let (rr, aa, bb, MM) = llvmParams
 
-  let r = bld.asArray(addModKernel.getParam(0), fieldTy)
-  let a = bld.asArray(addModKernel.getParam(1), fieldTy)
-  let b = bld.asArray(addModKernel.getParam(2), fieldTy)
+    # Pointers are opaque in LLVM now
+    let r = asy.asArray(rr, fd.fieldTy)
+    let a = asy.asArray(aa, fd.fieldTy)
+    let b = asy.asArray(bb, fd.fieldTy)
+    let M = asy.asArray(MM, fd.fieldTy)
 
-  let t = bld.makeArray(fieldTy)
-  let N = cm.getNumWords(field)
+    let t = asy.makeArray(fd.fieldTy)
+    let N = fd.numWords
 
-  t[0] = bld.add_co(a[0], b[0])
-  for i in 1 ..< N:
-    t[i] = bld.add_cio(a[i], b[i])
+    t[0] = asy.br.add_co(a[0], b[0])
+    for i in 1 ..< N:
+      t[i] = asy.br.add_cio(a[i], b[i])
 
-  if cm.getSpareBits(field) >= 1:
-    asy.finalSubNoOverflow(cm, field, t, t)
-  else:
-    asy.finalSubMayOverflow(cm, field, t, t)
+    if fd.spareBits >= 1:
+      asy.finalSubNoOverflow(fd, t, t, M)
+    else:
+      asy.finalSubMayOverflow(fd, t, t, M)
 
-  bld.store(r, t)
-  bld.retVoid()
+    asy.store(r, t)
+    asy.br.retVoid()
 
-  return (addModTy, addModKernel)
+  asy.callFn(name, [r, a, b, M])
 
-proc field_sub_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef =
+proc modsub_nvidia(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) {.used.} =
   ## Generate an optimized modular substraction kernel
   ## with parameters `a, b, modulus: Limbs -> Limbs`
+  let name = "_modsub.u" & $fd.w & "x" & $fd.numWords
 
-  let procName = cm.genSymbol(block:
-    case field
-    of fp: opFpSub
-    of fr: opFrSub)
-  let fieldTy = cm.getFieldType(field)
-  let pFieldTy = pointer_t(fieldTy)
+  asy.llvmInternalFnDef(
+          name, SectionName,
+          asy.void_t, toTypes([r, a, b, M]),
+          {kHot}):
 
-  let subModTy = function_t(asy.void_t, [pFieldTy, pFieldTy, pFieldTy])
-  let subModKernel = asy.module.addFunction(cstring procName, subModTy)
-  let blck = asy.ctx.appendBasicBlock(subModKernel, "subModBody")
-  asy.builder.positionAtEnd(blck)
+    tagParameter(1, "sret")
 
-  let bld = asy.builder
+    let (rr, aa, bb, MM) = llvmParams
 
-  let r = bld.asArray(subModKernel.getParam(0), fieldTy)
-  let a = bld.asArray(subModKernel.getParam(1), fieldTy)
-  let b = bld.asArray(subModKernel.getParam(2), fieldTy)
+    # Pointers are opaque in LLVM now
+    let r = asy.asArray(rr, fd.fieldTy)
+    let a = asy.asArray(aa, fd.fieldTy)
+    let b = asy.asArray(bb, fd.fieldTy)
+    let M = asy.asArray(MM, fd.fieldTy)
 
-  let t = bld.makeArray(fieldTy)
-  let N = cm.getNumWords(field)
-  let zero = case cm.wordSize
-             of w32: constInt(asy.i32_t, 0)
-             of w64: constInt(asy.i64_t, 0)
+    let t = asy.makeArray(fd.fieldTy)
+    let N = fd.numWords
 
-  t[0] = bld.sub_bo(a[0], b[0])
-  for i in 1 ..< N:
-    t[i] = bld.sub_bio(a[i], b[i])
+    t[0] = asy.br.sub_bo(a[0], b[0])
+    for i in 1 ..< N:
+      t[i] = asy.br.sub_bio(a[i], b[i])
 
-  let underflowMask = bld.sub_bi(zero, zero)
+    let underflowMask = asy.br.sub_bi(fd.zero, fd.zero)
 
-  # If underflow
-  # TODO: predicated mov instead?
-  let M = (seq[ValueRef])(cm.getModulus(field))
-  let maskedM = bld.makeArray(fieldTy)
-  for i in 0 ..< N:
-    maskedM[i] = bld.`and`(M[i], underflowMask)
+    # If underflow
+    # TODO: predicated mov instead?
+    let maskedM = asy.makeArray(fd.fieldTy)
+    for i in 0 ..< N:
+      maskedM[i] = asy.br.`and`(M[i], underflowMask)
 
-  block:
-    t[0] = bld.add_co(t[0], maskedM[0])
-  for i in 1 ..< N-1:
-    t[i] = bld.add_cio(t[i], maskedM[i])
-  if N > 1:
-    t[N-1] = bld.add_ci(t[N-1], maskedM[N-1])
+    block:
+      t[0] = asy.br.add_co(t[0], maskedM[0])
+    for i in 1 ..< N-1:
+      t[i] = asy.br.add_cio(t[i], maskedM[i])
+    if N > 1:
+      t[N-1] = asy.br.add_ci(t[N-1], maskedM[N-1])
 
-  bld.store(r, t)
-  bld.retVoid()
+    asy.store(r, t)
+    asy.br.retVoid()
 
-  return (subModTy, subModKernel)
+  asy.callFn(name, [r, a, b, M])
 
-proc field_mul_CIOS_sparebit_gen(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, skipFinalSub: bool): FnDef =
+proc mtymul_CIOS_sparebit(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef, finalReduce: bool) =
   ## Generate an optimized modular multiplication kernel
   ## with parameters `a, b, modulus: Limbs -> Limbs`
 
-  let procName = cm.genSymbol(block:
-    if skipFinalSub:
-      case field
-      of fp: opFpMulSkipFinalSub
-      of fr: opFrMulSkipFinalSub
+  let name =
+    if not finalReduce and fd.spareBits >= 2:
+      "_mty_mulur.u" & $fd.w & "x" & $fd.numWords & "b2"
     else:
-      case field
-      of fp: opFpMul
-      of fr: opFrMul)
-  let fieldTy = cm.getFieldType(field)
-  let pFieldTy = pointer_t(fieldTy)
-
-  let mulModTy = function_t(asy.void_t, [pFieldTy, pFieldTy, pFieldTy])
-  let mulModKernel = asy.module.addFunction(cstring procName, mulModTy)
-  let blck = asy.ctx.appendBasicBlock(mulModKernel, "mulModBody")
-  asy.builder.positionAtEnd(blck)
-
-  let bld = asy.builder
-
-  let r = bld.asArray(mulModKernel.getParam(0), fieldTy)
-  let a = bld.asArray(mulModKernel.getParam(1), fieldTy)
-  let b = bld.asArray(mulModKernel.getParam(2), fieldTy)
-
-  # Algorithm
-  # -----------------------------------------
-  #
-  # On x86, with a single carry chain and a spare bit:
-  #
-  # for i=0 to N-1
-  #   (A, t[0]) <- a[0] * b[i] + t[0]
-  #    m        <- (t[0] * m0ninv) mod 2ʷ
-  #   (C, _)    <- m * M[0] + t[0]
-  #   for j=1 to N-1
-  #     (A, t[j])   <- a[j] * b[i] + A + t[j]
-  #     (C, t[j-1]) <- m * M[j] + C + t[j]
-  #
-  #   t[N-1] = C + A
-  #
-  # with MULX, ADCX, ADOX dual carry chains
-  #
-  # for i=0 to N-1
-  #   for j=0 to N-1
-  # 		(A,t[j])  := t[j] + a[j]*b[i] + A
-  #   m := t[0]*m0ninv mod W
-  # 	C,_ := t[0] + m*M[0]
-  # 	for j=1 to N-1
-  # 		(C,t[j-1]) := t[j] + m*M[j] + C
-  #   t[N-1] = C + A
-  #
-  # In our case, we only have a single carry flag
-  # but we have a lot of registers
-  # and a multiply-accumulate instruction
-  #
-  # Hence we can use the dual carry chain approach
-  # one chain after the other instead of interleaved like on x86.
-
-  let t = bld.makeArray(fieldTy)
-  let N = cm.getNumWords(field)
-  let m0ninv = ValueRef cm.getMontgomeryNegInverse0(field)
-  let M = (seq[ValueRef])(cm.getModulus(field))
-  let zero = case cm.wordSize
-             of w32: constInt(asy.i32_t, 0)
-             of w64: constInt(asy.i64_t, 0)
+      doAssert fd.spareBits >= 1
+      "_mty_mul.u" & $fd.w & "x" & $fd.numWords & "b1"
 
-  for i in 0 ..< N:
-    # Multiplication
-    # -------------------------------
-    #   for j=0 to N-1
-    # 		(A,t[j])  := t[j] + a[j]*b[i] + A
+  asy.llvmInternalFnDef(
+          name, SectionName,
+          asy.void_t, toTypes([r, a, b, M]),
+          {kHot}):
+
+    tagParameter(1, "sret")
+
+    let (rr, aa, bb, MM) = llvmParams
+
+    # Pointers are opaque in LLVM now
+    let r = asy.asArray(rr, fd.fieldTy)
+    let a = asy.asArray(aa, fd.fieldTy)
+    let b = asy.asArray(bb, fd.fieldTy)
+    let M = asy.asArray(MM, fd.fieldTy)
+
+    let t = asy.makeArray(fd.fieldTy)
+    let N = fd.numWords
+    let m0ninv = asy.getM0ninv(fd)
+
+    # Algorithm
+    # -----------------------------------------
     #
-    # for 4 limbs, implicit column-wise carries
+    # On x86, with a single carry chain and a spare bit:
     #
-    # t[0]     = t[0] + (a[0]*b[i]).lo
-    # t[1]     = t[1] + (a[1]*b[i]).lo + (a[0]*b[i]).hi
-    # t[2]     = t[2] + (a[2]*b[i]).lo + (a[1]*b[i]).hi
-    # t[3]     = t[3] + (a[3]*b[i]).lo + (a[2]*b[i]).hi
-    # overflow =                         (a[3]*b[i]).hi
+    # for i=0 to N-1
+    #   (A, t[0]) <- a[0] * b[i] + t[0]
+    #    m        <- (t[0] * m0ninv) mod 2ʷ
+    #   (C, _)    <- m * M[0] + t[0]
+    #   for j=1 to N-1
+    #     (A, t[j])   <- a[j] * b[i] + A + t[j]
+    #     (C, t[j-1]) <- m * M[j] + C + t[j]
     #
-    # or
+    #   t[N-1] = C + A
     #
-    # t[0]     = t[0] + (a[0]*b[i]).lo
-    # t[1]     = t[1] + (a[0]*b[i]).hi + (a[1]*b[i]).lo
-    # t[2]     = t[2] + (a[2]*b[i]).lo + (a[1]*b[i]).hi
-    # t[3]     = t[3] + (a[2]*b[i]).hi + (a[3]*b[i]).lo
-    # overflow =    carry              + (a[3]*b[i]).hi
+    # with MULX, ADCX, ADOX dual carry chains
     #
-    # Depending if we chain lo/hi or even/odd
-    # The even/odd carry chain is more likely to be optimized via μops-fusion
-    # as it's common to compute the full product. That said:
-    # - it's annoying if the number of limbs is odd with edge conditions.
-    # - GPUs are RISC architectures and unlikely to have clever instruction rescheduling logic
-    let bi = b[i]
-    var A = ValueRef zero
-
-    if i == 0:
-      for j in 0 ..< N:
-        t[j] = bld.mul(a[j], bi)
-    else:
-      t[0] = bld.mulloadd_co(a[0], bi, t[0])
-      for j in 1 ..< N:
-        t[j] = bld.mulloadd_cio(a[j], bi, t[j])
-      if N > 1:
-        A = bld.add_ci(zero, zero)
-    if N > 1:
-      t[1] = bld.mulhiadd_co(a[0], bi, t[1])
-    for j in 2 ..< N:
-      t[j] = bld.mulhiadd_cio(a[j-1], bi, t[j])
-    A = bld.mulhiadd_ci(a[N-1], bi, A)
-
-    # Reduction
-    # -------------------------------
+    # for i=0 to N-1
+    #   for j=0 to N-1
+    # 		(A,t[j])  := t[j] + a[j]*b[i] + A
     #   m := t[0]*m0ninv mod W
-    #
     # 	C,_ := t[0] + m*M[0]
     # 	for j=1 to N-1
     # 		(C,t[j-1]) := t[j] + m*M[j] + C
     #   t[N-1] = C + A
     #
-    # for 4 limbs, implicit column-wise carries
-    #    _  = t[0] + (m*M[0]).lo
-    #  t[0] = t[1] + (m*M[1]).lo + (m*M[0]).hi
-    #  t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi
-    #  t[2] = t[3] + (m*M[3]).lo + (m*M[2]).hi
-    #  t[3] = A + carry          + (m*M[3]).hi
-    #
-    # or
+    # In our case, we only have a single carry flag
+    # but we have a lot of registers
+    # and a multiply-accumulate instruction
     #
-    #    _  = t[0] + (m*M[0]).lo
-    #  t[0] = t[1] + (m*M[0]).hi + (m*M[1]).lo
-    #  t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi
-    #  t[2] = t[3] + (m*M[2]).hi + (m*M[3]).lo
-    #  t[3] = A + carry          + (m*M[3]).hi
-
-    let m = bld.mul(t[0], m0ninv)
-    let _ = bld.mulloadd_co(m, M[0], t[0])
-    for j in 1 ..< N:
-      t[j-1] = bld.mulloadd_cio(m, M[j], t[j])
-    t[N-1] = bld.add_ci(A, 0)
-    if N > 1:
-      t[0] = bld.mulhiadd_co(m, M[0], t[0])
-      for j in 1 ..< N-1:
-        t[j] = bld.mulhiadd_cio(m, M[j], t[j])
-      t[N-1] = bld.mulhiadd_ci(m, M[N-1], t[N-1])
-    else:
-      t[0] = bld.mulhiadd(m, M[0], t[0])
+    # Hence we can use the dual carry chain approach
+    # one chain after the other instead of interleaved like on x86.
+
+    for i in 0 ..< N:
+      # Multiplication
+      # -------------------------------
+      #   for j=0 to N-1
+      # 		(A,t[j])  := t[j] + a[j]*b[i] + A
+      #
+      # for 4 limbs, implicit column-wise carries
+      #
+      # t[0]     = t[0] + (a[0]*b[i]).lo
+      # t[1]     = t[1] + (a[1]*b[i]).lo + (a[0]*b[i]).hi
+      # t[2]     = t[2] + (a[2]*b[i]).lo + (a[1]*b[i]).hi
+      # t[3]     = t[3] + (a[3]*b[i]).lo + (a[2]*b[i]).hi
+      # overflow =                         (a[3]*b[i]).hi
+      #
+      # or
+      #
+      # t[0]     = t[0] + (a[0]*b[i]).lo
+      # t[1]     = t[1] + (a[0]*b[i]).hi + (a[1]*b[i]).lo
+      # t[2]     = t[2] + (a[2]*b[i]).lo + (a[1]*b[i]).hi
+      # t[3]     = t[3] + (a[2]*b[i]).hi + (a[3]*b[i]).lo
+      # overflow =    carry              + (a[3]*b[i]).hi
+      #
+      # Depending if we chain lo/hi or even/odd
+      # The even/odd carry chain is more likely to be optimized via μops-fusion
+      # as it's common to compute the full product. That said:
+      # - it's annoying if the number of limbs is odd with edge conditions.
+      # - GPUs are RISC architectures and unlikely to have clever instruction rescheduling logic
+      let bi = b[i]
+      var A = fd.zero
+
+      if i == 0:
+        for j in 0 ..< N:
+          t[j] = asy.br.mul(a[j], bi)
+      else:
+        t[0] = asy.br.mulloadd_co(a[0], bi, t[0])
+        for j in 1 ..< N:
+          t[j] = asy.br.mulloadd_cio(a[j], bi, t[j])
+        if N > 1:
+          A = asy.br.add_ci(fd.zero, fd.zero)
+      if N > 1:
+        t[1] = asy.br.mulhiadd_co(a[0], bi, t[1])
+      for j in 2 ..< N:
+        t[j] = asy.br.mulhiadd_cio(a[j-1], bi, t[j])
+      A = asy.br.mulhiadd_ci(a[N-1], bi, A)
+
+      # Reduction
+      # -------------------------------
+      #   m := t[0]*m0ninv mod W
+      #
+      # 	C,_ := t[0] + m*M[0]
+      # 	for j=1 to N-1
+      # 		(C,t[j-1]) := t[j] + m*M[j] + C
+      #   t[N-1] = C + A
+      #
+      # for 4 limbs, implicit column-wise carries
+      #    _  = t[0] + (m*M[0]).lo
+      #  t[0] = t[1] + (m*M[1]).lo + (m*M[0]).hi
+      #  t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi
+      #  t[2] = t[3] + (m*M[3]).lo + (m*M[2]).hi
+      #  t[3] = A + carry          + (m*M[3]).hi
+      #
+      # or
+      #
+      #    _  = t[0] + (m*M[0]).lo
+      #  t[0] = t[1] + (m*M[0]).hi + (m*M[1]).lo
+      #  t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi
+      #  t[2] = t[3] + (m*M[2]).hi + (m*M[3]).lo
+      #  t[3] = A + carry          + (m*M[3]).hi
+
+      let m = asy.br.mul(t[0], m0ninv)
+      let _ = asy.br.mulloadd_co(m, M[0], t[0])
+      for j in 1 ..< N:
+        t[j-1] = asy.br.mulloadd_cio(m, M[j], t[j])
+      t[N-1] = asy.br.add_ci(A, 0)
+      if N > 1:
+        t[0] = asy.br.mulhiadd_co(m, M[0], t[0])
+        for j in 1 ..< N-1:
+          t[j] = asy.br.mulhiadd_cio(m, M[j], t[j])
+        t[N-1] = asy.br.mulhiadd_ci(m, M[N-1], t[N-1])
+      else:
+        t[0] = asy.br.mulhiadd(m, M[0], t[0])
 
-  if not skipFinalSub:
-    asy.finalSubNoOverflow(cm, field, t, t)
+    if finalReduce:
+      asy.finalSubNoOverflow(fd, t, t, M)
 
-  bld.store(r, t)
-  bld.retVoid()
+    asy.store(r, t)
+    asy.br.retVoid()
 
-  return (mulModTy, mulModKernel)
+  asy.callFn(name, [r, a, b, M])
 
-proc field_mul_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, skipFinalSub = false): FnDef =
+proc mtymul_nvidia(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef, finalReduce = true) {.used.} =
   ## Generate an optimized modular addition kernel
   ## with parameters `a, b, modulus: Limbs -> Limbs`
-  return asy.field_mul_CIOS_sparebit_gen(cm, field, skipFinalSub)
+
+  # TODO: spareBits == 0
+  asy.mtymul_CIOS_sparebit(fd, r, a, b, M, finalReduce)
diff --git a/constantine/math_compiler/impl_fields_sat.nim b/constantine/math_compiler/impl_fields_sat.nim
index cd52f96f2..ab8fa4bfd 100644
--- a/constantine/math_compiler/impl_fields_sat.nim
+++ b/constantine/math_compiler/impl_fields_sat.nim
@@ -121,7 +121,7 @@ proc finalSubNoOverflow*(asy: Assembler_LLVM, fd: FieldDescriptor, rr, a, M: Val
   let t = asy.br.select(borrow, a, a_minus_M)
   asy.store(rr, t)
 
-proc modadd*(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) =
+proc modadd_sat(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) {.used.} =
   ## Generate an optimized modular addition kernel
   ## with parameters `a, b, modulus: Limbs -> Limbs`
 
diff --git a/constantine/math_compiler/ir.nim b/constantine/math_compiler/ir.nim
index ab80e036a..db46a273f 100644
--- a/constantine/math_compiler/ir.nim
+++ b/constantine/math_compiler/ir.nim
@@ -259,11 +259,11 @@ proc makeArray*(asy: Assembler_LLVM, elemTy: TypeRef, len: uint32): Array =
 
 proc `[]`*(a: Array, index: SomeInteger): ValueRef {.inline.}=
   # First dereference the array pointer with 0, then access the `index`
-  let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.buf, [ValueRef constInt(a.int32_t, 0), ValueRef constInt(a.int32_t, uint64 index)])
+  let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.buf, [constInt(a.int32_t, 0), constInt(a.int32_t, uint64 index)])
   a.builder.load2(a.elemTy, pelem)
 
 proc `[]=`*(a: Array, index: SomeInteger, val: ValueRef) {.inline.}=
-  let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.buf, [ValueRef constInt(a.int32_t, 0), ValueRef constInt(a.int32_t, uint64 index)])
+  let pelem = a.builder.getElementPtr2_InBounds(a.arrayTy, a.buf, [constInt(a.int32_t, 0), constInt(a.int32_t, uint64 index)])
   a.builder.store(val, pelem)
 
 proc store*(asy: Assembler_LLVM, dst: Array, src: Array) {.inline.}=
@@ -345,9 +345,10 @@ proc defineGlobalConstant*(
 proc tagCudaKernel(asy: Assembler_LLVM, fn: ValueRef) =
   ## Tag a function as a Cuda Kernel, i.e. callable from host
 
-  let returnTy = fn.getTypeOf().getReturnType()
-  doAssert returnTy.isVoid(), block:
-    "Kernels must not return values but function returns " & $returnTy.getTypeKind()
+  # We cannot get the full function type from its impl so we cannot do this check.
+  # let returnTy = fn.getTypeOf().getReturnType()
+  # doAssert returnTy.isVoid(), block:
+  #   "Kernels must not return values but function returns " & $returnTy.getTypeKind()
 
   asy.module.addNamedMetadataOperand(
     "nvvm.annotations",
@@ -360,8 +361,11 @@ proc tagCudaKernel(asy: Assembler_LLVM, fn: ValueRef) =
 
 proc setPublic(asy: Assembler_LLVM, fn: ValueRef) =
   case asy.backend
-  of bkAmdGpu: fn.setFnCallConv(AMDGPU_KERNEL)
-  of bkNvidiaPtx: asy.tagCudaKernel(fn)
+  of bkAmdGpu:
+    fn.setFnCallConv(AMDGPU_KERNEL)
+  of bkNvidiaPtx:
+    # asy.tagCudaKernel(fn)
+    fn.setFnCallConv(PTX_Kernel)
   else: discard
 
 # ############################################################
@@ -474,7 +478,7 @@ template llvmFnDef[N: static int](
       savedLoc = blck
 
     let llvmParams {.inject.} = unpackParams(asy.br, paramsTys)
-    template tagParameter(idx: int, attr: string) {.inject.} =
+    template tagParameter(idx: int, attr: string) {.inject, used.} =
       let a = asy.ctx.createAttr(attr)
       fn.addAttribute(cint idx, a)
     body
@@ -484,7 +488,7 @@ template llvmFnDef[N: static int](
       fn.setLinkage(linkInternal)
     else:
       asy.setPublic(fn)
-    fn.setSection(sectionName)
+    fn.setSection(cstring sectionName)
     asy.addAttributes(fn, attrs)
 
     asy.br.positionAtEnd(savedLoc)
diff --git a/constantine/math_compiler/pub_fields.nim b/constantine/math_compiler/pub_fields.nim
index 51f92da3e..83e602644 100644
--- a/constantine/math_compiler/pub_fields.nim
+++ b/constantine/math_compiler/pub_fields.nim
@@ -10,13 +10,13 @@ import
   constantine/platforms/llvm/llvm,
   ./ir,
   ./impl_fields_globals,
-  ./impl_fields_sat
+  ./impl_fields_dispatch
 
 proc genFpAdd*(asy: Assembler_LLVM, fd: FieldDescriptor): string =
   ## Generate a public field addition proc
   ## with signature
   ##   void name(FieldType r, FieldType a, FieldType b)
-  ## with r the result and a, b the operants
+  ## with r the result and a, b the operands
   ## and return the corresponding name to call it
 
   let name = fd.name & "_add"
@@ -28,3 +28,37 @@ proc genFpAdd*(asy: Assembler_LLVM, fd: FieldDescriptor): string =
     asy.br.retVoid()
 
   return name
+
+proc genFpSub*(asy: Assembler_LLVM, fd: FieldDescriptor): string =
+  ## Generate a public field substraction proc
+  ## with signature
+  ##   void name(FieldType r, FieldType a, FieldType b)
+  ## with r the result and a, b the operands
+  ## and return the corresponding name to call it
+
+  let name = fd.name & "_sub"
+  asy.llvmPublicFnDef(name, "ctt." & fd.name, asy.void_t, [fd.fieldTy, fd.fieldTy, fd.fieldTy]):
+    let M = asy.getModulusPtr(fd)
+
+    let (r, a, b) = llvmParams
+    asy.modsub(fd, r, a, b, M)
+    asy.br.retVoid()
+
+  return name
+
+proc genFpMul*(asy: Assembler_LLVM, fd: FieldDescriptor): string =
+  ## Generate a public field substraction proc
+  ## with signature
+  ##   void name(FieldType r, FieldType a, FieldType b)
+  ## with r the result and a, b the operands
+  ## and return the corresponding name to call it
+
+  let name = fd.name & "_mul"
+  asy.llvmPublicFnDef(name, "ctt." & fd.name, asy.void_t, [fd.fieldTy, fd.fieldTy, fd.fieldTy]):
+    let M = asy.getModulusPtr(fd)
+
+    let (r, a, b) = llvmParams
+    asy.mtymul(fd, r, a, b, M) # TODO: for now we only suport Montgomery representation
+    asy.br.retVoid()
+
+  return name
\ No newline at end of file
diff --git a/constantine/platforms/llvm/asm_nvidia.nim b/constantine/platforms/llvm/asm_nvidia.nim
index c56c9063f..c3de09024 100644
--- a/constantine/platforms/llvm/asm_nvidia.nim
+++ b/constantine/platforms/llvm/asm_nvidia.nim
@@ -98,9 +98,9 @@ macro genInstr(body: untyped): untyped =
     instrBody.add quote do:
       let `ctx` {.used.} = builder.getContext()
       # lhs: ValueRef or uint32 or uint64
-      let `numBits` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf().getIntTypeWidth()
+      let `numBits` = when `lhs` is ValueRef: `lhs`.getTypeOf().getIntTypeWidth()
                       else: 8*sizeof(`lhs`)
-      let `regTy` = when `lhs` is ValueRef|ConstValueRef: `lhs`.getTypeOf()
+      let `regTy` = when `lhs` is ValueRef: `lhs`.getTypeOf()
                     elif `lhs` is uint32: `ctx`.int32_t()
                     elif `lhs` is uint64: `ctx`.int64_t()
                     else: {.error "Unsupported input type " & $typeof(`lhs`).}
@@ -225,8 +225,7 @@ macro genInstr(body: untyped): untyped =
       # else: constInt(uint64(op))
       opArray.add nnkWhenStmt.newTree(
           nnkElifBranch.newTree(nnkInfix.newTree(ident"is", op, bindSym"ValueRef"), op),
-          nnkElifBranch.newTree(nnkInfix.newTree(ident"is", op, bindSym"ConstValueRef"), newCall(ident"ValueRef", op)),
-          nnkElse.newTree(newCall(ident"ValueRef", newCall(ident"constInt", regTy, newCall(ident"uint64", op))))
+          nnkElse.newTree(newCall(ident"constInt", regTy, newCall(ident"uint64", op)))
         )
     # builder.call2(ty, inlineASM, [lhs, rhs], name)
     instrBody.add newCall(
@@ -249,14 +248,14 @@ macro genInstr(body: untyped): untyped =
           opDefs.add newIdentDefs(
             operands[i],
             nnkInfix.newTree(ident"or",
-              nnkInfix.newTree(ident"or", ident"AnyValueRef", ident"uint32"),
+              nnkInfix.newTree(ident"or", ident"ValueRef", ident"uint32"),
               ident"uint64")
           )
         elif constraint == "rn":
           opDefs.add newIdentDefs(
             operands[i],
             nnkInfix.newTree(ident"or",
-              ident"AnyValueRef",
+              ident"ValueRef",
               ident"uint32")
           )
         else:
diff --git a/tests/gpu/hello_world_nvidia.nim b/tests/gpu/hello_world_nvidia.nim
index f2a069cfc..7e26caf40 100644
--- a/tests/gpu/hello_world_nvidia.nim
+++ b/tests/gpu/hello_world_nvidia.nim
@@ -8,7 +8,8 @@
 
 import
   constantine/platforms/llvm/llvm,
-  constantine/math_compiler/codegen_nvidia
+  constantine/platforms/abis/nvidia_abi {.all.},
+  constantine/platforms/abis/c_abi
 
 # ############################################################
 #
@@ -79,7 +80,17 @@ proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult
 #
 # ############################################################
 
-template check*(status: NvvmResult) =
+template check(status: CUresult) =
+  ## Check the status code of a CUDA operation
+  ## Exit program with error if failure
+
+  let code = status # ensure that the input expression is evaluated once only
+  if code != CUDA_SUCCESS:
+    writeStackTrace()
+    stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
+    quit 1
+
+template check(status: NvvmResult) =
   let code = status # Assign so execution is done once only.
   if code != NVVM_SUCCESS:
     stderr.write astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code
@@ -148,6 +159,22 @@ proc ptxCodegenViaLlvmNvptx(module: ModuleRef, sm: tuple[major, minor: int32]):
 
 echo "Nvidia JIT compiler Hello World"
 
+proc tagCudaKernel(module: ModuleRef, fnTy: TypeRef, fnImpl: ValueRef) =
+  ## Tag a function as a Cuda Kernel, i.e. callable from host
+
+  doAssert fnTy.getReturnType().isVoid(), block:
+    "Kernels must not return values but function returns " & $fnTy.getReturnType().getTypeKind()
+
+  let ctx = module.getContext()
+  module.addNamedMetadataOperand(
+    "nvvm.annotations",
+    ctx.asValueRef(ctx.metadataNode([
+      fnImpl.asMetadataRef(),
+      ctx.metadataNode("kernel"),
+      constInt(ctx.int32_t(), 1, LlvmBool(false)).asMetadataRef()
+    ]))
+  )
+
 proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKernelName: string) =
 
   # ######################################
@@ -175,7 +202,7 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe
   block:
     let addType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
     let addKernel = module.addFunction(addKernelName, addType)
-    let blck = ctx.appendBasicBlock(addKernel, "addBody")
+    let blck = ctx.appendBasicBlock(addKernel)
     builder.positionAtEnd(blck)
     let r = addKernel.getParam(0)
     let a = addKernel.getParam(1)
@@ -184,12 +211,12 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe
     builder.store(sum, r)
     builder.retVoid()
 
-    module.wrapInCallableCudaKernel((addType, addKernel))
+    module.tagCudaKernel(addType, addKernel)
 
   block:
     let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false))
     let mulKernel = module.addFunction(mulKernelName, mulType)
-    let blck = ctx.appendBasicBlock(mulKernel, "mulBody")
+    let blck = ctx.appendBasicBlock(mulKernel)
     builder.positionAtEnd(blck)
     let r = mulKernel.getParam(0)
     let a = mulKernel.getParam(1)
@@ -198,7 +225,7 @@ proc writeExampleAddMul(ctx: ContextRef, module: ModuleRef, addKernelName, mulKe
     builder.store(prod, r)
     builder.retVoid()
 
-    module.wrapInCallableCudaKernel((mulType, mulKernel))
+    module.tagCudaKernel(mulType, mulKernel)
 
   module.verify(AbortProcessAction)
 
@@ -228,7 +255,33 @@ type
     PTXviaLlvmNvptx
 
 proc getCudaKernel(cuMod: CUmodule, fnName: string): CUfunction =
-  check cuModuleGetFunction(result, cuMod, fnName & "_public")
+  check cuModuleGetFunction(result, cuMod, fnName[0].unsafeAddr)
+
+proc cudaDeviceInit(deviceID = 0'i32): CUdevice =
+
+  check cuInit(deviceID.uint32)
+
+  var devCount: int32
+  check cuDeviceGetCount(devCount)
+  if devCount == 0:
+    echo "cudaDeviceInit error: no devices supporting CUDA"
+    quit 1
+
+  var cuDevice: CUdevice
+  check cuDeviceGet(cuDevice, deviceID)
+  var name = newString(128)
+  check cuDeviceGetName(name[0].addr, name.len.int32, cuDevice)
+  echo "Using CUDA Device [", deviceID, "]: ", cstring(name)
+
+  var major, minor: int32
+  check cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)
+  check cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)
+  echo "Compute Capability: SM ", major, ".", minor
+  if major < 6:
+    echo "Error: Device ",deviceID," is not sm_60 (Pascal generation, GTX 1080) or later"
+    quit 1
+
+  return cuDevice
 
 proc main(backend: CodegenBackend) =
 
@@ -271,7 +324,7 @@ proc main(backend: CodegenBackend) =
   var cuCtx: CUcontext
   var cuMod: CUmodule
   check cuCtxCreate(cuCtx, 0, cudaDevice)
-  check cuModuleLoadData(cuMod, ptx)
+  check cuModuleLoadData(cuMod, ptx[0].unsafeAddr)
   let addKernel = cuMod.getCudaKernel(addKernelName)
   let mulKernel = cuMod.getCudaKernel(mulKernelName)
 
@@ -323,9 +376,10 @@ proc main(backend: CodegenBackend) =
   check cuCtxDestroy(cuCtx)
   cuCtx = CUcontext(nil)
 
-echo "\n\nCompilation via Nvidia NVVM\n###########################\n"
-main(PTXviaNvidiaNvvm)
-echo "\n\nEnd: Compilation via Nvidia NVVM\n################################"
+# echo "\n\nCompilation via Nvidia NVVM\n###########################\n"
+# main(PTXviaNvidiaNvvm)
+# echo "\n\nEnd: Compilation via Nvidia NVVM\n################################"
+echo "[Skip] Compilation via Nvidia NVVM, incompatibilities between LLVM IR and NVVM IR"
 
 echo "\n\nCompilation via LLVM NVPTX\n##########################\n"
 main(PTXviaLlvmNvptx)
diff --git a/tests/gpu/t_nvidia_fp.nim b/tests/gpu/t_nvidia_fp.nim
index af4de0075..ec5e8a2ed 100644
--- a/tests/gpu/t_nvidia_fp.nim
+++ b/tests/gpu/t_nvidia_fp.nim
@@ -14,9 +14,9 @@ import
   constantine/platforms/llvm/llvm,
   constantine/platforms/static_for,
   constantine/named/algebras,
-  constantine/math/io/io_bigints,
   constantine/math/arithmetic,
-  constantine/math_compiler/[ir, impl_fields_nvidia, codegen_nvidia],
+  constantine/math/io/[io_bigints, io_fields],
+  constantine/math_compiler/[ir, pub_fields, codegen_nvidia],
   # Test utilities
   helpers/prng_unsafe
 
@@ -28,33 +28,6 @@ echo "test_nvidia_fp xoshiro512** seed: ", seed
 
 const Iters = 10
 
-proc init(T: type CurveMetadata, asy: Assembler_LLVM, curve: static Algebra, wordSize: WordSize): T =
-  CurveMetadata.init(
-      asy.ctx,
-      $curve & "_", wordSize,
-      fpBits = uint32 Fp[curve].bits(),
-      fpMod = Fp[curve].getModulus().toHex(),
-      frBits = uint32 Fr[curve].bits(),
-      frMod = Fr[curve].getModulus().toHex())
-
-proc genFieldAddPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
-  let fpAdd = asy.field_add_gen(cm, fp)
-  asy.module.wrapInCallableCudaKernel(fpAdd)
-  let frAdd = asy.field_add_gen(cm, fr)
-  asy.module.wrapInCallableCudaKernel(frAdd)
-
-proc genFieldSubPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
-  let fpSub = asy.field_sub_gen(cm, fp)
-  asy.module.wrapInCallableCudaKernel(fpSub)
-  let frSub = asy.field_sub_gen(cm, fr)
-  asy.module.wrapInCallableCudaKernel(frSub)
-
-proc genFieldMulPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
-  let fpMul = asy.field_mul_gen(cm, fp)
-  asy.module.wrapInCallableCudaKernel(fpMul)
-  let frMul = asy.field_mul_gen(cm, fr)
-  asy.module.wrapInCallableCudaKernel(frMul)
-
 # Init LLVM
 # -------------------------
 initializeFullNVPTXTarget()
@@ -66,173 +39,55 @@ var sm: tuple[major, minor: int32]
 check cuDeviceGetAttribute(sm.major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDevice)
 check cuDeviceGetAttribute(sm.minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDevice)
 
-proc t_field_add(curve: static Algebra) =
-  # Codegen
-  # -------------------------
-  let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve))
-  let cm32 = CurveMetadata.init(asy, curve, w32)
-  asy.genFieldAddPTX(cm32)
-  let cm64 = CurveMetadata.init(asy, curve, w64)
-  asy.genFieldAddPTX(cm64)
-
-  let ptx = asy.codegenNvidiaPTX(sm)
-
-  # GPU exec
-  # -------------------------
-  var cuCtx: CUcontext
-  var cuMod: CUmodule
-  check cuCtxCreate(cuCtx, 0, cudaDevice)
-  check cuModuleLoadData(cuMod, ptx)
-  defer:
-    check cuMod.cuModuleUnload()
-    check cuCtx.cuCtxDestroy()
-
-  let fpAdd32 = cuMod.getCudaKernel(cm32, opFpAdd)
-  let fpAdd64 = cuMod.getCudaKernel(cm64, opFpAdd)
-  let frAdd32 = cuMod.getCudaKernel(cm32, opFrAdd)
-  let frAdd64 = cuMod.getCudaKernel(cm64, opFrAdd)
-
-  # Fp
-  for i in 0 ..< Iters:
-    let a = rng.random_long01Seq(Fp[curve])
-    let b = rng.random_long01Seq(Fp[curve])
-
-    var rCPU, rGPU_32, rGPU_64: Fp[curve]
-
-    rCPU.sum(a, b)
-    fpAdd32.exec(rGPU_32, a, b)
-    fpAdd64.exec(rGPU_64, a, b)
-
-    doAssert bool(rCPU == rGPU_32)
-    doAssert bool(rCPU == rGPU_64)
-
-  # Fr
-  for i in 0 ..< Iters:
-    let a = rng.random_long01Seq(Fr[curve])
-    let b = rng.random_long01Seq(Fr[curve])
-
-    var rCPU, rGPU_32, rGPU_64: Fr[curve]
-
-    rCPU.sum(a, b)
-    frAdd32.exec(rGPU_32, a, b)
-    frAdd64.exec(rGPU_64, a, b)
-
-    doAssert bool(rCPU == rGPU_32)
-    doAssert bool(rCPU == rGPU_64)
-
-proc t_field_sub(curve: static Algebra) =
-  # Codegen
-  # -------------------------
-  let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve))
-  let cm32 = CurveMetadata.init(asy, curve, w32)
-  asy.genFieldSubPTX(cm32)
-  let cm64 = CurveMetadata.init(asy, curve, w64)
-  asy.genFieldSubPTX(cm64)
-
-  let ptx = asy.codegenNvidiaPTX(sm)
-
-  # GPU exec
-  # -------------------------
-  var cuCtx: CUcontext
-  var cuMod: CUmodule
-  check cuCtxCreate(cuCtx, 0, cudaDevice)
-  check cuModuleLoadData(cuMod, ptx)
-  defer:
-    check cuMod.cuModuleUnload()
-    check cuCtx.cuCtxDestroy()
-
-  let fpSub32 = cuMod.getCudaKernel(cm32, opFpSub)
-  let fpSub64 = cuMod.getCudaKernel(cm64, opFpSub)
-  let frSub32 = cuMod.getCudaKernel(cm32, opFrSub)
-  let frSub64 = cuMod.getCudaKernel(cm64, opFrSub)
-
-  # Fp
-  for i in 0 ..< Iters:
-    let a = rng.random_long01Seq(Fp[curve])
-    let b = rng.random_long01Seq(Fp[curve])
-
-    var rCPU, rGPU_32, rGPU_64: Fp[curve]
-
-    rCPU.diff(a, b)
-    fpSub32.exec(rGPU_32, a, b)
-    fpSub64.exec(rGPU_64, a, b)
-
-    doAssert bool(rCPU == rGPU_32)
-    doAssert bool(rCPU == rGPU_64)
-
-  # Fr
-  for i in 0 ..< Iters:
-    let a = rng.random_long01Seq(Fr[curve])
-    let b = rng.random_long01Seq(Fr[curve])
-
-    var rCPU, rGPU_32, rGPU_64: Fr[curve]
-
-    rCPU.diff(a, b)
-    frSub32.exec(rGPU_32, a, b)
-    frSub64.exec(rGPU_64, a, b)
-
-    doAssert bool(rCPU == rGPU_32)
-    doAssert bool(rCPU == rGPU_64)
-
-proc t_field_mul(curve: static Algebra) =
-  # Codegen
-  # -------------------------
-  let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve))
-  let cm32 = CurveMetadata.init(asy, curve, w32)
-  asy.genFieldMulPTX(cm32)
-
-  # 64-bit integer fused-multiply-add with carry is buggy:
-  # https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
-  # https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
-
-  # let cm64 = CurveMetadata.init(asy, curve, w64)
-  # asy.genFieldMulPTX(cm64)
-
-  let ptx = asy.codegenNvidiaPTX(sm)
+template gen_binop_test(
+      testName: untyped,
+      kernGenerator: untyped,
+      cpuFn: untyped) =
 
-  # GPU exec
-  # -------------------------
-  var cuCtx: CUcontext
-  var cuMod: CUmodule
-  check cuCtxCreate(cuCtx, 0, cudaDevice)
-  check cuModuleLoadData(cuMod, ptx)
-  defer:
-    check cuMod.cuModuleUnload()
-    check cuCtx.cuCtxDestroy()
 
-  let fpMul32 = cuMod.getCudaKernel(cm32, opFpMul)
-  let frMul32 = cuMod.getCudaKernel(cm32, opFrMul)
-  # let fpMul64 = cuMod.getCudaKernel(cm64, opFpMul)
-  # let frMul64 = cuMod.getCudaKernel(cm64, opFrMul)
+  proc testName[Name: static Algebra](field: type FF[Name], wordSize: int) =
+    # Codegen
+    # -------------------------
+    let name = if field is Fp: $Name & "_fp"
+              else: $Name & "_fr"
+    let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & name & $wordSize))
+    let fd = asy.ctx.configureField(
+      name, field.bits(),
+      field.getModulus().toHex(),
+      v = 1, w = wordSize
+    )
 
-  # Fp
-  for i in 0 ..< Iters:
-    let a = rng.random_long01Seq(Fp[curve])
-    let b = rng.random_long01Seq(Fp[curve])
+    asy.definePrimitives(fd)
 
+    let kernName = asy.kernGenerator(fd)
+    let ptx = asy.codegenNvidiaPTX(sm)
 
-    var rCPU, rGPU_32: Fp[curve] # rGPU_64
+    # GPU exec
+    # -------------------------
+    var cuCtx: CUcontext
+    var cuMod: CUmodule
+    check cuCtxCreate(cuCtx, 0, cudaDevice)
+    check cuModuleLoadData(cuMod, ptx)
+    defer:
+      check cuMod.cuModuleUnload()
+      check cuCtx.cuCtxDestroy()
 
-    rCPU.prod(a, b)
-    fpMul32.exec(rGPU_32, a, b)
-    # fpMul64.exec(rGPU_64, a, b)
+    let kernel = cuMod.getCudaKernel(kernName)
 
-    doAssert bool(rCPU == rGPU_32)
-    # doAssert bool(rCPU == rGPU_64)
+    for i in 0 ..< Iters:
+      let a = rng.random_long01Seq(field)
+      let b = rng.random_long01Seq(field)
 
-  # Fr
-  for i in 0 ..< Iters:
-    let a = rng.random_long01Seq(Fr[curve])
-    let b = rng.random_long01Seq(Fr[curve])
+      var rCPU, rGPU: field
 
-    var rCPU, rGPU_32: Fr[curve] # rGPU_64
+      rCPU.cpuFn(a, b)
+      kernel.exec(rGPU, a, b)
 
-    rCPU.prod(a, b)
-    frMul32.exec(rGPU_32, a, b)
-    # frMul64.exec(rGPU_64, a, b)
+      doAssert bool(rCPU == rGPU)
 
-    doAssert bool(rCPU == rGPU_32)
-    # doAssert bool(rCPU == rGPU_64)
+gen_binop_test(t_field_add, genFpAdd, sum)
+gen_binop_test(t_field_sub, genFpSub, diff)
+gen_binop_test(t_field_mul, genFpMul, prod)
 
 proc main() =
   const curves = [
@@ -253,11 +108,31 @@ proc main() =
   suite "[Nvidia GPU] Field Arithmetic":
     staticFor i, 0, curves.len:
       const curve = curves[i]
-      test "Nvidia GPU field addition       (𝔽p, 𝔽r) for " & $curve:
-        t_field_add(curve)
-      test "Nvidia GPU field substraction   (𝔽p, 𝔽r) for " & $curve:
-        t_field_sub(curve)
-      test "Nvidia GPU field multiplication (𝔽p, 𝔽r) for " & $curve:
-        t_field_mul(curve)
+      for wordSize in [32, 64]:
+        test "Nvidia GPU field addition 𝔽p " & $wordSize & "-bit for " & $curve:
+          t_field_add(Fp[curve], wordSize)
+        test "Nvidia GPU field substraction 𝔽p " & $wordSize & "-bit for " & $curve:
+          t_field_sub(Fp[curve], wordSize)
+        test "Nvidia GPU field multiplication 𝔽p " & $wordSize & "-bit for " & $curve:
+          if wordSize == 64:
+            skip()
+            # 64-bit integer fused-multiply-add with carry is buggy:
+            # https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
+            # https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
+          else:
+            t_field_mul(Fp[curve], wordSize)
+
+        test "Nvidia GPU field addition 𝔽r " & $wordSize & "-bit for " & $curve:
+          t_field_add(Fr[curve], wordSize)
+        test "Nvidia GPU field substraction 𝔽r " & $wordSize & "-bit for " & $curve:
+          t_field_sub(Fr[curve], wordSize)
+        test "Nvidia GPU field multiplication 𝔽r " & $wordSize & "-bit for " & $curve:
+          if wordSize == 64:
+            skip()
+            # 64-bit integer fused-multiply-add with carry is buggy:
+            # https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
+            # https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
+          else:
+            t_field_mul(Fr[curve], wordSize)
 
 main()