Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add test in HIP emulating OpenMP Copy and Zero-Copy configurations. #882

Open
wants to merge 1 commit into
base: aomp-dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 156 additions & 0 deletions examples/hip/CopyVsZeroCopy/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#-----------------------------------------------------------------------
#
# Makefile: Cuda clang demo Makefile for both amdgcn and nvptx targets.
# amdgcn targets begin with gfx. nvptx targets begin with sm_
#
# Run "make help" to see how to use this Makefile
#
#-----------------------------------------------------------------------
# MIT License
# Copyright (c) 2017 Advanced Micro Devices, Inc. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use, copy,
# modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

TESTNAME =ep
FILETYPE =cpp

UNAMEP = $(shell uname -m)
AOMP_CPUTARGET = $(UNAMEP)-pc-linux-gnu
ifeq ($(UNAMEP),ppc64le)
AOMP_CPUTARGET = ppc64le-linux-gnu
endif
ifeq ($(AOMP),)
# --- Standard Makefile check for AOMP installation ---
ifeq ("$(wildcard $(AOMP))","")
ifneq ($(AOMP),)
$(warning AOMP not found at $(AOMP))
endif
AOMP = $(HOME)/rocm/aomp
ifeq ("$(wildcard $(AOMP))","")
$(warning AOMP not found at $(AOMP))
AOMP = /usr/lib/aomp
ifeq ("$(wildcard $(AOMP))","")
$(warning AOMP not found at $(AOMP))
$(error Please install AOMP or correctly set env-var AOMP)
endif
endif
endif
# --- End Standard Makefile check for AOMP installation ---
endif
ifeq ($(CUDA),)
CUDA = /usr/local/cuda
endif
ifeq ($(AOMP_GPU),)
INSTALLED_GPU = $(shell $(AOMP)/bin/mygpu -d gfx900) # Default AOMP_GPU is gfx900 which is vega
AOMP_GPU ?= $(INSTALLED_GPU)
endif
ifeq ($(TARGETS),)
TARGETS =--offload-arch=$(AOMP_GPU)$(AOMP_TARGET_FEATURES)
endif

CC =$(AOMP)/bin/clang++

# compiler automatically adds "libdevice/<target>/ to -L opts
LFLAGS =-L$(AOMP)/lib

# Add cudart only if we have an Nvidia sm_ target
ifeq (sm_,$(findstring sm_,$(TARGETS)))
LFLAGS +=-L$(CUDA)/targets/$(UNAMEP)-linux/lib -lcudart -Wl,-rpath,$(CUDA)/targets/$(UNAMEP)-linux/lib -std=c++11
CFLAGS +=-x cuda -I$(CUDA)/include
else
AOMPHIP ?= $(AOMP)
PFILE = $(AOMPHIP)/bin/hipcc
ifeq ("$(wildcard $(PFILE))","")
AOMPHIP = $(AOMP)/..
PFILE = $(AOMPHIP)/bin/hipcc
ifeq ("$(wildcard $(PFILE))","")
AOMPHIP = $(AOMP)/../..
endif
endif

VERS = $(shell $(AOMP)/bin/clang --version | grep -oP '(?<=clang version )[0-9.]+')
ifeq ($(shell expr $(VERS) \>= 12.0), 1)
RPTH = -Wl,-rpath,$(AOMPHIP)/lib
endif
HIPLIBS = -L $(AOMPHIP)/hip -L $(AOMPHIP)/lib $(RPTH)
CFLAGS = -x hip -std=c++11 $(HIPLIBS) -lamdhip64 -mcode-object-version=4
endif

# ----- Demo compile and link in one step, no object code saved
$(TESTNAME): $(TESTNAME).$(FILETYPE)
$(CC) $(CFLAGS) $(TARGETS) $(LFLAGS) $^ -o $@

run: $(TESTNAME)
./$(TESTNAME)

# ---- Demo compile and link in two steps, object saved
$(TESTNAME).o: $(TESTNAME).$(FILETYPE)
$(CC) -c $(CFLAGS) $(TARGETS) $^

obin: $(TESTNAME).o
$(CC) $(LFLAGS) $^ -o obin

run_obin: obin
./obin

# ---- Demo compile to intermediates LLVMIR or assembly
$(TESTNAME).ll: $(TESTNAME).$(FILETYPE)
$(CC) -c -S -emit-llvm $(CFLAGS) $(TARGETS) $^

$(TESTNAME).s: $(TESTNAME).$(FILETYPE)
$(CC) -c -S $(CFLAGS) $(TARGETS) $^

help:
@echo
@echo "Makefile Help:"
@echo " Source: $(TESTNAME).$(FILETYPE)"
@echo " Compiler: $(CC)"
@echo " Compiler flags: $(CFLAGS)"
@echo
@echo "Avalable Targets:"
@echo " make // build binary $(TESTNAME)"
@echo " make run // run $(TESTNAME)"
@echo " make $(TESTNAME).o // compile, be, & assemble : -c"
@echo " make obin // link step only"
@echo " make run_obin // run obin "
@echo " make $(TESTNAME).s // compile & backend steps : -c -S"
@echo " make $(TESTNAME).ll // compile step only : -c -S -emit-llvm"
@echo " make clean // cleanup files"
@echo " make help // this help"
@echo
@echo "Environment Variables:"
@echo " AOMP default: $(HOME)/rocm/aomp value: $(AOMP)"
@echo " AOMP_GPU default: gfx900 value: $(AOMP_GPU)"
@echo " CUDA default: /usr/local/cuda value: $(CUDA)"
@echo " TARGETS default: --offload-arch=$(AOMP_GPU)"
@echo " value: $(TARGETS)"
@echo
@echo "Link Flags:"
@echo " Link flags: $(LFLAGS)"
@echo

# Cleanup anything this makefile can create
clean:
@[ -f ./$(TESTNAME) ] && rm ./$(TESTNAME) ; true
@[ -f ./obin ] && rm ./obin ; true
@[ -f ./$(TESTNAME).ll ] && rm *.ll ; true
@[ -f ./$(TESTNAME).o ] && rm $(TESTNAME).o ; true
@[ -f ./$(TESTNAME).s ] && rm *.s ; true
11 changes: 11 additions & 0 deletions examples/hip/CopyVsZeroCopy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
CopyVsZeroCopy - Demonstrate performance difference on MI300A between Copy and Zero-Copy configurations.
=======================================================
This test is used to monitor performance difference between OpenMP's matching Copy and Zero-Copy configurations when
programmed in HIP.

To build in Copy configuration, use:
HSA_XNACK=0 make run

To build in Zero-Copy configuration, use
HSA_XNACK=1 make run

107 changes: 107 additions & 0 deletions examples/hip/CopyVsZeroCopy/ep.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// MIT License
//
// Copyright (c) 2017 Advanced Micro Devices, Inc. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use, copy,
// modify, merge, publish, distribute, sublicense, and/or sell copies
// of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

// This program replicates OpenMP behavior for two (extremely reduced)
// kernels of the benchmark SPECaccel 2023 452.ep, emulating OpenMP's
// copy and zero-copy runtime behaviors.


#include <cstdlib>
#include <cstdio>
#include <sys/time.h>
#include <iostream>
#include <cmath>

#include "hip/hip_runtime.h"

__global__ void init_xx(double *xx, int length) {
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i > length) return;
xx[i] = 1.0;
}

__global__ void inc_xx(double *xx, int blksize, int nk) {
int k = threadIdx.x + blockIdx.x*blockDim.x;
if (k >= blksize) {
return;
}
for(int i=0; i<2*nk; i++) {
xx[k*2*nk + i] += 1.0;
}
return;
}

int main() {
int blksize = 15000;
int nk = 65536;
double *xx = (double *)malloc(blksize*2*nk*sizeof(double));
int m = 40;
int mk = 16;
int mm = m - mk;
int np = (1 << mm);
int numblks = ceil( (double)np / (double) blksize);
hipError_t err;

printf("numblks = %d\n", numblks);

char *HSA_XNACK_Env = getenv("HSA_XNACK");
bool isXnackEnabled = false;
if (HSA_XNACK_Env) {
int HSA_XNACK_Val = atoi(HSA_XNACK_Env);
isXnackEnabled = (HSA_XNACK_Val > 0) ? true : false;
}

double *d_xx = nullptr;
//#pragma omp target enter data map(alloc:xx[0:blksize*2*nk])
if (!isXnackEnabled) { // Copy
printf("OpenMP Copy configuration\n");
err = hipMalloc(&d_xx, blksize*2*nk*sizeof(double));
if (err != HIP_SUCCESS) {
printf("Cannot allocate device memory\n");
return 0;
}
//hipMemcpy(d_xx, xx, blksize*2*nk*sizeof(double), hipMemcpyHostToDevice);
} else {
printf("OpenMP Zero-Copy configuration\n");
d_xx = xx; // zero-copy
}

for (int blk=0; blk < 10; ++blk) {
printf("blk=%d\n", blk);
// #pragma omp target teams loop collapse(2)
// for(int k=0; k<blksize; k++)
// for(int i=0; i<2*nk; i++)
// xx[k*2*nk + i] = 1.0;
init_xx<<<7680000, 256, 0>>>(d_xx, blksize*2*nk);
hipDeviceSynchronize();
// #pragma omp target teams loop
// for (int k = 0; k < blksize; k++)
// for(int i=0; i<2*nk; i++)
// xx[k*2*nk + i] += 1.0;
inc_xx<<<938, 16, 0>>>(d_xx, blksize, nk);
hipDeviceSynchronize();
}

return 0;
}