Skip to content

Commit

Permalink
AARCH64 port using CMA, based on juj#261
Browse files Browse the repository at this point in the history
  • Loading branch information
hakmo committed Nov 10, 2023
1 parent 157115a commit e52ce40
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 6 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
*.S
*.symvers
*.order
build/*
16 changes: 15 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ set(DEFAULT_TO_SINGLE_CORE_BOARD OFF)
set(DEFAULT_TO_ARMV6Z OFF)
set(DEFAULT_TO_ARMV7A OFF)
set(DEFAULT_TO_ARMV8A OFF)
set(DEFAULT_USE_VCSM_CMA OFF)

# http://ozzmaker.com/check-raspberry-software-hardware-version-command-line/
if (BOARD_REVISION MATCHES "(0002)|(0003)|(0004)|(0005)|(0006)|(0007)|(0008)|(0009)" OR BOARD_REVISION MATCHES "(000d)|(000e)|(000f)|(0010)|(0011)|(0012)" OR BOARD_REVISION MATCHES "(900092)|(900093)|(9000c1)")
Expand All @@ -46,7 +47,20 @@ if (SINGLE_CORE_BOARD)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSINGLE_CORE_BOARD=1")
endif()

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -mabi=aapcs-linux -mhard-float -mfloat-abi=hard -mlittle-endian -mtls-dialect=gnu2 -funsafe-math-optimizations")
option(AARCH64 "Target a Raspberry Pi with aarch64 architecture" NO)
if (AARCH64)
message(STATUS "Enable AARCH64 build")
set(DEFAULT_USE_VCSM_CMA ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlittle-endian -funsafe-math-optimizations -DTIMER_32BIT")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -mabi=aapcs-linux -mhard-float -mfloat-abi=hard -mlittle-endian -mtls-dialect=gnu2 -funsafe-math-optimizations")
endif()

option(USE_VCSM_CMA "Map Memory from CPU instead of GPU" ${DEFAULT_USE_VCSM_CMA})
if (USE_VCSM_CMA)
message(STATUS "Enabling Map Memory from CPU instead of GPU")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_VCSM_CMA=1")
endif()

option(ARMV6Z "Target a Raspberry Pi with ARMv6Z instruction set (Pi 1A, 1A+, 1B, 1B+, Zero, Zero W)" ${DEFAULT_TO_ARMV6Z})
if (ARMV6Z)
Expand Down
63 changes: 63 additions & 0 deletions cma.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#ifdef USE_VCSM_CMA

#include "config.h"
#include "cma.h"
#include "util.h"
#include <sys/ioctl.h>
#include <fcntl.h>
#include <syslog.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

static int cma_fd = -1;
#define PAGE_SIZE 4096

void OpenVCSM(void) {
cma_fd = open("/dev/vcsm-cma", O_RDWR|O_SYNC);
if (cma_fd < 0) FATAL_ERROR("can't open /dev/vcsm-cma");
}

void CloseVCSM(void) {
if (cma_fd >= 0) {
close(cma_fd);
}
}

const int NAME_LENGTH = 32;

struct Allocate {
/* user -> kernel */
uint32_t size;
uint32_t num;
uint32_t flags;
uint32_t pad;
char name[NAME_LENGTH];

/* kernel -> user */
int32_t fd;
uint32_t vcHandle;
uint64_t dmaAddr;
};

int AllocateCMA(const char* reason, size_t req, CMAInfo* res) {
if (res == NULL) {
return -1;
}
Allocate ctx;
memset(&ctx, 0, sizeof(ctx));
ctx.size = ALIGN_UP(req, PAGE_SIZE);
ctx.flags = 0; // NO cache
strncpy((char*)ctx.name, reason, NAME_LENGTH -1);
ctx.num = 1;
if (ioctl(cma_fd, _IOR('J', 0x5A, struct Allocate), &ctx) < 0 || ctx.fd < 0) { // allocate cmd
return -1;
}
res->size = ctx.size;
res->vcHandle = ctx.vcHandle;
res->dmaAddr = ctx.dmaAddr;
res->fd = ctx.fd;
return 0;
}

#endif
16 changes: 16 additions & 0 deletions cma.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#pragma once
#ifdef USE_VCSM_CMA

#include <memory.h>
#include <inttypes.h>
struct CMAInfo {
size_t size;
uintptr_t dmaAddr;
uint32_t fd;
uint32_t vcHandle;
};

void OpenVCSM(void);
void CloseVCSM(void);
int AllocateCMA(const char* reason, size_t req, CMAInfo* res);
#endif
39 changes: 36 additions & 3 deletions dma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
#include "util.h"
#include "mailbox.h"

#ifdef USE_VCSM_CMA
#include "cma.h"
#endif

#ifdef USE_DMA_TRANSFERS

#define BCM2835_PERI_BASE 0x3F000000
Expand All @@ -36,6 +40,9 @@ struct GpuMemory
void *virtualAddr;
uintptr_t busAddress;
uint32_t sizeBytes;
#ifdef USE_VCSM_CMA
uint32_t vcHandle;
#endif
};

#define NUM_DMA_CBS 1024
Expand Down Expand Up @@ -127,7 +134,33 @@ void FreeDMAChannel(int channel)
#define VIRT_TO_BUS(block, x) ((uintptr_t)(x) - (uintptr_t)((block).virtualAddr) + (block).busAddress)

uint64_t totalGpuMemoryUsed = 0;
#ifdef USE_VCSM_CMA

void FreeUncachedGpuMemory(GpuMemory mem) {
munmap(mem.virtualAddr, mem.sizeBytes);
close(mem.allocationHandle);
}

GpuMemory AllocateUncachedGpuMemory(uint32_t numBytes, const char *reason) {
GpuMemory mem;
CMAInfo ctx;
if (AllocateCMA(reason, numBytes, &ctx) != 0) {
FATAL_ERROR("alloc cma failed");
}
mem.sizeBytes = ctx.size;
mem.busAddress = ctx.dmaAddr;
mem.allocationHandle = ctx.fd;
mem.vcHandle = ctx.vcHandle;
mem.virtualAddr = mmap(0, mem.sizeBytes, PROT_READ | PROT_WRITE, MAP_SHARED, ctx.fd, 0);
totalGpuMemoryUsed += mem.sizeBytes;
if (mem.virtualAddr == MAP_FAILED) {
FreeUncachedGpuMemory(mem);
FATAL_ERROR("Failed to mmap CMA memory!");
}
printf("Allocated %u bytes of GPU memory for %s (bus address=%p). Total GPU memory used: %llu bytes\n", mem.sizeBytes, reason, (void*)mem.busAddress, totalGpuMemoryUsed);
return mem;
}
#else
// Allocates the given number of bytes in GPU side memory, and returns the virtual address and physical bus address of the allocated memory block.
// The virtual address holds an uncached view to the allocated memory, so writes and reads to that memory address bypass the L1 and L2 caches. Use
// this kind of memory to pass data blocks over to the DMA controller to process.
Expand All @@ -154,7 +187,7 @@ void FreeUncachedGpuMemory(GpuMemory mem)
Mailbox(MEM_UNLOCK_MESSAGE, mem.allocationHandle);
Mailbox(MEM_FREE_MESSAGE, mem.allocationHandle);
}

#endif
volatile DMAChannelRegisterFile *GetDMAChannel(int channelNumber)
{
if (channelNumber < 0 || channelNumber >= BCM2835_NUM_DMA_CHANNELS)
Expand Down Expand Up @@ -720,8 +753,8 @@ void SPIDMATransfer(SPITask *task)
while((dmaTx->cs & BCM2835_DMA_CS_ACTIVE))
{
CheckSPIDMAChannelsNotStolen();
if (tick() - dmaTaskStart > 5000000)
FATAL_ERROR("DMA TX channel has stalled!");
if (tick() - dmaTaskStart > 5000000)
FATAL_ERROR("DMA TX channel has stalled!");
}
while((dmaRx->cs & BCM2835_DMA_CS_ACTIVE))
{
Expand Down
22 changes: 20 additions & 2 deletions spi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ void ChipSelectHigh();
#define TOGGLE_CHIP_SELECT_LINE() ((void)0)
#endif

#ifdef USE_VCSM_CMA
#include "cma.h"
#endif

static uint32_t writeCounter = 0;

#define WRITE_FIFO(word) do { \
Expand All @@ -49,7 +53,11 @@ volatile SPIRegisterFile *spi = 0;

// Points to the system timer register. N.B. spec sheet says this is two low and high parts, in an 32-bit aligned (but not 64-bit aligned) address. Profiling shows
// that Pi 3 Model B does allow reading this as a u64 load, and even when unaligned, it is around 30% faster to do so compared to loading in parts "lo | (hi << 32)".
#ifdef TIMER_32BIT
volatile systemTimer *systemTimerRegister = 0;
#else
volatile uint64_t *systemTimerRegister = 0;
#endif

void DumpSPICS(uint32_t reg)
{
Expand Down Expand Up @@ -510,13 +518,20 @@ int InitSPI()
// Memory map GPIO and SPI peripherals for direct access
mem_fd = open("/dev/mem", O_RDWR|O_SYNC);
if (mem_fd < 0) FATAL_ERROR("can't open /dev/mem (run as sudo)");
#ifdef USE_VCSM_CMA
OpenVCSM();
#endif
printf("bcm_host_get_peripheral_address: %p, bcm_host_get_peripheral_size: %u, bcm_host_get_sdram_address: %p\n", bcm_host_get_peripheral_address(), bcm_host_get_peripheral_size(), bcm_host_get_sdram_address());
bcm2835 = mmap(NULL, bcm_host_get_peripheral_size(), (PROT_READ | PROT_WRITE), MAP_SHARED, mem_fd, bcm_host_get_peripheral_address());
if (bcm2835 == MAP_FAILED) FATAL_ERROR("mapping /dev/mem failed");
spi = (volatile SPIRegisterFile*)((uintptr_t)bcm2835 + BCM2835_SPI0_BASE);
gpio = (volatile GPIORegisterFile*)((uintptr_t)bcm2835 + BCM2835_GPIO_BASE);
systemTimerRegister = (volatile uint64_t*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine.
// TODO: On graceful shutdown, (ctrl-c signal?) close(mem_fd)
#ifdef TIMER_32BIT
systemTimerRegister = (volatile TIMER_TYPE*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE);
#else
systemTimerRegister = (volatile TIMER_TYPE*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine.
#endif
// TODO: On graceful shutdown, (ctrl-c signal?) close(mem_fd)
#endif

uint32_t currentBcmCoreSpeed = MailboxRet2(0x00030002/*Get Clock Rate*/, 0x4/*CORE*/);
Expand Down Expand Up @@ -658,6 +673,9 @@ void DeinitSPI()
close(mem_fd);
mem_fd = -1;
}
#ifdef USE_VCSM_CMA
CloseVCSM();
#endif

#ifndef KERNEL_MODULE_CLIENT

Expand Down
13 changes: 13 additions & 0 deletions tick.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,21 @@
#include <unistd.h>

// Initialized in spi.cpp along with the rest of the BCM2835 peripheral:
#ifdef TIMER_32BIT
struct __attribute__((packed, aligned(4))) systemTimer {
volatile uint32_t cs;
volatile uint32_t clo;
volatile uint32_t chi;
volatile uint32_t c[4];
};
#define TIMER_TYPE systemTimer
extern volatile systemTimer* systemTimerRegister;
#define tick() (((uint64_t)systemTimerRegister->clo) | ((uint64_t)(systemTimerRegister->chi) << 32))
#else
#define TIMER_TYPE uint64_t
extern volatile uint64_t *systemTimerRegister;
#define tick() (*systemTimerRegister)
#endif

#endif

Expand Down

0 comments on commit e52ce40

Please sign in to comment.