From e52ce40a645f7bc7f1aafa87553d78c64df64adc Mon Sep 17 00:00:00 2001 From: hakmo Date: Fri, 10 Nov 2023 09:18:53 -0700 Subject: [PATCH] AARCH64 port using CMA, based on https://github.com/juj/fbcp-ili9341/pull/261 --- .gitignore | 1 + CMakeLists.txt | 16 ++++++++++++- cma.cpp | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++ cma.h | 16 +++++++++++++ dma.cpp | 39 ++++++++++++++++++++++++++++--- spi.cpp | 22 ++++++++++++++++-- tick.h | 13 +++++++++++ 7 files changed, 164 insertions(+), 6 deletions(-) create mode 100644 cma.cpp create mode 100644 cma.h diff --git a/.gitignore b/.gitignore index 06fb2a5..a412732 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ *.S *.symvers *.order +build/* diff --git a/CMakeLists.txt b/CMakeLists.txt index abd94b2..2feef96 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,7 @@ set(DEFAULT_TO_SINGLE_CORE_BOARD OFF) set(DEFAULT_TO_ARMV6Z OFF) set(DEFAULT_TO_ARMV7A OFF) set(DEFAULT_TO_ARMV8A OFF) +set(DEFAULT_USE_VCSM_CMA OFF) # http://ozzmaker.com/check-raspberry-software-hardware-version-command-line/ if (BOARD_REVISION MATCHES "(0002)|(0003)|(0004)|(0005)|(0006)|(0007)|(0008)|(0009)" OR BOARD_REVISION MATCHES "(000d)|(000e)|(000f)|(0010)|(0011)|(0012)" OR BOARD_REVISION MATCHES "(900092)|(900093)|(9000c1)") @@ -46,7 +47,20 @@ if (SINGLE_CORE_BOARD) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSINGLE_CORE_BOARD=1") endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -mabi=aapcs-linux -mhard-float -mfloat-abi=hard -mlittle-endian -mtls-dialect=gnu2 -funsafe-math-optimizations") +option(AARCH64 "Target a Raspberry Pi with aarch64 architecture" NO) +if (AARCH64) + message(STATUS "Enable AARCH64 build") + set(DEFAULT_USE_VCSM_CMA ON) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlittle-endian -funsafe-math-optimizations -DTIMER_32BIT") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -marm -mabi=aapcs-linux -mhard-float -mfloat-abi=hard -mlittle-endian -mtls-dialect=gnu2 -funsafe-math-optimizations") +endif() + +option(USE_VCSM_CMA "Map Memory from CPU instead of GPU" ${DEFAULT_USE_VCSM_CMA}) +if (USE_VCSM_CMA) + message(STATUS "Enabling Map Memory from CPU instead of GPU") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_VCSM_CMA=1") +endif() option(ARMV6Z "Target a Raspberry Pi with ARMv6Z instruction set (Pi 1A, 1A+, 1B, 1B+, Zero, Zero W)" ${DEFAULT_TO_ARMV6Z}) if (ARMV6Z) diff --git a/cma.cpp b/cma.cpp new file mode 100644 index 0000000..a916c9f --- /dev/null +++ b/cma.cpp @@ -0,0 +1,63 @@ +#ifdef USE_VCSM_CMA + +#include "config.h" +#include "cma.h" +#include "util.h" +#include +#include +#include +#include +#include +#include + +static int cma_fd = -1; +#define PAGE_SIZE 4096 + +void OpenVCSM(void) { + cma_fd = open("/dev/vcsm-cma", O_RDWR|O_SYNC); + if (cma_fd < 0) FATAL_ERROR("can't open /dev/vcsm-cma"); +} + +void CloseVCSM(void) { + if (cma_fd >= 0) { + close(cma_fd); + } +} + +const int NAME_LENGTH = 32; + +struct Allocate { + /* user -> kernel */ + uint32_t size; + uint32_t num; + uint32_t flags; + uint32_t pad; + char name[NAME_LENGTH]; + + /* kernel -> user */ + int32_t fd; + uint32_t vcHandle; + uint64_t dmaAddr; +}; + +int AllocateCMA(const char* reason, size_t req, CMAInfo* res) { + if (res == NULL) { + return -1; + } + Allocate ctx; + memset(&ctx, 0, sizeof(ctx)); + ctx.size = ALIGN_UP(req, PAGE_SIZE); + ctx.flags = 0; // NO cache + strncpy((char*)ctx.name, reason, NAME_LENGTH -1); + ctx.num = 1; + if (ioctl(cma_fd, _IOR('J', 0x5A, struct Allocate), &ctx) < 0 || ctx.fd < 0) { // allocate cmd + return -1; + } + res->size = ctx.size; + res->vcHandle = ctx.vcHandle; + res->dmaAddr = ctx.dmaAddr; + res->fd = ctx.fd; + return 0; +} + +#endif diff --git a/cma.h b/cma.h new file mode 100644 index 0000000..01f2029 --- /dev/null +++ b/cma.h @@ -0,0 +1,16 @@ +#pragma once +#ifdef USE_VCSM_CMA + +#include +#include +struct CMAInfo { + size_t size; + uintptr_t dmaAddr; + uint32_t fd; + uint32_t vcHandle; +}; + +void OpenVCSM(void); +void CloseVCSM(void); +int AllocateCMA(const char* reason, size_t req, CMAInfo* res); +#endif diff --git a/dma.cpp b/dma.cpp index d2844e2..8ab7535 100644 --- a/dma.cpp +++ b/dma.cpp @@ -14,6 +14,10 @@ #include "util.h" #include "mailbox.h" +#ifdef USE_VCSM_CMA +#include "cma.h" +#endif + #ifdef USE_DMA_TRANSFERS #define BCM2835_PERI_BASE 0x3F000000 @@ -36,6 +40,9 @@ struct GpuMemory void *virtualAddr; uintptr_t busAddress; uint32_t sizeBytes; +#ifdef USE_VCSM_CMA + uint32_t vcHandle; +#endif }; #define NUM_DMA_CBS 1024 @@ -127,7 +134,33 @@ void FreeDMAChannel(int channel) #define VIRT_TO_BUS(block, x) ((uintptr_t)(x) - (uintptr_t)((block).virtualAddr) + (block).busAddress) uint64_t totalGpuMemoryUsed = 0; +#ifdef USE_VCSM_CMA +void FreeUncachedGpuMemory(GpuMemory mem) { + munmap(mem.virtualAddr, mem.sizeBytes); + close(mem.allocationHandle); +} + +GpuMemory AllocateUncachedGpuMemory(uint32_t numBytes, const char *reason) { + GpuMemory mem; + CMAInfo ctx; + if (AllocateCMA(reason, numBytes, &ctx) != 0) { + FATAL_ERROR("alloc cma failed"); + } + mem.sizeBytes = ctx.size; + mem.busAddress = ctx.dmaAddr; + mem.allocationHandle = ctx.fd; + mem.vcHandle = ctx.vcHandle; + mem.virtualAddr = mmap(0, mem.sizeBytes, PROT_READ | PROT_WRITE, MAP_SHARED, ctx.fd, 0); + totalGpuMemoryUsed += mem.sizeBytes; + if (mem.virtualAddr == MAP_FAILED) { + FreeUncachedGpuMemory(mem); + FATAL_ERROR("Failed to mmap CMA memory!"); + } + printf("Allocated %u bytes of GPU memory for %s (bus address=%p). Total GPU memory used: %llu bytes\n", mem.sizeBytes, reason, (void*)mem.busAddress, totalGpuMemoryUsed); + return mem; +} +#else // Allocates the given number of bytes in GPU side memory, and returns the virtual address and physical bus address of the allocated memory block. // The virtual address holds an uncached view to the allocated memory, so writes and reads to that memory address bypass the L1 and L2 caches. Use // this kind of memory to pass data blocks over to the DMA controller to process. @@ -154,7 +187,7 @@ void FreeUncachedGpuMemory(GpuMemory mem) Mailbox(MEM_UNLOCK_MESSAGE, mem.allocationHandle); Mailbox(MEM_FREE_MESSAGE, mem.allocationHandle); } - +#endif volatile DMAChannelRegisterFile *GetDMAChannel(int channelNumber) { if (channelNumber < 0 || channelNumber >= BCM2835_NUM_DMA_CHANNELS) @@ -720,8 +753,8 @@ void SPIDMATransfer(SPITask *task) while((dmaTx->cs & BCM2835_DMA_CS_ACTIVE)) { CheckSPIDMAChannelsNotStolen(); - if (tick() - dmaTaskStart > 5000000) - FATAL_ERROR("DMA TX channel has stalled!"); + if (tick() - dmaTaskStart > 5000000) + FATAL_ERROR("DMA TX channel has stalled!"); } while((dmaRx->cs & BCM2835_DMA_CS_ACTIVE)) { diff --git a/spi.cpp b/spi.cpp index d156f2d..3270a03 100644 --- a/spi.cpp +++ b/spi.cpp @@ -33,6 +33,10 @@ void ChipSelectHigh(); #define TOGGLE_CHIP_SELECT_LINE() ((void)0) #endif +#ifdef USE_VCSM_CMA +#include "cma.h" +#endif + static uint32_t writeCounter = 0; #define WRITE_FIFO(word) do { \ @@ -49,7 +53,11 @@ volatile SPIRegisterFile *spi = 0; // Points to the system timer register. N.B. spec sheet says this is two low and high parts, in an 32-bit aligned (but not 64-bit aligned) address. Profiling shows // that Pi 3 Model B does allow reading this as a u64 load, and even when unaligned, it is around 30% faster to do so compared to loading in parts "lo | (hi << 32)". +#ifdef TIMER_32BIT +volatile systemTimer *systemTimerRegister = 0; +#else volatile uint64_t *systemTimerRegister = 0; +#endif void DumpSPICS(uint32_t reg) { @@ -510,13 +518,20 @@ int InitSPI() // Memory map GPIO and SPI peripherals for direct access mem_fd = open("/dev/mem", O_RDWR|O_SYNC); if (mem_fd < 0) FATAL_ERROR("can't open /dev/mem (run as sudo)"); +#ifdef USE_VCSM_CMA + OpenVCSM(); +#endif printf("bcm_host_get_peripheral_address: %p, bcm_host_get_peripheral_size: %u, bcm_host_get_sdram_address: %p\n", bcm_host_get_peripheral_address(), bcm_host_get_peripheral_size(), bcm_host_get_sdram_address()); bcm2835 = mmap(NULL, bcm_host_get_peripheral_size(), (PROT_READ | PROT_WRITE), MAP_SHARED, mem_fd, bcm_host_get_peripheral_address()); if (bcm2835 == MAP_FAILED) FATAL_ERROR("mapping /dev/mem failed"); spi = (volatile SPIRegisterFile*)((uintptr_t)bcm2835 + BCM2835_SPI0_BASE); gpio = (volatile GPIORegisterFile*)((uintptr_t)bcm2835 + BCM2835_GPIO_BASE); - systemTimerRegister = (volatile uint64_t*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine. - // TODO: On graceful shutdown, (ctrl-c signal?) close(mem_fd) +#ifdef TIMER_32BIT + systemTimerRegister = (volatile TIMER_TYPE*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE); +#else + systemTimerRegister = (volatile TIMER_TYPE*)((uintptr_t)bcm2835 + BCM2835_TIMER_BASE + 0x04); // Generates an unaligned 64-bit pointer, but seems to be fine. +#endif +// TODO: On graceful shutdown, (ctrl-c signal?) close(mem_fd) #endif uint32_t currentBcmCoreSpeed = MailboxRet2(0x00030002/*Get Clock Rate*/, 0x4/*CORE*/); @@ -658,6 +673,9 @@ void DeinitSPI() close(mem_fd); mem_fd = -1; } +#ifdef USE_VCSM_CMA + CloseVCSM(); +#endif #ifndef KERNEL_MODULE_CLIENT diff --git a/tick.h b/tick.h index 2036168..118d0d7 100644 --- a/tick.h +++ b/tick.h @@ -5,8 +5,21 @@ #include // Initialized in spi.cpp along with the rest of the BCM2835 peripheral: +#ifdef TIMER_32BIT +struct __attribute__((packed, aligned(4))) systemTimer { + volatile uint32_t cs; + volatile uint32_t clo; + volatile uint32_t chi; + volatile uint32_t c[4]; +}; +#define TIMER_TYPE systemTimer +extern volatile systemTimer* systemTimerRegister; +#define tick() (((uint64_t)systemTimerRegister->clo) | ((uint64_t)(systemTimerRegister->chi) << 32)) +#else +#define TIMER_TYPE uint64_t extern volatile uint64_t *systemTimerRegister; #define tick() (*systemTimerRegister) +#endif #endif