diff --git a/src/allocator.cpp b/src/allocator.cpp index 913d86ad7b88..6d5f5290e03a 100644 --- a/src/allocator.cpp +++ b/src/allocator.cpp @@ -508,7 +508,7 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size) if (vkdev->info.type == 1) { // integrated gpu, prefer unified memory - memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, 0); + memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); } else { @@ -758,7 +758,7 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size) if (vkdev->info.type == 1) { // integrated gpu, prefer unified memory - memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, 0); + memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); } else { @@ -805,7 +805,7 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size) if (vkdev->info.type == 1) { // integrated gpu, prefer unified memory - memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, 0); + memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); } else { @@ -929,9 +929,7 @@ VkBufferMemory* VkStagingBufferAllocator::fastMalloc(size_t size) // setup memory type if (memory_type_index == (uint32_t)-1) { - // integrated gpu, prefer unified memory - // discrete gpu, prefer the small pcie mappable memory, or fallback to host visible only anyway otherwise - memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); } ptr->memory = allocate_memory(memoryRequirements.size); @@ -1113,7 +1111,7 @@ VkImageMemory* VkSimpleImageAllocator::fastMalloc(int width, int height, VkForma { VkImageMemory* ptr = new VkImageMemory; - ptr->image = create_image(width, height, format, VK_IMAGE_USAGE_STORAGE_BIT); + ptr->image = create_image(width, height, format, VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); VkMemoryRequirements memoryRequirements; vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements); diff --git a/src/command.cpp b/src/command.cpp index 9261cb243565..99f1ff90e466 100644 --- a/src/command.cpp +++ b/src/command.cpp @@ -332,6 +332,23 @@ void VkCompute::record_write_timestamp(uint32_t query) } #endif // NCNN_BENCHMARK +void VkCompute::record_queue_transfer_acquire(const VkMat& m, uint32_t src_queue_family_index) +{ + if (queue_family_index == src_queue_family_index) + return; + + if (vkdev->info.support_VK_KHR_push_descriptor) + return queue_transfer_acquire_barrier(m.buffer(), m.buffer_offset(), m.total() * m.elemsize, src_queue_family_index); + + record_type r; + r.type = 16; + r.queue_transfer_acquire_barrier.buffer = m.buffer(); + r.queue_transfer_acquire_barrier.offset = m.buffer_offset(); + r.queue_transfer_acquire_barrier.size = m.total() * m.elemsize; + r.queue_transfer_acquire_barrier.src_queue_family_index = src_queue_family_index; + delayed_records.push_back(r); +} + #if __ANDROID_API__ >= 26 void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& im, const VkMat& m) { @@ -870,6 +887,9 @@ int VkCompute::submit_and_wait() case 15: compute_host_barrier(r.compute_host_barrier.buffer, r.compute_host_barrier.offset, r.compute_host_barrier.size); break; + case 16: + queue_transfer_acquire_barrier(r.queue_transfer_acquire_barrier.buffer, r.queue_transfer_acquire_barrier.offset, r.queue_transfer_acquire_barrier.size, r.queue_transfer_acquire_barrier.src_queue_family_index); + break; } } @@ -1180,6 +1200,27 @@ void VkCompute::compute_host_barrier(VkBuffer buffer, size_t offset, size_t size vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); } +void VkCompute::queue_transfer_acquire_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t src_queue_family_index) +{ +// fprintf(stderr, "cmd queue_transfer_acquire_barrier %p[+%lu] %lu %lu -> %lu\n", buffer, offset, size, src_queue_family_index, queue_family_index); + + VkBufferMemoryBarrier bufferBarrier; + bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + bufferBarrier.pNext = 0; + bufferBarrier.srcAccessMask = 0; + bufferBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + bufferBarrier.srcQueueFamilyIndex = src_queue_family_index; + bufferBarrier.dstQueueFamilyIndex = queue_family_index; + bufferBarrier.buffer = buffer; + bufferBarrier.offset = offset; + bufferBarrier.size = size; + + VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); +} + void VkCompute::initial_image_compute_barrier(VkImage image) { // fprintf(stderr, "cmd initial_image_compute_barrier %p %lu %lu\n", image, oldlayout, newlayout); @@ -1275,9 +1316,13 @@ void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt) // set weight blob as readonly dst.data->state = 4; - if (dst.allocator->mappable) + // we can skip queue transfer and staging buffer allocation + // only on unified memory architecture and unified compute/transfer queue + // which is usually the case on integrated gpu / cpu + if (dst.allocator->mappable && queue_family_index == vkdev->info.compute_queue_family_index) { dst.upload(src_flattened); + return; } @@ -1317,6 +1362,8 @@ int VkTransfer::submit_and_wait() mapped_ptr_offset += alignSize(r.size, buffer_offset_alignment); } + staging_vkallocator->flush(staging_data); + begin_command_buffer(); // fprintf(stderr, "cmd transfer %p %lu\n", staging_data->buffer, staging_buffer_size); @@ -1332,32 +1379,32 @@ int VkTransfer::submit_and_wait() staging_buffer_offset += alignSize(r.size, buffer_offset_alignment); } -// // finish TODO queue owner transfer release -// std::vector bufferBarriers(transfer_count); -// for (int i=0; iinfo.compute_queue_family_index; -// bufferBarriers[i].buffer = r.vkmat.buffer(); -// bufferBarriers[i].offset = r.vkmat.buffer_offset(); -// bufferBarriers[i].size = r.size; -// } -// -// VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; -// VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; -// -// vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, transfer_count, bufferBarriers.data(), 0, 0); + // owner transfer release + for (int i=0; iinfo.compute_queue_family_index); + } end_command_buffer(); int ret = queue_submit_and_wait_fence(); + // compute queue owner transfer acquire + { + VkCompute cmd(vkdev); + + for (int i=0; ifastFree(staging_data); staging_data = 0; @@ -1386,6 +1433,27 @@ void VkTransfer::copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vect vkCmdCopyBuffer(command_buffer, src, dst, regions.size(), regions.data()); } +void VkTransfer::queue_transfer_release_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t dst_queue_family_index) +{ +// fprintf(stderr, "cmd queue_transfer_release_barrier %p[+%lu] %lu %lu -> %lu\n", buffer, offset, size, queue_family_index, dst_queue_family_index); + + VkBufferMemoryBarrier bufferBarrier; + bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + bufferBarrier.pNext = 0; + bufferBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + bufferBarrier.dstAccessMask = 0; + bufferBarrier.srcQueueFamilyIndex = queue_family_index; + bufferBarrier.dstQueueFamilyIndex = dst_queue_family_index; + bufferBarrier.buffer = buffer; + bufferBarrier.offset = offset; + bufferBarrier.size = size; + + VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; + VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; + + vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0); +} + } // namespace ncnn #endif // NCNN_VULKAN diff --git a/src/command.h b/src/command.h index 49ab022e08c4..cc161cb2c27c 100644 --- a/src/command.h +++ b/src/command.h @@ -69,7 +69,11 @@ class VkCompute : public Command void record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkMat& m); +#if NCNN_BENCHMARK void record_write_timestamp(uint32_t query); +#endif // NCNN_BENCHMARK + + void record_queue_transfer_acquire(const VkMat& m, uint32_t src_queue_family_index); #if __ANDROID_API__ >= 26 void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& im, const VkMat& m); @@ -134,6 +138,7 @@ class VkCompute : public Command void transfer_host_barrier(VkBuffer buffer, size_t offset, size_t size); void host_compute_barrier(VkBuffer buffer, size_t offset, size_t size); void compute_host_barrier(VkBuffer buffer, size_t offset, size_t size); + void queue_transfer_acquire_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t src_queue_family_index); void initial_image_compute_barrier(VkImage image); #if __ANDROID_API__ >= 26 void update_import_android_hardware_buffer_bindings(VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const VkDescriptorImageInfo& descriptorImageInfo, const VkDescriptorBufferInfo& descriptorBufferInfo); @@ -165,6 +170,7 @@ class VkCompute : public Command // 13=transfer-host barrier // 14=host-compute barrier // 15=compute-host barrier + // 16=queue-transfer-acquire barrier int type; union @@ -187,6 +193,7 @@ class VkCompute : public Command struct { VkBuffer buffer; size_t offset; size_t size; } transfer_host_barrier; struct { VkBuffer buffer; size_t offset; size_t size; } host_compute_barrier; struct { VkBuffer buffer; size_t offset; size_t size; } compute_host_barrier; + struct { VkBuffer buffer; size_t offset; size_t size; size_t src_queue_family_index; } queue_transfer_acquire_barrier; }; std::vector regions; @@ -218,6 +225,7 @@ class VkTransfer : public Command // recording issue void copy_buffer(VkBuffer src, size_t src_offset, VkBuffer dst, size_t dst_offset, size_t size); void copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vector& regions); + void queue_transfer_release_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t target_queue_family_index); protected: size_t buffer_offset_alignment; diff --git a/src/gpu.cpp b/src/gpu.cpp index 12009503d453..1c5b74edb5e0 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -31,7 +31,7 @@ #if __ANDROID__ #define ENABLE_VALIDATION_LAYER 0 #else -#define ENABLE_VALIDATION_LAYER 1 +#define ENABLE_VALIDATION_LAYER 0 #endif namespace ncnn {