Skip to content

Commit

Permalink
do not enforce coherent memory type, queue transfer after uploading m…
Browse files Browse the repository at this point in the history
…odel weight
  • Loading branch information
nihui committed Jan 2, 2020
1 parent 038666e commit b361b24
Show file tree
Hide file tree
Showing 4 changed files with 104 additions and 30 deletions.
12 changes: 5 additions & 7 deletions src/allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size)
if (vkdev->info.type == 1)
{
// integrated gpu, prefer unified memory
memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, 0);
memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
}
else
{
Expand Down Expand Up @@ -758,7 +758,7 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size)
if (vkdev->info.type == 1)
{
// integrated gpu, prefer unified memory
memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, 0);
memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
}
else
{
Expand Down Expand Up @@ -805,7 +805,7 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size)
if (vkdev->info.type == 1)
{
// integrated gpu, prefer unified memory
memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, 0);
memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
}
else
{
Expand Down Expand Up @@ -929,9 +929,7 @@ VkBufferMemory* VkStagingBufferAllocator::fastMalloc(size_t size)
// setup memory type
if (memory_type_index == (uint32_t)-1)
{
// integrated gpu, prefer unified memory
// discrete gpu, prefer the small pcie mappable memory, or fallback to host visible only anyway otherwise
memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
}

ptr->memory = allocate_memory(memoryRequirements.size);
Expand Down Expand Up @@ -1113,7 +1111,7 @@ VkImageMemory* VkSimpleImageAllocator::fastMalloc(int width, int height, VkForma
{
VkImageMemory* ptr = new VkImageMemory;

ptr->image = create_image(width, height, format, VK_IMAGE_USAGE_STORAGE_BIT);
ptr->image = create_image(width, height, format, VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);

VkMemoryRequirements memoryRequirements;
vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);
Expand Down
112 changes: 90 additions & 22 deletions src/command.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,23 @@ void VkCompute::record_write_timestamp(uint32_t query)
}
#endif // NCNN_BENCHMARK

void VkCompute::record_queue_transfer_acquire(const VkMat& m, uint32_t src_queue_family_index)
{
if (queue_family_index == src_queue_family_index)
return;

if (vkdev->info.support_VK_KHR_push_descriptor)
return queue_transfer_acquire_barrier(m.buffer(), m.buffer_offset(), m.total() * m.elemsize, src_queue_family_index);

record_type r;
r.type = 16;
r.queue_transfer_acquire_barrier.buffer = m.buffer();
r.queue_transfer_acquire_barrier.offset = m.buffer_offset();
r.queue_transfer_acquire_barrier.size = m.total() * m.elemsize;
r.queue_transfer_acquire_barrier.src_queue_family_index = src_queue_family_index;
delayed_records.push_back(r);
}

#if __ANDROID_API__ >= 26
void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& im, const VkMat& m)
{
Expand Down Expand Up @@ -870,6 +887,9 @@ int VkCompute::submit_and_wait()
case 15:
compute_host_barrier(r.compute_host_barrier.buffer, r.compute_host_barrier.offset, r.compute_host_barrier.size);
break;
case 16:
queue_transfer_acquire_barrier(r.queue_transfer_acquire_barrier.buffer, r.queue_transfer_acquire_barrier.offset, r.queue_transfer_acquire_barrier.size, r.queue_transfer_acquire_barrier.src_queue_family_index);
break;
}
}

Expand Down Expand Up @@ -1180,6 +1200,27 @@ void VkCompute::compute_host_barrier(VkBuffer buffer, size_t offset, size_t size
vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0);
}

void VkCompute::queue_transfer_acquire_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t src_queue_family_index)
{
// fprintf(stderr, "cmd queue_transfer_acquire_barrier %p[+%lu] %lu %lu -> %lu\n", buffer, offset, size, src_queue_family_index, queue_family_index);

VkBufferMemoryBarrier bufferBarrier;
bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
bufferBarrier.pNext = 0;
bufferBarrier.srcAccessMask = 0;
bufferBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
bufferBarrier.srcQueueFamilyIndex = src_queue_family_index;
bufferBarrier.dstQueueFamilyIndex = queue_family_index;
bufferBarrier.buffer = buffer;
bufferBarrier.offset = offset;
bufferBarrier.size = size;

VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;

vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0);
}

void VkCompute::initial_image_compute_barrier(VkImage image)
{
// fprintf(stderr, "cmd initial_image_compute_barrier %p %lu %lu\n", image, oldlayout, newlayout);
Expand Down Expand Up @@ -1275,9 +1316,13 @@ void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt)
// set weight blob as readonly
dst.data->state = 4;

if (dst.allocator->mappable)
// we can skip queue transfer and staging buffer allocation
// only on unified memory architecture and unified compute/transfer queue
// which is usually the case on integrated gpu / cpu
if (dst.allocator->mappable && queue_family_index == vkdev->info.compute_queue_family_index)
{
dst.upload(src_flattened);

return;
}

Expand Down Expand Up @@ -1317,6 +1362,8 @@ int VkTransfer::submit_and_wait()
mapped_ptr_offset += alignSize(r.size, buffer_offset_alignment);
}

staging_vkallocator->flush(staging_data);

begin_command_buffer();

// fprintf(stderr, "cmd transfer %p %lu\n", staging_data->buffer, staging_buffer_size);
Expand All @@ -1332,32 +1379,32 @@ int VkTransfer::submit_and_wait()
staging_buffer_offset += alignSize(r.size, buffer_offset_alignment);
}

// // finish TODO queue owner transfer release
// std::vector<VkBufferMemoryBarrier> bufferBarriers(transfer_count);
// for (int i=0; i<transfer_count; i++)
// {
// const record_type& r = delayed_records[i];
//
// bufferBarriers[i].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
// bufferBarriers[i].pNext = 0;
// bufferBarriers[i].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
// bufferBarriers[i].dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
// bufferBarriers[i].srcQueueFamilyIndex = queue_family_index;
// bufferBarriers[i].dstQueueFamilyIndex = vkdev->info.compute_queue_family_index;
// bufferBarriers[i].buffer = r.vkmat.buffer();
// bufferBarriers[i].offset = r.vkmat.buffer_offset();
// bufferBarriers[i].size = r.size;
// }
//
// VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
// VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
//
// vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, transfer_count, bufferBarriers.data(), 0, 0);
// owner transfer release
for (int i=0; i<transfer_count; i++)
{
const record_type& r = delayed_records[i];

queue_transfer_release_barrier(r.vkmat.buffer(), r.vkmat.buffer_offset(), r.size, vkdev->info.compute_queue_family_index);
}

end_command_buffer();

int ret = queue_submit_and_wait_fence();

// compute queue owner transfer acquire
{
VkCompute cmd(vkdev);

for (int i=0; i<transfer_count; i++)
{
const record_type& r = delayed_records[i];

cmd.record_queue_transfer_acquire(r.vkmat, queue_family_index);
}

cmd.submit_and_wait();
}

// deallocate staging buffer
staging_vkallocator->fastFree(staging_data);
staging_data = 0;
Expand Down Expand Up @@ -1386,6 +1433,27 @@ void VkTransfer::copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vect
vkCmdCopyBuffer(command_buffer, src, dst, regions.size(), regions.data());
}

void VkTransfer::queue_transfer_release_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t dst_queue_family_index)
{
// fprintf(stderr, "cmd queue_transfer_release_barrier %p[+%lu] %lu %lu -> %lu\n", buffer, offset, size, queue_family_index, dst_queue_family_index);

VkBufferMemoryBarrier bufferBarrier;
bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
bufferBarrier.pNext = 0;
bufferBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
bufferBarrier.dstAccessMask = 0;
bufferBarrier.srcQueueFamilyIndex = queue_family_index;
bufferBarrier.dstQueueFamilyIndex = dst_queue_family_index;
bufferBarrier.buffer = buffer;
bufferBarrier.offset = offset;
bufferBarrier.size = size;

VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;

vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0);
}

} // namespace ncnn

#endif // NCNN_VULKAN
8 changes: 8 additions & 0 deletions src/command.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,11 @@ class VkCompute : public Command

void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& m);

#if NCNN_BENCHMARK
void record_write_timestamp(uint32_t query);
#endif // NCNN_BENCHMARK

void record_queue_transfer_acquire(const VkMat& m, uint32_t src_queue_family_index);

#if __ANDROID_API__ >= 26
void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& im, const VkMat& m);
Expand Down Expand Up @@ -134,6 +138,7 @@ class VkCompute : public Command
void transfer_host_barrier(VkBuffer buffer, size_t offset, size_t size);
void host_compute_barrier(VkBuffer buffer, size_t offset, size_t size);
void compute_host_barrier(VkBuffer buffer, size_t offset, size_t size);
void queue_transfer_acquire_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t src_queue_family_index);
void initial_image_compute_barrier(VkImage image);
#if __ANDROID_API__ >= 26
void update_import_android_hardware_buffer_bindings(VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const VkDescriptorImageInfo& descriptorImageInfo, const VkDescriptorBufferInfo& descriptorBufferInfo);
Expand Down Expand Up @@ -165,6 +170,7 @@ class VkCompute : public Command
// 13=transfer-host barrier
// 14=host-compute barrier
// 15=compute-host barrier
// 16=queue-transfer-acquire barrier
int type;

union
Expand All @@ -187,6 +193,7 @@ class VkCompute : public Command
struct { VkBuffer buffer; size_t offset; size_t size; } transfer_host_barrier;
struct { VkBuffer buffer; size_t offset; size_t size; } host_compute_barrier;
struct { VkBuffer buffer; size_t offset; size_t size; } compute_host_barrier;
struct { VkBuffer buffer; size_t offset; size_t size; size_t src_queue_family_index; } queue_transfer_acquire_barrier;
};

std::vector<VkBufferCopy> regions;
Expand Down Expand Up @@ -218,6 +225,7 @@ class VkTransfer : public Command
// recording issue
void copy_buffer(VkBuffer src, size_t src_offset, VkBuffer dst, size_t dst_offset, size_t size);
void copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vector<VkBufferCopy>& regions);
void queue_transfer_release_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t target_queue_family_index);

protected:
size_t buffer_offset_alignment;
Expand Down
2 changes: 1 addition & 1 deletion src/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
#if __ANDROID__
#define ENABLE_VALIDATION_LAYER 0
#else
#define ENABLE_VALIDATION_LAYER 1
#define ENABLE_VALIDATION_LAYER 0
#endif

namespace ncnn {
Expand Down

0 comments on commit b361b24

Please sign in to comment.