CPU的设计理念是顺序执行,对并行执行并不擅长,而GPU正是为高并行而设计的。因此,使用GPU进行运算,配合合适的并行算法,可以大大提高程序的运行效率。
本文使用C/C++调用DirectX 11 Compute Shader(DirectCompute)实现简单GPU计算。
环境要求:
如果觉得Compute Shader 4.0不够用,还可以改成Compute Shader 5.0,不过硬件要支持DirectX 11才行。
环境初始化基本流程:
进行GPU计算的方法:
GPU线程的各参数满足下图所示含义:
除本帖以外,还可参考微软官方示例:
GPU程序:
<code>// dcompute.hlsl - 要运行的GPU程序 // 常量内存(必须为16的倍数) cbuffer CB : register(b0) { unsigned int a; unsigned int b; unsigned int c; unsigned int d; }; // u0对应UnorderedAccessView RWStructuredBuffer<unsigned int> Data : register(u0); // 主程序(注意cs_4_0只支持M,N,1,只有cs_5_0才支持M,N,P) [numthreads(4, 1, 1)] void main(uint3 Gid : SV_GroupID, // 组别ID(Dispatch函数三个参数) uint3 DTid : SV_DispatchThreadID, // 总ID uint3 GTid : SV_GroupThreadID, // 组内线程ID(numthreads属性三个参数) uint GI : SV_GroupIndex) // 组内序号 { Data[DTid.x] = a + b + c + d; } </unsigned></code>
主程序(C语言版本,需VS2013+):
主程序(C++版本):
<code class="language-cpp">// dcomupte.cpp - 运行GPU程序的程序 #include <stdio.h> #include <windows.h> #include <d3d11.h> #include <d3dcompiler.h> #include <atlbase.h> // CComPtr<t> #pragma comment(lib, "d3d11.lib") #pragma comment(lib, "d3dcompiler.lib") // CComPtr<t> g_obj的使用 // 初始化:&g_obj // 已经初始化取地址:&g_obj.p // 调用成员函数:g_obj-> // 释放:g_obj = NULL; // 工具类型,HRESULT返回值转换为此类型,可自动抛出异常 struct comexcept { explicit comexcept(HRESULT ret) : hr(ret) { if (FAILED(hr)) throw *this; } HRESULT hr; }; // 基础对象 CComPtr<id3d11device> g_dev; // 设备对象 CComPtr<id3d11devicecontext> g_immctx; // 设备上下文对象 D3D_FEATURE_LEVEL g_level; // Direct3D支持级别 CComPtr<id3dblob> g_cs_sort_code; // GPU程序编译后的字节码 CComPtr<id3d11computeshader> g_cs_sort; // GPU程序对象 // 资源对象 CComPtr<id3d11buffer> g_constbuf; // 常量内存 CComPtr<id3d11buffer> g_gpubuf; // GPU内存 CComPtr<id3d11shaderresourceview> g_gpubuf_srv; // GPU内存Shader资源视图绑定(多步Shader计算会用到) CComPtr<id3d11unorderedaccessview> g_gpubuf_uav; // GPU内存乱序访问视图绑定 CComPtr<id3d11buffer> g_cpubuf; // CPU内存(用来读取GPU内存数据) // 常量内存的结构 // 注意:大小必须是16的倍数,否则会失败 typedef struct ConstBuffer { UINT a; UINT b; UINT c; UINT d; }ConstBuffer; #define NUM_ELEMENTS 16 void DoCompute(); // 入口点(初始化资源) int main(int argc, char *argv[]) { // 支持的设备级别 D3D_FEATURE_LEVEL dlevel[] = { D3D_FEATURE_LEVEL_11_0, D3D_FEATURE_LEVEL_10_1, D3D_FEATURE_LEVEL_10_0, }; // 创建设备 // D3D_DRIVER_TYPE_HARDWARE = 使用GPU // D3D_DRIVER_TYPE_WARP = 使用CPU (comexcept)D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, 0, dlevel, sizeof dlevel / sizeof dlevel[0], D3D11_SDK_VERSION, &g_dev, &g_level, &g_immctx); // 检查是否支持Compute Shader 4.0 D3D11_FEATURE_DATA_D3D10_X_HARDWARE_OPTIONS hwopts; (comexcept)g_dev->CheckFeatureSupport(D3D11_FEATURE_D3D10_X_HARDWARE_OPTIONS, &hwopts, sizeof(hwopts)); if (!hwopts.ComputeShaders_Plus_RawAndStructuredBuffers_Via_Shader_4_x) (comexcept)E_FAIL; // 编译HLSL CComPtr<id3dblob> cs_errors; HRESULT hrcompile = D3DCompileFromFile(L"dcompute.hlsl", NULL, NULL, "main", "cs_4_0", 0, 0, &g_cs_sort_code, &cs_errors); if (cs_errors) { printf("%s", cs_errors->GetBufferPointer()); return 0; // 直接结束程序而不是抛出异常,确保能显示编译错误 } (comexcept)hrcompile; // 创建Compute Shader (comexcept)g_dev->CreateComputeShader(g_cs_sort_code->GetBufferPointer(), g_cs_sort_code->GetBufferSize(), NULL, &g_cs_sort); // 创建常量内存(必须是16的倍数,对应b0寄存器) D3D11_BUFFER_DESC constant_buffer_desc = { sizeof (ConstBuffer), D3D11_USAGE_DEFAULT, D3D11_BIND_CONSTANT_BUFFER, 0, 0, 0 }; (comexcept)g_dev->CreateBuffer(&constant_buffer_desc, NULL, &g_constbuf); // 创建GPU内存 D3D11_BUFFER_DESC buffer_desc = { NUM_ELEMENTS * sizeof(UINT), D3D11_USAGE_DEFAULT, D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE, 0, D3D11_RESOURCE_MISC_BUFFER_STRUCTURED, sizeof (UINT) }; (comexcept)g_dev->CreateBuffer(&buffer_desc, NULL, &g_gpubuf); // 创建GPU内存的Shader资源视图绑定(对应t0寄存器) D3D11_SHADER_RESOURCE_VIEW_DESC srvbuffer_desc = { DXGI_FORMAT_UNKNOWN, D3D11_SRV_DIMENSION_BUFFER }; srvbuffer_desc.Buffer.NumElements = NUM_ELEMENTS; (comexcept)g_dev->CreateShaderResourceView(g_gpubuf, &srvbuffer_desc, &g_gpubuf_srv); // 创建GPU内存的乱序访问视图绑定(对应u0寄存器) D3D11_UNORDERED_ACCESS_VIEW_DESC uavbuffer_desc = { DXGI_FORMAT_UNKNOWN, D3D11_UAV_DIMENSION_BUFFER }; uavbuffer_desc.Buffer.NumElements = NUM_ELEMENTS; (comexcept)g_dev->CreateUnorderedAccessView(g_gpubuf, &uavbuffer_desc, &g_gpubuf_uav); // 创建CPU传输内存 D3D11_BUFFER_DESC readback_buffer_desc = { NUM_ELEMENTS * sizeof (UINT), D3D11_USAGE_STAGING, 0, D3D11_CPU_ACCESS_READ, 0, sizeof (UINT) }; (comexcept)g_dev->CreateBuffer(&readback_buffer_desc, NULL, &g_cpubuf); DoCompute(); return 0; } // 计算主程序 void DoCompute(void) { // 设置常量内存为1,2,3,4 // 并将常量内存绑定到b0寄存器 ConstBuffer cb = { 1, 2, 3, 4 }; g_immctx->UpdateSubresource(g_constbuf, 0, NULL, &cb, 0, 0); g_immctx->CSSetConstantBuffers(0, 1, &g_constbuf.p); // 引用已创建的对象要用&xxx.p而不是&xxx // 设置GPU内存为0 // 并将GPU内存绑定到u0寄存器 UINT buf[NUM_ELEMENTS] = { 0 }; g_immctx->UpdateSubresource(g_gpubuf, 0, NULL, &buf[0], 0, 0); g_immctx->CSSetUnorderedAccessViews(0, 1, &g_gpubuf_uav.p, NULL); // 进行运算(4,4,1线程组,注意cs_4_0只支持M,N,1,只有cs_5_0才支持M,N,P) g_immctx->CSSetShader(g_cs_sort, NULL, 0); g_immctx->Dispatch(4, 1, 1); // 将GPU内存数据复制到CPU D3D11_MAPPED_SUBRESOURCE mapped = { 0 }; g_immctx->CopyResource(g_cpubuf, g_gpubuf); (comexcept)g_immctx->Map(g_cpubuf, 0, D3D11_MAP_READ, 0, &mapped); memcpy(&buf[0], mapped.pData, NUM_ELEMENTS * sizeof(UINT)); g_immctx->Unmap(g_cpubuf, 0); // 显示数据 for (int i = 0; i < NUM_ELEMENTS; i++) { printf("%d ", buf[i]); } printf("\n"); } </id3dblob></id3d11buffer></id3d11unorderedaccessview></id3d11shaderresourceview></id3d11buffer></id3d11buffer></id3d11computeshader></id3dblob></id3d11devicecontext></id3d11device></t></t></atlbase.h></d3dcompiler.h></d3d11.h></windows.h></stdio.h></code>
[修改于 8年2个月前 - 2016/10/24 11:30:41]
整理了一下官方示例中的GPU双调排序算法,主要是使用CComPtr<T>和comexcept简化程序结构。
同时这也是一个很好的Compute Shader入门例子。
GPU程序:
<code>//-------------------------------------------------------------------------------------- // File: ComputeShaderSort11.hlsl // // This file contains the compute shaders to perform GPU sorting using DirectX 11. // // Copyright (c) Microsoft Corporation. All rights reserved. //-------------------------------------------------------------------------------------- #define BITONIC_BLOCK_SIZE 512 #define TRANSPOSE_BLOCK_SIZE 16 //-------------------------------------------------------------------------------------- // 常量缓冲区 //-------------------------------------------------------------------------------------- // b# 寄存器表示ConstantBuffer(常量缓冲区) cbuffer CB : register( b0 ) { unsigned int g_iLevel; unsigned int g_iLevelMask; unsigned int g_iWidth; unsigned int g_iHeight; }; //-------------------------------------------------------------------------------------- // 结构化缓冲区 //-------------------------------------------------------------------------------------- // t# 寄存器表示ShaderResourceView(Shader资源视图) // u# 寄存器表示UnorderedAccessView(乱序访问视图) StructuredBuffer<unsigned int> Input : register( t0 ); RWStructuredBuffer<unsigned int> Data : register( u0 ); //-------------------------------------------------------------------------------------- // 双调排序GPU程序(Compute Shader) //-------------------------------------------------------------------------------------- groupshared unsigned int shared_data[BITONIC_BLOCK_SIZE]; // 组内共享的内部数据 [numthreads(BITONIC_BLOCK_SIZE, 1, 1)] // 组内线程数X*Y*Z,其中cs_4_0中Z必须为1,cs_5_0没有这个限制 void BitonicSort( uint3 Gid : SV_GroupID, // 组ID uint3 DTid : SV_DispatchThreadID, // 总线程ID uint3 GTid : SV_GroupThreadID, // 组内线程ID uint GI : SV_GroupIndex ) // 组内线程序号 { // 从乱序访问视图加载组内共享的内部数据 shared_data[GI] = Data[DTid.x]; GroupMemoryBarrierWithGroupSync(); // 等待组内所有共享数据访问结束,且所有程序均到达此调用 // 对组内共享的内部数据进行排序 for (unsigned int j = g_iLevel >> 1 ; j > 0 ; j >>= 1) { unsigned int result = ((shared_data[GI & ~j] <= shared_data[gi | j])="=" (bool)(g_ilevelmask & dtid.x))? ^ j] : shared_data[gi]; groupmemorybarrierwithgroupsync(); shared_data[gi]="result;" } 将组内共享的内部数据存回乱序访问视图 data[dtid.x]="shared_data[GI];" -------------------------------------------------------------------------------------- 矩阵转置gpu程序(compute shader) groupshared unsigned int transpose_shared_data[transpose_block_size * transpose_block_size]; [numthreads(transpose_block_size, transpose_block_size, 1)] void matrixtranspose( uint3 gid sv_groupid, dtid sv_dispatchthreadid, gtid sv_groupthreadid, uint gi sv_groupindex ) { transpose_shared_data[gi]="Input[DTid.y" g_iwidth + dtid.x]; uint2 xy="DTid.yx" - gtid.yx gtid.xy; data[xy.y g_iheight xy.x]="transpose_shared_data[GTid.x" transpose_block_size gtid.y]; < code></=></unsigned></unsigned></code>
CPU程序:
<code class="language-cpp">// dcomupte.cpp - 运行GPU程序的程序 #include <stdio.h> #include <windows.h> #include <d3d11.h> #include <d3dcompiler.h> #include <atlbase.h> // CComPtr<t> #include <vector> #include <random> #include <algorithm> #pragma comment(lib, "d3d11.lib") #pragma comment(lib, "d3dcompiler.lib") // CComPtr<t> g_obj的使用 // 初始化:&g_obj // 已经初始化取地址:&g_obj.p // 调用成员函数:g_obj-> // 释放:g_obj = NULL; // 工具类型,HRESULT返回值转换为此类型,可自动抛出异常 struct comexcept { explicit comexcept(HRESULT ret) : hr(ret) { if (FAILED(hr)) throw *this; } HRESULT hr; }; // 基础对象 CComPtr<id3d11device> g_dev; // 设备对象 CComPtr<id3d11devicecontext> g_immctx; // 设备上下文对象 D3D_FEATURE_LEVEL g_level; // Direct3D支持级别 CComPtr<id3dblob> g_cs_sort_code; // GPU程序编译后的字节码:双调排序 CComPtr<id3d11computeshader> g_cs_sort; // GPU程序对象:双调排序 CComPtr<id3dblob> g_cs_transpose_code; // GPU程序编译后的字节码:矩阵转置 CComPtr<id3d11computeshader> g_cs_transpose; // GPU程序对象:矩阵转置 // 资源对象 CComPtr<id3d11buffer> g_constbuf; // 常量内存 CComPtr<id3d11buffer> g_gpubuf1; // GPU内存1 CComPtr<id3d11shaderresourceview> g_gpubuf1_srv; // GPU内存1的Shader资源视图绑定 CComPtr<id3d11unorderedaccessview> g_gpubuf1_uav; // GPU内存1乱序访问视图绑定 CComPtr<id3d11buffer> g_gpubuf2; // GPU内存2 CComPtr<id3d11shaderresourceview> g_gpubuf2_srv; // GPU内存2的Shader资源视图绑定 CComPtr<id3d11unorderedaccessview> g_gpubuf2_uav; // GPU内存2的乱序访问视图绑定 CComPtr<id3d11buffer> g_cpubuf; // CPU内存(用来读取GPU内存数据) // 常量内存的结构 // 注意:大小必须是16的倍数,否则会失败 struct ConstBuffer { UINT iLevel; UINT iLevelMask; UINT iWidth; UINT iHeight; }; const UINT NUM_ELEMENTS = 512 * 512; const UINT BITONIC_BLOCK_SIZE = 512; const UINT TRANSPOSE_BLOCK_SIZE = 16; const UINT MATRIX_WIDTH = BITONIC_BLOCK_SIZE; const UINT MATRIX_HEIGHT = NUM_ELEMENTS / BITONIC_BLOCK_SIZE; std::vector<uint> data(NUM_ELEMENTS); std::vector<uint> results(NUM_ELEMENTS); void DoCompute(); // 入口点(初始化资源) int main(int argc, char *argv[]) { // 生成随机数据 std::random_device rd; std::mt19937 mt(rd()); std::generate(data.begin(), data.end(), [&] { return mt(); }); // 支持的设备级别 D3D_FEATURE_LEVEL dlevel[] = { D3D_FEATURE_LEVEL_11_0, D3D_FEATURE_LEVEL_10_1, D3D_FEATURE_LEVEL_10_0, }; // 创建设备 // D3D_DRIVER_TYPE_HARDWARE = 使用GPU // D3D_DRIVER_TYPE_WARP = 使用CPU (comexcept)D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, 0, dlevel, sizeof dlevel / sizeof dlevel[0], D3D11_SDK_VERSION, &g_dev, &g_level, &g_immctx); // 检查是否支持Compute Shader 4.0 D3D11_FEATURE_DATA_D3D10_X_HARDWARE_OPTIONS hwopts; (comexcept)g_dev->CheckFeatureSupport(D3D11_FEATURE_D3D10_X_HARDWARE_OPTIONS, &hwopts, sizeof(hwopts)); if (!hwopts.ComputeShaders_Plus_RawAndStructuredBuffers_Via_Shader_4_x) (comexcept)E_FAIL; // 编译HLSL CComPtr<id3dblob> cs_sort_errors, cs_transpose_errors; HRESULT hr_sort = D3DCompileFromFile(L"gpusort.hlsl", NULL, NULL, "BitonicSort", "cs_4_0", 0, 0, &g_cs_sort_code, &cs_sort_errors); HRESULT hr_transpose = D3DCompileFromFile(L"gpusort.hlsl", NULL, NULL, "MatrixTranspose", "cs_4_0", 0, 0, &g_cs_transpose_code, &cs_transpose_errors); if (cs_sort_errors || cs_transpose_errors) { if (cs_sort_errors) printf("%s", cs_sort_errors->GetBufferPointer()); if (cs_transpose_errors) printf("%s", cs_transpose_errors->GetBufferPointer()); return 0; // 直接结束程序而不是抛出异常,确保能显示编译错误 } (comexcept)hr_sort; (comexcept)hr_transpose; // 创建Compute Shader (comexcept)g_dev->CreateComputeShader(g_cs_sort_code->GetBufferPointer(), g_cs_sort_code->GetBufferSize(), NULL, &g_cs_sort); (comexcept)g_dev->CreateComputeShader(g_cs_transpose_code->GetBufferPointer(), g_cs_transpose_code->GetBufferSize(), NULL, &g_cs_transpose); // 创建常量内存(必须是16的倍数,对应b0寄存器) D3D11_BUFFER_DESC constant_buffer_desc = { sizeof(ConstBuffer), D3D11_USAGE_DEFAULT, D3D11_BIND_CONSTANT_BUFFER, 0, 0, 0 }; (comexcept)g_dev->CreateBuffer(&constant_buffer_desc, NULL, &g_constbuf); // 创建两块GPU内存 D3D11_BUFFER_DESC buffer_desc = { NUM_ELEMENTS * sizeof(UINT), D3D11_USAGE_DEFAULT, D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE, 0, D3D11_RESOURCE_MISC_BUFFER_STRUCTURED, sizeof(UINT) }; (comexcept)g_dev->CreateBuffer(&buffer_desc, NULL, &g_gpubuf1); (comexcept)g_dev->CreateBuffer(&buffer_desc, NULL, &g_gpubuf2); // 创建两块GPU内存的Shader资源视图绑定(对应t0寄存器) D3D11_SHADER_RESOURCE_VIEW_DESC srvbuffer_desc = { DXGI_FORMAT_UNKNOWN, D3D11_SRV_DIMENSION_BUFFER }; srvbuffer_desc.Buffer.NumElements = NUM_ELEMENTS; (comexcept)g_dev->CreateShaderResourceView(g_gpubuf1, &srvbuffer_desc, &g_gpubuf1_srv); (comexcept)g_dev->CreateShaderResourceView(g_gpubuf2, &srvbuffer_desc, &g_gpubuf2_srv); // 创建两块GPU内存的乱序访问视图绑定(对应u0寄存器) D3D11_UNORDERED_ACCESS_VIEW_DESC uavbuffer_desc = { DXGI_FORMAT_UNKNOWN, D3D11_UAV_DIMENSION_BUFFER }; uavbuffer_desc.Buffer.NumElements = NUM_ELEMENTS; (comexcept)g_dev->CreateUnorderedAccessView(g_gpubuf1, &uavbuffer_desc, &g_gpubuf1_uav); (comexcept)g_dev->CreateUnorderedAccessView(g_gpubuf2, &uavbuffer_desc, &g_gpubuf2_uav); // 创建CPU传输内存 D3D11_BUFFER_DESC readback_buffer_desc = { NUM_ELEMENTS * sizeof(UINT), D3D11_USAGE_STAGING, 0, D3D11_CPU_ACCESS_READ, 0, sizeof(UINT) }; (comexcept)g_dev->CreateBuffer(&readback_buffer_desc, NULL, &g_cpubuf); // 调用GPU运算 printf("Begin GPU sorting...\n"); DoCompute(); printf("End GPU sorting...\n"); // 调用CPU作对照运算 printf("Begin CPU sorting...\n"); std::sort(data.begin(), data.end()); printf("End CPU sorting...\n"); // 比较GPU和CPU的运算结果 bool same = true; for (int i = 0; i < NUM_ELEMENTS; i++) { if (data[i] != results[i]) { same = false; break; } } printf("Comparison %s\n", same ? "SUCCEEDED" : "FAILED"); return 0; } // 设置常量 void SetConstants(UINT iLevel, UINT iLevelMask, UINT iWidth, UINT iHeight) { ConstBuffer cb = { iLevel, iLevelMask, iWidth, iHeight }; g_immctx->UpdateSubresource(g_constbuf, 0, nullptr, &cb, 0, 0); g_immctx->CSSetConstantBuffers(0, 1, &g_constbuf.p); } // 计算主程序 void DoCompute(void) { // 上传数据 g_immctx->UpdateSubresource(g_gpubuf1, 0, nullptr, &data[0], 0, 0); // 排序数据 // 先按不大于块大小的level对行数据进行排序 for (UINT level = 2; level <= bitonic_block_size; level="level" * 2) { setconstants(level, level, matrix_height, matrix_width); 对行数据进行排序 g_immctx->CSSetUnorderedAccessViews(0, 1, &g_gpubuf1_uav.p, nullptr); g_immctx->CSSetShader(g_cs_sort, nullptr, 0); g_immctx->Dispatch(NUM_ELEMENTS / BITONIC_BLOCK_SIZE, 1, 1); } // 然后按大于块大小的level对行列数据进行排序 // 转置,排序列,转置,排序行 for (UINT level = (BITONIC_BLOCK_SIZE * 2); level <= num_elements; level="level" * 2) { setconstants((level bitonic_block_size), (level & ~num_elements) bitonic_block_size, matrix_width, matrix_height); 将数据由buffer1转置并存到buffer2 id3d11shaderresourceview* pviewnullptr="nullptr;" g_immctx->CSSetShaderResources(0, 1, &pViewnullptr); g_immctx->CSSetUnorderedAccessViews(0, 1, &g_gpubuf2_uav.p, nullptr); g_immctx->CSSetShaderResources(0, 1, &g_gpubuf1_srv.p); g_immctx->CSSetShader(g_cs_transpose, nullptr, 0); g_immctx->Dispatch(MATRIX_WIDTH / TRANSPOSE_BLOCK_SIZE, MATRIX_HEIGHT / TRANSPOSE_BLOCK_SIZE, 1); // 排序转置后的列数据 g_immctx->CSSetShader(g_cs_sort, nullptr, 0); g_immctx->Dispatch(NUM_ELEMENTS / BITONIC_BLOCK_SIZE, 1, 1); SetConstants(BITONIC_BLOCK_SIZE, level, MATRIX_HEIGHT, MATRIX_WIDTH); // 将数据由buffer2转置并存回buffer1 g_immctx->CSSetShaderResources(0, 1, &pViewnullptr); g_immctx->CSSetUnorderedAccessViews(0, 1, &g_gpubuf1_uav.p, nullptr); g_immctx->CSSetShaderResources(0, 1, &g_gpubuf2_srv.p); g_immctx->CSSetShader(g_cs_transpose, nullptr, 0); g_immctx->Dispatch(MATRIX_HEIGHT / TRANSPOSE_BLOCK_SIZE, MATRIX_WIDTH / TRANSPOSE_BLOCK_SIZE, 1); // 排序行数据 g_immctx->CSSetShader(g_cs_sort, nullptr, 0); g_immctx->Dispatch(NUM_ELEMENTS / BITONIC_BLOCK_SIZE, 1, 1); } // 下载数据 D3D11_MAPPED_SUBRESOURCE MappedResource = { 0 }; g_immctx->CopyResource(g_cpubuf, g_gpubuf1); (comexcept)g_immctx->Map(g_cpubuf, 0, D3D11_MAP_READ, 0, &MappedResource); memcpy(&results[0], MappedResource.pData, NUM_ELEMENTS * sizeof(UINT)); g_immctx->Unmap(g_cpubuf, 0); } </=></=></id3dblob></uint></uint></id3d11buffer></id3d11unorderedaccessview></id3d11shaderresourceview></id3d11buffer></id3d11unorderedaccessview></id3d11shaderresourceview></id3d11buffer></id3d11buffer></id3d11computeshader></id3dblob></id3d11computeshader></id3dblob></id3d11devicecontext></id3d11device></t></algorithm></random></vector></t></atlbase.h></d3dcompiler.h></d3d11.h></windows.h></stdio.h></code>
除了[RW]StructuredBuffer以外,DirectX 11 Compute Shader还支持[RW]ByteAddressBuffer,可以实现非结构化数据访问,不过使用比较麻烦,应用也比较少,这里就不介绍了,感兴趣的可以看一下微软官方示例中的BasicCompute11子示例。
上述双调排序程序中使用了GroupMemoryBarrierWithGroupSync函数,它可以等待组内所有共享数据访问结束,且所有程序均到达此调用。HLSL还提供了一些相似用途的函数:
<code>AllMemoryBarrier AllMemoryBarrierWithGroupSync DeviceMemoryBarrier DeviceMemoryBarrierWithGroupSync GroupMemoryBarrier GroupMemoryBarrierWithGroupSync InterlockedAdd InterlockedAnd InterlockedCompareExchange InterlockedCompareStore InterlockedExchange InterlockedMax InterlockedMin InterlockedOr InterlockedXor </code>
时段 | 个数 |
---|---|
{{f.startingTime}}点 - {{f.endTime}}点 | {{f.fileCount}} |
200字以内,仅用于支线交流,主线讨论请采用回复功能。