所需环境:
GPU寄存器:
除了StructuredBuffer和RWStructuredBuffer以外,还有ByteAddressBuffer和RWByteAddressBuffer,它们使用比较麻烦。
GPU线程模型:
4个线程组,每个线程组4个线程,计算1+2+3+4=10。
<code class="language-cs">using System; using System.Collections.Generic; using System.Linq; using System.Text; using SharpDX; using SharpDX.Direct3D; using SharpDX.Direct3D11; using SharpDX.D3DCompiler; using D3D11 = SharpDX.Direct3D11; using DXGI = SharpDX.DXGI; // 引用: // SharpDX.dll // SharpDX.Direct3D11.dll // SharpDX.DXGI.dll // SharpDX.D3DCompiler.dll // 附加: // d3dx11_43.dll(DirectX Redist June 2010) // d3dcompiler_43.dll(DirectX Redist June 2010) // d3dcsx_43.dll(DirectX Redist June 2010) namespace dcomputecsharp { class Program { static string CS = @" // 要运行的GPU程序 // 常量缓冲区(必须为16的倍数) cbuffer CB : register(b0) { unsigned int a; unsigned int b; unsigned int c; unsigned int d; }; // u0对应UnorderedAccessView RWStructuredBuffer<unsigned int> Data : register(u0); // 主程序(注意cs_4_0只支持M,N,1,只有cs_5_0才支持M,N,P) [numthreads(4, 1, 1)] void main(uint3 Gid : SV_GroupID, // 组别ID(Dispatch函数三个参数) uint3 DTid : SV_DispatchThreadID, // 总ID uint3 GTid : SV_GroupThreadID, // 组内线程ID(numthreads属性三个参数) uint GI : SV_GroupIndex) // 组内序号 { Data[DTid.x] = a + b + c + d; } "; static Device dev; static DeviceContext immctx; static CompilationResult cs_main_comp; static ComputeShader cs_main; static D3D11.Buffer constbuf; static D3D11.Buffer gpubuf1; static ShaderResourceView gpubuf1_srv; static UnorderedAccessView gpubuf1_uav; static D3D11.Buffer cpubuf; struct ConstBuffer { public uint a; public uint b; public uint c; public uint d; } static void Main(string[] args) { try { // 创建设备,并获取ImmediateContext对象 // DirverType.Hardware : 使用GPU // DirverType.Warp : 使用CPU (Win7要求KB2670838以支持WARP11) dev = new Device(DriverType.Hardware, DeviceCreationFlags.None, FeatureLevel.Level_11_0, FeatureLevel.Level_10_1, FeatureLevel.Level_10_0); immctx = dev.ImmediateContext; // 检查是否支持Compute Shader 4.0 if (!dev.CheckFeatureSupport(Feature.D3D10XHardwareOptions)) { Console.WriteLine("No support for compute shaders."); return; } // 编译并创建Compute Shader对象 cs_main_comp = ShaderBytecode.Compile(CS, "main", "cs_4_0"); cs_main = new ComputeShader(dev, cs_main_comp.Bytecode, null); // 创建常量缓冲区(注意大小必须是16的倍数) constbuf = new D3D11.Buffer(dev, 16, ResourceUsage.Default, BindFlags.ConstantBuffer, CpuAccessFlags.None, ResourceOptionFlags.None, 0); // 创建GPU缓冲区 gpubuf1 = new D3D11.Buffer(dev, sizeof(uint) * 16, ResourceUsage.Default, BindFlags.ShaderResource | BindFlags.UnorderedAccess, CpuAccessFlags.None, ResourceOptionFlags.BufferStructured, sizeof(uint)); // 为GPU缓冲区创建Shader资源视图绑定 ShaderResourceViewDescription srvdesc = new ShaderResourceViewDescription(); srvdesc.Format = DXGI.Format.Unknown; srvdesc.Dimension = ShaderResourceViewDimension.Buffer; srvdesc.Buffer.ElementCount = 16; gpubuf1_srv = new ShaderResourceView(dev, gpubuf1, srvdesc); // 为GPU缓冲区创建乱序访问视图绑定 UnorderedAccessViewDescription uavdesc = new UnorderedAccessViewDescription(); uavdesc.Format = DXGI.Format.Unknown; uavdesc.Dimension = UnorderedAccessViewDimension.Buffer; uavdesc.Buffer.ElementCount = 16; gpubuf1_uav = new UnorderedAccessView(dev, gpubuf1, uavdesc); // 创建CPU传输缓冲区 cpubuf = new D3D11.Buffer(dev, sizeof(uint) * 16, ResourceUsage.Staging, BindFlags.None, CpuAccessFlags.Read, ResourceOptionFlags.BufferStructured, sizeof(uint)); // 进行计算 DoCompute(); } finally { // 清理对象 if (cpubuf != null) cpubuf.Dispose(); if (gpubuf1_uav != null) gpubuf1_uav.Dispose(); if (gpubuf1_srv != null) gpubuf1_srv.Dispose(); if (gpubuf1 != null) gpubuf1.Dispose(); if (constbuf != null) constbuf.Dispose(); if (cs_main != null) cs_main.Dispose(); if (cs_main_comp != null) cs_main_comp.Dispose(); if (immctx != null) immctx.Dispose(); if (dev != null) dev.Dispose(); } } // 计算主程序 private static void DoCompute() { // 设置常量 ConstBuffer cb = new ConstBuffer() { a = 1, b = 2, c = 3, d = 4 }; immctx.UpdateSubresource(ref cb, constbuf); immctx.ComputeShader.SetConstantBuffer(0, constbuf); // 上传数据 uint[] buf = new uint[16]; immctx.UpdateSubresource(buf, gpubuf1); immctx.ComputeShader.SetUnorderedAccessView(0, gpubuf1_uav); // 进行运算 immctx.ComputeShader.SetShader(cs_main, null, 0); immctx.Dispatch(4, 1, 1); // 下载数据 immctx.CopyResource(gpubuf1, cpubuf); DataStream ds; immctx.MapSubresource(cpubuf, 0, MapMode.Read, MapFlags.None, out ds); uint[] outbuf = ds.ReadRange<uint>(16); immctx.UnmapSubresource(cpubuf, 0); // 显示结果 for (int i = 0; i < 16; i++) { Console.Write("{0} ", outbuf[i]); } Console.WriteLine(); } } } </uint></unsigned></code>
双调排序算法(一种可并行的排序算法),微软官方示例的SharpDX移植版本(原版使用C++)。
<code class="language-cs">using System; using System.Collections.Generic; using System.Linq; using System.Text; using SharpDX; using SharpDX.Direct3D; using SharpDX.Direct3D11; using SharpDX.D3DCompiler; using D3D11 = SharpDX.Direct3D11; using DXGI = SharpDX.DXGI; // 引用: // SharpDX.dll // SharpDX.Direct3D11.dll // SharpDX.DXGI.dll // SharpDX.D3DCompiler.dll // 附加: // d3dx11_43.dll(DirectX Redist June 2010) // d3dcompiler_43.dll(DirectX Redist June 2010) // d3dcsx_43.dll(DirectX Redist June 2010) namespace dcomputecsharp { class Program { static string CS = @" //-------------------------------------------------------------------------------------- // 块大小定义 //-------------------------------------------------------------------------------------- #define BITONIC_BLOCK_SIZE 512 #define TRANSPOSE_BLOCK_SIZE 16 //-------------------------------------------------------------------------------------- // 常量缓冲区 //-------------------------------------------------------------------------------------- // b# 寄存器表示ConstantBuffer(常量缓冲区) cbuffer CB : register( b0 ) { unsigned int g_iLevel; unsigned int g_iLevelMask; unsigned int g_iWidth; unsigned int g_iHeight; }; //-------------------------------------------------------------------------------------- // 结构化缓冲区 //-------------------------------------------------------------------------------------- // t# 寄存器表示ShaderResourceView(Shader资源视图) // u# 寄存器表示UnorderedAccessView(乱序访问视图) StructuredBuffer<unsigned int> Input : register( t0 ); RWStructuredBuffer<unsigned int> Data : register( u0 ); //-------------------------------------------------------------------------------------- // 双调排序GPU程序(Compute Shader) //-------------------------------------------------------------------------------------- groupshared unsigned int shared_data[BITONIC_BLOCK_SIZE]; // 组内共享的内部数据 [numthreads(BITONIC_BLOCK_SIZE, 1, 1)] // 组内线程数X*Y*Z,其中cs_4_0中Z必须为1,cs_5_0没有这个限制 void BitonicSort( uint3 Gid : SV_GroupID, // 组ID uint3 DTid : SV_DispatchThreadID, // 总线程ID uint3 GTid : SV_GroupThreadID, // 组内线程ID uint GI : SV_GroupIndex ) // 组内线程序号 { // 从乱序访问视图加载组内共享的内部数据 shared_data[GI] = Data[DTid.x]; GroupMemoryBarrierWithGroupSync(); // 等待组内所有共享数据访问结束,且所有程序均到达此调用 // 对组内共享的内部数据进行排序 for (unsigned int j = g_iLevel >> 1 ; j > 0 ; j >>= 1) { unsigned int result = ((shared_data[GI & ~j] <= shared_data[gi | j])="=" (bool)(g_ilevelmask & dtid.x))? ^ j] : shared_data[gi]; groupmemorybarrierwithgroupsync(); shared_data[gi]="result;" } 将组内共享的内部数据存回乱序访问视图 data[dtid.x]="shared_data[GI];" -------------------------------------------------------------------------------------- 矩阵转置gpu程序(compute shader) groupshared unsigned int transpose_shared_data[transpose_block_size * transpose_block_size]; [numthreads(transpose_block_size, transpose_block_size, 1)] void matrixtranspose( uint3 gid sv_groupid, dtid sv_dispatchthreadid, gtid sv_groupthreadid, uint gi sv_groupindex ) { transpose_shared_data[gi]="Input[DTid.y" g_iwidth + dtid.x]; uint2 xy="DTid.yx" - gtid.yx gtid.xy; data[xy.y g_iheight xy.x]="transpose_shared_data[GTid.x" transpose_block_size gtid.y]; "; static device dev; devicecontext immctx; compilationresult cs_sort_comp; computeshader cs_sort; cs_transpose_comp; cs_transpose; d3d11.buffer constbuf; gpubuf1; shaderresourceview gpubuf1_srv; unorderedaccessview gpubuf1_uav; gpubuf2; gpubuf2_srv; gpubuf2_uav; cpubuf; struct constbuffer public ilevel; ilevelmask; iwidth; iheight; const num_elements="512" 512; bitonic_block_size="512;" matrix_width="BITONIC_BLOCK_SIZE;" matrix_height="NUM_ELEMENTS" bitonic_block_size; main(string[] args) try 创建设备,并获取immediatecontext对象 dirvertype.hardware 使用gpu dirvertype.warp 使用cpu (win7要求kb2670838以支持warp11) dev="new" device(drivertype.hardware, devicecreationflags.none, featurelevel.level_11_0, featurelevel.level_10_1, featurelevel.level_10_0); immctx="dev.ImmediateContext;" 检查是否支持compute shader 4.0 if (!dev.checkfeaturesupport(feature.d3d10xhardwareoptions)) console.writeline("no support for compute shaders."); return; 编译并创建compute shader对象 cs_sort_comp="ShaderBytecode.Compile(CS," "bitonicsort", "cs_4_0"); cs_sort="new" computeshader(dev, cs_sort_comp.bytecode, null); cs_transpose_comp="ShaderBytecode.Compile(CS," "matrixtranspose", cs_transpose="new" cs_transpose_comp.bytecode, 创建常量缓冲区(注意大小必须是16的倍数) constbuf="new" d3d11.buffer(dev, 16, resourceusage.default, bindflags.constantbuffer, cpuaccessflags.none, resourceoptionflags.none, 0); 创建两个gpu缓冲区 gpubuf1="new" sizeof(uint) (int)num_elements, bindflags.shaderresource bindflags.unorderedaccess, resourceoptionflags.bufferstructured, sizeof(uint)); gpubuf2="new" 为gpu缓冲区创建shader资源视图绑定 shaderresourceviewdescription srvdesc="new" shaderresourceviewdescription(); srvdesc.format="DXGI.Format.Unknown;" srvdesc.dimension="ShaderResourceViewDimension.Buffer;" srvdesc.buffer.elementcount="(int)NUM_ELEMENTS;" gpubuf1_srv="new" shaderresourceview(dev, gpubuf1, srvdesc); gpubuf2_srv="new" gpubuf2, 为gpu缓冲区创建乱序访问视图绑定 unorderedaccessviewdescription uavdesc="new" unorderedaccessviewdescription(); uavdesc.format="DXGI.Format.Unknown;" uavdesc.dimension="UnorderedAccessViewDimension.Buffer;" uavdesc.buffer.elementcount="(int)NUM_ELEMENTS;" gpubuf1_uav="new" unorderedaccessview(dev, uavdesc); gpubuf2_uav="new" 创建cpu传输缓冲区 cpubuf="new" resourceusage.staging, bindflags.none, cpuaccessflags.read, 进行计算 docompute(); finally 清理对象 (cpubuf !="null)" cpubuf.dispose(); (gpubuf2_uav gpubuf2_uav.dispose(); (gpubuf2_srv gpubuf2_srv.dispose(); (gpubuf2 gpubuf2.dispose(); (gpubuf1_uav gpubuf1_uav.dispose(); (gpubuf1_srv gpubuf1_srv.dispose(); (gpubuf1 gpubuf1.dispose(); (constbuf constbuf.dispose(); (cs_transpose cs_transpose.dispose(); (cs_transpose_comp cs_transpose_comp.dispose(); (cs_sort cs_sort.dispose(); (cs_sort_comp cs_sort_comp.dispose(); (immctx immctx.dispose(); (dev dev.dispose(); 设置常量 private setconstants(uint ilevel, ilevelmask, iwidth, iheight) cb="new" constbuffer() ilevel="iLevel," ilevelmask="iLevelMask," iwidth="iWidth," iheight="iHeight" }; immctx.updatesubresource(ref cb, constbuf); immctx.computeshader.setconstantbuffer(0, 计算主程序 docompute() console.writeline("generating random data"); 生成随机数据 uint[] buf="new" uint[num_elements]; rand="new" random(environment.tickcount); (int i="0;" < num_elements; i++) buf[i]="(uint)rand.NextLong();" console.writeline("random data generated"); console.writeline("gpu sorting begins"); 上传数据 immctx.updatesubresource(buf, gpubuf1); immctx.computeshader.setunorderedaccessview(0, gpubuf1_uav); 排序数据 先按不大于块大小的level对行数据进行排序 (uint level="2;" 2) setconstants(level, level, matrix_height, matrix_width); 对行数据进行排序 immctx.computeshader.setshader(cs_sort, null, immctx.dispatch((int)(num_elements bitonic_block_size), 1, 1); 然后按大于块大小的level对行列数据进行排序 转置,排序列,转置,排序行 2); setconstants((level (level ~num_elements) bitonic_block_size, matrix_width, matrix_height); 将数据由buffer1转置并存到buffer2 immctx.computeshader.setshaderresource(0, gpubuf2_uav); gpubuf1_srv); immctx.computeshader.setshader(cs_transpose, immctx.dispatch((int)(matrix_width transpose_block_size), (int)(matrix_height 排序转置后的列数据 setconstants(bitonic_block_size, 将数据由buffer2转置并存回buffer1 gpubuf2_srv); immctx.dispatch((int)(matrix_height (int)(matrix_width 排序行数据 下载数据 immctx.copyresource(gpubuf1, cpubuf); datastream ds; immctx.mapsubresource(cpubuf, 0, mapmode.read, mapflags.none, out ds); outbuf="ds.ReadRange<uint">((int)NUM_ELEMENTS); immctx.UnmapSubresource(cpubuf, 0); Console.WriteLine("GPU sorting ends"); Console.WriteLine("CPU sorting begins"); // 进行CPU对照排序(此处C#排序非常快,和GPU排序几乎看不出区别) Array.Sort(buf); Console.WriteLine("CPU sorting ends"); Console.WriteLine("Comparing"); // 比较结果是否一致 bool same = true; for (int i = 0; i < NUM_ELEMENTS; i++) { if (buf[i] != outbuf[i]) { same = false; break; } } Console.WriteLine("Result: {0}", same); } } } </=></unsigned></unsigned></code>
[修改于 7年7个月前 - 2016/11/06 20:05:40]
时段 | 个数 |
---|---|
{{f.startingTime}}点 - {{f.endTime}}点 | {{f.fileCount}} |