实现流程

在C#中设置好存储粒子信息的数组（ParticleArray），并使用DrawMeshInstancedIndirect对粒子（cube）进行实例化操作。
在ComputeShader中对传入的ParticleArray中的数据进行计算（主要是对位置进行偏移计算）。
在Shader中根据ParticleArray中的数据对粒子（cube）的顶点进行偏移，并运用数据中的uv对贴图进行采样渲染。

粒子数据结构体

Particle Info Struct

private struct Particle
{
    public Vector3 position;
    public Vector3 customPosition;
    public Vector2 uv;
}

初始化粒子数组

Initial Particle Array

m_Width = texture.width;
m_Height = texture.height;
m_Amount = m_Width * m_Height;
Particle[] particleArray = new Particle[m_Amount];
for (int i = 0; i < m_Width; i++)
{
    for (int j = 0; j < m_Height; j++)
    {
        int id = i * m_Height + j;
        float x = (float)i / (m_Width - 1);
        float y = (float)j / (m_Height - 1);
        particleArray[id].position = new Vector3(x - 0.5f, y - 0.5f, 0);
        particleArray[id].customPosition = particleArray[id].position;
        particleArray[id].uv = new Vector2(x, y);
    }
}
m_ParticleBuffer = new ComputeBuffer(m_Amount, Marshal.SizeOf(typeof(Particle)));
m_ParticleBuffer.SetData(particleArray);

m_KernelID = computeShader.FindKernel("CSMain");
computeShader.SetBuffer(m_KernelID, k_ParticleBuffer, m_ParticleBuffer);
material.SetBuffer(k_ParticleBuffer, m_ParticleBuffer);

初始化粒子实例数据

Initial Particle Instance Info

uint[] args =
{
    particleMesh.GetIndexCount(0),      //单个实例的索引数
    (uint)m_Amount,                     //实例数
    particleMesh.GetIndexStart(0),      //起始索引数
    particleMesh.GetBaseVertex(0),      //基顶点位置
    0                                   //起始实例的位置
};
m_ArgsBuffer = new ComputeBuffer(1, args.Length * sizeof(uint), 
    ComputeBufferType.IndirectArguments);
m_ArgsBuffer.SetData(args);

执行CS并实例化粒子对象

Dispatch CS && Draw Instance

computeShader.Dispatch(m_KernelID, m_ThreadGroupCount, 1, 1);
material.SetMatrix(k_LocalToWorldMatrix, transform.localToWorldMatrix);
Graphics.DrawMeshInstancedIndirect(particleMesh, 0, material, 
    new Bounds(Vector3.zero, Vector3.one), m_ArgsBuffer);

在多线程中移动粒子

ComputeShader

#pragma kernel CSMain

struct Particle
{
    float3 position;
    float3 customPosition;
    float2 uv;
};
RWStructuredBuffer<Particle> _ParticleBuffer;

[numthreads(256, 1, 1)]
void CSMain(uint3 id : SV_DispatchThreadID)
{
    float3 pos = _ParticleBuffer[id.x].position;
    if (_Time < _MorphTime)
    {
        float3 morph = (_ParticleBuffer[id.x].customPosition - pos) * 0.1;
        _ParticleBuffer[id.x].position += morph;
    }
    else if (_Time < _CurlTime)
    {
        _ParticleBuffer[id.x].position += curlNoise(pos) * 0.05;
    }
}

渲染粒子实例

Shader

float4x4 GetObjectToWorldMatrix(float3 pos)
{
    float4x4 objectMatrix = float4x4(
        _Size, 0, 0, pos.x,
        0, _Size, 0, pos.y,
        0, 0, _Size, pos.z,
        0, 0, 0, 1);
    return mul(_LocalToWorldMatrix, objectMatrix);
}

v2f vert (appdata v, uint instanceID : SV_InstanceID)
{
    v2f o;
    Particle particle = _ParticleBuffer[instanceID];
    float4x4 objectToWorldMatrix = GetObjectToWorldMatrix(particle.position);
    float4 positionWS = mul(objectToWorldMatrix, v.vertex);
    o.pos = TransformWorldToHClip(positionWS);
    o.uv = _ParticleBuffer[instanceID].uv;
    return o;
}

half4 frag (v2f i) : SV_Target
{
    half4 color = tex2D(_OriginTex, i.uv);
    half4 targetColor = tex2D(_TargetTex, i.uv);
    color = lerp(color, targetColor, _Lerp);
    return color;
}

附 ComputeShader简述

Compute Shader简称cs，DX10开始出现。
cs可以做通用计算，在GPU上执行主要的计算过程，最终再将结果传递给CPU，这类非图形计算称为GPGPU。
cs虽然不在渲染流水线中，但它支持读写GPU资源，可以将运行结果直接传递到渲染管线，减少了从显存到内存的时间开销。
基本概念：
- GroupID：线程组ID
- GroupThreadID：组内线程ID
- DispatchThreadID：线程全局ID
- DispatchThreadID = GroupID * numthreads + GroupThreadID
- warp：GPU调度的基本单元
- StructureBuffer<类型>：只读的结构缓冲区
- RWStructureBuffer<类型>：可读写的结构缓冲区

TODO

curlNoise函数还没来得及深究，后续再看看相关的算法。