Performance improvements using alignment and pipelining

Hi Egor,

I wrote a blog post about [alignment and pipelining](https://mijailovic.net/2018/07/20/alignment-and-pipelining/), which you could use to further boost the performance of most examples in your repository. In essence, you can transform this code:

```csharp
public static int Sum(int[] source)
{
  const int VectorSizeInInts = 8;

  fixed (int* ptr = &source[0])
  {
    var pos = 0;
    var sum = Avx.SetZeroVector256<int>();

    for (; pos <= source.Length - VectorSizeInInts; pos += VectorSizeInInts)
    {
      var current = Avx.LoadVector256(ptr + pos);
      sum = Avx2.Add(current, sum);
    }

    var temp = stackalloc int[VectorSizeInInts];
    Avx.Store(temp, sum);

    var final = Sum(temp, VectorSizeInInts);
    final += Sum(ptr + pos, source.Length - pos);

    return final;
  }
}
```

Into this:

```csharp
public static int SumAlignedPipelined(int[] source)
{
  const ulong AlignmentMask = 31UL;
  const int VectorSizeInInts = 8;
  const int BlockSizeInInts = 32;

  fixed (int* ptr = &source[0])
  {
    var aligned = (int*)(((ulong)ptr + AlignmentMask) & ~AlignmentMask);
    var pos = (int)(aligned - ptr);
    var sum = Avx.SetZeroVector256<int>();
    var final = Sum(ptr, pos);

    for (; pos <= source.Length - BlockSizeInInts; pos += BlockSizeInInts)
    {
      var block0 = Avx.LoadAlignedVector256(ptr + pos + 0 * VectorSizeInInts);
      var block1 = Avx.LoadAlignedVector256(ptr + pos + 1 * VectorSizeInInts);
      var block2 = Avx.LoadAlignedVector256(ptr + pos + 2 * VectorSizeInInts);
      var block3 = Avx.LoadAlignedVector256(ptr + pos + 3 * VectorSizeInInts);

      sum = Avx2.Add(block0, sum);
      sum = Avx2.Add(block1, sum);
      sum = Avx2.Add(block2, sum);
      sum = Avx2.Add(block3, sum);
    }

    for (; pos <= source.Length - VectorSizeInInts; pos += VectorSizeInInts)
    {
      var current = Avx.LoadAlignedVector256(ptr + pos);
      sum = Avx2.Add(current, sum);
    }

    var temp = stackalloc int[VectorSizeInInts];
    Avx.Store(temp, sum);

    final += Sum(temp, VectorSizeInInts);
    final += Sum(ptr + pos, source.Length - pos);

    return final;
  }
}
```

On my machine this results in 27% boost in performance when working with aligned arrays, and 34% on unaligned arrays (some arrays in your benchmarks are aligned, and some are not, which can results in around 10% difference in performance).

I don't have the time to send you a pull request, but I though you might be interested in this.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Performance improvements using alignment and pipelining #4

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Performance improvements using alignment and pipelining #4

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions