diff --git a/README.md b/README.md index 19fb4b6..e39e063 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ All results are obtained from the same toaster, with the same load, so compariso Tested frameworks: - [Arch](https://github.com/genaray/Arch) - [DefaultEcs](https://github.com/Doraku/DefaultEcs) -- [Fennecs](https://github.com/thygrrr/fennecs) +- [**fenn**ecs](https://fennecs.tech) - [Flecs.Net](https://github.com/BeanCheeseBurrito/Flecs.NET) - [Friflo.Engine.ECS](https://github.com/friflo/Friflo.Json.Fliox/blob/main/Engine/README.md) - [Leopotam.Ecs](https://github.com/Leopotam/ecs) using what I believe is a nuget package not made by the actual author and compiled in debug... diff --git a/source/Ecs.CSharp.Benchmark/Capabilities.cs b/source/Ecs.CSharp.Benchmark/Capabilities.cs new file mode 100644 index 0000000..3a2b46c --- /dev/null +++ b/source/Ecs.CSharp.Benchmark/Capabilities.cs @@ -0,0 +1,69 @@ +using BenchmarkDotNet.Configs; + +namespace Ecs.CSharp.Benchmark +{ + /// + /// Capability Requirements for specific tests. + /// Add your own intrinsics or other system dependencies here. + /// + /// + /// Usage: Add a category to it and apply exclusions in the ApplyExclusions method. + /// (this is an EXCLUSIVE category filter, it turns OFF all categories it matches) + /// Then, set your own BenchmarkCategory to include the CapabilityCategory string. + /// + /// + /// + /// [BenchmarkCategory( + /// Categories.Fennecs, + /// Capabilities.Avx2 + /// )] + /// public void Raw_AVX2() + /// + /// + internal static class Capabilities + { + // These are common vectorized instruction set categories. + // x86/x64 + public const string Avx2 = nameof(System.Runtime.Intrinsics.X86.Avx2); + public const string Avx = nameof(System.Runtime.Intrinsics.X86.Avx); + public const string Sse3 = nameof(System.Runtime.Intrinsics.X86.Sse3); + public const string Sse2 = nameof(System.Runtime.Intrinsics.X86.Sse2); + + // Arm + public const string AdvSimd = nameof(System.Runtime.Intrinsics.Arm.AdvSimd); + + /// + /// This applies capability-based exclusions as filters to the config. + /// + /// a Benchmark Config, e.g. as used in Program.cs + public static IConfig WithCapabilityExclusions(this IConfig self) + { + if (!System.Runtime.Intrinsics.X86.Avx2.IsSupported) + { + self = self.AddFilter(new CategoryExclusion(Avx2)); + } + + if (!System.Runtime.Intrinsics.X86.Avx.IsSupported) + { + self = self.AddFilter(new CategoryExclusion(Avx)); + } + + if (!System.Runtime.Intrinsics.X86.Sse3.IsSupported) + { + self = self.AddFilter(new CategoryExclusion(Sse3)); + } + + if (!System.Runtime.Intrinsics.X86.Sse2.IsSupported) + { + self = self.AddFilter(new CategoryExclusion(Sse2)); + } + + if (!System.Runtime.Intrinsics.Arm.AdvSimd.IsSupported) + { + self = self.AddFilter(new CategoryExclusion(AdvSimd)); + } + + return self; + } + } +} diff --git a/source/Ecs.CSharp.Benchmark/Categories.cs b/source/Ecs.CSharp.Benchmark/Categories.cs index 2e0e883..2812643 100644 --- a/source/Ecs.CSharp.Benchmark/Categories.cs +++ b/source/Ecs.CSharp.Benchmark/Categories.cs @@ -1,5 +1,13 @@ -namespace Ecs.CSharp.Benchmark +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using BenchmarkDotNet.Filters; +using BenchmarkDotNet.Running; + +namespace Ecs.CSharp.Benchmark { + /// + /// Prefixes / ECS package names for benchmarks, used as BenchMarkDotNet categories. + /// internal static class Categories { public const string Arch = "Arch"; @@ -14,10 +22,28 @@ internal static class Categories public const string SveltoECS = "Svelto.ECS"; public const string Morpeh = "Morpeh"; public const string FlecsNet = "FlecsNet"; - public const string Fennecs = "Fennecs"; + public const string Fennecs = "fennecs"; public const string TinyEcs = "TinyEcs"; public const string CreateEntity = "CreateEntity"; public const string System = "System"; } + + /// + /// Excludes a given category from benchmarks. + /// (used by Program.cs) + /// + /// + /// When an exclusion is PRESENT, then all benchmarks that HAVE the category will be EXCLUDED. + /// + /// + /// CategoryExclusion("foo") will exclude all benchmarks that have the "foo" category. + /// + public class CategoryExclusion(string category) : IFilter + { + public bool Predicate([NotNull] BenchmarkCase benchmarkCase) + { + return !benchmarkCase.Descriptor.Categories.Contains(category); + } + } } diff --git a/source/Ecs.CSharp.Benchmark/Contexts/FennecsBaseContext.cs b/source/Ecs.CSharp.Benchmark/Contexts/FennecsBaseContext.cs index cc67db5..8892bd5 100644 --- a/source/Ecs.CSharp.Benchmark/Contexts/FennecsBaseContext.cs +++ b/source/Ecs.CSharp.Benchmark/Contexts/FennecsBaseContext.cs @@ -1,38 +1,52 @@ using System; +using System.Runtime.CompilerServices; using fennecs; namespace Ecs.CSharp.Benchmark.Contexts { namespace Fennecs_Components { - internal struct Component1 + internal record struct Component1 { + public static implicit operator Component1(int value) => new() { Value = value }; + public static implicit operator Component2(Component1 self) => new() { Value = self.Value }; + public static implicit operator Component3(Component1 self) => new() { Value = self.Value }; + public static implicit operator int (Component1 c) => c.Value; + public int Value; } - internal struct Component2 + internal record struct Component2 { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static implicit operator Component1(Component2 self) => new() { Value = self.Value }; + public static implicit operator Component2(int value) => new() { Value = value }; + public static implicit operator Component3(Component2 self) => new() { Value = self.Value }; + public static implicit operator int (Component2 c) => c.Value; + public int Value; } - internal struct Component3 + internal record struct Component3 { + public static implicit operator Component1(Component3 self) => new() { Value = self.Value }; + public static implicit operator Component2(Component3 self) => new() { Value = self.Value }; + public static implicit operator Component3(int value) => new() { Value = value }; + public static implicit operator int (Component3 c) => c.Value; + public int Value; } } - internal class FennecsBaseContext : IDisposable + internal class FennecsBaseContext(int entityCount) : IDisposable { - public World World { get; } + public World World { get; } = new World(entityCount * 2); - public FennecsBaseContext() - { - World = new World(); - } - - public void Dispose() + public virtual void Dispose() { World.Dispose(); } + public FennecsBaseContext() : this(100000) + { } } } diff --git a/source/Ecs.CSharp.Benchmark/CreateEntityWithOneComponent/Fennecs.cs b/source/Ecs.CSharp.Benchmark/CreateEntityWithOneComponent/Fennecs.cs index 0939b51..ae76576 100644 --- a/source/Ecs.CSharp.Benchmark/CreateEntityWithOneComponent/Fennecs.cs +++ b/source/Ecs.CSharp.Benchmark/CreateEntityWithOneComponent/Fennecs.cs @@ -8,17 +8,16 @@ namespace Ecs.CSharp.Benchmark public partial class CreateEntityWithOneComponent { [Context] private readonly FennecsBaseContext _fennecs; - + [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] + [Benchmark(Description = "fennecs")] public void Fennecs() { World world = _fennecs.World; - for (int i = 0; i < EntityCount; ++i) - { - world.Spawn().Add(); - } + world.Entity() + .Add(new Component1()) + .Spawn(EntityCount); } } } diff --git a/source/Ecs.CSharp.Benchmark/CreateEntityWithThreeComponents/Fennecs.cs b/source/Ecs.CSharp.Benchmark/CreateEntityWithThreeComponents/Fennecs.cs index 8689522..29d900e 100644 --- a/source/Ecs.CSharp.Benchmark/CreateEntityWithThreeComponents/Fennecs.cs +++ b/source/Ecs.CSharp.Benchmark/CreateEntityWithThreeComponents/Fennecs.cs @@ -3,26 +3,24 @@ using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components; using fennecs; +// ReSharper disable once CheckNamespace namespace Ecs.CSharp.Benchmark { public partial class CreateEntityWithThreeComponents { - [Context] - private readonly FennecsBaseContext _fennecs; - + [Context] private readonly FennecsBaseContext _fennecs; + [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] + [Benchmark(Description = "fennecs")] public void Fennecs() { World world = _fennecs.World; - for (int i = 0; i < EntityCount; ++i) - { - world.Spawn() - .Add() - .Add() - .Add(); - } + world.Entity() + .Add(new Component1()) + .Add(new Component2()) + .Add(new Component3()) + .Spawn(EntityCount); } } } diff --git a/source/Ecs.CSharp.Benchmark/CreateEntityWithTwoComponents/Fennecs.cs b/source/Ecs.CSharp.Benchmark/CreateEntityWithTwoComponents/Fennecs.cs index 4c2016d..b12b478 100644 --- a/source/Ecs.CSharp.Benchmark/CreateEntityWithTwoComponents/Fennecs.cs +++ b/source/Ecs.CSharp.Benchmark/CreateEntityWithTwoComponents/Fennecs.cs @@ -8,18 +8,17 @@ namespace Ecs.CSharp.Benchmark public partial class CreateEntityWithTwoComponents { [Context] private readonly FennecsBaseContext _fennecs; - + [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] + [Benchmark(Description = "fennecs")] public void Fennecs() { World world = _fennecs.World; - for (int i = 0; i < EntityCount; ++i) - { - world.Spawn(). - Add().Add(); - } + world.Entity() + .Add(new Component1()) + .Add(new Component2()) + .Spawn(EntityCount); } } } diff --git a/source/Ecs.CSharp.Benchmark/Ecs.CSharp.Benchmark.csproj b/source/Ecs.CSharp.Benchmark/Ecs.CSharp.Benchmark.csproj index 5c24029..0b4e018 100644 --- a/source/Ecs.CSharp.Benchmark/Ecs.CSharp.Benchmark.csproj +++ b/source/Ecs.CSharp.Benchmark/Ecs.CSharp.Benchmark.csproj @@ -37,7 +37,7 @@ - + diff --git a/source/Ecs.CSharp.Benchmark/Program.cs b/source/Ecs.CSharp.Benchmark/Program.cs index 1e9cb5a..3a91a4a 100644 --- a/source/Ecs.CSharp.Benchmark/Program.cs +++ b/source/Ecs.CSharp.Benchmark/Program.cs @@ -14,18 +14,26 @@ BenchmarkSwitcher benchmark = BenchmarkSwitcher.FromTypes(new[] { - typeof(CreateEntityWithOneComponent), - typeof(CreateEntityWithTwoComponents), - typeof(CreateEntityWithThreeComponents), - typeof(SystemWithOneComponent), typeof(SystemWithTwoComponents), typeof(SystemWithThreeComponents), - typeof(SystemWithTwoComponentsMultipleComposition) + typeof(SystemWithTwoComponentsMultipleComposition), + + //Moving lighter tests to the back makes the estimated time display more reliable + typeof(CreateEntityWithOneComponent), + typeof(CreateEntityWithTwoComponents), + typeof(CreateEntityWithThreeComponents), }); -IConfig configuration = DefaultConfig.Instance.WithOptions(ConfigOptions.DisableOptimizationsValidator); + +IConfig configuration = DefaultConfig.Instance + .WithOptions(ConfigOptions.DisableOptimizationsValidator) + .WithCapabilityExclusions(); + +#if RANK_RESULTS + configuration = configuration.WithOrderer(new BenchmarkDotNet.Order.DefaultOrderer(BenchmarkDotNet.Order.SummaryOrderPolicy.FastestToSlowest)); +#endif if (args.Length > 0) { diff --git a/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/Fennecs.cs b/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/Fennecs.cs index 3690e37..01080b2 100644 --- a/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/Fennecs.cs +++ b/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/Fennecs.cs @@ -1,23 +1,28 @@ using System; -using System.Runtime.CompilerServices; +using System.Buffers; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.X86; using BenchmarkDotNet.Attributes; using Ecs.CSharp.Benchmark.Contexts; using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components; using fennecs; +// ReSharper disable once CheckNamespace namespace Ecs.CSharp.Benchmark { public partial class SystemWithOneComponent { [Context] private readonly FennecsContext _fennecs; + // ReSharper disable once ClassNeverInstantiated.Local private sealed class FennecsContext : FennecsBaseContext { - public Query query; + public readonly Stream query; - public FennecsContext(int entityCount, int entityPadding) + public FennecsContext(int entityCount, int entityPadding) : base(entityCount) { - query = World.Query().Build(); + query = World.Stream(); for (int i = 0; i < entityCount; ++i) { for (int j = 0; j < entityPadding; ++j) @@ -27,34 +32,159 @@ public FennecsContext(int entityCount, int entityPadding) World.Spawn().Add(); } + + query.Query.Batch(Batch.AddConflict.Replace) + .Add(new Component1 + { + Value = 0 + }) + .Submit(); + } + + public override void Dispose() + { + query.Query.Dispose(); + base.Dispose(); } } [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] - public void Fennecs_ForEach() + [Benchmark (Description = "fennecs(For)")] + public void Fennecs_For() + { + _fennecs.query.For(static (ref Component1 v) => { v.Value++; }); + } + + // Disabled for now. + // This API is available in fennecs 0.3.x and later, but is not optimized yet. + //[BenchmarkCategory(Categories.Fennecs)] + //[Benchmark (Description = "fennecs(Batch)")] + public void Fennecs_Batch() { - _fennecs.query.For((ref Component1 comp0) => comp0.Value++); + int newValue = _fennecs.query.Query[0].Ref().Value + 1; + _fennecs.query.Blit(newValue); } [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] + [Benchmark (Description = "fennecs(Job)")] public void Fennecs_Job() { - _fennecs.query.Job(delegate(ref Component1 v) { v.Value++; }, 1024); + _fennecs.query.Job(static (ref Component1 v) => { v.Value++; }); } - - [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] - public void Fennecs_Raw() + + //[BenchmarkCategory(Categories.Fennecs)] + //[Benchmark(Description = "fennecs(Blit)")] + public void Fennecs_Raw_Blit() + { + _fennecs.query.Raw(delegate(Memory mem1) + { + // This does exactly what the system does, but it is wholly dependent + // on the precondition of the benchmark. (so... it's taking a shortcut) + // fennecs 0.4.0 or 0.5.0 will provide a literal Blit method that + // works like this for fast updating of large swathes of component + // data. + Component1 newValue = new Component1 + { + // We can safely do this because we will never get called here with + // an empty archetype / zero size memory slab + Value = mem1.Span[0].Value + 1 + }; + mem1.Span.Fill(newValue); + }); + } + + #region Raw Runners + + [BenchmarkCategory(Categories.Fennecs, Capabilities.Avx2)] + [Benchmark(Description = "fennecs(AVX2)")] + public void Fennecs_Raw_AVX2() + { + _fennecs.query.Raw(delegate(Memory mem1) + { + int count = mem1.Length; + + using MemoryHandle handle1 = mem1.Pin(); + + unsafe + { + int* p1 = (int*)handle1.Pointer; + + int vectorSize = Vector256.Count; + int vectorEnd = count - (count % vectorSize); + for (int i = 0; i < vectorEnd; i += vectorSize) + { + Vector256 v1 = Avx.LoadVector256(p1 + i); + Avx.Store(p1 + i, Avx2.Add(v1, Vector256.One)); + } + + for (int i = vectorEnd; i < count; i++) // remaining elements + { + p1[i]++; + } + } + }); + } + + [BenchmarkCategory(Categories.Fennecs, Capabilities.Sse2)] + [Benchmark(Description = "fennecs(SSE2)")] + public void Fennecs_Raw_SSE2() + { + _fennecs.query.Raw(delegate(Memory mem1) + { + int count = mem1.Length; + + using MemoryHandle handle1 = mem1.Pin(); + + unsafe + { + int* p1 = (int*)handle1.Pointer; + + int vectorSize = Vector128.Count; + int vectorEnd = count - (count % vectorSize); + for (int i = 0; i < vectorEnd; i += vectorSize) + { + Vector128 v1 = Sse2.LoadVector128(p1 + i); + Sse2.Store(p1 + i, Sse2.Add(v1, Vector128.One)); + } + + for (int i = vectorEnd; i < count; i++) // remaining elements + { + p1[i]++; + } + } + }); + } + + [BenchmarkCategory(Categories.Fennecs, Capabilities.AdvSimd)] + [Benchmark(Description = "fennecs(AdvSIMD)")] + public void Fennecs_Raw_AdvSimd() { - _fennecs.query.Raw(delegate(Memory vectors) + _fennecs.query.Raw(delegate(Memory mem1) { - foreach (ref var v in vectors.Span) + int count = mem1.Length; + + using MemoryHandle handle1 = mem1.Pin(); + + unsafe { - v.Value++; + int* p1 = (int*)handle1.Pointer; + + int vectorSize = Vector128.Count; + int vectorEnd = count - (count % vectorSize); + for (int i = 0; i < vectorEnd; i += vectorSize) + { + Vector128 v1 = AdvSimd.LoadVector128(p1 + i); + AdvSimd.Store(p1 + i, AdvSimd.Add(v1, Vector128.One)); + } + + for (int i = vectorEnd; i < count; i++) // remaining elements + { + p1[i]++; + } } }); } + + #endregion } } diff --git a/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/_SystemWithOneComponent.cs b/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/_SystemWithOneComponent.cs index 170429d..ac73ec9 100644 --- a/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/_SystemWithOneComponent.cs +++ b/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/_SystemWithOneComponent.cs @@ -2,7 +2,7 @@ namespace Ecs.CSharp.Benchmark { - [BenchmarkCategory(Categories.System)] + [BenchmarkCategory(Categories.System, nameof(SystemWithOneComponent))] [MemoryDiagnoser] #if CHECK_CACHE_MISSES [HardwareCounters(BenchmarkDotNet.Diagnosers.HardwareCounter.CacheMisses)] diff --git a/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/Fennecs.cs b/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/Fennecs.cs index c657a88..75cc219 100644 --- a/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/Fennecs.cs +++ b/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/Fennecs.cs @@ -1,23 +1,28 @@ using System; -using System.Runtime.CompilerServices; +using System.Buffers; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.X86; using BenchmarkDotNet.Attributes; using Ecs.CSharp.Benchmark.Contexts; using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components; using fennecs; +// ReSharper disable once CheckNamespace namespace Ecs.CSharp.Benchmark { public partial class SystemWithThreeComponents { [Context] private readonly FennecsContext _fennecs; + private Stream Stream => _fennecs.stream; + // ReSharper disable once ClassNeverInstantiated.Local private sealed class FennecsContext : FennecsBaseContext { - public Query query; + internal readonly Stream stream; - public FennecsContext(int entityCount, int entityPadding) + public FennecsContext(int entityCount, int entityPadding) : base(entityCount) { - query = World.Query().Build(); for (int i = 0; i < entityCount; ++i) { for (int j = 0; j < entityPadding; ++j) @@ -38,42 +43,282 @@ public FennecsContext(int entityCount, int entityPadding) } World.Spawn().Add() - .Add(new Component2 { Value = 1 }) - .Add(new Component3 { Value = 1 }); + .Add(new Component2 {Value = 1}) + .Add(new Component3 {Value = 1}); } + + stream = World.Query().Stream(); + } + + public override void Dispose() + { + stream.Query.Dispose(); + base.Dispose(); } } + /// + /// fennecs For runners are the classic swiss army knife of this ECS. + /// + /// + /// They are the most versatile and offer decent single-threaded baseline performance to boot. + /// [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] - public void Fennecs_ForEach() + [Benchmark(Description = "fennecs(For)")] + public void fennecs_For() { - _fennecs.query.For((ref Component1 c1, ref Component2 c2, ref Component3 c3) => c1.Value += c2.Value + c3.Value); + Stream.For( + static (ref Component1 c1, ref Component2 c2, ref Component3 c3) => + { + c1.Value = c1.Value + c2.Value + c3.Value; + }); + } + + + /// + /// Experimental Implicit value type to compare performance in a tight loop. + /// + /// + /// It's very convenient, but is about 20% slower than the For runner with values + /// (it still gets inlined well!) + /// + //[BenchmarkCategory(Categories.Fennecs)] + //[Benchmark(Description = "fennecs(Implicit)")] + public void fennecs_For_Implicit() + { + Stream.For( + static (ref Component1 c1, ref Component2 c2, ref Component3 c3) => + { + c1 = c1 + c2 + c3; + }); } + + /// + /// fennecs Job runners are the most scalable runners. + /// + /// + /// + /// They're still an area for improvement :) + /// + /// + /// Job is designed for heavy individual workloads (e.g. update 20 physics worlds on 20 cores), + /// or large numbers of entities in many big archetypes. They only start paying off at around + /// 500,000 components when the individual work steps are simple (e.g. vector multiplications). + /// + /// [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] - public void Fennecs_Job() + [Benchmark(Description = $"fennecs(Job)")] + public void fennecs_Job() { - _fennecs.query.Job(delegate(ref Component1 c1, ref Component2 c2, ref Component3 c3) { c1.Value += c2.Value + c3.Value; }, 1024); + Stream.Job( + static (ref Component1 c1, ref Component2 c2, ref Component3 c3) => + { + c1.Value = c1.Value + c2.Value + c3.Value; + }); } + + // fennecs Raw runners guarantee contiguous memory access in the form of Query<>.Raw(MemoryAction<>) + // Raw runners are intended to process data or transfer it via the fastest available means. + // Example use cases: + // - transfer data to/from GPU + // - transfer data to/from Game Engine + // - Disk, Database, or Network I/O + // - SIMD calculations + // - snapshotting / copying / rollback / compression / hashing / diffing / permutation + // - etc. + // + // As example / reference / benchmarks, we vectorize our calculation here using AVX2, SSE2, and AdvSIMD + // Despite the 'unsafe' tags, this is quite safe ;) The Memorys are pinned till end of scope. + // We also keep an Unoptimized Workload around to let RyuJIT show off its magic. (still good!) + + #region Raw Runners + + /// + /// Unoptimized workload for fennecs(Raw) + /// [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] - public void Fennecs_Raw() + [Benchmark(Description = "fennecs(Raw)")] + public void fennecs_Raw() + { + Stream.Raw(Raw_Workload_Unoptimized); + } + + /// + /// Vectorized Benchmark Contender for fennecs. (AVX2) + /// + /// + /// This benchmark is automatically excluded if the current environment does not support AVX2. + /// + [BenchmarkCategory(Categories.Fennecs, Capabilities.Avx2)] + [Benchmark(Description = "fennecs(AVX2)")] + public void fennecs_Raw_AVX2() + { + Stream.Raw(Raw_Workload_AVX2); + } + + /// + /// Vectorized Benchmark Contender for fennecs. (SSE2 / AVX1) + /// + /// + /// This benchmark is automatically excluded if the current environment does not support SSE2. + /// + [BenchmarkCategory(Categories.Fennecs, Capabilities.Sse2)] + [Benchmark(Description = "fennecs(SSE2)")] + public void fennecs_Raw_SSE2() + { + Stream.Raw(Raw_Workload_SSE2); + } + + /// + /// Vectorized Benchmark Contender for fennecs. (Arm64 AdvSIMD) + /// + /// + /// This benchmark is automatically excluded if the current environment does not support AdvSIMD. + /// + [BenchmarkCategory(Categories.Fennecs, Capabilities.AdvSimd)] + [Benchmark(Description = "fennecs(AdvSIMD)")] + public void fennecs_Raw_AdvSIMD() + { + Stream.Raw(Raw_Workload_AdvSIMD); + } + + /// + /// Unoptimized workload for fennecs(Raw) + /// Treating the Memory Slabs basically as Arrays. + /// + /// + /// However, RyuJIT is able to optimize this workload to a degree, + /// especially if we use an explicit assignment instead of a compound assignment + /// for our addition. + /// + private static void Raw_Workload_Unoptimized(Memory c1V, Memory c2V, Memory c3V) + { + Span c1S = c1V.Span; + Span c2S = c2V.Span; + Span c3S = c3V.Span; + + for (int i = 0; i < c1S.Length; i++) + { + c1S[i].Value = c1S[i].Value + c2S[i].Value + c3S[i].Value; + } + } + + /// + /// AVX2 workload for fennecs(Raw) + /// We use AVX2 intrinsics to vectorize the workload, executing 8 additions in parallel. + /// (256 bits) + /// + private static void Raw_Workload_AVX2(Memory c1V, Memory c2V, Memory c3V) + { + int count = c1V.Length; + + using MemoryHandle mem1 = c1V.Pin(); + using MemoryHandle mem2 = c2V.Pin(); + using MemoryHandle mem3 = c3V.Pin(); + + unsafe + { + int* p1 = (int*)mem1.Pointer; + int* p2 = (int*)mem2.Pointer; + int* p3 = (int*)mem3.Pointer; + + int vectorSize = Vector256.Count; + int vectorEnd = count - (count % vectorSize); + for (int i = 0; i < vectorEnd; i += vectorSize) + { + Vector256 v1 = Avx.LoadVector256(p1 + i); + Vector256 v2 = Avx.LoadVector256(p2 + i); + Vector256 v3 = Avx.LoadVector256(p3 + i); + Vector256 sum = Avx2.Add(v1, Avx2.Add(v2, v3)); + + Avx.Store(p1 + i, sum); + } + + for (int i = vectorEnd; i < count; i++) // remaining elements + { + p1[i] = p1[i] + p2[i] + p3[i]; + } + } + } + + /// + /// SSE2 workload for fennecs(Raw) + /// We use SSE2 (same as AVX1) intrinsics to vectorize the workload, executing 4 additions in parallel. + /// (128 bits) + /// + private static void Raw_Workload_SSE2(Memory c1V, Memory c2V, Memory c3V) + { + int count = c1V.Length; + + using MemoryHandle mem1 = c1V.Pin(); + using MemoryHandle mem2 = c2V.Pin(); + using MemoryHandle mem3 = c3V.Pin(); + + unsafe + { + int* p1 = (int*)mem1.Pointer; + int* p2 = (int*)mem2.Pointer; + int* p3 = (int*)mem3.Pointer; + + int vectorSize = Vector128.Count; + int vectorEnd = count - (count % vectorSize); + for (int i = 0; i < vectorEnd; i += vectorSize) + { + Vector128 v1 = Sse2.LoadVector128(p1 + i); + Vector128 v2 = Sse2.LoadVector128(p2 + i); + Vector128 v3 = Sse2.LoadVector128(p3 + i); + Vector128 sum = Sse2.Add(v1, Sse2.Add(v2, v3)); + + Sse2.Store(p1 + i, sum); + } + + for (int i = vectorEnd; i < count; i++) // remaining elements + { + p1[i] = p1[i] + p2[i] + p3[i]; + } + } + } + + /// + /// AdvSIMD workload for fennecs(Raw) + /// We use AdvSIMD intrinsics to vectorize the workload, executing 4 additions in parallel. + /// (128 bits) + /// + private static void Raw_Workload_AdvSIMD(Memory c1V, Memory c2V, Memory c3V) { - _fennecs.query.Raw(delegate(Memory c1v, Memory c2v, Memory c3v) + int count = c1V.Length; + + using MemoryHandle mem1 = c1V.Pin(); + using MemoryHandle mem2 = c2V.Pin(); + using MemoryHandle mem3 = c3V.Pin(); + + unsafe { - var c1vs = c1v.Span; - var c2vs = c2v.Span; - var c3vs = c3v.Span; - - for (int i = 0; i < c1vs.Length; ++i) + int* p1 = (int*)mem1.Pointer; + int* p2 = (int*)mem2.Pointer; + int* p3 = (int*)mem3.Pointer; + + int vectorSize = Vector128.Count; + int vectorEnd = count - (count % vectorSize); + for (int i = 0; i < vectorEnd; i += vectorSize) + { + Vector128 v1 = AdvSimd.LoadVector128(p1 + i); + Vector128 v2 = AdvSimd.LoadVector128(p2 + i); + Vector128 v3 = AdvSimd.LoadVector128(p3 + i); + Vector128 sum = AdvSimd.Add(v1, AdvSimd.Add(v2, v3)); + + AdvSimd.Store(p1 + i, sum); + } + + for (int i = vectorEnd; i < count; i++) // remaining elements { - ref Component1 c1 = ref c1vs[i]; - c1.Value += c2vs[i].Value + c3vs[i].Value; + p1[i] = p1[i] + p2[i] + p3[i]; } - }); + } } + + #endregion } } diff --git a/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/_SystemWithThreeComponents.cs b/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/_SystemWithThreeComponents.cs index 408108b..e2454c2 100644 --- a/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/_SystemWithThreeComponents.cs +++ b/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/_SystemWithThreeComponents.cs @@ -2,7 +2,7 @@ namespace Ecs.CSharp.Benchmark { - [BenchmarkCategory(Categories.System)] + [BenchmarkCategory(Categories.System, nameof(SystemWithThreeComponents))] [MemoryDiagnoser] #if CHECK_CACHE_MISSES [HardwareCounters(BenchmarkDotNet.Diagnosers.HardwareCounter.CacheMisses)] diff --git a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/Fennecs.cs b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/Fennecs.cs index 142b48b..5f7f9b6 100644 --- a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/Fennecs.cs +++ b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/Fennecs.cs @@ -1,73 +1,300 @@ using System; -using System.Runtime.CompilerServices; +using System.Buffers; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.X86; using BenchmarkDotNet.Attributes; using Ecs.CSharp.Benchmark.Contexts; using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components; using fennecs; +// ReSharper disable ConvertToCompoundAssignment +// ReSharper disable once CheckNamespace namespace Ecs.CSharp.Benchmark { public partial class SystemWithTwoComponents { [Context] private readonly FennecsContext _fennecs; + private Stream Stream => _fennecs.stream; + // ReSharper disable once ClassNeverInstantiated.Local private sealed class FennecsContext : FennecsBaseContext { - public Query query; + internal readonly Stream stream; - public FennecsContext(int entityCount, int entityPadding) + public FennecsContext(int entityCount, int entityPadding) : base(entityCount) { - query = World.Query().Build(); + stream = World.Query().Stream(); + for (int i = 0; i < entityCount; ++i) { for (int j = 0; j < entityPadding; ++j) { Entity padding = World.Spawn(); - switch (j % 2) + switch (j % 3) { case 0: padding.Add(); break; - case 1: padding.Add(); break; } } - World.Spawn().Add().Add(new Component2 { Value = 1 }); + World.Spawn().Add() + .Add(new Component2 + { + Value = 1 + }); } + + } + + public override void Dispose() + { + stream.Query.Dispose(); + base.Dispose(); } } + /// + /// fennecs For runners are the classic swiss army knife of this ECS. + /// + /// + /// They are the most versatile and offer decent single-threaded baseline performance to boot. + /// [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] - public void Fennecs_ForEach() + [Benchmark(Description = "fennecs(For)")] + public void fennecs_For() { - _fennecs.query.For((ref Component1 c1, ref Component2 c2) => c1.Value += c2.Value); + Stream.For( + static (ref Component1 c1, ref Component2 c2) => + { + c1.Value = c1.Value + c2.Value; + }); } + + /// + /// fennecs Job runners are the most scalable runners. + /// + /// + /// + /// They're still an area for improvement :) + /// + /// + /// Job is designed for heavy individual workloads (e.g. update 20 physics worlds on 20 cores), + /// or large numbers of entities in many big archetypes. They only start paying off at around + /// 500,000 components when the individual work steps are simple (e.g. vector multiplications). + /// + /// [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] - public void Fennecs_Job() + [Benchmark(Description = $"fennecs(Job)")] + public void fennecs_Job() { - _fennecs.query.Job(delegate(ref Component1 c1, ref Component2 c2) { c1.Value += c2.Value; }, 1024); + Stream.Job( + static (ref Component1 c1, ref Component2 c2) => + { + c1.Value = c1.Value + c2.Value; + }); } + + // fennecs Raw runners guarantee contiguous memory access in the form of Query<>.Raw(MemoryAction<>) + // Raw runners are intended to process data or transfer it via the fastest available means. + // Example use cases: + // - transfer data to/from GPU + // - transfer data to/from Game Engine + // - Disk, Database, or Network I/O + // - SIMD calculations + // - snapshotting / copying / rollback / compression / hashing / diffing / permutation + // - etc. + // + // As example / reference / benchmarks, we vectorize our calculation here using AVX2, SSE2, and AdvSIMD + // Despite the 'unsafe' tags, this is quite safe ;) The Memorys are pinned till end of scope. + // We also keep an Unoptimized Workload around to let RyuJIT show off its magic. (still good!) + + #region Raw Runners + + /// + /// Unoptimized workload for fennecs(Raw) + /// [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] - public void Fennecs_Raw() + [Benchmark(Description = "fennecs(Raw)")] + public void fennecs_Raw() + { + Stream.Raw(Raw_Workload_Unoptimized); + } + + /// + /// Vectorized Benchmark Contender for fennecs. (AVX2) + /// + /// + /// This benchmark is automatically excluded if the current environment does not support AVX2. + /// + [BenchmarkCategory(Categories.Fennecs, Capabilities.Avx2)] + [Benchmark(Description = "fennecs(AVX2)")] + public void fennecs_Raw_AVX2() + { + Stream.Raw(Raw_Workload_AVX2); + } + + /// + /// Vectorized Benchmark Contender for fennecs. (SSE2 / AVX1) + /// + /// + /// This benchmark is automatically excluded if the current environment does not support SSE2. + /// + [BenchmarkCategory(Categories.Fennecs, Capabilities.Sse2)] + [Benchmark(Description = "fennecs(SSE2)")] + public void fennecs_Raw_SSE2() + { + Stream.Raw(Raw_Workload_SSE2); + } + + /// + /// Vectorized Benchmark Contender for fennecs. (Arm64 AdvSIMD) + /// + /// + /// This benchmark is automatically excluded if the current environment does not support AdvSIMD. + /// + [BenchmarkCategory(Categories.Fennecs, Capabilities.AdvSimd)] + [Benchmark(Description = "fennecs(AdvSIMD)")] + public void fennecs_Raw_AdvSIMD() + { + Stream.Raw(Raw_Workload_AdvSIMD); + } + + /// + /// Unoptimized workload for fennecs(Raw) + /// Treating the Memory Slabs basically as Arrays. + /// + /// + /// However, RyuJIT is able to optimize this workload to a degree, + /// especially if we use an explicit assignment instead of a compound assignment + /// for our addition. + /// + private static void Raw_Workload_Unoptimized(Memory c1V, Memory c2V) + { + Span c1S = c1V.Span; + Span c2S = c2V.Span; + + for (int i = 0; i < c1S.Length; i++) + { + // Compound Assignment is not as optimized as explicit assignment + c1S[i].Value = c1S[i].Value + c2S[i].Value; + } + } + + /// + /// AVX2 workload for fennecs(Raw) + /// We use AVX2 intrinsics to vectorize the workload, executing 8 additions in parallel. + /// (256 bits) + /// + private static void Raw_Workload_AVX2(Memory c1V, Memory c2V) + { + int count = c1V.Length; + + using MemoryHandle mem1 = c1V.Pin(); + using MemoryHandle mem2 = c2V.Pin(); + + unsafe + { + int* p1 = (int*)mem1.Pointer; + int* p2 = (int*)mem2.Pointer; + + int vectorSize = Vector256.Count; + int vectorEnd = count - (count % vectorSize); + for (int i = 0; i < vectorEnd; i += vectorSize) + { + Vector256 v1 = Avx.LoadVector256(p1 + i); + Vector256 v2 = Avx.LoadVector256(p2 + i); + Vector256 sum = Avx2.Add(v1, v2); + + Avx.Store(p1 + i, sum); + } + + for (int i = vectorEnd; i < count; i++) // remaining elements + { + // Compound Assignment is not as optimized as explicit assignment + p1[i] = p1[i] + p2[i]; + } + } + } + + /// + /// SSE2 workload for fennecs(Raw) + /// We use SSE2 (same as AVX1) intrinsics to vectorize the workload, executing 4 additions in parallel. + /// (128 bits) + /// + private static void Raw_Workload_SSE2(Memory c1V, Memory c2V) { - _fennecs.query.Raw(delegate(Memory c1v, Memory c2v) + int count = c1V.Length; + + using MemoryHandle mem1 = c1V.Pin(); + using MemoryHandle mem2 = c2V.Pin(); + + unsafe { - var c1vs = c1v.Span; - var c2vs = c2v.Span; - for (int i = 0; i < c1vs.Length; ++i) + int* p1 = (int*)mem1.Pointer; + int* p2 = (int*)mem2.Pointer; + + int vectorSize = Vector128.Count; + int vectorEnd = count - (count % vectorSize); + for (int i = 0; i < vectorEnd; i += vectorSize) + { + Vector128 v1 = Sse2.LoadVector128(p1 + i); + Vector128 v2 = Sse2.LoadVector128(p2 + i); + Vector128 sum = Sse2.Add(v1, v2); + + Sse2.Store(p1 + i, sum); + } + + for (int i = vectorEnd; i < count; i++) // remaining elements { - ref Component1 c1 = ref c1vs[i]; - c1.Value += c2vs[i].Value; + // Compound Assignment is not as optimized as explicit assignment + p1[i] = p1[i] + p2[i]; } - }); + } + } + + /// + /// AdvSIMD workload for fennecs(Raw) + /// We use AdvSIMD intrinsics to vectorize the workload, executing 4 additions in parallel. + /// (128 bits) + /// + private static void Raw_Workload_AdvSIMD(Memory c1V, Memory c2V) + { + int count = c1V.Length; + + using MemoryHandle mem1 = c1V.Pin(); + using MemoryHandle mem2 = c2V.Pin(); + + unsafe + { + int* p1 = (int*)mem1.Pointer; + int* p2 = (int*)mem2.Pointer; + + int vectorSize = Vector128.Count; + int vectorEnd = count - (count % vectorSize); + for (int i = 0; i < vectorEnd; i += vectorSize) + { + Vector128 v1 = AdvSimd.LoadVector128(p1 + i); + Vector128 v2 = AdvSimd.LoadVector128(p2 + i); + Vector128 sum = AdvSimd.Add(v1, v2); + + AdvSimd.Store(p1 + i, sum); + } + + for (int i = vectorEnd; i < count; i++) // remaining elements + { + // Compound Assignment is not as optimized as explicit assignment + p1[i] = p1[i] + p2[i]; + } + } } + + #endregion } } diff --git a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/_SystemWithTwoComponents.cs b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/_SystemWithTwoComponents.cs index aeba499..b93d115 100644 --- a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/_SystemWithTwoComponents.cs +++ b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/_SystemWithTwoComponents.cs @@ -2,7 +2,7 @@ namespace Ecs.CSharp.Benchmark { - [BenchmarkCategory(Categories.System)] + [BenchmarkCategory(Categories.System, nameof(SystemWithTwoComponents))] [MemoryDiagnoser] #if CHECK_CACHE_MISSES [HardwareCounters(BenchmarkDotNet.Diagnosers.HardwareCounter.CacheMisses)] diff --git a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/Fennecs.cs b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/Fennecs.cs index 42f8788..82e01e9 100644 --- a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/Fennecs.cs +++ b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/Fennecs.cs @@ -1,30 +1,38 @@ using System; +using System.Buffers; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.X86; using BenchmarkDotNet.Attributes; using Ecs.CSharp.Benchmark.Contexts; using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components; using fennecs; +// ReSharper disable ConvertToCompoundAssignment + +// ReSharper disable once CheckNamespace namespace Ecs.CSharp.Benchmark { public partial class SystemWithTwoComponentsMultipleComposition { [Context] private readonly FennecsContext _fennecs; + private Stream Stream => _fennecs.stream; + // ReSharper disable once ClassNeverInstantiated.Local private sealed class FennecsContext : FennecsBaseContext { - private record struct Padding1(); + private struct Padding1; - private record struct Padding2(); + private struct Padding2; - private record struct Padding3(); + private struct Padding3; - private record struct Padding4(); + private struct Padding4; - public Query query; + public readonly Stream stream; - public FennecsContext(int entityCount) + public FennecsContext(int entityCount) : base(entityCount) { - query = World.Query().Build(); for (int i = 0; i < entityCount; ++i) { Entity entity = World.Spawn().Add().Add(new Component2 { Value = 1 }); @@ -44,37 +52,254 @@ public FennecsContext(int entityCount) break; } } + + stream = World.Query().Stream(); + } + + public override void Dispose() + { + stream.Query.Dispose(); + base.Dispose(); } } + /// + /// fennecs For runners are the classic swiss army knife of this ECS. + /// + /// + /// They are the most versatile and offer decent single-threaded baseline performance to boot. + /// [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] - public void Fennecs_ForEach() + [Benchmark(Description = "fennecs(For)")] + public void fennecs_For() { - _fennecs.query.For((ref Component1 c1, ref Component2 c2) => c1.Value += c2.Value); + Stream.For( + static (ref Component1 c1, ref Component2 c2) => + { + c1.Value = c1.Value + c2.Value; + }); } + + /// + /// fennecs Job runners are the most scalable runners. + /// + /// + /// + /// They're still an area for improvement :) + /// + /// + /// Job is designed for heavy individual workloads (e.g. update 20 physics worlds on 20 cores), + /// or large numbers of entities in many big archetypes. They only start paying off at around + /// 500,000 components when the individual work steps are simple (e.g. vector multiplications). + /// + /// [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] - public void Fennecs_Job() + [Benchmark(Description = $"fennecs(Job)")] + public void fennecs_Job() { - _fennecs.query.Job(delegate(ref Component1 c1, ref Component2 c2) { c1.Value += c2.Value; }, 1024); + Stream.Job( + static (ref Component1 c1, ref Component2 c2) => + { + c1.Value = c1.Value + c2.Value; + }); } + + // fennecs Raw runners guarantee contiguous memory access in the form of Query<>.Raw(MemoryAction<>) + // Raw runners are intended to process data or transfer it via the fastest available means. + // Example use cases: + // - transfer data to/from GPU + // - transfer data to/from Game Engine + // - Disk, Database, or Network I/O + // - SIMD calculations + // - snapshotting / copying / rollback / compression / hashing / diffing / permutation + // - etc. + // + // As example / reference / benchmarks, we vectorize our calculation here using AVX2, SSE2, and AdvSIMD + // Despite the 'unsafe' tags, this is quite safe ;) The Memorys are pinned till end of scope. + // We also keep an Unoptimized Workload around to let RyuJIT show off its magic. (still good!) + + #region Raw Runners + + /// + /// Unoptimized workload for fennecs(Raw) + /// [BenchmarkCategory(Categories.Fennecs)] - [Benchmark] - public void Fennecs_Raw() + [Benchmark(Description = "fennecs(Raw)")] + public void fennecs_Raw() + { + Stream.Raw(Raw_Workload_Unoptimized); + } + + /// + /// Vectorized Benchmark Contender for fennecs. (AVX2) + /// + /// + /// This benchmark is automatically excluded if the current environment does not support AVX2. + /// + [BenchmarkCategory(Categories.Fennecs, Capabilities.Avx2)] + [Benchmark(Description = "fennecs(AVX2)")] + public void fennecs_Raw_AVX2() + { + Stream.Raw(Raw_Workload_AVX2); + } + + /// + /// Vectorized Benchmark Contender for fennecs. (SSE2 / AVX1) + /// + /// + /// This benchmark is automatically excluded if the current environment does not support SSE2. + /// + [BenchmarkCategory(Categories.Fennecs, Capabilities.Sse2)] + [Benchmark(Description = "fennecs(SSE2)")] + public void fennecs_Raw_SSE2() + { + Stream.Raw(Raw_Workload_SSE2); + } + + /// + /// Vectorized Benchmark Contender for fennecs. (Arm64 AdvSIMD) + /// + /// + /// This benchmark is automatically excluded if the current environment does not support AdvSIMD. + /// + [BenchmarkCategory(Categories.Fennecs, Capabilities.AdvSimd)] + [Benchmark(Description = "fennecs(AdvSIMD)")] + public void fennecs_Raw_AdvSIMD() + { + Stream.Raw(Raw_Workload_AdvSIMD); + } + + /// + /// Unoptimized workload for fennecs(Raw) + /// Treating the Memory Slabs basically as Arrays. + /// + /// + /// However, RyuJIT is able to optimize this workload to a degree, + /// especially if we use an explicit assignment instead of a compound assignment + /// for our addition. + /// + private static void Raw_Workload_Unoptimized(Memory c1V, Memory c2V) + { + Span c1S = c1V.Span; + Span c2S = c2V.Span; + + for (int i = 0; i < c1S.Length; i++) + { + // Compound Assignment is not as optimized as explicit assignment + c1S[i].Value = c1S[i].Value + c2S[i].Value; + } + } + + /// + /// AVX2 workload for fennecs(Raw) + /// We use AVX2 intrinsics to vectorize the workload, executing 8 additions in parallel. + /// (256 bits) + /// + private static void Raw_Workload_AVX2(Memory c1V, Memory c2V) { - _fennecs.query.Raw(delegate(Memory c1v, Memory c2v) + int count = c1V.Length; + + using MemoryHandle mem1 = c1V.Pin(); + using MemoryHandle mem2 = c2V.Pin(); + + unsafe { - var c1vs = c1v.Span; - var c2vs = c2v.Span; - for (int i = 0; i < c1vs.Length; ++i) + int* p1 = (int*)mem1.Pointer; + int* p2 = (int*)mem2.Pointer; + + int vectorSize = Vector256.Count; + int vectorEnd = count - (count % vectorSize); + for (int i = 0; i < vectorEnd; i += vectorSize) { - ref Component1 c1 = ref c1vs[i]; - c1.Value += c2vs[i].Value; + Vector256 v1 = Avx.LoadVector256(p1 + i); + Vector256 v2 = Avx.LoadVector256(p2 + i); + Vector256 sum = Avx2.Add(v1, v2); + + Avx.Store(p1 + i, sum); } - }); + + for (int i = vectorEnd; i < count; i++) // remaining elements + { + // Compound Assignment is not as optimized as explicit assignment + p1[i] = p1[i] + p2[i]; + } + } + } + + /// + /// SSE2 workload for fennecs(Raw) + /// We use SSE2 (same as AVX1) intrinsics to vectorize the workload, executing 4 additions in parallel. + /// (128 bits) + /// + private static void Raw_Workload_SSE2(Memory c1V, Memory c2V) + { + int count = c1V.Length; + + using MemoryHandle mem1 = c1V.Pin(); + using MemoryHandle mem2 = c2V.Pin(); + + unsafe + { + int* p1 = (int*)mem1.Pointer; + int* p2 = (int*)mem2.Pointer; + + int vectorSize = Vector128.Count; + int vectorEnd = count - (count % vectorSize); + for (int i = 0; i < vectorEnd; i += vectorSize) + { + Vector128 v1 = Sse2.LoadVector128(p1 + i); + Vector128 v2 = Sse2.LoadVector128(p2 + i); + Vector128 sum = Sse2.Add(v1, v2); + + Sse2.Store(p1 + i, sum); + } + + for (int i = vectorEnd; i < count; i++) // remaining elements + { + // Compound Assignment is not as optimized as explicit assignment + p1[i] = p1[i] + p2[i]; + } + } } + + /// + /// AdvSIMD workload for fennecs(Raw) + /// We use AdvSIMD intrinsics to vectorize the workload, executing 4 additions in parallel. + /// (128 bits) + /// + private static void Raw_Workload_AdvSIMD(Memory c1V, Memory c2V) + { + int count = c1V.Length; + + using MemoryHandle mem1 = c1V.Pin(); + using MemoryHandle mem2 = c2V.Pin(); + + unsafe + { + int* p1 = (int*)mem1.Pointer; + int* p2 = (int*)mem2.Pointer; + + int vectorSize = Vector128.Count; + int vectorEnd = count - (count % vectorSize); + for (int i = 0; i < vectorEnd; i += vectorSize) + { + Vector128 v1 = AdvSimd.LoadVector128(p1 + i); + Vector128 v2 = AdvSimd.LoadVector128(p2 + i); + Vector128 sum = AdvSimd.Add(v1, v2); + + AdvSimd.Store(p1 + i, sum); + } + + for (int i = vectorEnd; i < count; i++) // remaining elements + { + // Compound Assignment is not as optimized as explicit assignment + p1[i] = p1[i] + p2[i]; + } + } + } + + #endregion } } diff --git a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/_SystemWithTwoComponentsMultipleComposition.cs b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/_SystemWithTwoComponentsMultipleComposition.cs index e5fa72a..6c661ea 100644 --- a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/_SystemWithTwoComponentsMultipleComposition.cs +++ b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/_SystemWithTwoComponentsMultipleComposition.cs @@ -2,7 +2,7 @@ namespace Ecs.CSharp.Benchmark { - [BenchmarkCategory(Categories.System)] + [BenchmarkCategory(Categories.System, nameof(SystemWithTwoComponentsMultipleComposition))] [MemoryDiagnoser] #if CHECK_CACHE_MISSES [HardwareCounters(BenchmarkDotNet.Diagnosers.HardwareCounter.CacheMisses)]