diff --git a/README.md b/README.md
index 19fb4b6..e39e063 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ All results are obtained from the same toaster, with the same load, so compariso
Tested frameworks:
- [Arch](https://github.com/genaray/Arch)
- [DefaultEcs](https://github.com/Doraku/DefaultEcs)
-- [Fennecs](https://github.com/thygrrr/fennecs)
+- [**fenn**ecs](https://fennecs.tech)
- [Flecs.Net](https://github.com/BeanCheeseBurrito/Flecs.NET)
- [Friflo.Engine.ECS](https://github.com/friflo/Friflo.Json.Fliox/blob/main/Engine/README.md)
- [Leopotam.Ecs](https://github.com/Leopotam/ecs) using what I believe is a nuget package not made by the actual author and compiled in debug...
diff --git a/source/Ecs.CSharp.Benchmark/Capabilities.cs b/source/Ecs.CSharp.Benchmark/Capabilities.cs
new file mode 100644
index 0000000..3a2b46c
--- /dev/null
+++ b/source/Ecs.CSharp.Benchmark/Capabilities.cs
@@ -0,0 +1,69 @@
+using BenchmarkDotNet.Configs;
+
+namespace Ecs.CSharp.Benchmark
+{
+ ///
+ /// Capability Requirements for specific tests.
+ /// Add your own intrinsics or other system dependencies here.
+ ///
+ ///
+ /// Usage: Add a category to it and apply exclusions in the ApplyExclusions method.
+ /// (this is an EXCLUSIVE category filter, it turns OFF all categories it matches)
+ /// Then, set your own BenchmarkCategory to include the CapabilityCategory string.
+ ///
+ ///
+ ///
+ /// [BenchmarkCategory(
+ /// Categories.Fennecs,
+ /// Capabilities.Avx2
+ /// )]
+ /// public void Raw_AVX2()
+ ///
+ ///
+ internal static class Capabilities
+ {
+ // These are common vectorized instruction set categories.
+ // x86/x64
+ public const string Avx2 = nameof(System.Runtime.Intrinsics.X86.Avx2);
+ public const string Avx = nameof(System.Runtime.Intrinsics.X86.Avx);
+ public const string Sse3 = nameof(System.Runtime.Intrinsics.X86.Sse3);
+ public const string Sse2 = nameof(System.Runtime.Intrinsics.X86.Sse2);
+
+ // Arm
+ public const string AdvSimd = nameof(System.Runtime.Intrinsics.Arm.AdvSimd);
+
+ ///
+ /// This applies capability-based exclusions as filters to the config.
+ ///
+ /// a Benchmark Config, e.g. as used in Program.cs
+ public static IConfig WithCapabilityExclusions(this IConfig self)
+ {
+ if (!System.Runtime.Intrinsics.X86.Avx2.IsSupported)
+ {
+ self = self.AddFilter(new CategoryExclusion(Avx2));
+ }
+
+ if (!System.Runtime.Intrinsics.X86.Avx.IsSupported)
+ {
+ self = self.AddFilter(new CategoryExclusion(Avx));
+ }
+
+ if (!System.Runtime.Intrinsics.X86.Sse3.IsSupported)
+ {
+ self = self.AddFilter(new CategoryExclusion(Sse3));
+ }
+
+ if (!System.Runtime.Intrinsics.X86.Sse2.IsSupported)
+ {
+ self = self.AddFilter(new CategoryExclusion(Sse2));
+ }
+
+ if (!System.Runtime.Intrinsics.Arm.AdvSimd.IsSupported)
+ {
+ self = self.AddFilter(new CategoryExclusion(AdvSimd));
+ }
+
+ return self;
+ }
+ }
+}
diff --git a/source/Ecs.CSharp.Benchmark/Categories.cs b/source/Ecs.CSharp.Benchmark/Categories.cs
index 2e0e883..2812643 100644
--- a/source/Ecs.CSharp.Benchmark/Categories.cs
+++ b/source/Ecs.CSharp.Benchmark/Categories.cs
@@ -1,5 +1,13 @@
-namespace Ecs.CSharp.Benchmark
+using System.Diagnostics.CodeAnalysis;
+using System.Linq;
+using BenchmarkDotNet.Filters;
+using BenchmarkDotNet.Running;
+
+namespace Ecs.CSharp.Benchmark
{
+ ///
+ /// Prefixes / ECS package names for benchmarks, used as BenchMarkDotNet categories.
+ ///
internal static class Categories
{
public const string Arch = "Arch";
@@ -14,10 +22,28 @@ internal static class Categories
public const string SveltoECS = "Svelto.ECS";
public const string Morpeh = "Morpeh";
public const string FlecsNet = "FlecsNet";
- public const string Fennecs = "Fennecs";
+ public const string Fennecs = "fennecs";
public const string TinyEcs = "TinyEcs";
public const string CreateEntity = "CreateEntity";
public const string System = "System";
}
+
+ ///
+ /// Excludes a given category from benchmarks.
+ /// (used by Program.cs)
+ ///
+ ///
+ /// When an exclusion is PRESENT, then all benchmarks that HAVE the category will be EXCLUDED.
+ ///
+ ///
+ /// CategoryExclusion("foo") will exclude all benchmarks that have the "foo" category.
+ ///
+ public class CategoryExclusion(string category) : IFilter
+ {
+ public bool Predicate([NotNull] BenchmarkCase benchmarkCase)
+ {
+ return !benchmarkCase.Descriptor.Categories.Contains(category);
+ }
+ }
}
diff --git a/source/Ecs.CSharp.Benchmark/Contexts/FennecsBaseContext.cs b/source/Ecs.CSharp.Benchmark/Contexts/FennecsBaseContext.cs
index cc67db5..8892bd5 100644
--- a/source/Ecs.CSharp.Benchmark/Contexts/FennecsBaseContext.cs
+++ b/source/Ecs.CSharp.Benchmark/Contexts/FennecsBaseContext.cs
@@ -1,38 +1,52 @@
using System;
+using System.Runtime.CompilerServices;
using fennecs;
namespace Ecs.CSharp.Benchmark.Contexts
{
namespace Fennecs_Components
{
- internal struct Component1
+ internal record struct Component1
{
+ public static implicit operator Component1(int value) => new() { Value = value };
+ public static implicit operator Component2(Component1 self) => new() { Value = self.Value };
+ public static implicit operator Component3(Component1 self) => new() { Value = self.Value };
+ public static implicit operator int (Component1 c) => c.Value;
+
public int Value;
}
- internal struct Component2
+ internal record struct Component2
{
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static implicit operator Component1(Component2 self) => new() { Value = self.Value };
+ public static implicit operator Component2(int value) => new() { Value = value };
+ public static implicit operator Component3(Component2 self) => new() { Value = self.Value };
+ public static implicit operator int (Component2 c) => c.Value;
+
public int Value;
}
- internal struct Component3
+ internal record struct Component3
{
+ public static implicit operator Component1(Component3 self) => new() { Value = self.Value };
+ public static implicit operator Component2(Component3 self) => new() { Value = self.Value };
+ public static implicit operator Component3(int value) => new() { Value = value };
+ public static implicit operator int (Component3 c) => c.Value;
+
public int Value;
}
}
- internal class FennecsBaseContext : IDisposable
+ internal class FennecsBaseContext(int entityCount) : IDisposable
{
- public World World { get; }
+ public World World { get; } = new World(entityCount * 2);
- public FennecsBaseContext()
- {
- World = new World();
- }
-
- public void Dispose()
+ public virtual void Dispose()
{
World.Dispose();
}
+ public FennecsBaseContext() : this(100000)
+ { }
}
}
diff --git a/source/Ecs.CSharp.Benchmark/CreateEntityWithOneComponent/Fennecs.cs b/source/Ecs.CSharp.Benchmark/CreateEntityWithOneComponent/Fennecs.cs
index 0939b51..ae76576 100644
--- a/source/Ecs.CSharp.Benchmark/CreateEntityWithOneComponent/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/CreateEntityWithOneComponent/Fennecs.cs
@@ -8,17 +8,16 @@ namespace Ecs.CSharp.Benchmark
public partial class CreateEntityWithOneComponent
{
[Context] private readonly FennecsBaseContext _fennecs;
-
+
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
+ [Benchmark(Description = "fennecs")]
public void Fennecs()
{
World world = _fennecs.World;
- for (int i = 0; i < EntityCount; ++i)
- {
- world.Spawn().Add();
- }
+ world.Entity()
+ .Add(new Component1())
+ .Spawn(EntityCount);
}
}
}
diff --git a/source/Ecs.CSharp.Benchmark/CreateEntityWithThreeComponents/Fennecs.cs b/source/Ecs.CSharp.Benchmark/CreateEntityWithThreeComponents/Fennecs.cs
index 8689522..29d900e 100644
--- a/source/Ecs.CSharp.Benchmark/CreateEntityWithThreeComponents/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/CreateEntityWithThreeComponents/Fennecs.cs
@@ -3,26 +3,24 @@
using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components;
using fennecs;
+// ReSharper disable once CheckNamespace
namespace Ecs.CSharp.Benchmark
{
public partial class CreateEntityWithThreeComponents
{
- [Context]
- private readonly FennecsBaseContext _fennecs;
-
+ [Context] private readonly FennecsBaseContext _fennecs;
+
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
+ [Benchmark(Description = "fennecs")]
public void Fennecs()
{
World world = _fennecs.World;
- for (int i = 0; i < EntityCount; ++i)
- {
- world.Spawn()
- .Add()
- .Add()
- .Add();
- }
+ world.Entity()
+ .Add(new Component1())
+ .Add(new Component2())
+ .Add(new Component3())
+ .Spawn(EntityCount);
}
}
}
diff --git a/source/Ecs.CSharp.Benchmark/CreateEntityWithTwoComponents/Fennecs.cs b/source/Ecs.CSharp.Benchmark/CreateEntityWithTwoComponents/Fennecs.cs
index 4c2016d..b12b478 100644
--- a/source/Ecs.CSharp.Benchmark/CreateEntityWithTwoComponents/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/CreateEntityWithTwoComponents/Fennecs.cs
@@ -8,18 +8,17 @@ namespace Ecs.CSharp.Benchmark
public partial class CreateEntityWithTwoComponents
{
[Context] private readonly FennecsBaseContext _fennecs;
-
+
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
+ [Benchmark(Description = "fennecs")]
public void Fennecs()
{
World world = _fennecs.World;
- for (int i = 0; i < EntityCount; ++i)
- {
- world.Spawn().
- Add().Add();
- }
+ world.Entity()
+ .Add(new Component1())
+ .Add(new Component2())
+ .Spawn(EntityCount);
}
}
}
diff --git a/source/Ecs.CSharp.Benchmark/Ecs.CSharp.Benchmark.csproj b/source/Ecs.CSharp.Benchmark/Ecs.CSharp.Benchmark.csproj
index 5c24029..0b4e018 100644
--- a/source/Ecs.CSharp.Benchmark/Ecs.CSharp.Benchmark.csproj
+++ b/source/Ecs.CSharp.Benchmark/Ecs.CSharp.Benchmark.csproj
@@ -37,7 +37,7 @@
-
+
diff --git a/source/Ecs.CSharp.Benchmark/Program.cs b/source/Ecs.CSharp.Benchmark/Program.cs
index 1e9cb5a..3a91a4a 100644
--- a/source/Ecs.CSharp.Benchmark/Program.cs
+++ b/source/Ecs.CSharp.Benchmark/Program.cs
@@ -14,18 +14,26 @@
BenchmarkSwitcher benchmark = BenchmarkSwitcher.FromTypes(new[]
{
- typeof(CreateEntityWithOneComponent),
- typeof(CreateEntityWithTwoComponents),
- typeof(CreateEntityWithThreeComponents),
-
typeof(SystemWithOneComponent),
typeof(SystemWithTwoComponents),
typeof(SystemWithThreeComponents),
- typeof(SystemWithTwoComponentsMultipleComposition)
+ typeof(SystemWithTwoComponentsMultipleComposition),
+
+ //Moving lighter tests to the back makes the estimated time display more reliable
+ typeof(CreateEntityWithOneComponent),
+ typeof(CreateEntityWithTwoComponents),
+ typeof(CreateEntityWithThreeComponents),
});
-IConfig configuration = DefaultConfig.Instance.WithOptions(ConfigOptions.DisableOptimizationsValidator);
+
+IConfig configuration = DefaultConfig.Instance
+ .WithOptions(ConfigOptions.DisableOptimizationsValidator)
+ .WithCapabilityExclusions();
+
+#if RANK_RESULTS
+ configuration = configuration.WithOrderer(new BenchmarkDotNet.Order.DefaultOrderer(BenchmarkDotNet.Order.SummaryOrderPolicy.FastestToSlowest));
+#endif
if (args.Length > 0)
{
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/Fennecs.cs b/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/Fennecs.cs
index 3690e37..01080b2 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/Fennecs.cs
@@ -1,23 +1,28 @@
using System;
-using System.Runtime.CompilerServices;
+using System.Buffers;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
using BenchmarkDotNet.Attributes;
using Ecs.CSharp.Benchmark.Contexts;
using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components;
using fennecs;
+// ReSharper disable once CheckNamespace
namespace Ecs.CSharp.Benchmark
{
public partial class SystemWithOneComponent
{
[Context] private readonly FennecsContext _fennecs;
+ // ReSharper disable once ClassNeverInstantiated.Local
private sealed class FennecsContext : FennecsBaseContext
{
- public Query query;
+ public readonly Stream query;
- public FennecsContext(int entityCount, int entityPadding)
+ public FennecsContext(int entityCount, int entityPadding) : base(entityCount)
{
- query = World.Query().Build();
+ query = World.Stream();
for (int i = 0; i < entityCount; ++i)
{
for (int j = 0; j < entityPadding; ++j)
@@ -27,34 +32,159 @@ public FennecsContext(int entityCount, int entityPadding)
World.Spawn().Add();
}
+
+ query.Query.Batch(Batch.AddConflict.Replace)
+ .Add(new Component1
+ {
+ Value = 0
+ })
+ .Submit();
+ }
+
+ public override void Dispose()
+ {
+ query.Query.Dispose();
+ base.Dispose();
}
}
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
- public void Fennecs_ForEach()
+ [Benchmark (Description = "fennecs(For)")]
+ public void Fennecs_For()
+ {
+ _fennecs.query.For(static (ref Component1 v) => { v.Value++; });
+ }
+
+ // Disabled for now.
+ // This API is available in fennecs 0.3.x and later, but is not optimized yet.
+ //[BenchmarkCategory(Categories.Fennecs)]
+ //[Benchmark (Description = "fennecs(Batch)")]
+ public void Fennecs_Batch()
{
- _fennecs.query.For((ref Component1 comp0) => comp0.Value++);
+ int newValue = _fennecs.query.Query[0].Ref().Value + 1;
+ _fennecs.query.Blit(newValue);
}
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
+ [Benchmark (Description = "fennecs(Job)")]
public void Fennecs_Job()
{
- _fennecs.query.Job(delegate(ref Component1 v) { v.Value++; }, 1024);
+ _fennecs.query.Job(static (ref Component1 v) => { v.Value++; });
}
-
- [BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
- public void Fennecs_Raw()
+
+ //[BenchmarkCategory(Categories.Fennecs)]
+ //[Benchmark(Description = "fennecs(Blit)")]
+ public void Fennecs_Raw_Blit()
+ {
+ _fennecs.query.Raw(delegate(Memory mem1)
+ {
+ // This does exactly what the system does, but it is wholly dependent
+ // on the precondition of the benchmark. (so... it's taking a shortcut)
+ // fennecs 0.4.0 or 0.5.0 will provide a literal Blit method that
+ // works like this for fast updating of large swathes of component
+ // data.
+ Component1 newValue = new Component1
+ {
+ // We can safely do this because we will never get called here with
+ // an empty archetype / zero size memory slab
+ Value = mem1.Span[0].Value + 1
+ };
+ mem1.Span.Fill(newValue);
+ });
+ }
+
+ #region Raw Runners
+
+ [BenchmarkCategory(Categories.Fennecs, Capabilities.Avx2)]
+ [Benchmark(Description = "fennecs(AVX2)")]
+ public void Fennecs_Raw_AVX2()
+ {
+ _fennecs.query.Raw(delegate(Memory mem1)
+ {
+ int count = mem1.Length;
+
+ using MemoryHandle handle1 = mem1.Pin();
+
+ unsafe
+ {
+ int* p1 = (int*)handle1.Pointer;
+
+ int vectorSize = Vector256.Count;
+ int vectorEnd = count - (count % vectorSize);
+ for (int i = 0; i < vectorEnd; i += vectorSize)
+ {
+ Vector256 v1 = Avx.LoadVector256(p1 + i);
+ Avx.Store(p1 + i, Avx2.Add(v1, Vector256.One));
+ }
+
+ for (int i = vectorEnd; i < count; i++) // remaining elements
+ {
+ p1[i]++;
+ }
+ }
+ });
+ }
+
+ [BenchmarkCategory(Categories.Fennecs, Capabilities.Sse2)]
+ [Benchmark(Description = "fennecs(SSE2)")]
+ public void Fennecs_Raw_SSE2()
+ {
+ _fennecs.query.Raw(delegate(Memory mem1)
+ {
+ int count = mem1.Length;
+
+ using MemoryHandle handle1 = mem1.Pin();
+
+ unsafe
+ {
+ int* p1 = (int*)handle1.Pointer;
+
+ int vectorSize = Vector128.Count;
+ int vectorEnd = count - (count % vectorSize);
+ for (int i = 0; i < vectorEnd; i += vectorSize)
+ {
+ Vector128 v1 = Sse2.LoadVector128(p1 + i);
+ Sse2.Store(p1 + i, Sse2.Add(v1, Vector128.One));
+ }
+
+ for (int i = vectorEnd; i < count; i++) // remaining elements
+ {
+ p1[i]++;
+ }
+ }
+ });
+ }
+
+ [BenchmarkCategory(Categories.Fennecs, Capabilities.AdvSimd)]
+ [Benchmark(Description = "fennecs(AdvSIMD)")]
+ public void Fennecs_Raw_AdvSimd()
{
- _fennecs.query.Raw(delegate(Memory vectors)
+ _fennecs.query.Raw(delegate(Memory mem1)
{
- foreach (ref var v in vectors.Span)
+ int count = mem1.Length;
+
+ using MemoryHandle handle1 = mem1.Pin();
+
+ unsafe
{
- v.Value++;
+ int* p1 = (int*)handle1.Pointer;
+
+ int vectorSize = Vector128.Count;
+ int vectorEnd = count - (count % vectorSize);
+ for (int i = 0; i < vectorEnd; i += vectorSize)
+ {
+ Vector128 v1 = AdvSimd.LoadVector128(p1 + i);
+ AdvSimd.Store(p1 + i, AdvSimd.Add(v1, Vector128.One));
+ }
+
+ for (int i = vectorEnd; i < count; i++) // remaining elements
+ {
+ p1[i]++;
+ }
}
});
}
+
+ #endregion
}
}
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/_SystemWithOneComponent.cs b/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/_SystemWithOneComponent.cs
index 170429d..ac73ec9 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/_SystemWithOneComponent.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/_SystemWithOneComponent.cs
@@ -2,7 +2,7 @@
namespace Ecs.CSharp.Benchmark
{
- [BenchmarkCategory(Categories.System)]
+ [BenchmarkCategory(Categories.System, nameof(SystemWithOneComponent))]
[MemoryDiagnoser]
#if CHECK_CACHE_MISSES
[HardwareCounters(BenchmarkDotNet.Diagnosers.HardwareCounter.CacheMisses)]
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/Fennecs.cs b/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/Fennecs.cs
index c657a88..75cc219 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/Fennecs.cs
@@ -1,23 +1,28 @@
using System;
-using System.Runtime.CompilerServices;
+using System.Buffers;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
using BenchmarkDotNet.Attributes;
using Ecs.CSharp.Benchmark.Contexts;
using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components;
using fennecs;
+// ReSharper disable once CheckNamespace
namespace Ecs.CSharp.Benchmark
{
public partial class SystemWithThreeComponents
{
[Context] private readonly FennecsContext _fennecs;
+ private Stream Stream => _fennecs.stream;
+ // ReSharper disable once ClassNeverInstantiated.Local
private sealed class FennecsContext : FennecsBaseContext
{
- public Query query;
+ internal readonly Stream stream;
- public FennecsContext(int entityCount, int entityPadding)
+ public FennecsContext(int entityCount, int entityPadding) : base(entityCount)
{
- query = World.Query().Build();
for (int i = 0; i < entityCount; ++i)
{
for (int j = 0; j < entityPadding; ++j)
@@ -38,42 +43,282 @@ public FennecsContext(int entityCount, int entityPadding)
}
World.Spawn().Add()
- .Add(new Component2 { Value = 1 })
- .Add(new Component3 { Value = 1 });
+ .Add(new Component2 {Value = 1})
+ .Add(new Component3 {Value = 1});
}
+
+ stream = World.Query().Stream();
+ }
+
+ public override void Dispose()
+ {
+ stream.Query.Dispose();
+ base.Dispose();
}
}
+ ///
+ /// fennecs For runners are the classic swiss army knife of this ECS.
+ ///
+ ///
+ /// They are the most versatile and offer decent single-threaded baseline performance to boot.
+ ///
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
- public void Fennecs_ForEach()
+ [Benchmark(Description = "fennecs(For)")]
+ public void fennecs_For()
{
- _fennecs.query.For((ref Component1 c1, ref Component2 c2, ref Component3 c3) => c1.Value += c2.Value + c3.Value);
+ Stream.For(
+ static (ref Component1 c1, ref Component2 c2, ref Component3 c3) =>
+ {
+ c1.Value = c1.Value + c2.Value + c3.Value;
+ });
+ }
+
+
+ ///
+ /// Experimental Implicit value type to compare performance in a tight loop.
+ ///
+ ///
+ /// It's very convenient, but is about 20% slower than the For runner with values
+ /// (it still gets inlined well!)
+ ///
+ //[BenchmarkCategory(Categories.Fennecs)]
+ //[Benchmark(Description = "fennecs(Implicit)")]
+ public void fennecs_For_Implicit()
+ {
+ Stream.For(
+ static (ref Component1 c1, ref Component2 c2, ref Component3 c3) =>
+ {
+ c1 = c1 + c2 + c3;
+ });
}
+
+ ///
+ /// fennecs Job runners are the most scalable runners.
+ ///
+ ///
+ ///
+ /// They're still an area for improvement :)
+ ///
+ ///
+ /// Job is designed for heavy individual workloads (e.g. update 20 physics worlds on 20 cores),
+ /// or large numbers of entities in many big archetypes. They only start paying off at around
+ /// 500,000 components when the individual work steps are simple (e.g. vector multiplications).
+ ///
+ ///
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
- public void Fennecs_Job()
+ [Benchmark(Description = $"fennecs(Job)")]
+ public void fennecs_Job()
{
- _fennecs.query.Job(delegate(ref Component1 c1, ref Component2 c2, ref Component3 c3) { c1.Value += c2.Value + c3.Value; }, 1024);
+ Stream.Job(
+ static (ref Component1 c1, ref Component2 c2, ref Component3 c3) =>
+ {
+ c1.Value = c1.Value + c2.Value + c3.Value;
+ });
}
+
+ // fennecs Raw runners guarantee contiguous memory access in the form of Query<>.Raw(MemoryAction<>)
+ // Raw runners are intended to process data or transfer it via the fastest available means.
+ // Example use cases:
+ // - transfer data to/from GPU
+ // - transfer data to/from Game Engine
+ // - Disk, Database, or Network I/O
+ // - SIMD calculations
+ // - snapshotting / copying / rollback / compression / hashing / diffing / permutation
+ // - etc.
+ //
+ // As example / reference / benchmarks, we vectorize our calculation here using AVX2, SSE2, and AdvSIMD
+ // Despite the 'unsafe' tags, this is quite safe ;) The Memorys are pinned till end of scope.
+ // We also keep an Unoptimized Workload around to let RyuJIT show off its magic. (still good!)
+
+ #region Raw Runners
+
+ ///
+ /// Unoptimized workload for fennecs(Raw)
+ ///
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
- public void Fennecs_Raw()
+ [Benchmark(Description = "fennecs(Raw)")]
+ public void fennecs_Raw()
+ {
+ Stream.Raw(Raw_Workload_Unoptimized);
+ }
+
+ ///
+ /// Vectorized Benchmark Contender for fennecs. (AVX2)
+ ///
+ ///
+ /// This benchmark is automatically excluded if the current environment does not support AVX2.
+ ///
+ [BenchmarkCategory(Categories.Fennecs, Capabilities.Avx2)]
+ [Benchmark(Description = "fennecs(AVX2)")]
+ public void fennecs_Raw_AVX2()
+ {
+ Stream.Raw(Raw_Workload_AVX2);
+ }
+
+ ///
+ /// Vectorized Benchmark Contender for fennecs. (SSE2 / AVX1)
+ ///
+ ///
+ /// This benchmark is automatically excluded if the current environment does not support SSE2.
+ ///
+ [BenchmarkCategory(Categories.Fennecs, Capabilities.Sse2)]
+ [Benchmark(Description = "fennecs(SSE2)")]
+ public void fennecs_Raw_SSE2()
+ {
+ Stream.Raw(Raw_Workload_SSE2);
+ }
+
+ ///
+ /// Vectorized Benchmark Contender for fennecs. (Arm64 AdvSIMD)
+ ///
+ ///
+ /// This benchmark is automatically excluded if the current environment does not support AdvSIMD.
+ ///
+ [BenchmarkCategory(Categories.Fennecs, Capabilities.AdvSimd)]
+ [Benchmark(Description = "fennecs(AdvSIMD)")]
+ public void fennecs_Raw_AdvSIMD()
+ {
+ Stream.Raw(Raw_Workload_AdvSIMD);
+ }
+
+ ///
+ /// Unoptimized workload for fennecs(Raw)
+ /// Treating the Memory Slabs basically as Arrays.
+ ///
+ ///
+ /// However, RyuJIT is able to optimize this workload to a degree,
+ /// especially if we use an explicit assignment instead of a compound assignment
+ /// for our addition.
+ ///
+ private static void Raw_Workload_Unoptimized(Memory c1V, Memory c2V, Memory c3V)
+ {
+ Span c1S = c1V.Span;
+ Span c2S = c2V.Span;
+ Span c3S = c3V.Span;
+
+ for (int i = 0; i < c1S.Length; i++)
+ {
+ c1S[i].Value = c1S[i].Value + c2S[i].Value + c3S[i].Value;
+ }
+ }
+
+ ///
+ /// AVX2 workload for fennecs(Raw)
+ /// We use AVX2 intrinsics to vectorize the workload, executing 8 additions in parallel.
+ /// (256 bits)
+ ///
+ private static void Raw_Workload_AVX2(Memory c1V, Memory c2V, Memory c3V)
+ {
+ int count = c1V.Length;
+
+ using MemoryHandle mem1 = c1V.Pin();
+ using MemoryHandle mem2 = c2V.Pin();
+ using MemoryHandle mem3 = c3V.Pin();
+
+ unsafe
+ {
+ int* p1 = (int*)mem1.Pointer;
+ int* p2 = (int*)mem2.Pointer;
+ int* p3 = (int*)mem3.Pointer;
+
+ int vectorSize = Vector256.Count;
+ int vectorEnd = count - (count % vectorSize);
+ for (int i = 0; i < vectorEnd; i += vectorSize)
+ {
+ Vector256 v1 = Avx.LoadVector256(p1 + i);
+ Vector256 v2 = Avx.LoadVector256(p2 + i);
+ Vector256 v3 = Avx.LoadVector256(p3 + i);
+ Vector256 sum = Avx2.Add(v1, Avx2.Add(v2, v3));
+
+ Avx.Store(p1 + i, sum);
+ }
+
+ for (int i = vectorEnd; i < count; i++) // remaining elements
+ {
+ p1[i] = p1[i] + p2[i] + p3[i];
+ }
+ }
+ }
+
+ ///
+ /// SSE2 workload for fennecs(Raw)
+ /// We use SSE2 (same as AVX1) intrinsics to vectorize the workload, executing 4 additions in parallel.
+ /// (128 bits)
+ ///
+ private static void Raw_Workload_SSE2(Memory c1V, Memory c2V, Memory c3V)
+ {
+ int count = c1V.Length;
+
+ using MemoryHandle mem1 = c1V.Pin();
+ using MemoryHandle mem2 = c2V.Pin();
+ using MemoryHandle mem3 = c3V.Pin();
+
+ unsafe
+ {
+ int* p1 = (int*)mem1.Pointer;
+ int* p2 = (int*)mem2.Pointer;
+ int* p3 = (int*)mem3.Pointer;
+
+ int vectorSize = Vector128.Count;
+ int vectorEnd = count - (count % vectorSize);
+ for (int i = 0; i < vectorEnd; i += vectorSize)
+ {
+ Vector128 v1 = Sse2.LoadVector128(p1 + i);
+ Vector128 v2 = Sse2.LoadVector128(p2 + i);
+ Vector128 v3 = Sse2.LoadVector128(p3 + i);
+ Vector128 sum = Sse2.Add(v1, Sse2.Add(v2, v3));
+
+ Sse2.Store(p1 + i, sum);
+ }
+
+ for (int i = vectorEnd; i < count; i++) // remaining elements
+ {
+ p1[i] = p1[i] + p2[i] + p3[i];
+ }
+ }
+ }
+
+ ///
+ /// AdvSIMD workload for fennecs(Raw)
+ /// We use AdvSIMD intrinsics to vectorize the workload, executing 4 additions in parallel.
+ /// (128 bits)
+ ///
+ private static void Raw_Workload_AdvSIMD(Memory c1V, Memory c2V, Memory c3V)
{
- _fennecs.query.Raw(delegate(Memory c1v, Memory c2v, Memory c3v)
+ int count = c1V.Length;
+
+ using MemoryHandle mem1 = c1V.Pin();
+ using MemoryHandle mem2 = c2V.Pin();
+ using MemoryHandle mem3 = c3V.Pin();
+
+ unsafe
{
- var c1vs = c1v.Span;
- var c2vs = c2v.Span;
- var c3vs = c3v.Span;
-
- for (int i = 0; i < c1vs.Length; ++i)
+ int* p1 = (int*)mem1.Pointer;
+ int* p2 = (int*)mem2.Pointer;
+ int* p3 = (int*)mem3.Pointer;
+
+ int vectorSize = Vector128.Count;
+ int vectorEnd = count - (count % vectorSize);
+ for (int i = 0; i < vectorEnd; i += vectorSize)
+ {
+ Vector128 v1 = AdvSimd.LoadVector128(p1 + i);
+ Vector128 v2 = AdvSimd.LoadVector128(p2 + i);
+ Vector128 v3 = AdvSimd.LoadVector128(p3 + i);
+ Vector128 sum = AdvSimd.Add(v1, AdvSimd.Add(v2, v3));
+
+ AdvSimd.Store(p1 + i, sum);
+ }
+
+ for (int i = vectorEnd; i < count; i++) // remaining elements
{
- ref Component1 c1 = ref c1vs[i];
- c1.Value += c2vs[i].Value + c3vs[i].Value;
+ p1[i] = p1[i] + p2[i] + p3[i];
}
- });
+ }
}
+
+ #endregion
}
}
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/_SystemWithThreeComponents.cs b/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/_SystemWithThreeComponents.cs
index 408108b..e2454c2 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/_SystemWithThreeComponents.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/_SystemWithThreeComponents.cs
@@ -2,7 +2,7 @@
namespace Ecs.CSharp.Benchmark
{
- [BenchmarkCategory(Categories.System)]
+ [BenchmarkCategory(Categories.System, nameof(SystemWithThreeComponents))]
[MemoryDiagnoser]
#if CHECK_CACHE_MISSES
[HardwareCounters(BenchmarkDotNet.Diagnosers.HardwareCounter.CacheMisses)]
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/Fennecs.cs b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/Fennecs.cs
index 142b48b..5f7f9b6 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/Fennecs.cs
@@ -1,73 +1,300 @@
using System;
-using System.Runtime.CompilerServices;
+using System.Buffers;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
using BenchmarkDotNet.Attributes;
using Ecs.CSharp.Benchmark.Contexts;
using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components;
using fennecs;
+// ReSharper disable ConvertToCompoundAssignment
+// ReSharper disable once CheckNamespace
namespace Ecs.CSharp.Benchmark
{
public partial class SystemWithTwoComponents
{
[Context] private readonly FennecsContext _fennecs;
+ private Stream Stream => _fennecs.stream;
+ // ReSharper disable once ClassNeverInstantiated.Local
private sealed class FennecsContext : FennecsBaseContext
{
- public Query query;
+ internal readonly Stream stream;
- public FennecsContext(int entityCount, int entityPadding)
+ public FennecsContext(int entityCount, int entityPadding) : base(entityCount)
{
- query = World.Query().Build();
+ stream = World.Query().Stream();
+
for (int i = 0; i < entityCount; ++i)
{
for (int j = 0; j < entityPadding; ++j)
{
Entity padding = World.Spawn();
- switch (j % 2)
+ switch (j % 3)
{
case 0:
padding.Add();
break;
-
case 1:
padding.Add();
break;
}
}
- World.Spawn().Add().Add(new Component2 { Value = 1 });
+ World.Spawn().Add()
+ .Add(new Component2
+ {
+ Value = 1
+ });
}
+
+ }
+
+ public override void Dispose()
+ {
+ stream.Query.Dispose();
+ base.Dispose();
}
}
+ ///
+ /// fennecs For runners are the classic swiss army knife of this ECS.
+ ///
+ ///
+ /// They are the most versatile and offer decent single-threaded baseline performance to boot.
+ ///
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
- public void Fennecs_ForEach()
+ [Benchmark(Description = "fennecs(For)")]
+ public void fennecs_For()
{
- _fennecs.query.For((ref Component1 c1, ref Component2 c2) => c1.Value += c2.Value);
+ Stream.For(
+ static (ref Component1 c1, ref Component2 c2) =>
+ {
+ c1.Value = c1.Value + c2.Value;
+ });
}
+
+ ///
+ /// fennecs Job runners are the most scalable runners.
+ ///
+ ///
+ ///
+ /// They're still an area for improvement :)
+ ///
+ ///
+ /// Job is designed for heavy individual workloads (e.g. update 20 physics worlds on 20 cores),
+ /// or large numbers of entities in many big archetypes. They only start paying off at around
+ /// 500,000 components when the individual work steps are simple (e.g. vector multiplications).
+ ///
+ ///
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
- public void Fennecs_Job()
+ [Benchmark(Description = $"fennecs(Job)")]
+ public void fennecs_Job()
{
- _fennecs.query.Job(delegate(ref Component1 c1, ref Component2 c2) { c1.Value += c2.Value; }, 1024);
+ Stream.Job(
+ static (ref Component1 c1, ref Component2 c2) =>
+ {
+ c1.Value = c1.Value + c2.Value;
+ });
}
+
+ // fennecs Raw runners guarantee contiguous memory access in the form of Query<>.Raw(MemoryAction<>)
+ // Raw runners are intended to process data or transfer it via the fastest available means.
+ // Example use cases:
+ // - transfer data to/from GPU
+ // - transfer data to/from Game Engine
+ // - Disk, Database, or Network I/O
+ // - SIMD calculations
+ // - snapshotting / copying / rollback / compression / hashing / diffing / permutation
+ // - etc.
+ //
+ // As example / reference / benchmarks, we vectorize our calculation here using AVX2, SSE2, and AdvSIMD
+ // Despite the 'unsafe' tags, this is quite safe ;) The Memorys are pinned till end of scope.
+ // We also keep an Unoptimized Workload around to let RyuJIT show off its magic. (still good!)
+
+ #region Raw Runners
+
+ ///
+ /// Unoptimized workload for fennecs(Raw)
+ ///
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
- public void Fennecs_Raw()
+ [Benchmark(Description = "fennecs(Raw)")]
+ public void fennecs_Raw()
+ {
+ Stream.Raw(Raw_Workload_Unoptimized);
+ }
+
+ ///
+ /// Vectorized Benchmark Contender for fennecs. (AVX2)
+ ///
+ ///
+ /// This benchmark is automatically excluded if the current environment does not support AVX2.
+ ///
+ [BenchmarkCategory(Categories.Fennecs, Capabilities.Avx2)]
+ [Benchmark(Description = "fennecs(AVX2)")]
+ public void fennecs_Raw_AVX2()
+ {
+ Stream.Raw(Raw_Workload_AVX2);
+ }
+
+ ///
+ /// Vectorized Benchmark Contender for fennecs. (SSE2 / AVX1)
+ ///
+ ///
+ /// This benchmark is automatically excluded if the current environment does not support SSE2.
+ ///
+ [BenchmarkCategory(Categories.Fennecs, Capabilities.Sse2)]
+ [Benchmark(Description = "fennecs(SSE2)")]
+ public void fennecs_Raw_SSE2()
+ {
+ Stream.Raw(Raw_Workload_SSE2);
+ }
+
+ ///
+ /// Vectorized Benchmark Contender for fennecs. (Arm64 AdvSIMD)
+ ///
+ ///
+ /// This benchmark is automatically excluded if the current environment does not support AdvSIMD.
+ ///
+ [BenchmarkCategory(Categories.Fennecs, Capabilities.AdvSimd)]
+ [Benchmark(Description = "fennecs(AdvSIMD)")]
+ public void fennecs_Raw_AdvSIMD()
+ {
+ Stream.Raw(Raw_Workload_AdvSIMD);
+ }
+
+ ///
+ /// Unoptimized workload for fennecs(Raw)
+ /// Treating the Memory Slabs basically as Arrays.
+ ///
+ ///
+ /// However, RyuJIT is able to optimize this workload to a degree,
+ /// especially if we use an explicit assignment instead of a compound assignment
+ /// for our addition.
+ ///
+ private static void Raw_Workload_Unoptimized(Memory c1V, Memory c2V)
+ {
+ Span c1S = c1V.Span;
+ Span c2S = c2V.Span;
+
+ for (int i = 0; i < c1S.Length; i++)
+ {
+ // Compound Assignment is not as optimized as explicit assignment
+ c1S[i].Value = c1S[i].Value + c2S[i].Value;
+ }
+ }
+
+ ///
+ /// AVX2 workload for fennecs(Raw)
+ /// We use AVX2 intrinsics to vectorize the workload, executing 8 additions in parallel.
+ /// (256 bits)
+ ///
+ private static void Raw_Workload_AVX2(Memory c1V, Memory c2V)
+ {
+ int count = c1V.Length;
+
+ using MemoryHandle mem1 = c1V.Pin();
+ using MemoryHandle mem2 = c2V.Pin();
+
+ unsafe
+ {
+ int* p1 = (int*)mem1.Pointer;
+ int* p2 = (int*)mem2.Pointer;
+
+ int vectorSize = Vector256.Count;
+ int vectorEnd = count - (count % vectorSize);
+ for (int i = 0; i < vectorEnd; i += vectorSize)
+ {
+ Vector256 v1 = Avx.LoadVector256(p1 + i);
+ Vector256 v2 = Avx.LoadVector256(p2 + i);
+ Vector256 sum = Avx2.Add(v1, v2);
+
+ Avx.Store(p1 + i, sum);
+ }
+
+ for (int i = vectorEnd; i < count; i++) // remaining elements
+ {
+ // Compound Assignment is not as optimized as explicit assignment
+ p1[i] = p1[i] + p2[i];
+ }
+ }
+ }
+
+ ///
+ /// SSE2 workload for fennecs(Raw)
+ /// We use SSE2 (same as AVX1) intrinsics to vectorize the workload, executing 4 additions in parallel.
+ /// (128 bits)
+ ///
+ private static void Raw_Workload_SSE2(Memory c1V, Memory c2V)
{
- _fennecs.query.Raw(delegate(Memory c1v, Memory c2v)
+ int count = c1V.Length;
+
+ using MemoryHandle mem1 = c1V.Pin();
+ using MemoryHandle mem2 = c2V.Pin();
+
+ unsafe
{
- var c1vs = c1v.Span;
- var c2vs = c2v.Span;
- for (int i = 0; i < c1vs.Length; ++i)
+ int* p1 = (int*)mem1.Pointer;
+ int* p2 = (int*)mem2.Pointer;
+
+ int vectorSize = Vector128.Count;
+ int vectorEnd = count - (count % vectorSize);
+ for (int i = 0; i < vectorEnd; i += vectorSize)
+ {
+ Vector128 v1 = Sse2.LoadVector128(p1 + i);
+ Vector128 v2 = Sse2.LoadVector128(p2 + i);
+ Vector128 sum = Sse2.Add(v1, v2);
+
+ Sse2.Store(p1 + i, sum);
+ }
+
+ for (int i = vectorEnd; i < count; i++) // remaining elements
{
- ref Component1 c1 = ref c1vs[i];
- c1.Value += c2vs[i].Value;
+ // Compound Assignment is not as optimized as explicit assignment
+ p1[i] = p1[i] + p2[i];
}
- });
+ }
+ }
+
+ ///
+ /// AdvSIMD workload for fennecs(Raw)
+ /// We use AdvSIMD intrinsics to vectorize the workload, executing 4 additions in parallel.
+ /// (128 bits)
+ ///
+ private static void Raw_Workload_AdvSIMD(Memory c1V, Memory c2V)
+ {
+ int count = c1V.Length;
+
+ using MemoryHandle mem1 = c1V.Pin();
+ using MemoryHandle mem2 = c2V.Pin();
+
+ unsafe
+ {
+ int* p1 = (int*)mem1.Pointer;
+ int* p2 = (int*)mem2.Pointer;
+
+ int vectorSize = Vector128.Count;
+ int vectorEnd = count - (count % vectorSize);
+ for (int i = 0; i < vectorEnd; i += vectorSize)
+ {
+ Vector128 v1 = AdvSimd.LoadVector128(p1 + i);
+ Vector128 v2 = AdvSimd.LoadVector128(p2 + i);
+ Vector128 sum = AdvSimd.Add(v1, v2);
+
+ AdvSimd.Store(p1 + i, sum);
+ }
+
+ for (int i = vectorEnd; i < count; i++) // remaining elements
+ {
+ // Compound Assignment is not as optimized as explicit assignment
+ p1[i] = p1[i] + p2[i];
+ }
+ }
}
+
+ #endregion
}
}
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/_SystemWithTwoComponents.cs b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/_SystemWithTwoComponents.cs
index aeba499..b93d115 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/_SystemWithTwoComponents.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/_SystemWithTwoComponents.cs
@@ -2,7 +2,7 @@
namespace Ecs.CSharp.Benchmark
{
- [BenchmarkCategory(Categories.System)]
+ [BenchmarkCategory(Categories.System, nameof(SystemWithTwoComponents))]
[MemoryDiagnoser]
#if CHECK_CACHE_MISSES
[HardwareCounters(BenchmarkDotNet.Diagnosers.HardwareCounter.CacheMisses)]
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/Fennecs.cs b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/Fennecs.cs
index 42f8788..82e01e9 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/Fennecs.cs
@@ -1,30 +1,38 @@
using System;
+using System.Buffers;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
using BenchmarkDotNet.Attributes;
using Ecs.CSharp.Benchmark.Contexts;
using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components;
using fennecs;
+// ReSharper disable ConvertToCompoundAssignment
+
+// ReSharper disable once CheckNamespace
namespace Ecs.CSharp.Benchmark
{
public partial class SystemWithTwoComponentsMultipleComposition
{
[Context] private readonly FennecsContext _fennecs;
+ private Stream Stream => _fennecs.stream;
+ // ReSharper disable once ClassNeverInstantiated.Local
private sealed class FennecsContext : FennecsBaseContext
{
- private record struct Padding1();
+ private struct Padding1;
- private record struct Padding2();
+ private struct Padding2;
- private record struct Padding3();
+ private struct Padding3;
- private record struct Padding4();
+ private struct Padding4;
- public Query query;
+ public readonly Stream stream;
- public FennecsContext(int entityCount)
+ public FennecsContext(int entityCount) : base(entityCount)
{
- query = World.Query().Build();
for (int i = 0; i < entityCount; ++i)
{
Entity entity = World.Spawn().Add().Add(new Component2 { Value = 1 });
@@ -44,37 +52,254 @@ public FennecsContext(int entityCount)
break;
}
}
+
+ stream = World.Query().Stream();
+ }
+
+ public override void Dispose()
+ {
+ stream.Query.Dispose();
+ base.Dispose();
}
}
+ ///
+ /// fennecs For runners are the classic swiss army knife of this ECS.
+ ///
+ ///
+ /// They are the most versatile and offer decent single-threaded baseline performance to boot.
+ ///
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
- public void Fennecs_ForEach()
+ [Benchmark(Description = "fennecs(For)")]
+ public void fennecs_For()
{
- _fennecs.query.For((ref Component1 c1, ref Component2 c2) => c1.Value += c2.Value);
+ Stream.For(
+ static (ref Component1 c1, ref Component2 c2) =>
+ {
+ c1.Value = c1.Value + c2.Value;
+ });
}
+
+ ///
+ /// fennecs Job runners are the most scalable runners.
+ ///
+ ///
+ ///
+ /// They're still an area for improvement :)
+ ///
+ ///
+ /// Job is designed for heavy individual workloads (e.g. update 20 physics worlds on 20 cores),
+ /// or large numbers of entities in many big archetypes. They only start paying off at around
+ /// 500,000 components when the individual work steps are simple (e.g. vector multiplications).
+ ///
+ ///
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
- public void Fennecs_Job()
+ [Benchmark(Description = $"fennecs(Job)")]
+ public void fennecs_Job()
{
- _fennecs.query.Job(delegate(ref Component1 c1, ref Component2 c2) { c1.Value += c2.Value; }, 1024);
+ Stream.Job(
+ static (ref Component1 c1, ref Component2 c2) =>
+ {
+ c1.Value = c1.Value + c2.Value;
+ });
}
+
+ // fennecs Raw runners guarantee contiguous memory access in the form of Query<>.Raw(MemoryAction<>)
+ // Raw runners are intended to process data or transfer it via the fastest available means.
+ // Example use cases:
+ // - transfer data to/from GPU
+ // - transfer data to/from Game Engine
+ // - Disk, Database, or Network I/O
+ // - SIMD calculations
+ // - snapshotting / copying / rollback / compression / hashing / diffing / permutation
+ // - etc.
+ //
+ // As example / reference / benchmarks, we vectorize our calculation here using AVX2, SSE2, and AdvSIMD
+ // Despite the 'unsafe' tags, this is quite safe ;) The Memorys are pinned till end of scope.
+ // We also keep an Unoptimized Workload around to let RyuJIT show off its magic. (still good!)
+
+ #region Raw Runners
+
+ ///
+ /// Unoptimized workload for fennecs(Raw)
+ ///
[BenchmarkCategory(Categories.Fennecs)]
- [Benchmark]
- public void Fennecs_Raw()
+ [Benchmark(Description = "fennecs(Raw)")]
+ public void fennecs_Raw()
+ {
+ Stream.Raw(Raw_Workload_Unoptimized);
+ }
+
+ ///
+ /// Vectorized Benchmark Contender for fennecs. (AVX2)
+ ///
+ ///
+ /// This benchmark is automatically excluded if the current environment does not support AVX2.
+ ///
+ [BenchmarkCategory(Categories.Fennecs, Capabilities.Avx2)]
+ [Benchmark(Description = "fennecs(AVX2)")]
+ public void fennecs_Raw_AVX2()
+ {
+ Stream.Raw(Raw_Workload_AVX2);
+ }
+
+ ///
+ /// Vectorized Benchmark Contender for fennecs. (SSE2 / AVX1)
+ ///
+ ///
+ /// This benchmark is automatically excluded if the current environment does not support SSE2.
+ ///
+ [BenchmarkCategory(Categories.Fennecs, Capabilities.Sse2)]
+ [Benchmark(Description = "fennecs(SSE2)")]
+ public void fennecs_Raw_SSE2()
+ {
+ Stream.Raw(Raw_Workload_SSE2);
+ }
+
+ ///
+ /// Vectorized Benchmark Contender for fennecs. (Arm64 AdvSIMD)
+ ///
+ ///
+ /// This benchmark is automatically excluded if the current environment does not support AdvSIMD.
+ ///
+ [BenchmarkCategory(Categories.Fennecs, Capabilities.AdvSimd)]
+ [Benchmark(Description = "fennecs(AdvSIMD)")]
+ public void fennecs_Raw_AdvSIMD()
+ {
+ Stream.Raw(Raw_Workload_AdvSIMD);
+ }
+
+ ///
+ /// Unoptimized workload for fennecs(Raw)
+ /// Treating the Memory Slabs basically as Arrays.
+ ///
+ ///
+ /// However, RyuJIT is able to optimize this workload to a degree,
+ /// especially if we use an explicit assignment instead of a compound assignment
+ /// for our addition.
+ ///
+ private static void Raw_Workload_Unoptimized(Memory c1V, Memory c2V)
+ {
+ Span c1S = c1V.Span;
+ Span c2S = c2V.Span;
+
+ for (int i = 0; i < c1S.Length; i++)
+ {
+ // Compound Assignment is not as optimized as explicit assignment
+ c1S[i].Value = c1S[i].Value + c2S[i].Value;
+ }
+ }
+
+ ///
+ /// AVX2 workload for fennecs(Raw)
+ /// We use AVX2 intrinsics to vectorize the workload, executing 8 additions in parallel.
+ /// (256 bits)
+ ///
+ private static void Raw_Workload_AVX2(Memory c1V, Memory c2V)
{
- _fennecs.query.Raw(delegate(Memory c1v, Memory c2v)
+ int count = c1V.Length;
+
+ using MemoryHandle mem1 = c1V.Pin();
+ using MemoryHandle mem2 = c2V.Pin();
+
+ unsafe
{
- var c1vs = c1v.Span;
- var c2vs = c2v.Span;
- for (int i = 0; i < c1vs.Length; ++i)
+ int* p1 = (int*)mem1.Pointer;
+ int* p2 = (int*)mem2.Pointer;
+
+ int vectorSize = Vector256.Count;
+ int vectorEnd = count - (count % vectorSize);
+ for (int i = 0; i < vectorEnd; i += vectorSize)
{
- ref Component1 c1 = ref c1vs[i];
- c1.Value += c2vs[i].Value;
+ Vector256 v1 = Avx.LoadVector256(p1 + i);
+ Vector256 v2 = Avx.LoadVector256(p2 + i);
+ Vector256 sum = Avx2.Add(v1, v2);
+
+ Avx.Store(p1 + i, sum);
}
- });
+
+ for (int i = vectorEnd; i < count; i++) // remaining elements
+ {
+ // Compound Assignment is not as optimized as explicit assignment
+ p1[i] = p1[i] + p2[i];
+ }
+ }
+ }
+
+ ///
+ /// SSE2 workload for fennecs(Raw)
+ /// We use SSE2 (same as AVX1) intrinsics to vectorize the workload, executing 4 additions in parallel.
+ /// (128 bits)
+ ///
+ private static void Raw_Workload_SSE2(Memory c1V, Memory c2V)
+ {
+ int count = c1V.Length;
+
+ using MemoryHandle mem1 = c1V.Pin();
+ using MemoryHandle mem2 = c2V.Pin();
+
+ unsafe
+ {
+ int* p1 = (int*)mem1.Pointer;
+ int* p2 = (int*)mem2.Pointer;
+
+ int vectorSize = Vector128.Count;
+ int vectorEnd = count - (count % vectorSize);
+ for (int i = 0; i < vectorEnd; i += vectorSize)
+ {
+ Vector128 v1 = Sse2.LoadVector128(p1 + i);
+ Vector128 v2 = Sse2.LoadVector128(p2 + i);
+ Vector128 sum = Sse2.Add(v1, v2);
+
+ Sse2.Store(p1 + i, sum);
+ }
+
+ for (int i = vectorEnd; i < count; i++) // remaining elements
+ {
+ // Compound Assignment is not as optimized as explicit assignment
+ p1[i] = p1[i] + p2[i];
+ }
+ }
}
+
+ ///
+ /// AdvSIMD workload for fennecs(Raw)
+ /// We use AdvSIMD intrinsics to vectorize the workload, executing 4 additions in parallel.
+ /// (128 bits)
+ ///
+ private static void Raw_Workload_AdvSIMD(Memory c1V, Memory c2V)
+ {
+ int count = c1V.Length;
+
+ using MemoryHandle mem1 = c1V.Pin();
+ using MemoryHandle mem2 = c2V.Pin();
+
+ unsafe
+ {
+ int* p1 = (int*)mem1.Pointer;
+ int* p2 = (int*)mem2.Pointer;
+
+ int vectorSize = Vector128.Count;
+ int vectorEnd = count - (count % vectorSize);
+ for (int i = 0; i < vectorEnd; i += vectorSize)
+ {
+ Vector128 v1 = AdvSimd.LoadVector128(p1 + i);
+ Vector128 v2 = AdvSimd.LoadVector128(p2 + i);
+ Vector128 sum = AdvSimd.Add(v1, v2);
+
+ AdvSimd.Store(p1 + i, sum);
+ }
+
+ for (int i = vectorEnd; i < count; i++) // remaining elements
+ {
+ // Compound Assignment is not as optimized as explicit assignment
+ p1[i] = p1[i] + p2[i];
+ }
+ }
+ }
+
+ #endregion
}
}
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/_SystemWithTwoComponentsMultipleComposition.cs b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/_SystemWithTwoComponentsMultipleComposition.cs
index e5fa72a..6c661ea 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/_SystemWithTwoComponentsMultipleComposition.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/_SystemWithTwoComponentsMultipleComposition.cs
@@ -2,7 +2,7 @@
namespace Ecs.CSharp.Benchmark
{
- [BenchmarkCategory(Categories.System)]
+ [BenchmarkCategory(Categories.System, nameof(SystemWithTwoComponentsMultipleComposition))]
[MemoryDiagnoser]
#if CHECK_CACHE_MISSES
[HardwareCounters(BenchmarkDotNet.Diagnosers.HardwareCounter.CacheMisses)]