diff --git a/README.md b/README.md
index 19fb4b6..e39e063 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ All results are obtained from the same toaster, with the same load, so compariso
 Tested frameworks:
 - [Arch](https://github.com/genaray/Arch)
 - [DefaultEcs](https://github.com/Doraku/DefaultEcs)
-- [Fennecs](https://github.com/thygrrr/fennecs)
+- [**fenn**ecs](https://fennecs.tech)
 - [Flecs.Net](https://github.com/BeanCheeseBurrito/Flecs.NET)
 - [Friflo.Engine.ECS](https://github.com/friflo/Friflo.Json.Fliox/blob/main/Engine/README.md)
 - [Leopotam.Ecs](https://github.com/Leopotam/ecs) using what I believe is a nuget package not made by the actual author and compiled in debug...
diff --git a/source/Ecs.CSharp.Benchmark/Capabilities.cs b/source/Ecs.CSharp.Benchmark/Capabilities.cs
new file mode 100644
index 0000000..3a2b46c
--- /dev/null
+++ b/source/Ecs.CSharp.Benchmark/Capabilities.cs
@@ -0,0 +1,69 @@
+﻿using BenchmarkDotNet.Configs;
+
+namespace Ecs.CSharp.Benchmark
+{
+    /// <summary>
+    /// Capability Requirements for specific tests.
+    /// Add your own intrinsics or other system dependencies here.
+    /// </summary>
+    /// <remarks>
+    /// Usage: Add a category to it and apply exclusions in the ApplyExclusions method.
+    /// (this is an EXCLUSIVE category filter, it turns OFF all categories it matches)
+    /// Then, set your own BenchmarkCategory to include the CapabilityCategory string.
+    /// </remarks>
+    /// <example>
+    /// <code>
+    /// [BenchmarkCategory(
+    ///     Categories.Fennecs,
+    ///     Capabilities.Avx2
+    /// )]
+    /// public void Raw_AVX2()
+    /// </code>
+    /// </example>
+    internal static class Capabilities
+    {
+        // These are common vectorized instruction set categories.
+        // x86/x64
+        public const string Avx2 = nameof(System.Runtime.Intrinsics.X86.Avx2);
+        public const string Avx = nameof(System.Runtime.Intrinsics.X86.Avx);
+        public const string Sse3 = nameof(System.Runtime.Intrinsics.X86.Sse3);
+        public const string Sse2 = nameof(System.Runtime.Intrinsics.X86.Sse2);
+        
+        // Arm
+        public const string AdvSimd = nameof(System.Runtime.Intrinsics.Arm.AdvSimd);
+        
+        /// <summary>
+        /// This applies capability-based exclusions as filters to the config.
+        /// </summary>
+        /// <param name="self">a Benchmark Config, e.g. as used in Program.cs</param>
+        public static IConfig WithCapabilityExclusions(this IConfig self)
+        {
+            if (!System.Runtime.Intrinsics.X86.Avx2.IsSupported)
+            {
+                self = self.AddFilter(new CategoryExclusion(Avx2));
+            }
+
+            if (!System.Runtime.Intrinsics.X86.Avx.IsSupported)
+            {
+                self = self.AddFilter(new CategoryExclusion(Avx));
+            }
+
+            if (!System.Runtime.Intrinsics.X86.Sse3.IsSupported)
+            {
+                self = self.AddFilter(new CategoryExclusion(Sse3));
+            }
+
+            if (!System.Runtime.Intrinsics.X86.Sse2.IsSupported)
+            {
+                self = self.AddFilter(new CategoryExclusion(Sse2));
+            }
+
+            if (!System.Runtime.Intrinsics.Arm.AdvSimd.IsSupported)
+            {
+                self = self.AddFilter(new CategoryExclusion(AdvSimd));
+            }
+
+            return self;
+        }
+    }
+}
diff --git a/source/Ecs.CSharp.Benchmark/Categories.cs b/source/Ecs.CSharp.Benchmark/Categories.cs
index 2e0e883..2812643 100644
--- a/source/Ecs.CSharp.Benchmark/Categories.cs
+++ b/source/Ecs.CSharp.Benchmark/Categories.cs
@@ -1,5 +1,13 @@
-﻿namespace Ecs.CSharp.Benchmark
+﻿using System.Diagnostics.CodeAnalysis;
+using System.Linq;
+using BenchmarkDotNet.Filters;
+using BenchmarkDotNet.Running;
+
+namespace Ecs.CSharp.Benchmark
 {
+    /// <summary>
+    /// Prefixes / ECS package names for benchmarks, used as BenchMarkDotNet categories.
+    /// </summary>
     internal static class Categories
     {
         public const string Arch = "Arch";
@@ -14,10 +22,28 @@ internal static class Categories
         public const string SveltoECS = "Svelto.ECS";
         public const string Morpeh = "Morpeh";
         public const string FlecsNet = "FlecsNet";
-        public const string Fennecs = "Fennecs";
+        public const string Fennecs = "fennecs";
         public const string TinyEcs = "TinyEcs";
 
         public const string CreateEntity = "CreateEntity";
         public const string System = "System";
     }
+
+    /// <summary>
+    /// Excludes a given category from benchmarks.
+    /// (used by Program.cs)
+    /// </summary>
+    /// <remarks>
+    /// When an exclusion is PRESENT, then all benchmarks that HAVE the category will be EXCLUDED.
+    /// </remarks>
+    /// <example>
+    /// <c>CategoryExclusion("foo")</c> will exclude all benchmarks that have the "foo" category.
+    /// </example>
+    public class CategoryExclusion(string category) : IFilter
+    {
+        public bool Predicate([NotNull] BenchmarkCase benchmarkCase)
+        {
+            return !benchmarkCase.Descriptor.Categories.Contains(category);
+        }
+    }    
 }
diff --git a/source/Ecs.CSharp.Benchmark/Contexts/FennecsBaseContext.cs b/source/Ecs.CSharp.Benchmark/Contexts/FennecsBaseContext.cs
index cc67db5..8892bd5 100644
--- a/source/Ecs.CSharp.Benchmark/Contexts/FennecsBaseContext.cs
+++ b/source/Ecs.CSharp.Benchmark/Contexts/FennecsBaseContext.cs
@@ -1,38 +1,52 @@
 ﻿using System;
+using System.Runtime.CompilerServices;
 using fennecs;
 
 namespace Ecs.CSharp.Benchmark.Contexts
 {
     namespace Fennecs_Components
     {
-        internal struct Component1
+        internal record struct Component1
         {
+            public static implicit operator Component1(int value) => new() { Value = value }; 
+            public static implicit operator Component2(Component1 self) => new() { Value = self.Value }; 
+            public static implicit operator Component3(Component1 self) => new() { Value = self.Value }; 
+            public static implicit operator int (Component1 c) => c.Value;
+            
             public int Value;
         }
 
-        internal struct Component2
+        internal record  struct Component2
         {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            public static implicit operator Component1(Component2 self) => new() { Value = self.Value }; 
+            public static implicit operator Component2(int value) => new() { Value = value }; 
+            public static implicit operator Component3(Component2 self) => new() { Value = self.Value }; 
+            public static implicit operator int (Component2 c) => c.Value;
+            
             public int Value;
         }
 
-        internal struct Component3
+        internal record struct Component3
         {
+            public static implicit operator Component1(Component3 self) => new() { Value = self.Value }; 
+            public static implicit operator Component2(Component3 self) => new() { Value = self.Value }; 
+            public static implicit operator Component3(int value) => new() { Value = value }; 
+            public static implicit operator int (Component3 c) => c.Value;
+            
             public int Value;
         }
     }
 
-    internal class FennecsBaseContext : IDisposable
+    internal class FennecsBaseContext(int entityCount) : IDisposable
     {
-        public World World { get; }
+        public World World { get; } = new World(entityCount * 2);
 
-        public FennecsBaseContext()
-        {
-            World = new World();
-        }
-
-        public void Dispose()
+        public virtual void Dispose()
         {
             World.Dispose();
         }
+        public FennecsBaseContext() : this(100000)
+        { }
     }
 }
diff --git a/source/Ecs.CSharp.Benchmark/CreateEntityWithOneComponent/Fennecs.cs b/source/Ecs.CSharp.Benchmark/CreateEntityWithOneComponent/Fennecs.cs
index 0939b51..ae76576 100644
--- a/source/Ecs.CSharp.Benchmark/CreateEntityWithOneComponent/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/CreateEntityWithOneComponent/Fennecs.cs
@@ -8,17 +8,16 @@ namespace Ecs.CSharp.Benchmark
     public partial class CreateEntityWithOneComponent
     {
         [Context] private readonly FennecsBaseContext _fennecs;
-
+        
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
+        [Benchmark(Description = "fennecs")]
         public void Fennecs()
         {
             World world = _fennecs.World;
 
-            for (int i = 0; i < EntityCount; ++i)
-            {
-                world.Spawn().Add<Component1>();
-            }
+            world.Entity()
+                .Add(new Component1())
+                .Spawn(EntityCount);
         }
     }
 }
diff --git a/source/Ecs.CSharp.Benchmark/CreateEntityWithThreeComponents/Fennecs.cs b/source/Ecs.CSharp.Benchmark/CreateEntityWithThreeComponents/Fennecs.cs
index 8689522..29d900e 100644
--- a/source/Ecs.CSharp.Benchmark/CreateEntityWithThreeComponents/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/CreateEntityWithThreeComponents/Fennecs.cs
@@ -3,26 +3,24 @@
 using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components;
 using fennecs;
 
+// ReSharper disable once CheckNamespace
 namespace Ecs.CSharp.Benchmark
 {
     public partial class CreateEntityWithThreeComponents
     {
-        [Context]
-        private readonly FennecsBaseContext _fennecs;
-
+        [Context] private readonly FennecsBaseContext _fennecs;
+        
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
+        [Benchmark(Description = "fennecs")]
         public void Fennecs()
         {
             World world = _fennecs.World;
 
-            for (int i = 0; i < EntityCount; ++i)
-            {
-                world.Spawn()
-                    .Add<Component1>()
-                    .Add<Component2>()
-                    .Add<Component3>();
-            }
+            world.Entity()
+                .Add(new Component1())
+                .Add(new Component2())
+                .Add(new Component3())
+                .Spawn(EntityCount);
         }
     }
 }
diff --git a/source/Ecs.CSharp.Benchmark/CreateEntityWithTwoComponents/Fennecs.cs b/source/Ecs.CSharp.Benchmark/CreateEntityWithTwoComponents/Fennecs.cs
index 4c2016d..b12b478 100644
--- a/source/Ecs.CSharp.Benchmark/CreateEntityWithTwoComponents/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/CreateEntityWithTwoComponents/Fennecs.cs
@@ -8,18 +8,17 @@ namespace Ecs.CSharp.Benchmark
     public partial class CreateEntityWithTwoComponents
     {
         [Context] private readonly FennecsBaseContext _fennecs;
-
+        
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
+        [Benchmark(Description = "fennecs")]
         public void Fennecs()
         {
             World world = _fennecs.World;
 
-            for (int i = 0; i < EntityCount; ++i)
-            {
-                world.Spawn().
-                    Add<Component1>().Add<Component2>();
-            }
+            world.Entity()
+                .Add(new Component1())
+                .Add(new Component2())
+                .Spawn(EntityCount);
         }
     }
 }
diff --git a/source/Ecs.CSharp.Benchmark/Ecs.CSharp.Benchmark.csproj b/source/Ecs.CSharp.Benchmark/Ecs.CSharp.Benchmark.csproj
index 5c24029..0b4e018 100644
--- a/source/Ecs.CSharp.Benchmark/Ecs.CSharp.Benchmark.csproj
+++ b/source/Ecs.CSharp.Benchmark/Ecs.CSharp.Benchmark.csproj
@@ -37,7 +37,7 @@
     <PackageReference Include="DefaultEcs" Version="0.17.2" />
     <PackageReference Include="DefaultEcs.Analyzer" Version="0.17.0" PrivateAssets="all" />
 
-    <PackageReference Include="fennecs" Version="0.1.1-beta" />
+    <PackageReference Include="fennecs" Version="0.5.4-beta" />
     
     <PackageReference Include="Friflo.Engine.ECS" Version="1.14.0" />
 
diff --git a/source/Ecs.CSharp.Benchmark/Program.cs b/source/Ecs.CSharp.Benchmark/Program.cs
index 1e9cb5a..3a91a4a 100644
--- a/source/Ecs.CSharp.Benchmark/Program.cs
+++ b/source/Ecs.CSharp.Benchmark/Program.cs
@@ -14,18 +14,26 @@
 
 BenchmarkSwitcher benchmark = BenchmarkSwitcher.FromTypes(new[]
 {
-    typeof(CreateEntityWithOneComponent),
-    typeof(CreateEntityWithTwoComponents),
-    typeof(CreateEntityWithThreeComponents),
-
     typeof(SystemWithOneComponent),
     typeof(SystemWithTwoComponents),
     typeof(SystemWithThreeComponents),
 
-    typeof(SystemWithTwoComponentsMultipleComposition)
+    typeof(SystemWithTwoComponentsMultipleComposition),
+
+    //Moving lighter tests to the back makes the estimated time display more reliable 
+    typeof(CreateEntityWithOneComponent),
+    typeof(CreateEntityWithTwoComponents),
+    typeof(CreateEntityWithThreeComponents),
 });
 
-IConfig configuration = DefaultConfig.Instance.WithOptions(ConfigOptions.DisableOptimizationsValidator);
+
+IConfig configuration = DefaultConfig.Instance
+    .WithOptions(ConfigOptions.DisableOptimizationsValidator)
+    .WithCapabilityExclusions();
+
+#if RANK_RESULTS
+    configuration = configuration.WithOrderer(new BenchmarkDotNet.Order.DefaultOrderer(BenchmarkDotNet.Order.SummaryOrderPolicy.FastestToSlowest));
+#endif
 
 if (args.Length > 0)
 {
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/Fennecs.cs b/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/Fennecs.cs
index 3690e37..01080b2 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/Fennecs.cs
@@ -1,23 +1,28 @@
 ﻿using System;
-using System.Runtime.CompilerServices;
+using System.Buffers;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
 using BenchmarkDotNet.Attributes;
 using Ecs.CSharp.Benchmark.Contexts;
 using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components;
 using fennecs;
 
+// ReSharper disable once CheckNamespace
 namespace Ecs.CSharp.Benchmark
 {
     public partial class SystemWithOneComponent
     {
         [Context] private readonly FennecsContext _fennecs;
 
+        // ReSharper disable once ClassNeverInstantiated.Local
         private sealed class FennecsContext : FennecsBaseContext
         {
-            public Query<Component1> query;
+            public readonly Stream<Component1> query;
 
-            public FennecsContext(int entityCount, int entityPadding)
+            public FennecsContext(int entityCount, int entityPadding) : base(entityCount)
             {
-                query = World.Query<Component1>().Build();
+                query = World.Stream<Component1>();
                 for (int i = 0; i < entityCount; ++i)
                 {
                     for (int j = 0; j < entityPadding; ++j)
@@ -27,34 +32,159 @@ public FennecsContext(int entityCount, int entityPadding)
 
                     World.Spawn().Add<Component1>();
                 }
+
+                query.Query.Batch(Batch.AddConflict.Replace)
+                    .Add(new Component1
+                    {
+                        Value = 0
+                    })
+                    .Submit();
+            }
+
+            public override void Dispose()
+            {
+                query.Query.Dispose();
+                base.Dispose();
             }
         }
 
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
-        public void Fennecs_ForEach()
+        [Benchmark (Description = "fennecs(For)")]
+        public void Fennecs_For()
+        {
+            _fennecs.query.For(static (ref Component1 v) => { v.Value++; });
+        }
+
+        // Disabled for now.
+        // This API is available in fennecs 0.3.x and later, but is not optimized yet.
+        //[BenchmarkCategory(Categories.Fennecs)]
+        //[Benchmark (Description = "fennecs(Batch)")]
+        public void Fennecs_Batch()
         {
-            _fennecs.query.For((ref Component1 comp0) => comp0.Value++);
+            int newValue = _fennecs.query.Query[0].Ref<Component1>().Value + 1;
+            _fennecs.query.Blit(newValue);
         }
 
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
+        [Benchmark (Description = "fennecs(Job)")]
         public void Fennecs_Job()
         {
-            _fennecs.query.Job(delegate(ref Component1 v) { v.Value++; }, 1024);
+            _fennecs.query.Job(static (ref Component1 v) => { v.Value++; });
         }
-        
-        [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
-        public void Fennecs_Raw()
+
+        //[BenchmarkCategory(Categories.Fennecs)]
+        //[Benchmark(Description = "fennecs(Blit)")]
+        public void Fennecs_Raw_Blit()
+        {
+            _fennecs.query.Raw(delegate(Memory<Component1> mem1)
+            {
+                // This does exactly what the system does, but it is wholly dependent
+                // on the precondition of the benchmark. (so... it's taking a shortcut)
+                // fennecs 0.4.0 or 0.5.0 will provide a literal Blit method that
+                // works like this for fast updating of large swathes of component
+                // data.
+                Component1 newValue = new Component1
+                {
+                    // We can safely do this because we will never get called here with
+                    // an empty archetype / zero size memory slab
+                    Value = mem1.Span[0].Value + 1
+                };
+                mem1.Span.Fill(newValue);
+            });
+        }
+
+        #region Raw Runners
+
+        [BenchmarkCategory(Categories.Fennecs, Capabilities.Avx2)]
+        [Benchmark(Description = "fennecs(AVX2)")]
+        public void Fennecs_Raw_AVX2()
+        {
+            _fennecs.query.Raw(delegate(Memory<Component1> mem1)
+            {
+                int count = mem1.Length;
+
+                using MemoryHandle handle1 = mem1.Pin();
+
+                unsafe
+                {
+                    int* p1 = (int*)handle1.Pointer;
+
+                    int vectorSize = Vector256<int>.Count;
+                    int vectorEnd = count - (count % vectorSize);
+                    for (int i = 0; i < vectorEnd; i += vectorSize)
+                    {
+                        Vector256<int> v1 = Avx.LoadVector256(p1 + i);
+                        Avx.Store(p1 + i, Avx2.Add(v1, Vector256<int>.One));
+                    }
+
+                    for (int i = vectorEnd; i < count; i++) // remaining elements
+                    {
+                        p1[i]++;
+                    }
+                }
+            });
+        }
+
+        [BenchmarkCategory(Categories.Fennecs, Capabilities.Sse2)]
+        [Benchmark(Description = "fennecs(SSE2)")]
+        public void Fennecs_Raw_SSE2()
+        {
+            _fennecs.query.Raw(delegate(Memory<Component1> mem1)
+            {
+                int count = mem1.Length;
+
+                using MemoryHandle handle1 = mem1.Pin();
+
+                unsafe
+                {
+                    int* p1 = (int*)handle1.Pointer;
+
+                    int vectorSize = Vector128<int>.Count;
+                    int vectorEnd = count - (count % vectorSize);
+                    for (int i = 0; i < vectorEnd; i += vectorSize)
+                    {
+                        Vector128<int> v1 = Sse2.LoadVector128(p1 + i);
+                        Sse2.Store(p1 + i, Sse2.Add(v1, Vector128<int>.One));
+                    }
+
+                    for (int i = vectorEnd; i < count; i++) // remaining elements
+                    {
+                        p1[i]++;
+                    }
+                }
+            });
+        }
+
+        [BenchmarkCategory(Categories.Fennecs, Capabilities.AdvSimd)]
+        [Benchmark(Description = "fennecs(AdvSIMD)")]
+        public void Fennecs_Raw_AdvSimd()
         {
-            _fennecs.query.Raw(delegate(Memory<Component1> vectors)
+            _fennecs.query.Raw(delegate(Memory<Component1> mem1)
             {
-                foreach (ref var v in vectors.Span)
+                int count = mem1.Length;
+
+                using MemoryHandle handle1 = mem1.Pin();
+
+                unsafe
                 {
-                    v.Value++;
+                    int* p1 = (int*)handle1.Pointer;
+
+                    int vectorSize = Vector128<int>.Count;
+                    int vectorEnd = count - (count % vectorSize);
+                    for (int i = 0; i < vectorEnd; i += vectorSize)
+                    {
+                        Vector128<int> v1 = AdvSimd.LoadVector128(p1 + i);
+                        AdvSimd.Store(p1 + i, AdvSimd.Add(v1, Vector128<int>.One));
+                    }
+
+                    for (int i = vectorEnd; i < count; i++) // remaining elements
+                    {
+                        p1[i]++;
+                    }
                 }
             });
         }
+
+        #endregion
     }
 }
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/_SystemWithOneComponent.cs b/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/_SystemWithOneComponent.cs
index 170429d..ac73ec9 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/_SystemWithOneComponent.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithOneComponent/_SystemWithOneComponent.cs
@@ -2,7 +2,7 @@
 
 namespace Ecs.CSharp.Benchmark
 {
-    [BenchmarkCategory(Categories.System)]
+    [BenchmarkCategory(Categories.System, nameof(SystemWithOneComponent))]
     [MemoryDiagnoser]
 #if CHECK_CACHE_MISSES
     [HardwareCounters(BenchmarkDotNet.Diagnosers.HardwareCounter.CacheMisses)]
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/Fennecs.cs b/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/Fennecs.cs
index c657a88..75cc219 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/Fennecs.cs
@@ -1,23 +1,28 @@
 ﻿using System;
-using System.Runtime.CompilerServices;
+using System.Buffers;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
 using BenchmarkDotNet.Attributes;
 using Ecs.CSharp.Benchmark.Contexts;
 using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components;
 using fennecs;
 
+// ReSharper disable once CheckNamespace
 namespace Ecs.CSharp.Benchmark
 {
     public partial class SystemWithThreeComponents
     {
         [Context] private readonly FennecsContext _fennecs;
+        private Stream<Component1, Component2, Component3> Stream => _fennecs.stream;
 
+        // ReSharper disable once ClassNeverInstantiated.Local
         private sealed class FennecsContext : FennecsBaseContext
         {
-            public Query<Component1, Component2, Component3> query;
+            internal readonly Stream<Component1, Component2, Component3> stream;
 
-            public FennecsContext(int entityCount, int entityPadding)
+            public FennecsContext(int entityCount, int entityPadding) : base(entityCount)
             {
-                query = World.Query<Component1, Component2, Component3>().Build();
                 for (int i = 0; i < entityCount; ++i)
                 {
                     for (int j = 0; j < entityPadding; ++j)
@@ -38,42 +43,282 @@ public FennecsContext(int entityCount, int entityPadding)
                     }
 
                     World.Spawn().Add<Component1>()
-                        .Add(new Component2 { Value = 1 })
-                        .Add(new Component3 { Value = 1 });
+                        .Add(new Component2 {Value = 1})
+                        .Add(new Component3 {Value = 1});
                 }
+
+                stream = World.Query<Component1, Component2, Component3>().Stream();
+            }
+            
+            public override void Dispose()
+            {
+                stream.Query.Dispose();
+                base.Dispose();
             }
         }
 
+        /// <summary>
+        /// fennecs For runners are the classic swiss army knife of this ECS. 
+        /// </summary>
+        /// <remarks>
+        /// They are the most versatile and offer decent single-threaded baseline performance to boot.
+        /// </remarks>
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
-        public void Fennecs_ForEach()
+        [Benchmark(Description = "fennecs(For)")]
+        public void fennecs_For()
         {
-            _fennecs.query.For((ref Component1 c1, ref Component2 c2, ref Component3 c3) => c1.Value += c2.Value + c3.Value);
+            Stream.For(
+                static (ref Component1 c1, ref Component2 c2, ref Component3 c3) =>
+                {
+                    c1.Value = c1.Value + c2.Value + c3.Value;
+                });
+        }
+
+
+        /// <summary>
+        /// Experimental Implicit value type to compare performance in a tight loop.
+        /// </summary>
+        /// <remarks>
+        /// It's very convenient, but is about 20% slower than the For runner with values
+        /// (it still gets inlined well!)
+        /// </remarks>
+        //[BenchmarkCategory(Categories.Fennecs)]
+        //[Benchmark(Description = "fennecs(Implicit)")]
+        public void fennecs_For_Implicit()
+        {
+            Stream.For(
+                static (ref Component1 c1, ref Component2 c2, ref Component3 c3) =>
+                {
+                    c1 = c1 + c2 + c3;
+                });
         }
 
+
+        /// <summary>
+        /// fennecs Job runners are the most scalable runners.
+        /// </summary>
+        /// <remarks>
+        /// <para>
+        /// They're still an area for improvement :)
+        /// </para>
+        /// <para>
+        /// Job is designed for heavy individual workloads (e.g. update 20 physics worlds on 20 cores),
+        /// or large numbers of entities in many big archetypes. They only start paying off at around
+        /// 500,000 components when the individual work steps are simple (e.g. vector multiplications).
+        /// </para>
+        /// </remarks>
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
-        public void Fennecs_Job()
+        [Benchmark(Description = $"fennecs(Job)")]
+        public void fennecs_Job()
         {
-            _fennecs.query.Job(delegate(ref Component1 c1, ref Component2 c2, ref Component3 c3) { c1.Value += c2.Value + c3.Value; }, 1024);
+            Stream.Job(
+                static (ref Component1 c1, ref Component2 c2, ref Component3 c3) =>
+                {
+                    c1.Value = c1.Value + c2.Value + c3.Value;
+                });
         }
 
+
+        // fennecs Raw runners guarantee contiguous memory access in the form of Query<>.Raw(MemoryAction<>)
+        // Raw runners are intended to process data or transfer it via the fastest available means.
+        // Example use cases:
+        //  - transfer data to/from GPU
+        //  - transfer data to/from Game Engine
+        //  - Disk, Database, or Network I/O
+        //  - SIMD calculations
+        //  - snapshotting / copying / rollback / compression / hashing / diffing / permutation
+        //  - etc.
+        //
+        // As example / reference / benchmarks, we vectorize our calculation here using AVX2, SSE2, and AdvSIMD
+        // Despite the 'unsafe' tags, this is quite safe ;) The Memory<T>s are pinned till end of scope.
+        // We also keep an Unoptimized Workload around to let RyuJIT show off its magic. (still good!)
+
+        #region Raw Runners
+        
+        /// <summary>
+        /// Unoptimized workload for fennecs(Raw)
+        /// </summary>
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
-        public void Fennecs_Raw()
+        [Benchmark(Description = "fennecs(Raw)")]
+        public void fennecs_Raw()
+        {
+            Stream.Raw(Raw_Workload_Unoptimized);
+        }
+
+        /// <summary>
+        /// Vectorized Benchmark Contender for fennecs. (AVX2)
+        /// </summary>
+        /// <remarks>
+        /// This benchmark is automatically excluded if the current environment does not support AVX2.
+        /// </remarks>
+        [BenchmarkCategory(Categories.Fennecs, Capabilities.Avx2)]
+        [Benchmark(Description = "fennecs(AVX2)")]
+        public void fennecs_Raw_AVX2()
+        {
+            Stream.Raw(Raw_Workload_AVX2);
+        }
+
+        /// <summary>
+        /// Vectorized Benchmark Contender for fennecs. (SSE2 / AVX1)
+        /// </summary>
+        /// <remarks>
+        /// This benchmark is automatically excluded if the current environment does not support SSE2.
+        /// </remarks>
+        [BenchmarkCategory(Categories.Fennecs, Capabilities.Sse2)]
+        [Benchmark(Description = "fennecs(SSE2)")]
+        public void fennecs_Raw_SSE2()
+        {
+            Stream.Raw(Raw_Workload_SSE2);
+        }
+
+        /// <summary>
+        /// Vectorized Benchmark Contender for fennecs. (Arm64 AdvSIMD)
+        /// </summary>
+        /// <remarks>
+        /// This benchmark is automatically excluded if the current environment does not support AdvSIMD.
+        /// </remarks>
+        [BenchmarkCategory(Categories.Fennecs, Capabilities.AdvSimd)]
+        [Benchmark(Description = "fennecs(AdvSIMD)")]
+        public void fennecs_Raw_AdvSIMD()
+        {
+            Stream.Raw(Raw_Workload_AdvSIMD);
+        }
+
+        /// <summary>
+        /// Unoptimized workload for fennecs(Raw)
+        /// Treating the Memory Slabs basically as Arrays.
+        /// </summary>
+        /// <remarks>
+        /// However, RyuJIT is able to optimize this workload to a degree,
+        /// especially if we use an explicit assignment instead of a compound assignment
+        /// for our addition. 
+        /// </remarks>
+        private static void Raw_Workload_Unoptimized(Memory<Component1> c1V, Memory<Component2> c2V, Memory<Component3> c3V)
+        {
+            Span<Component1> c1S = c1V.Span;
+            Span<Component2> c2S = c2V.Span;
+            Span<Component3> c3S = c3V.Span;
+
+            for (int i = 0; i < c1S.Length; i++)
+            {
+                c1S[i].Value = c1S[i].Value + c2S[i].Value + c3S[i].Value;
+            }
+        }
+
+        /// <summary>
+        /// AVX2 workload for fennecs(Raw)
+        /// We use AVX2 intrinsics to vectorize the workload, executing 8 additions in parallel.
+        /// (256 bits)
+        /// </summary>
+        private static void Raw_Workload_AVX2(Memory<Component1> c1V, Memory<Component2> c2V, Memory<Component3> c3V)
+        {
+            int count = c1V.Length;
+
+            using MemoryHandle mem1 = c1V.Pin();
+            using MemoryHandle mem2 = c2V.Pin();
+            using MemoryHandle mem3 = c3V.Pin();
+
+            unsafe
+            {
+                int* p1 = (int*)mem1.Pointer;
+                int* p2 = (int*)mem2.Pointer;
+                int* p3 = (int*)mem3.Pointer;
+
+                int vectorSize = Vector256<int>.Count;
+                int vectorEnd = count - (count % vectorSize);
+                for (int i = 0; i < vectorEnd; i += vectorSize)
+                {
+                    Vector256<int> v1 = Avx.LoadVector256(p1 + i);
+                    Vector256<int> v2 = Avx.LoadVector256(p2 + i);
+                    Vector256<int> v3 = Avx.LoadVector256(p3 + i);
+                    Vector256<int> sum = Avx2.Add(v1, Avx2.Add(v2, v3));
+
+                    Avx.Store(p1 + i, sum);
+                }
+
+                for (int i = vectorEnd; i < count; i++) // remaining elements
+                {
+                    p1[i] = p1[i] + p2[i] + p3[i];
+                }
+            }
+        }
+
+        /// <summary>
+        /// SSE2 workload for fennecs(Raw)
+        /// We use SSE2 (same as AVX1) intrinsics to vectorize the workload, executing 4 additions in parallel.
+        /// (128 bits) 
+        /// </summary>
+        private static void Raw_Workload_SSE2(Memory<Component1> c1V, Memory<Component2> c2V, Memory<Component3> c3V)
+        {
+            int count = c1V.Length;
+
+            using MemoryHandle mem1 = c1V.Pin();
+            using MemoryHandle mem2 = c2V.Pin();
+            using MemoryHandle mem3 = c3V.Pin();
+
+            unsafe
+            {
+                int* p1 = (int*)mem1.Pointer;
+                int* p2 = (int*)mem2.Pointer;
+                int* p3 = (int*)mem3.Pointer;
+
+                int vectorSize = Vector128<int>.Count;
+                int vectorEnd = count - (count % vectorSize);
+                for (int i = 0; i < vectorEnd; i += vectorSize)
+                {
+                    Vector128<int> v1 = Sse2.LoadVector128(p1 + i);
+                    Vector128<int> v2 = Sse2.LoadVector128(p2 + i);
+                    Vector128<int> v3 = Sse2.LoadVector128(p3 + i);
+                    Vector128<int> sum = Sse2.Add(v1, Sse2.Add(v2, v3));
+
+                    Sse2.Store(p1 + i, sum);
+                }
+
+                for (int i = vectorEnd; i < count; i++) // remaining elements
+                {
+                    p1[i] = p1[i] + p2[i] + p3[i];
+                }
+            }
+        }
+
+        /// <summary>
+        /// AdvSIMD workload for fennecs(Raw)
+        /// We use AdvSIMD intrinsics to vectorize the workload, executing 4 additions in parallel.
+        /// (128 bits) 
+        /// </summary>
+        private static void Raw_Workload_AdvSIMD(Memory<Component1> c1V, Memory<Component2> c2V, Memory<Component3> c3V)
         {
-            _fennecs.query.Raw(delegate(Memory<Component1> c1v, Memory<Component2> c2v, Memory<Component3> c3v)
+            int count = c1V.Length;
+
+            using MemoryHandle mem1 = c1V.Pin();
+            using MemoryHandle mem2 = c2V.Pin();
+            using MemoryHandle mem3 = c3V.Pin();
+
+            unsafe
             {
-                var c1vs = c1v.Span;
-                var c2vs = c2v.Span;
-                var c3vs = c3v.Span;
-                
-                for (int i = 0; i < c1vs.Length; ++i)
+                int* p1 = (int*)mem1.Pointer;
+                int* p2 = (int*)mem2.Pointer;
+                int* p3 = (int*)mem3.Pointer;
+
+                int vectorSize = Vector128<int>.Count;
+                int vectorEnd = count - (count % vectorSize);
+                for (int i = 0; i < vectorEnd; i += vectorSize)
+                {
+                    Vector128<int> v1 = AdvSimd.LoadVector128(p1 + i);
+                    Vector128<int> v2 = AdvSimd.LoadVector128(p2 + i);
+                    Vector128<int> v3 = AdvSimd.LoadVector128(p3 + i);
+                    Vector128<int> sum = AdvSimd.Add(v1, AdvSimd.Add(v2, v3));
+
+                    AdvSimd.Store(p1 + i, sum);
+                }
+
+                for (int i = vectorEnd; i < count; i++) // remaining elements
                 {
-                    ref Component1 c1 = ref c1vs[i];
-                    c1.Value += c2vs[i].Value + c3vs[i].Value;
+                    p1[i] = p1[i] + p2[i] + p3[i];
                 }
-            });
+            }
         }
+
+        #endregion
     }
 }
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/_SystemWithThreeComponents.cs b/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/_SystemWithThreeComponents.cs
index 408108b..e2454c2 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/_SystemWithThreeComponents.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithThreeComponents/_SystemWithThreeComponents.cs
@@ -2,7 +2,7 @@
 
 namespace Ecs.CSharp.Benchmark
 {
-    [BenchmarkCategory(Categories.System)]
+    [BenchmarkCategory(Categories.System, nameof(SystemWithThreeComponents))]
     [MemoryDiagnoser]
 #if CHECK_CACHE_MISSES
     [HardwareCounters(BenchmarkDotNet.Diagnosers.HardwareCounter.CacheMisses)]
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/Fennecs.cs b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/Fennecs.cs
index 142b48b..5f7f9b6 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/Fennecs.cs
@@ -1,73 +1,300 @@
 ﻿using System;
-using System.Runtime.CompilerServices;
+using System.Buffers;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
 using BenchmarkDotNet.Attributes;
 using Ecs.CSharp.Benchmark.Contexts;
 using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components;
 using fennecs;
+// ReSharper disable ConvertToCompoundAssignment
 
+// ReSharper disable once CheckNamespace
 namespace Ecs.CSharp.Benchmark
 {
     public partial class SystemWithTwoComponents
     {
         [Context] private readonly FennecsContext _fennecs;
+        private Stream<Component1, Component2> Stream => _fennecs.stream;
 
+        // ReSharper disable once ClassNeverInstantiated.Local
         private sealed class FennecsContext : FennecsBaseContext
         {
-            public Query<Component1, Component2> query;
+            internal readonly Stream<Component1, Component2> stream;
 
-            public FennecsContext(int entityCount, int entityPadding)
+            public FennecsContext(int entityCount, int entityPadding) : base(entityCount)
             {
-                query = World.Query<Component1, Component2>().Build();
+                stream = World.Query<Component1, Component2>().Stream();
+
                 for (int i = 0; i < entityCount; ++i)
                 {
                     for (int j = 0; j < entityPadding; ++j)
                     {
                         Entity padding = World.Spawn();
-                        switch (j % 2)
+                        switch (j % 3)
                         {
                             case 0:
                                 padding.Add<Component1>();
                                 break;
-
                             case 1:
                                 padding.Add<Component2>();
                                 break;
                         }
                     }
 
-                    World.Spawn().Add<Component1>().Add(new Component2 { Value = 1 });
+                    World.Spawn().Add<Component1>()
+                        .Add(new Component2
+                        {
+                            Value = 1
+                        });
                 }
+
+            }
+            
+            public override void Dispose()
+            {
+                stream.Query.Dispose();
+                base.Dispose();
             }
         }
 
+        /// <summary>
+        /// fennecs For runners are the classic swiss army knife of this ECS. 
+        /// </summary>
+        /// <remarks>
+        /// They are the most versatile and offer decent single-threaded baseline performance to boot.
+        /// </remarks>
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
-        public void Fennecs_ForEach()
+        [Benchmark(Description = "fennecs(For)")]
+        public void fennecs_For()
         {
-            _fennecs.query.For((ref Component1 c1, ref Component2 c2) => c1.Value += c2.Value);
+            Stream.For(
+                static (ref Component1 c1, ref Component2 c2) =>
+                {
+                    c1.Value = c1.Value + c2.Value;
+                });
         }
 
+
+        /// <summary>
+        /// fennecs Job runners are the most scalable runners.
+        /// </summary>
+        /// <remarks>
+        /// <para>
+        /// They're still an area for improvement :)
+        /// </para>
+        /// <para>
+        /// Job is designed for heavy individual workloads (e.g. update 20 physics worlds on 20 cores),
+        /// or large numbers of entities in many big archetypes. They only start paying off at around
+        /// 500,000 components when the individual work steps are simple (e.g. vector multiplications).
+        /// </para>
+        /// </remarks>
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
-        public void Fennecs_Job()
+        [Benchmark(Description = $"fennecs(Job)")]
+        public void fennecs_Job()
         {
-            _fennecs.query.Job(delegate(ref Component1 c1, ref Component2 c2) { c1.Value += c2.Value; }, 1024);
+            Stream.Job(
+                static (ref Component1 c1, ref Component2 c2) =>
+                {
+                    c1.Value = c1.Value + c2.Value;
+                });
         }
 
+
+        // fennecs Raw runners guarantee contiguous memory access in the form of Query<>.Raw(MemoryAction<>)
+        // Raw runners are intended to process data or transfer it via the fastest available means.
+        // Example use cases:
+        //  - transfer data to/from GPU
+        //  - transfer data to/from Game Engine
+        //  - Disk, Database, or Network I/O
+        //  - SIMD calculations
+        //  - snapshotting / copying / rollback / compression / hashing / diffing / permutation
+        //  - etc.
+        //
+        // As example / reference / benchmarks, we vectorize our calculation here using AVX2, SSE2, and AdvSIMD
+        // Despite the 'unsafe' tags, this is quite safe ;) The Memory<T>s are pinned till end of scope.
+        // We also keep an Unoptimized Workload around to let RyuJIT show off its magic. (still good!)
+
+        #region Raw Runners
+        
+        /// <summary>
+        /// Unoptimized workload for fennecs(Raw)
+        /// </summary>
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
-        public void Fennecs_Raw()
+        [Benchmark(Description = "fennecs(Raw)")]
+        public void fennecs_Raw()
+        {
+            Stream.Raw(Raw_Workload_Unoptimized);
+        }
+
+        /// <summary>
+        /// Vectorized Benchmark Contender for fennecs. (AVX2)
+        /// </summary>
+        /// <remarks>
+        /// This benchmark is automatically excluded if the current environment does not support AVX2.
+        /// </remarks>
+        [BenchmarkCategory(Categories.Fennecs, Capabilities.Avx2)]
+        [Benchmark(Description = "fennecs(AVX2)")]
+        public void fennecs_Raw_AVX2()
+        {
+            Stream.Raw(Raw_Workload_AVX2);
+        }
+
+        /// <summary>
+        /// Vectorized Benchmark Contender for fennecs. (SSE2 / AVX1)
+        /// </summary>
+        /// <remarks>
+        /// This benchmark is automatically excluded if the current environment does not support SSE2.
+        /// </remarks>
+        [BenchmarkCategory(Categories.Fennecs, Capabilities.Sse2)]
+        [Benchmark(Description = "fennecs(SSE2)")]
+        public void fennecs_Raw_SSE2()
+        {
+            Stream.Raw(Raw_Workload_SSE2);
+        }
+
+        /// <summary>
+        /// Vectorized Benchmark Contender for fennecs. (Arm64 AdvSIMD)
+        /// </summary>
+        /// <remarks>
+        /// This benchmark is automatically excluded if the current environment does not support AdvSIMD.
+        /// </remarks>
+        [BenchmarkCategory(Categories.Fennecs, Capabilities.AdvSimd)]
+        [Benchmark(Description = "fennecs(AdvSIMD)")]
+        public void fennecs_Raw_AdvSIMD()
+        {
+            Stream.Raw(Raw_Workload_AdvSIMD);
+        }
+
+        /// <summary>
+        /// Unoptimized workload for fennecs(Raw)
+        /// Treating the Memory Slabs basically as Arrays.
+        /// </summary>
+        /// <remarks>
+        /// However, RyuJIT is able to optimize this workload to a degree,
+        /// especially if we use an explicit assignment instead of a compound assignment
+        /// for our addition. 
+        /// </remarks>
+        private static void Raw_Workload_Unoptimized(Memory<Component1> c1V, Memory<Component2> c2V)
+        {
+            Span<Component1> c1S = c1V.Span;
+            Span<Component2> c2S = c2V.Span;
+
+            for (int i = 0; i < c1S.Length; i++)
+            {
+                // Compound Assignment is not as optimized as explicit assignment
+                c1S[i].Value = c1S[i].Value + c2S[i].Value;
+            }
+        }
+
+        /// <summary>
+        /// AVX2 workload for fennecs(Raw)
+        /// We use AVX2 intrinsics to vectorize the workload, executing 8 additions in parallel.
+        /// (256 bits)
+        /// </summary>
+        private static void Raw_Workload_AVX2(Memory<Component1> c1V, Memory<Component2> c2V)
+        {
+            int count = c1V.Length;
+
+            using MemoryHandle mem1 = c1V.Pin();
+            using MemoryHandle mem2 = c2V.Pin();
+
+            unsafe
+            {
+                int* p1 = (int*)mem1.Pointer;
+                int* p2 = (int*)mem2.Pointer;
+
+                int vectorSize = Vector256<int>.Count;
+                int vectorEnd = count - (count % vectorSize);
+                for (int i = 0; i < vectorEnd; i += vectorSize)
+                {
+                    Vector256<int> v1 = Avx.LoadVector256(p1 + i);
+                    Vector256<int> v2 = Avx.LoadVector256(p2 + i);
+                    Vector256<int> sum = Avx2.Add(v1, v2);
+
+                    Avx.Store(p1 + i, sum);
+                }
+
+                for (int i = vectorEnd; i < count; i++) // remaining elements
+                {
+                    // Compound Assignment is not as optimized as explicit assignment
+                    p1[i] = p1[i] + p2[i];
+                }
+            }
+        }
+
+        /// <summary>
+        /// SSE2 workload for fennecs(Raw)
+        /// We use SSE2 (same as AVX1) intrinsics to vectorize the workload, executing 4 additions in parallel.
+        /// (128 bits) 
+        /// </summary>
+        private static void Raw_Workload_SSE2(Memory<Component1> c1V, Memory<Component2> c2V)
         {
-            _fennecs.query.Raw(delegate(Memory<Component1> c1v, Memory<Component2> c2v)
+            int count = c1V.Length;
+
+            using MemoryHandle mem1 = c1V.Pin();
+            using MemoryHandle mem2 = c2V.Pin();
+
+            unsafe
             {
-                var c1vs = c1v.Span;
-                var c2vs = c2v.Span;
-                for (int i = 0; i < c1vs.Length; ++i)
+                int* p1 = (int*)mem1.Pointer;
+                int* p2 = (int*)mem2.Pointer;
+
+                int vectorSize = Vector128<int>.Count;
+                int vectorEnd = count - (count % vectorSize);
+                for (int i = 0; i < vectorEnd; i += vectorSize)
+                {
+                    Vector128<int> v1 = Sse2.LoadVector128(p1 + i);
+                    Vector128<int> v2 = Sse2.LoadVector128(p2 + i);
+                    Vector128<int> sum = Sse2.Add(v1, v2);
+
+                    Sse2.Store(p1 + i, sum);
+                }
+
+                for (int i = vectorEnd; i < count; i++) // remaining elements
                 {
-                    ref Component1 c1 = ref c1vs[i];
-                    c1.Value += c2vs[i].Value;
+                    // Compound Assignment is not as optimized as explicit assignment
+                    p1[i] = p1[i] + p2[i];
                 }
-            });
+            }
+        }
+
+        /// <summary>
+        /// AdvSIMD workload for fennecs(Raw)
+        /// We use AdvSIMD intrinsics to vectorize the workload, executing 4 additions in parallel.
+        /// (128 bits) 
+        /// </summary>
+        private static void Raw_Workload_AdvSIMD(Memory<Component1> c1V, Memory<Component2> c2V)
+        {
+            int count = c1V.Length;
+
+            using MemoryHandle mem1 = c1V.Pin();
+            using MemoryHandle mem2 = c2V.Pin();
+
+            unsafe
+            {
+                int* p1 = (int*)mem1.Pointer;
+                int* p2 = (int*)mem2.Pointer;
+
+                int vectorSize = Vector128<int>.Count;
+                int vectorEnd = count - (count % vectorSize);
+                for (int i = 0; i < vectorEnd; i += vectorSize)
+                {
+                    Vector128<int> v1 = AdvSimd.LoadVector128(p1 + i);
+                    Vector128<int> v2 = AdvSimd.LoadVector128(p2 + i);
+                    Vector128<int> sum = AdvSimd.Add(v1, v2);
+
+                    AdvSimd.Store(p1 + i, sum);
+                }
+
+                for (int i = vectorEnd; i < count; i++) // remaining elements
+                {
+                    // Compound Assignment is not as optimized as explicit assignment
+                    p1[i] = p1[i] + p2[i];
+                }
+            }
         }
+
+        #endregion
     }
 }
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/_SystemWithTwoComponents.cs b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/_SystemWithTwoComponents.cs
index aeba499..b93d115 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/_SystemWithTwoComponents.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponents/_SystemWithTwoComponents.cs
@@ -2,7 +2,7 @@
 
 namespace Ecs.CSharp.Benchmark
 {
-    [BenchmarkCategory(Categories.System)]
+    [BenchmarkCategory(Categories.System, nameof(SystemWithTwoComponents))]
     [MemoryDiagnoser]
 #if CHECK_CACHE_MISSES
     [HardwareCounters(BenchmarkDotNet.Diagnosers.HardwareCounter.CacheMisses)]
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/Fennecs.cs b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/Fennecs.cs
index 42f8788..82e01e9 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/Fennecs.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/Fennecs.cs
@@ -1,30 +1,38 @@
 ﻿using System;
+using System.Buffers;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using System.Runtime.Intrinsics.X86;
 using BenchmarkDotNet.Attributes;
 using Ecs.CSharp.Benchmark.Contexts;
 using Ecs.CSharp.Benchmark.Contexts.Fennecs_Components;
 using fennecs;
 
+// ReSharper disable ConvertToCompoundAssignment
+
+// ReSharper disable once CheckNamespace
 namespace Ecs.CSharp.Benchmark
 {
     public partial class SystemWithTwoComponentsMultipleComposition
     {
         [Context] private readonly FennecsContext _fennecs;
+        private  Stream<Component1, Component2> Stream => _fennecs.stream;
 
+        // ReSharper disable once ClassNeverInstantiated.Local
         private sealed class FennecsContext : FennecsBaseContext
         {
-            private record struct Padding1();
+            private struct Padding1;
 
-            private record struct Padding2();
+            private struct Padding2;
 
-            private record struct Padding3();
+            private struct Padding3;
 
-            private record struct Padding4();
+            private struct Padding4;
 
-            public Query<Component1, Component2> query;
+            public readonly Stream<Component1, Component2> stream;
 
-            public FennecsContext(int entityCount)
+            public FennecsContext(int entityCount) : base(entityCount)
             {
-                query = World.Query<Component1, Component2>().Build();
                 for (int i = 0; i < entityCount; ++i)
                 {
                     Entity entity = World.Spawn().Add<Component1>().Add(new Component2 { Value = 1 });
@@ -44,37 +52,254 @@ public FennecsContext(int entityCount)
                             break;
                     }
                 }
+                
+                stream = World.Query<Component1, Component2>().Stream();
+            }
+            
+            public override void Dispose()
+            {
+                stream.Query.Dispose();
+                base.Dispose();
             }
         }
 
+        /// <summary>
+        /// fennecs For runners are the classic swiss army knife of this ECS. 
+        /// </summary>
+        /// <remarks>
+        /// They are the most versatile and offer decent single-threaded baseline performance to boot.
+        /// </remarks>
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
-        public void Fennecs_ForEach()
+        [Benchmark(Description = "fennecs(For)")]
+        public void fennecs_For()
         {
-            _fennecs.query.For((ref Component1 c1, ref Component2 c2) => c1.Value += c2.Value);
+            Stream.For(
+                static (ref Component1 c1, ref Component2 c2) =>
+                {
+                    c1.Value = c1.Value + c2.Value;
+                });
         }
 
+
+        /// <summary>
+        /// fennecs Job runners are the most scalable runners.
+        /// </summary>
+        /// <remarks>
+        /// <para>
+        /// They're still an area for improvement :)
+        /// </para>
+        /// <para>
+        /// Job is designed for heavy individual workloads (e.g. update 20 physics worlds on 20 cores),
+        /// or large numbers of entities in many big archetypes. They only start paying off at around
+        /// 500,000 components when the individual work steps are simple (e.g. vector multiplications).
+        /// </para>
+        /// </remarks>
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
-        public void Fennecs_Job()
+        [Benchmark(Description = $"fennecs(Job)")]
+        public void fennecs_Job()
         {
-            _fennecs.query.Job(delegate(ref Component1 c1, ref Component2 c2) { c1.Value += c2.Value; }, 1024);
+            Stream.Job(
+                static (ref Component1 c1, ref Component2 c2) =>
+                {
+                    c1.Value = c1.Value + c2.Value;
+                });
         }
 
+
+        // fennecs Raw runners guarantee contiguous memory access in the form of Query<>.Raw(MemoryAction<>)
+        // Raw runners are intended to process data or transfer it via the fastest available means.
+        // Example use cases:
+        //  - transfer data to/from GPU
+        //  - transfer data to/from Game Engine
+        //  - Disk, Database, or Network I/O
+        //  - SIMD calculations
+        //  - snapshotting / copying / rollback / compression / hashing / diffing / permutation
+        //  - etc.
+        //
+        // As example / reference / benchmarks, we vectorize our calculation here using AVX2, SSE2, and AdvSIMD
+        // Despite the 'unsafe' tags, this is quite safe ;) The Memory<T>s are pinned till end of scope.
+        // We also keep an Unoptimized Workload around to let RyuJIT show off its magic. (still good!)
+
+        #region Raw Runners
+        
+        /// <summary>
+        /// Unoptimized workload for fennecs(Raw)
+        /// </summary>
         [BenchmarkCategory(Categories.Fennecs)]
-        [Benchmark]
-        public void Fennecs_Raw()
+        [Benchmark(Description = "fennecs(Raw)")]
+        public void fennecs_Raw()
+        {
+            Stream.Raw(Raw_Workload_Unoptimized);
+        }
+
+        /// <summary>
+        /// Vectorized Benchmark Contender for fennecs. (AVX2)
+        /// </summary>
+        /// <remarks>
+        /// This benchmark is automatically excluded if the current environment does not support AVX2.
+        /// </remarks>
+        [BenchmarkCategory(Categories.Fennecs, Capabilities.Avx2)]
+        [Benchmark(Description = "fennecs(AVX2)")]
+        public void fennecs_Raw_AVX2()
+        {
+            Stream.Raw(Raw_Workload_AVX2);
+        }
+
+        /// <summary>
+        /// Vectorized Benchmark Contender for fennecs. (SSE2 / AVX1)
+        /// </summary>
+        /// <remarks>
+        /// This benchmark is automatically excluded if the current environment does not support SSE2.
+        /// </remarks>
+        [BenchmarkCategory(Categories.Fennecs, Capabilities.Sse2)]
+        [Benchmark(Description = "fennecs(SSE2)")]
+        public void fennecs_Raw_SSE2()
+        {
+            Stream.Raw(Raw_Workload_SSE2);
+        }
+
+        /// <summary>
+        /// Vectorized Benchmark Contender for fennecs. (Arm64 AdvSIMD)
+        /// </summary>
+        /// <remarks>
+        /// This benchmark is automatically excluded if the current environment does not support AdvSIMD.
+        /// </remarks>
+        [BenchmarkCategory(Categories.Fennecs, Capabilities.AdvSimd)]
+        [Benchmark(Description = "fennecs(AdvSIMD)")]
+        public void fennecs_Raw_AdvSIMD()
+        {
+            Stream.Raw(Raw_Workload_AdvSIMD);
+        }
+
+        /// <summary>
+        /// Unoptimized workload for fennecs(Raw)
+        /// Treating the Memory Slabs basically as Arrays.
+        /// </summary>
+        /// <remarks>
+        /// However, RyuJIT is able to optimize this workload to a degree,
+        /// especially if we use an explicit assignment instead of a compound assignment
+        /// for our addition. 
+        /// </remarks>
+        private static void Raw_Workload_Unoptimized(Memory<Component1> c1V, Memory<Component2> c2V)
+        {
+            Span<Component1> c1S = c1V.Span;
+            Span<Component2> c2S = c2V.Span;
+
+            for (int i = 0; i < c1S.Length; i++)
+            {
+                // Compound Assignment is not as optimized as explicit assignment
+                c1S[i].Value = c1S[i].Value + c2S[i].Value;
+            }
+        }
+
+        /// <summary>
+        /// AVX2 workload for fennecs(Raw)
+        /// We use AVX2 intrinsics to vectorize the workload, executing 8 additions in parallel.
+        /// (256 bits)
+        /// </summary>
+        private static void Raw_Workload_AVX2(Memory<Component1> c1V, Memory<Component2> c2V)
         {
-            _fennecs.query.Raw(delegate(Memory<Component1> c1v, Memory<Component2> c2v)
+            int count = c1V.Length;
+
+            using MemoryHandle mem1 = c1V.Pin();
+            using MemoryHandle mem2 = c2V.Pin();
+
+            unsafe
             {
-                var c1vs = c1v.Span;
-                var c2vs = c2v.Span;
-                for (int i = 0; i < c1vs.Length; ++i)
+                int* p1 = (int*)mem1.Pointer;
+                int* p2 = (int*)mem2.Pointer;
+
+                int vectorSize = Vector256<int>.Count;
+                int vectorEnd = count - (count % vectorSize);
+                for (int i = 0; i < vectorEnd; i += vectorSize)
                 {
-                    ref Component1 c1 = ref c1vs[i];
-                    c1.Value += c2vs[i].Value;
+                    Vector256<int> v1 = Avx.LoadVector256(p1 + i);
+                    Vector256<int> v2 = Avx.LoadVector256(p2 + i);
+                    Vector256<int> sum = Avx2.Add(v1, v2);
+
+                    Avx.Store(p1 + i, sum);
                 }
-            });
+
+                for (int i = vectorEnd; i < count; i++) // remaining elements
+                {
+                    // Compound Assignment is not as optimized as explicit assignment
+                    p1[i] = p1[i] + p2[i];
+                }
+            }
+        }
+
+        /// <summary>
+        /// SSE2 workload for fennecs(Raw)
+        /// We use SSE2 (same as AVX1) intrinsics to vectorize the workload, executing 4 additions in parallel.
+        /// (128 bits) 
+        /// </summary>
+        private static void Raw_Workload_SSE2(Memory<Component1> c1V, Memory<Component2> c2V)
+        {
+            int count = c1V.Length;
+
+            using MemoryHandle mem1 = c1V.Pin();
+            using MemoryHandle mem2 = c2V.Pin();
+
+            unsafe
+            {
+                int* p1 = (int*)mem1.Pointer;
+                int* p2 = (int*)mem2.Pointer;
+
+                int vectorSize = Vector128<int>.Count;
+                int vectorEnd = count - (count % vectorSize);
+                for (int i = 0; i < vectorEnd; i += vectorSize)
+                {
+                    Vector128<int> v1 = Sse2.LoadVector128(p1 + i);
+                    Vector128<int> v2 = Sse2.LoadVector128(p2 + i);
+                    Vector128<int> sum = Sse2.Add(v1, v2);
+
+                    Sse2.Store(p1 + i, sum);
+                }
+
+                for (int i = vectorEnd; i < count; i++) // remaining elements
+                {
+                    // Compound Assignment is not as optimized as explicit assignment
+                    p1[i] = p1[i] + p2[i];
+                }
+            }
         }
+
+        /// <summary>
+        /// AdvSIMD workload for fennecs(Raw)
+        /// We use AdvSIMD intrinsics to vectorize the workload, executing 4 additions in parallel.
+        /// (128 bits) 
+        /// </summary>
+        private static void Raw_Workload_AdvSIMD(Memory<Component1> c1V, Memory<Component2> c2V)
+        {
+            int count = c1V.Length;
+
+            using MemoryHandle mem1 = c1V.Pin();
+            using MemoryHandle mem2 = c2V.Pin();
+
+            unsafe
+            {
+                int* p1 = (int*)mem1.Pointer;
+                int* p2 = (int*)mem2.Pointer;
+
+                int vectorSize = Vector128<int>.Count;
+                int vectorEnd = count - (count % vectorSize);
+                for (int i = 0; i < vectorEnd; i += vectorSize)
+                {
+                    Vector128<int> v1 = AdvSimd.LoadVector128(p1 + i);
+                    Vector128<int> v2 = AdvSimd.LoadVector128(p2 + i);
+                    Vector128<int> sum = AdvSimd.Add(v1, v2);
+
+                    AdvSimd.Store(p1 + i, sum);
+                }
+
+                for (int i = vectorEnd; i < count; i++) // remaining elements
+                {
+                    // Compound Assignment is not as optimized as explicit assignment
+                    p1[i] = p1[i] + p2[i];
+                }
+            }
+        }
+
+        #endregion
     }
 }
diff --git a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/_SystemWithTwoComponentsMultipleComposition.cs b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/_SystemWithTwoComponentsMultipleComposition.cs
index e5fa72a..6c661ea 100644
--- a/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/_SystemWithTwoComponentsMultipleComposition.cs
+++ b/source/Ecs.CSharp.Benchmark/SystemWithTwoComponentsMultipleComposition/_SystemWithTwoComponentsMultipleComposition.cs
@@ -2,7 +2,7 @@
 
 namespace Ecs.CSharp.Benchmark
 {
-    [BenchmarkCategory(Categories.System)]
+    [BenchmarkCategory(Categories.System, nameof(SystemWithTwoComponentsMultipleComposition))]
     [MemoryDiagnoser]
 #if CHECK_CACHE_MISSES
     [HardwareCounters(BenchmarkDotNet.Diagnosers.HardwareCounter.CacheMisses)]