diff --git a/BepuPhysics/CollisionDetection/CollidableOverlapFinder.cs b/BepuPhysics/CollisionDetection/CollidableOverlapFinder.cs index 18f29993..25c7a7c3 100644 --- a/BepuPhysics/CollisionDetection/CollidableOverlapFinder.cs +++ b/BepuPhysics/CollisionDetection/CollidableOverlapFinder.cs @@ -73,19 +73,19 @@ public void Handle(int indexA, int indexB) int nextJobIndex; public CollidableOverlapFinder(NarrowPhase narrowPhase, BroadPhase broadPhase) { - selfTestContext = new Tree.MultithreadedSelfTest(narrowPhase.Pool); - intertreeTestContext = new Tree.MultithreadedIntertreeTest(narrowPhase.Pool); + selfTestContext = new Tree.MultithreadedSelfTest(); + intertreeTestContext = new Tree.MultithreadedIntertreeTest(); this.narrowPhase = narrowPhase; this.broadPhase = broadPhase; workerAction = Worker; //VERSION 2 - selfTestContext2 = new Tree.MultithreadedSelfTest(narrowPhase.Pool); - intertreeTestContext2 = new Tree.MultithreadedIntertreeTest(narrowPhase.Pool); + selfTestContext2 = new Tree.MultithreadedSelfTest(); + intertreeTestContext2 = new Tree.MultithreadedIntertreeTest(); //VERSION 3 - selfTestContext3 = new Tree.MultithreadedSelfTest(narrowPhase.Pool); - intertreeTestContext3 = new Tree.MultithreadedIntertreeTest(narrowPhase.Pool); + selfTestContext3 = new Tree.MultithreadedSelfTest(); + intertreeTestContext3 = new Tree.MultithreadedIntertreeTest(); } void Worker(int workerIndex) @@ -142,8 +142,8 @@ public override void DispatchOverlaps(float dt, IThreadDispatcher threadDispatch intertreeHandlers[i] = new IntertreeOverlapHandler(broadPhase.ActiveLeaves, broadPhase.StaticLeaves, narrowPhase, i); } Debug.Assert(intertreeHandlers.Length >= threadDispatcher.ThreadCount); - selfTestContext.PrepareJobs(ref broadPhase.ActiveTree, selfHandlers, threadDispatcher.ThreadCount); - intertreeTestContext.PrepareJobs(ref broadPhase.ActiveTree, ref broadPhase.StaticTree, intertreeHandlers, threadDispatcher.ThreadCount); + selfTestContext.PrepareJobs(ref broadPhase.ActiveTree, selfHandlers, threadDispatcher.ThreadCount, 0, narrowPhase.Pool); + intertreeTestContext.PrepareJobs(ref broadPhase.ActiveTree, ref broadPhase.StaticTree, intertreeHandlers, threadDispatcher.ThreadCount, 0, narrowPhase.Pool); nextJobIndex = -1; var totalJobCount = selfTestContext.JobCount + intertreeTestContext.JobCount; threadDispatcher.DispatchWorkers(workerAction, totalJobCount); @@ -452,8 +452,8 @@ public void DispatchOverlaps2(float dt, IThreadDispatcher threadDispatcher = nul intertreeTestHandlers[i] = new PairCollector(threadPool, pairs, broadPhase.ActiveLeaves, broadPhase.StaticLeaves); } Debug.Assert(intertreeTestHandlers.Length >= threadDispatcher.ThreadCount); - selfTestContext2.PrepareJobs(ref broadPhase.ActiveTree, selfTestHandlers, threadDispatcher.ThreadCount); - intertreeTestContext2.PrepareJobs(ref broadPhase.ActiveTree, ref broadPhase.StaticTree, intertreeTestHandlers, threadDispatcher.ThreadCount); + selfTestContext2.PrepareJobs(ref broadPhase.ActiveTree, selfTestHandlers, threadDispatcher.ThreadCount, 0, narrowPhase.Pool); + intertreeTestContext2.PrepareJobs(ref broadPhase.ActiveTree, ref broadPhase.StaticTree, intertreeTestHandlers, threadDispatcher.ThreadCount, 0, narrowPhase.Pool); var testTaskCount = selfTestContext2.JobCount + intertreeTestContext2.JobCount; var effectiveJobCount = int.Max(1, int.Max(previousPairCount, testTaskCount)); var taskStack = new TaskStack(narrowPhase.Pool, threadDispatcher, threadDispatcher.ThreadCount); @@ -559,7 +559,44 @@ public void DispatchOverlaps2(float dt, IThreadDispatcher threadDispatcher = nul - + unsafe static void PrepareSelfTestJob3(long id, void* context, int workerIndex, IThreadDispatcher threadDispatcher) + { + var overlapFinder = (CollidableOverlapFinder)threadDispatcher.ManagedContext; + overlapFinder.selfTestContext3.PrepareJobs(ref overlapFinder.broadPhase.ActiveTree, overlapFinder.selfTestHandlers3, threadDispatcher.ThreadCount, workerIndex, threadDispatcher.WorkerPools[workerIndex]); + } + unsafe static void PrepareIntertreeTestJob3(long id, void* context, int workerIndex, IThreadDispatcher threadDispatcher) + { + var overlapFinder = (CollidableOverlapFinder)threadDispatcher.ManagedContext; + var broadPhase = overlapFinder.broadPhase; + overlapFinder.intertreeTestContext3.PrepareJobs(ref broadPhase.ActiveTree, ref broadPhase.StaticTree, overlapFinder.intertreeTestHandlers3, threadDispatcher.ThreadCount, workerIndex, threadDispatcher.WorkerPools[workerIndex]); + } + unsafe static void CompletedPreparation3(long id, void* context, int workerIndex, IThreadDispatcher threadDispatcher) + { + var overlapFinder = (CollidableOverlapFinder)threadDispatcher.ManagedContext; + var selfTest = overlapFinder.selfTestContext3; + var intertreeTest = overlapFinder.intertreeTestContext3; + var broadTaskCount = selfTest.JobCount + intertreeTest.JobCount; + //We'll use a continuation to notify us when all broad jobs are complete by stopping the broad stack. + ContinuationHandle broadPhaseCompleteContinuation = default; + var broadStack = overlapFinder.broadStack; + if (broadTaskCount > 0) + broadPhaseCompleteContinuation = broadStack->AllocateContinuation(broadTaskCount, workerIndex, threadDispatcher, TaskStack.GetRequestStopTask(broadStack)); + if (selfTest.JobCount > 0) + broadStack->PushFor(&SelfTestJob3, null, 0, selfTest.JobCount, workerIndex, threadDispatcher, continuation: broadPhaseCompleteContinuation); + if (intertreeTest.JobCount > 0) + broadStack->PushFor(&IntertreeTestJob3, null, 0, intertreeTest.JobCount, workerIndex, threadDispatcher, continuation: broadPhaseCompleteContinuation); + + //Go ahead and flush the narrow phase work that the preparation phase generated, if any. If there is any left, then it's not full sized (because that would have been flushed), but that's fine. + //This is mostly free. At this point, we almost certainly have idling workers. + overlapFinder.taskAccumulators[workerIndex].FlushToStack(workerIndex, threadDispatcher); + + if (broadTaskCount == 0) + { + //The broad phase didn't actually have work to do, so we can just stop it now. + //Note that this stop was submitted *after* we flushed the stack! That's because the stop is a sync point, and we want to make sure that all the narrow phase work created by the preparation phase is submitted to the narrow phase stack. + broadStack->RequestStop(); + } + } unsafe static void SelfTestJob3(long id, void* context, int workerIndex, IThreadDispatcher threadDispatcher) { @@ -610,36 +647,7 @@ unsafe static void Worker3(int workerIndex, IThreadDispatcher threadDispatcher) var overlapFinder = (CollidableOverlapFinder)threadDispatcher.ManagedContext; var broadPhase = overlapFinder.broadPhase; var broadStack = overlapFinder.broadStack; - if (workerIndex == 0) - { - //The worker 0 is responsible for getting everything set up. - //We'll run the job preparation on this thread while the other threads are getting into position. - //Note that the job preparation phase can generate narrow phase tests, so those other threads may end up doing something even before we push anything here. - var selfTest = overlapFinder.selfTestContext3; - var intertreeTest = overlapFinder.intertreeTestContext3; - selfTest.PrepareJobs(ref broadPhase.ActiveTree, overlapFinder.selfTestHandlers3, threadDispatcher.ThreadCount); - intertreeTest.PrepareJobs(ref broadPhase.ActiveTree, ref broadPhase.StaticTree, overlapFinder.intertreeTestHandlers3, threadDispatcher.ThreadCount); - var broadTaskCount = selfTest.JobCount + intertreeTest.JobCount; - //We'll use a continuation to notify us when all broad jobs are complete by stopping the broad stack. - ContinuationHandle broadPhaseCompleteContinuation = default; - if (broadTaskCount > 0) - broadPhaseCompleteContinuation = broadStack->AllocateContinuation(broadTaskCount, workerIndex, threadDispatcher, TaskStack.GetRequestStopTask(broadStack)); - if (selfTest.JobCount > 0) - broadStack->PushFor(&SelfTestJob3, null, 0, selfTest.JobCount, workerIndex, threadDispatcher, continuation: broadPhaseCompleteContinuation); - if (intertreeTest.JobCount > 0) - broadStack->PushFor(&IntertreeTestJob3, null, 0, intertreeTest.JobCount, workerIndex, threadDispatcher, continuation: broadPhaseCompleteContinuation); - - //Go ahead and flush the narrow phase work that the preparation phase generated, if any. If there is any left, then it's not full sized (because that would have been flushed), but that's fine. - //This is mostly free. At this point, we almost certainly have idling workers. - overlapFinder.taskAccumulators[workerIndex].FlushToStack(workerIndex, threadDispatcher); - if (broadTaskCount == 0) - { - //The broad phase didn't actually have work to do, so we can just stop it now. - //Note that this stop was submitted *after* we flushed the stack! That's because the stop is a sync point, and we want to make sure that all the narrow phase work created by the preparation phase is submitted to the narrow phase stack. - broadStack->RequestStop(); - } - } //The worker stays active for the duration of the dispatch that covers both the broad phase and narrow phase. //We'll grab tree testing jobs with priority since those generate the narrow phase jobs. var waiter = new SpinWait(); @@ -830,6 +838,11 @@ public void DispatchOverlaps3(float dt, IThreadDispatcher threadDispatcher = nul intertreeTestHandlers3[i] = new PairCollector3(threadDispatcher, threadAccumulator, i, broadPhase.ActiveLeaves, broadPhase.StaticLeaves); } Debug.Assert(intertreeTestHandlers3.Length >= threadDispatcher.ThreadCount); + //Submit both self test and intertree test jobs to the broad phase stack; they can run in parallel. + //They're pretty cheap, but since we've got the fork infrastructure, we might as well use it. + var completedPreparationContinuation = broadStack->AllocateContinuation(2, 0, threadDispatcher, new Task(&CompletedPreparation3)); + broadStack->PushUnsafely(new Task(&PrepareSelfTestJob3, continuation: completedPreparationContinuation), 0, threadDispatcher); + broadStack->PushUnsafely(new Task(&PrepareIntertreeTestJob3, continuation: completedPreparationContinuation), 0, threadDispatcher); threadDispatcher.DispatchWorkers(&Worker3, managedContext: this); narrowTaskStack.Dispose(narrowPhase.Pool, threadDispatcher); diff --git a/BepuPhysics/Trees/Tree_IntertreeQueriesMT.cs b/BepuPhysics/Trees/Tree_IntertreeQueriesMT.cs index ba9732b3..561229ae 100644 --- a/BepuPhysics/Trees/Tree_IntertreeQueriesMT.cs +++ b/BepuPhysics/Trees/Tree_IntertreeQueriesMT.cs @@ -28,18 +28,16 @@ struct Job public Tree TreeB; public TOverlapHandler[] OverlapHandlers; - public MultithreadedIntertreeTest(BufferPool pool) - { - Pool = pool; - } - /// /// Prepares the jobs associated with a self test. Must be called before a dispatch over PairTest. /// /// Callbacks used to handle individual overlaps detected by the self test. /// Number of threads to prepare jobs for. - public void PrepareJobs(ref Tree treeA, ref Tree treeB, TOverlapHandler[] overlapHandlers, int threadCount) + /// Index of the worker executing the preparation job. + /// Pool to allocate from. + public void PrepareJobs(ref Tree treeA, ref Tree treeB, TOverlapHandler[] overlapHandlers, int threadCount, int workerIndex, BufferPool pool) { + Pool = pool; if (treeA.LeafCount == 0 || treeB.LeafCount == 0) { //If either tree has zero leaves, no intertree test is required. @@ -58,10 +56,11 @@ public void PrepareJobs(ref Tree treeA, ref Tree treeB, TOverlapHandler[] overla this.TreeA = treeA; this.TreeB = treeB; //Collect jobs. + ref var handler = ref OverlapHandlers[workerIndex]; if (treeA.LeafCount >= 2 && treeB.LeafCount >= 2) { //Both trees have complete nodes; we can use a general case. - GetJobsBetweenDifferentNodes(ref treeA.Nodes[0], ref treeB.Nodes[0], ref OverlapHandlers[0]); + GetJobsBetweenDifferentNodes(ref treeA.Nodes[0], ref treeB.Nodes[0], ref handler); } else if (treeA.LeafCount == 1 && treeB.LeafCount >= 2) { @@ -72,11 +71,11 @@ public void PrepareJobs(ref Tree treeA, ref Tree treeB, TOverlapHandler[] overla var abIntersects = BoundingBox.IntersectsUnsafe(a.A, b.B); if (aaIntersects) { - DispatchTestForNodes(ref a.A, ref b.A, ref OverlapHandlers[0]); + DispatchTestForNodes(ref a.A, ref b.A, ref handler); } if (abIntersects) { - DispatchTestForNodes(ref a.A, ref b.B, ref OverlapHandlers[0]); + DispatchTestForNodes(ref a.A, ref b.B, ref handler); } } else if (treeA.LeafCount >= 2 && treeB.LeafCount == 1) @@ -88,11 +87,11 @@ public void PrepareJobs(ref Tree treeA, ref Tree treeB, TOverlapHandler[] overla var baIntersects = BoundingBox.IntersectsUnsafe(a.B, b.A); if (aaIntersects) { - DispatchTestForNodes(ref a.A, ref b.A, ref OverlapHandlers[0]); + DispatchTestForNodes(ref a.A, ref b.A, ref handler); } if (baIntersects) { - DispatchTestForNodes(ref a.B, ref b.A, ref OverlapHandlers[0]); + DispatchTestForNodes(ref a.B, ref b.A, ref handler); } } else @@ -100,20 +99,21 @@ public void PrepareJobs(ref Tree treeA, ref Tree treeB, TOverlapHandler[] overla Debug.Assert(treeA.LeafCount == 1 && treeB.LeafCount == 1); if (BoundingBox.IntersectsUnsafe(treeA.Nodes[0].A, treeB.Nodes[0].A)) { - DispatchTestForNodes(ref treeA.Nodes[0].A, ref treeB.Nodes[0].A, ref OverlapHandlers[0]); + DispatchTestForNodes(ref treeA.Nodes[0].A, ref treeB.Nodes[0].A, ref handler); } } } /// - /// Cleans up after a multithreaded self test. + /// Cleans up after a multithreaded self test. Returns resources to the pool used by . /// public void CompleteTest() { //Note that we don't allocate a job list if there aren't any jobs. if (jobs.Span.Allocated) jobs.Dispose(Pool); + Pool = null; } public void ExecuteJob(int jobIndex, int workerIndex) diff --git a/BepuPhysics/Trees/Tree_SelfQueriesMT.cs b/BepuPhysics/Trees/Tree_SelfQueriesMT.cs index 21d36181..a03aec38 100644 --- a/BepuPhysics/Trees/Tree_SelfQueriesMT.cs +++ b/BepuPhysics/Trees/Tree_SelfQueriesMT.cs @@ -1,281 +1,280 @@ -using BepuUtilities; -using BepuUtilities.Collections; -using BepuUtilities.Memory; -using System; -using System.Diagnostics; -using System.Runtime.CompilerServices; -using System.Threading; - -namespace BepuPhysics.Trees -{ - partial struct Tree - { - //TODO: - //There are a some issues inherited from the prototype that we'd like to address at some point: - //1) Recursion. There's no reason to use recursion here. - //2) Duplicate work with the single threaded variant. The current load balancing approach uses a single threaded pass to dive into the tree, and that logic - //is basically identical. It would be great to have a zero overhead abstraction that unifies the two. Unclear how useful this is- it's possible that the abstraction - //would end up being more complex than just two near-identical implementations. - //3) Limited workstealing capacity. While we can dive arbitrarily far in the first pass, it increases the single threaded phase. - //If the narrow phase relies on the broadphase for its work balancing (that is, the overlap handler directly triggers narrow phase work), - //you may need to dive so deeply to maintain load balance that the single threaded phase starts to limit parallelism meaningfully. - //Any constant cost less than ~5us is basically irrelevant, though- if you can collect 128 nodepairs to test in 5us, that would likely be enough to load balance the narrow phase - //even on something like 16 cores. - //4) If the handler directly executes narrow phase work, overlaps handled during the single threaded collection phase could be nasty. This should be pretty rare for any nontrivial - //tree, but it's still something to be aware of in corner cases. - - //To specifically address #3 above, consider explicit workstealing. When a worker is out of directly accessible work (its exhausted its own stack, and no more precollected roots exist), - //it could snoop other worker stacks. This would introduce sync requirements on every stack. - //1) The stealer would probably start at claim 0 and walk forward. The largest jobs are at the top of the stack, which gives you the most bang for the sync work buck. - //It would check the claims state of each stack entry- there would be a integer on each entry marking it as claimed or not. Once a candidate is found, compare exchange to claim it. - //It would have to distinguish between 'stolen' blocks and locally claimed blocks. A thief can step over stolen blocks, but if it hits a locally claimed block, it has to stop. - //2) While pushing new jobs to the local stack is free, victims must always check to confirm that a stack pop will not consume a job that has been stolen by another thread. - //Given that shallow stack accesses will tend to be less work, the local thread should probably prefer claiming chunks of its stack at a time. It can do this simply by - //performing a compare exchange on a stack element the desired number of elements up the stack. Since thieves always work step by step without leaving any gaps, the local thread - //can block them by claiming at any (unclaimed) point in the stack. All later stack entries can be unaffected. In practice, this means local threads should be able to - //avoid doing interlocked operations on the overwhelming majority of pop operations. - - //With such a scheme, you would still want to somehow collect an initial set of jobs to give workers something to munch on, but you don't need lots of jobs per worker anymore. - //So, if you had a 128 core machine, you could get away with still having ~256 jobs- which you can probably collect in less than 20us even on lower frequency processors - //(like the ones you'd find in a 128 core machine). - - public class MultithreadedSelfTest where TOverlapHandler : struct, IOverlapHandler - { - struct Job - { - public int A; - public int B; - } - - public BufferPool Pool; - - int NextNodePair; - int leafThreshold; - private QuickList jobs; - public int JobCount => jobs.Count; - public Tree Tree; - public TOverlapHandler[] OverlapHandlers; - - public MultithreadedSelfTest(BufferPool pool) - { - Pool = pool; - } - - /// - /// Prepares the jobs associated with a self test. Must be called before a dispatch over PairTest. - /// - /// Tree to test against itself. - /// Callbacks used to handle individual overlaps detected by the self test. - /// Number of threads to prepare jobs for. - public void PrepareJobs(ref Tree tree, TOverlapHandler[] overlapHandlers, int threadCount) - { - //If there are not multiple children, there's no need to recurse. - //This provides a guarantee that there are at least 2 children in each internal node considered by GetOverlapsInNode. - if (tree.LeafCount < 2) - { - //We clear it out to avoid keeping any old job counts. The count property is used for scheduling, so incorrect values could break the job scheduler. - jobs = new QuickList(); - return; - } - Debug.Assert(overlapHandlers.Length >= threadCount); - const float jobMultiplier = 8f; - var targetJobCount = Math.Max(1, jobMultiplier * threadCount); - leafThreshold = (int)(tree.LeafCount / targetJobCount); - jobs = new QuickList((int)(targetJobCount * 2), Pool); - NextNodePair = -1; - this.OverlapHandlers = overlapHandlers; - this.Tree = tree; - //Collect jobs. - CollectJobsInNode(0, tree.LeafCount, ref OverlapHandlers[0]); - } - - /// - /// Cleans up after a multithreaded self test. - /// - public void CompleteSelfTest() - { - //Note that a tree with 0 or 1 entries won't have any jobs. - if (jobs.Span.Allocated) - jobs.Dispose(Pool); - } - - public void ExecuteJob(int jobIndex, int workerIndex) - { - ref var overlap = ref jobs[jobIndex]; - if (overlap.A >= 0) - { - if (overlap.A == overlap.B) - { - //Same node. - Tree.GetOverlapsInNode(ref Tree.Nodes[overlap.A], ref OverlapHandlers[workerIndex]); - } - else if (overlap.B >= 0) - { - //Different nodes. - Tree.GetOverlapsBetweenDifferentNodes(ref Tree.Nodes[overlap.A], ref Tree.Nodes[overlap.B], ref OverlapHandlers[workerIndex]); - } - else - { - //A is an internal node, B is a leaf. - var leafIndex = Encode(overlap.B); - ref var leaf = ref Tree.Leaves[leafIndex]; - ref var childOwningLeaf = ref Unsafe.Add(ref Tree.Nodes[leaf.NodeIndex].A, leaf.ChildIndex); - Tree.TestLeafAgainstNode(leafIndex, ref childOwningLeaf, overlap.A, ref OverlapHandlers[workerIndex]); - } - } - else - { - //A is a leaf, B is internal. - var leafIndex = Encode(overlap.A); - ref var leaf = ref Tree.Leaves[leafIndex]; - ref var childOwningLeaf = ref Unsafe.Add(ref Tree.Nodes[leaf.NodeIndex].A, leaf.ChildIndex); - Tree.TestLeafAgainstNode(leafIndex, ref childOwningLeaf, overlap.B, ref OverlapHandlers[workerIndex]); - - //NOTE THAT WE DO NOT HANDLE THE CASE THAT BOTH A AND B ARE LEAVES HERE. - //The collection routine should take care of that, since it has more convenient access to bounding boxes and because a single test isn't worth an atomic increment. - } - } - /// - /// Executes a single worker of the multithreaded self test. - /// - /// Index of the worker executing this set of tests. - public void PairTest(int workerIndex) - { - Debug.Assert(workerIndex >= 0 && workerIndex < OverlapHandlers.Length); - int nextNodePairIndex; - //To minimize the number of worker overlap lists, perform direct load balancing by manually grabbing the next indices. - while ((nextNodePairIndex = Interlocked.Increment(ref NextNodePair)) < jobs.Count) - { - ExecuteJob(nextNodePairIndex, workerIndex); - } - } - - void DispatchTestForLeaf(int leafIndex, ref NodeChild leafChild, int nodeIndex, int nodeLeafCount, ref TOverlapHandler results) - { - if (nodeIndex < 0) - { - results.Handle(leafIndex, Encode(nodeIndex)); - } - else - { - if (nodeLeafCount <= leafThreshold) - jobs.Add(new Job { A = Encode(leafIndex), B = nodeIndex }, Pool); - else - TestLeafAgainstNode(leafIndex, ref leafChild, nodeIndex, ref results); - } - } - - void TestLeafAgainstNode(int leafIndex, ref NodeChild leafChild, int nodeIndex, ref TOverlapHandler results) - { - ref var node = ref Tree.Nodes[nodeIndex]; - ref var a = ref node.A; - ref var b = ref node.B; - //Despite recursion, leafBounds should remain in L1- it'll be used all the way down the recursion from here. - //However, while we likely loaded child B when we loaded child A, there's no guarantee that it will stick around. - //Reloading that in the event of eviction would require more work than keeping the derived data on the stack. - //TODO: this is some pretty questionable microtuning. It's not often that the post-leaf-found recursion will be long enough to evict L1. Definitely test it. - var bIndex = b.Index; - var bLeafCount = b.LeafCount; - var aIntersects = BoundingBox.IntersectsUnsafe(leafChild, a); - var bIntersects = BoundingBox.IntersectsUnsafe(leafChild, b); - if (aIntersects) - { - DispatchTestForLeaf(leafIndex, ref leafChild, a.Index, a.LeafCount, ref results); - } - if (bIntersects) - { - DispatchTestForLeaf(leafIndex, ref leafChild, bIndex, bLeafCount, ref results); - } - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - void DispatchTestForNodes(ref NodeChild a, ref NodeChild b, ref TOverlapHandler results) - { - if (a.Index >= 0) - { - if (b.Index >= 0) - { - if (a.LeafCount + b.LeafCount <= leafThreshold) - jobs.Add(new Job { A = a.Index, B = b.Index }, Pool); - else - GetJobsBetweenDifferentNodes(ref Tree.Nodes[a.Index], ref Tree.Nodes[b.Index], ref results); - - } - else - { - //leaf B versus node A. - TestLeafAgainstNode(Encode(b.Index), ref b, a.Index, ref results); - } - } - else if (b.Index >= 0) - { - //leaf A versus node B. - TestLeafAgainstNode(Encode(a.Index), ref a, b.Index, ref results); - } - else - { - //Two leaves. - results.Handle(Encode(a.Index), Encode(b.Index)); - } - } - - void GetJobsBetweenDifferentNodes(ref Node a, ref Node b, ref TOverlapHandler results) - { - //There are no shared children, so test them all. - - ref var aa = ref a.A; - ref var ab = ref a.B; - ref var ba = ref b.A; - ref var bb = ref b.B; - var aaIntersects = BoundingBox.IntersectsUnsafe(aa, ba); - var abIntersects = BoundingBox.IntersectsUnsafe(aa, bb); - var baIntersects = BoundingBox.IntersectsUnsafe(ab, ba); - var bbIntersects = BoundingBox.IntersectsUnsafe(ab, bb); - - if (aaIntersects) - { - DispatchTestForNodes(ref aa, ref ba, ref results); - } - if (abIntersects) - { - DispatchTestForNodes(ref aa, ref bb, ref results); - } - if (baIntersects) - { - DispatchTestForNodes(ref ab, ref ba, ref results); - } - if (bbIntersects) - { - DispatchTestForNodes(ref ab, ref bb, ref results); - } - - } - - void CollectJobsInNode(int nodeIndex, int leafCount, ref TOverlapHandler results) - { - if (leafCount <= leafThreshold) - { - jobs.Add(new Job { A = nodeIndex, B = nodeIndex }, Pool); - return; - } - - ref var node = ref Tree.Nodes[nodeIndex]; - ref var a = ref node.A; - ref var b = ref node.B; - - var ab = BoundingBox.IntersectsUnsafe(a, b); - - if (a.Index >= 0) - CollectJobsInNode(a.Index, a.LeafCount, ref results); - if (b.Index >= 0) - CollectJobsInNode(b.Index, b.LeafCount, ref results); - - if (ab) - { - DispatchTestForNodes(ref a, ref b, ref results); - } - - } - } - } -} +using BepuUtilities; +using BepuUtilities.Collections; +using BepuUtilities.Memory; +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace BepuPhysics.Trees +{ + partial struct Tree + { + //TODO: + //There are a some issues inherited from the prototype that we'd like to address at some point: + //1) Recursion. There's no reason to use recursion here. + //2) Duplicate work with the single threaded variant. The current load balancing approach uses a single threaded pass to dive into the tree, and that logic + //is basically identical. It would be great to have a zero overhead abstraction that unifies the two. Unclear how useful this is- it's possible that the abstraction + //would end up being more complex than just two near-identical implementations. + //3) Limited workstealing capacity. While we can dive arbitrarily far in the first pass, it increases the single threaded phase. + //If the narrow phase relies on the broadphase for its work balancing (that is, the overlap handler directly triggers narrow phase work), + //you may need to dive so deeply to maintain load balance that the single threaded phase starts to limit parallelism meaningfully. + //Any constant cost less than ~5us is basically irrelevant, though- if you can collect 128 nodepairs to test in 5us, that would likely be enough to load balance the narrow phase + //even on something like 16 cores. + //4) If the handler directly executes narrow phase work, overlaps handled during the single threaded collection phase could be nasty. This should be pretty rare for any nontrivial + //tree, but it's still something to be aware of in corner cases. + + //To specifically address #3 above, consider explicit workstealing. When a worker is out of directly accessible work (its exhausted its own stack, and no more precollected roots exist), + //it could snoop other worker stacks. This would introduce sync requirements on every stack. + //1) The stealer would probably start at claim 0 and walk forward. The largest jobs are at the top of the stack, which gives you the most bang for the sync work buck. + //It would check the claims state of each stack entry- there would be a integer on each entry marking it as claimed or not. Once a candidate is found, compare exchange to claim it. + //It would have to distinguish between 'stolen' blocks and locally claimed blocks. A thief can step over stolen blocks, but if it hits a locally claimed block, it has to stop. + //2) While pushing new jobs to the local stack is free, victims must always check to confirm that a stack pop will not consume a job that has been stolen by another thread. + //Given that shallow stack accesses will tend to be less work, the local thread should probably prefer claiming chunks of its stack at a time. It can do this simply by + //performing a compare exchange on a stack element the desired number of elements up the stack. Since thieves always work step by step without leaving any gaps, the local thread + //can block them by claiming at any (unclaimed) point in the stack. All later stack entries can be unaffected. In practice, this means local threads should be able to + //avoid doing interlocked operations on the overwhelming majority of pop operations. + + //With such a scheme, you would still want to somehow collect an initial set of jobs to give workers something to munch on, but you don't need lots of jobs per worker anymore. + //So, if you had a 128 core machine, you could get away with still having ~256 jobs- which you can probably collect in less than 20us even on lower frequency processors + //(like the ones you'd find in a 128 core machine). + + public class MultithreadedSelfTest where TOverlapHandler : struct, IOverlapHandler + { + struct Job + { + public int A; + public int B; + } + + public BufferPool Pool; + + int NextNodePair; + int leafThreshold; + private QuickList jobs; + public int JobCount => jobs.Count; + public Tree Tree; + public TOverlapHandler[] OverlapHandlers; + + /// + /// Prepares the jobs associated with a self test. Must be called before a dispatch over PairTest. + /// + /// Tree to test against itself. + /// Callbacks used to handle individual overlaps detected by the self test. + /// Number of threads to prepare jobs for. + /// Index of the worker executing the preparation job. + /// Pool to allocate from. + public void PrepareJobs(ref Tree tree, TOverlapHandler[] overlapHandlers, int threadCount, int workerIndex, BufferPool pool) + { + Pool = pool; + //If there are not multiple children, there's no need to recurse. + //This provides a guarantee that there are at least 2 children in each internal node considered by GetOverlapsInNode. + if (tree.LeafCount < 2) + { + //We clear it out to avoid keeping any old job counts. The count property is used for scheduling, so incorrect values could break the job scheduler. + jobs = new QuickList(); + return; + } + Debug.Assert(overlapHandlers.Length >= threadCount); + const float jobMultiplier = 8f; + var targetJobCount = Math.Max(1, jobMultiplier * threadCount); + leafThreshold = (int)(tree.LeafCount / targetJobCount); + jobs = new QuickList((int)(targetJobCount * 2), Pool); + NextNodePair = -1; + this.OverlapHandlers = overlapHandlers; + this.Tree = tree; + //Collect jobs. + CollectJobsInNode(0, tree.LeafCount, ref OverlapHandlers[workerIndex]); + } + + /// + /// Cleans up after a multithreaded self test. Returns resources to the pool used by . + /// + public void CompleteSelfTest() + { + //Note that a tree with 0 or 1 entries won't have any jobs. + if (jobs.Span.Allocated) + jobs.Dispose(Pool); + Pool = null; + } + + public void ExecuteJob(int jobIndex, int workerIndex) + { + ref var overlap = ref jobs[jobIndex]; + if (overlap.A >= 0) + { + if (overlap.A == overlap.B) + { + //Same node. + Tree.GetOverlapsInNode(ref Tree.Nodes[overlap.A], ref OverlapHandlers[workerIndex]); + } + else if (overlap.B >= 0) + { + //Different nodes. + Tree.GetOverlapsBetweenDifferentNodes(ref Tree.Nodes[overlap.A], ref Tree.Nodes[overlap.B], ref OverlapHandlers[workerIndex]); + } + else + { + //A is an internal node, B is a leaf. + var leafIndex = Encode(overlap.B); + ref var leaf = ref Tree.Leaves[leafIndex]; + ref var childOwningLeaf = ref Unsafe.Add(ref Tree.Nodes[leaf.NodeIndex].A, leaf.ChildIndex); + Tree.TestLeafAgainstNode(leafIndex, ref childOwningLeaf, overlap.A, ref OverlapHandlers[workerIndex]); + } + } + else + { + //A is a leaf, B is internal. + var leafIndex = Encode(overlap.A); + ref var leaf = ref Tree.Leaves[leafIndex]; + ref var childOwningLeaf = ref Unsafe.Add(ref Tree.Nodes[leaf.NodeIndex].A, leaf.ChildIndex); + Tree.TestLeafAgainstNode(leafIndex, ref childOwningLeaf, overlap.B, ref OverlapHandlers[workerIndex]); + + //NOTE THAT WE DO NOT HANDLE THE CASE THAT BOTH A AND B ARE LEAVES HERE. + //The collection routine should take care of that, since it has more convenient access to bounding boxes and because a single test isn't worth an atomic increment. + } + } + /// + /// Executes a single worker of the multithreaded self test. + /// + /// Index of the worker executing this set of tests. + public void PairTest(int workerIndex) + { + Debug.Assert(workerIndex >= 0 && workerIndex < OverlapHandlers.Length); + int nextNodePairIndex; + //To minimize the number of worker overlap lists, perform direct load balancing by manually grabbing the next indices. + while ((nextNodePairIndex = Interlocked.Increment(ref NextNodePair)) < jobs.Count) + { + ExecuteJob(nextNodePairIndex, workerIndex); + } + } + + void DispatchTestForLeaf(int leafIndex, ref NodeChild leafChild, int nodeIndex, int nodeLeafCount, ref TOverlapHandler results) + { + if (nodeIndex < 0) + { + results.Handle(leafIndex, Encode(nodeIndex)); + } + else + { + if (nodeLeafCount <= leafThreshold) + jobs.Add(new Job { A = Encode(leafIndex), B = nodeIndex }, Pool); + else + TestLeafAgainstNode(leafIndex, ref leafChild, nodeIndex, ref results); + } + } + + void TestLeafAgainstNode(int leafIndex, ref NodeChild leafChild, int nodeIndex, ref TOverlapHandler results) + { + ref var node = ref Tree.Nodes[nodeIndex]; + ref var a = ref node.A; + ref var b = ref node.B; + //Despite recursion, leafBounds should remain in L1- it'll be used all the way down the recursion from here. + //However, while we likely loaded child B when we loaded child A, there's no guarantee that it will stick around. + //Reloading that in the event of eviction would require more work than keeping the derived data on the stack. + //TODO: this is some pretty questionable microtuning. It's not often that the post-leaf-found recursion will be long enough to evict L1. Definitely test it. + var bIndex = b.Index; + var bLeafCount = b.LeafCount; + var aIntersects = BoundingBox.IntersectsUnsafe(leafChild, a); + var bIntersects = BoundingBox.IntersectsUnsafe(leafChild, b); + if (aIntersects) + { + DispatchTestForLeaf(leafIndex, ref leafChild, a.Index, a.LeafCount, ref results); + } + if (bIntersects) + { + DispatchTestForLeaf(leafIndex, ref leafChild, bIndex, bLeafCount, ref results); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + void DispatchTestForNodes(ref NodeChild a, ref NodeChild b, ref TOverlapHandler results) + { + if (a.Index >= 0) + { + if (b.Index >= 0) + { + if (a.LeafCount + b.LeafCount <= leafThreshold) + jobs.Add(new Job { A = a.Index, B = b.Index }, Pool); + else + GetJobsBetweenDifferentNodes(ref Tree.Nodes[a.Index], ref Tree.Nodes[b.Index], ref results); + + } + else + { + //leaf B versus node A. + TestLeafAgainstNode(Encode(b.Index), ref b, a.Index, ref results); + } + } + else if (b.Index >= 0) + { + //leaf A versus node B. + TestLeafAgainstNode(Encode(a.Index), ref a, b.Index, ref results); + } + else + { + //Two leaves. + results.Handle(Encode(a.Index), Encode(b.Index)); + } + } + + void GetJobsBetweenDifferentNodes(ref Node a, ref Node b, ref TOverlapHandler results) + { + //There are no shared children, so test them all. + + ref var aa = ref a.A; + ref var ab = ref a.B; + ref var ba = ref b.A; + ref var bb = ref b.B; + var aaIntersects = BoundingBox.IntersectsUnsafe(aa, ba); + var abIntersects = BoundingBox.IntersectsUnsafe(aa, bb); + var baIntersects = BoundingBox.IntersectsUnsafe(ab, ba); + var bbIntersects = BoundingBox.IntersectsUnsafe(ab, bb); + + if (aaIntersects) + { + DispatchTestForNodes(ref aa, ref ba, ref results); + } + if (abIntersects) + { + DispatchTestForNodes(ref aa, ref bb, ref results); + } + if (baIntersects) + { + DispatchTestForNodes(ref ab, ref ba, ref results); + } + if (bbIntersects) + { + DispatchTestForNodes(ref ab, ref bb, ref results); + } + + } + + void CollectJobsInNode(int nodeIndex, int leafCount, ref TOverlapHandler results) + { + if (leafCount <= leafThreshold) + { + jobs.Add(new Job { A = nodeIndex, B = nodeIndex }, Pool); + return; + } + + ref var node = ref Tree.Nodes[nodeIndex]; + ref var a = ref node.A; + ref var b = ref node.B; + + var ab = BoundingBox.IntersectsUnsafe(a, b); + + if (a.Index >= 0) + CollectJobsInNode(a.Index, a.LeafCount, ref results); + if (b.Index >= 0) + CollectJobsInNode(b.Index, b.LeafCount, ref results); + + if (ab) + { + DispatchTestForNodes(ref a, ref b, ref results); + } + + } + } + } +} diff --git a/Demos/SpecializedTests/IntertreeThreadingTests.cs b/Demos/SpecializedTests/IntertreeThreadingTests.cs index 898fd42b..a6264582 100644 --- a/Demos/SpecializedTests/IntertreeThreadingTests.cs +++ b/Demos/SpecializedTests/IntertreeThreadingTests.cs @@ -90,13 +90,13 @@ unsafe static void TestTrees(BufferPool pool, IThreadDispatcher threadDispatcher treeA.Validate(); treeB.Validate(); - var context = new Tree.MultithreadedIntertreeTest(pool); + var context = new Tree.MultithreadedIntertreeTest(); var handlers = new OverlapHandler[threadDispatcher.ThreadCount]; for (int i = 0; i < threadDispatcher.ThreadCount; ++i) { handlers[i].Pairs = new List<(int a, int b)>(); } - context.PrepareJobs(ref treeA, ref treeB, handlers, threadDispatcher.ThreadCount); + context.PrepareJobs(ref treeA, ref treeB, handlers, threadDispatcher.ThreadCount, 0, pool); threadDispatcher.DispatchWorkers(context.PairTest, context.JobCount); context.CompleteTest(); List<(int a, int b)> multithreadedResults = new List<(int, int)>(); diff --git a/Demos/SpecializedTests/TreeTest.cs b/Demos/SpecializedTests/TreeTest.cs index 8910c25e..7a914904 100644 --- a/Demos/SpecializedTests/TreeTest.cs +++ b/Demos/SpecializedTests/TreeTest.cs @@ -65,7 +65,7 @@ public static void Test() var threadDispatcher = new ThreadDispatcher(Environment.ProcessorCount); var refineContext = new Tree.RefitAndRefineMultithreadedContext(); - var selfTestContext = new Tree.MultithreadedSelfTest(pool); + var selfTestContext = new Tree.MultithreadedSelfTest(); var overlapHandlers = new OverlapHandler[threadDispatcher.ThreadCount]; Action pairTestAction = selfTestContext.PairTest; var removedLeafHandles = new QuickList(leafCount, pool); @@ -129,7 +129,7 @@ public static void Test() { overlapHandlers[k] = new OverlapHandler(); } - selfTestContext.PrepareJobs(ref tree, overlapHandlers, threadDispatcher.ThreadCount); + selfTestContext.PrepareJobs(ref tree, overlapHandlers, threadDispatcher.ThreadCount, 0, pool); threadDispatcher.DispatchWorkers(pairTestAction); selfTestContext.CompleteSelfTest(); tree.Validate();