Edit Mesh: multi-thread auto-smooth & custom normal calculations

Supported multi-threading for bm_mesh_loops_calc_normals. This is done by operating on vertex-loops instead of face-loops. Single threaded operation still loops over faces since iterating over vertices adds some overhead in the case of custom-normals as the order used for accessing loops must be the same as iterating of a faces loops. From isolated timing tests of bm_mesh_loops_calc_normals on high poly models, this gives between 3.5x to 10x speedup, with larger gains for meshes with custom-normals. NOTE: this is part one of two patches for multi-threaded auto-smooth, tagging edges as sharp is still single threaded. Reviewed By: mont29 Ref D11928
2021-07-14 13:22:58 +10:00
parent 3fb47956c0
commit 4ba06ad0a8
3 changed files with 742 additions and 273 deletions
--- a/source/blender/blenkernel/BKE_mesh.h
+++ b/source/blender/blenkernel/BKE_mesh.h
@@ -399,6 +399,12 @@ void BKE_lnor_spacearr_init(MLoopNorSpaceArray *lnors_spacearr,
                            const char data_type);
 void BKE_lnor_spacearr_clear(MLoopNorSpaceArray *lnors_spacearr);
 void BKE_lnor_spacearr_free(MLoopNorSpaceArray *lnors_spacearr);
+
+void BKE_lnor_spacearr_tls_init(MLoopNorSpaceArray *lnors_spacearr,
+                                MLoopNorSpaceArray *lnors_spacearr_tls);
+void BKE_lnor_spacearr_tls_join(MLoopNorSpaceArray *lnors_spacearr,
+                                MLoopNorSpaceArray *lnors_spacearr_tls);
+
 MLoopNorSpace *BKE_lnor_space_create(MLoopNorSpaceArray *lnors_spacearr);
 void BKE_lnor_space_define(MLoopNorSpace *lnor_space,
                           const float lnor[3],
--- a/source/blender/blenkernel/intern/mesh_normals.cc
+++ b/source/blender/blenkernel/intern/mesh_normals.cc
@@ -530,6 +530,36 @@ void BKE_lnor_spacearr_init(MLoopNorSpaceArray *lnors_spacearr,
  lnors_spacearr->data_type = data_type;
 }

+/**
+ * Utility for multi-threaded calculation that ensures
+ * `lnors_spacearr_tls` doesn't share memory with `lnors_spacearr`
+ * that would cause it not to be thread safe.
+ *
+ * \note This works as long as threads never operate on the same loops at once.
+ */
+void BKE_lnor_spacearr_tls_init(MLoopNorSpaceArray *lnors_spacearr,
+                                MLoopNorSpaceArray *lnors_spacearr_tls)
+{
+  *lnors_spacearr_tls = *lnors_spacearr;
+  lnors_spacearr_tls->mem = BLI_memarena_new(BLI_MEMARENA_STD_BUFSIZE, __func__);
+}
+
+/**
+ * Utility for multi-threaded calculation
+ * that merges `lnors_spacearr_tls` into `lnors_spacearr`.
+ */
+void BKE_lnor_spacearr_tls_join(MLoopNorSpaceArray *lnors_spacearr,
+                                MLoopNorSpaceArray *lnors_spacearr_tls)
+{
+  BLI_assert(lnors_spacearr->data_type == lnors_spacearr_tls->data_type);
+  BLI_assert(lnors_spacearr->mem != lnors_spacearr_tls->mem);
+  lnors_spacearr->num_spaces += lnors_spacearr_tls->num_spaces;
+  BLI_memarena_merge(lnors_spacearr->mem, lnors_spacearr_tls->mem);
+  BLI_memarena_free(lnors_spacearr_tls->mem);
+  lnors_spacearr_tls->mem = nullptr;
+  BKE_lnor_spacearr_clear(lnors_spacearr_tls);
+}
+
 void BKE_lnor_spacearr_clear(MLoopNorSpaceArray *lnors_spacearr)
 {
  lnors_spacearr->num_spaces = 0;
--- a/source/blender/bmesh/intern/bmesh_mesh_normals.c
+++ b/source/blender/bmesh/intern/bmesh_mesh_normals.c
@@ -526,81 +526,43 @@ bool BM_loop_check_cyclic_smooth_fan(BMLoop *l_curr)
 }

 /**
- * BMesh version of BKE_mesh_normals_loop_split() in `mesh_evaluate.cc`
- * Will use first clnors_data array, and fallback to cd_loop_clnors_offset
- * (use NULL and -1 to not use clnors).
+ * Called for all faces loops.
 *
- * \note This sets #BM_ELEM_TAG which is used in tool code (e.g. T84426).
- * we could add a low-level API flag for this, see #BM_ELEM_API_FLAG_ENABLE and friends.
+ * - All loops must have #BM_ELEM_TAG cleared.
+ * - Loop indices must be valid.
+ *
+ * \note When custom normals are present, the order of loops can be important.
+ * Loops with lower indices must be passed before loops with higher indices (for each vertex).
+ * This is needed since the first loop sets the reference point for the custom normal offsets.
+ *
+ * \return The number of loops that were handled (for early exit when all have been handled).
 */
-static void bm_mesh_loops_calc_normals(BMesh *bm,
+static int bm_mesh_loops_calc_normals_for_loop(BMesh *bm,
                                               const float (*vcos)[3],
                                               const float (*fnos)[3],
-                                       float (*r_lnos)[3],
-                                       MLoopNorSpaceArray *r_lnors_spacearr,
                                               const short (*clnors_data)[2],
                                               const int cd_loop_clnors_offset,
-                                       const bool do_rebuild)
+                                               const bool has_clnors,
+                                               /* Cache. */
+                                               BLI_Stack *edge_vectors,
+                                               /* Iterate. */
+                                               BMLoop *l_curr,
+                                               /* Result. */
+                                               float (*r_lnos)[3],
+                                               MLoopNorSpaceArray *r_lnors_spacearr)
 {
-  BMIter fiter;
-  BMFace *f_curr;
-  const bool has_clnors = clnors_data || (cd_loop_clnors_offset != -1);
+  BLI_assert((bm->elem_index_dirty & (BM_FACE | BM_LOOP)) == 0);
+  BLI_assert((vcos == NULL) || ((bm->elem_index_dirty & BM_VERT) == 0));
+  UNUSED_VARS_NDEBUG(bm);

-  MLoopNorSpaceArray _lnors_spacearr = {NULL};
+  int handled = 0;

  /* Temp normal stack. */
  BLI_SMALLSTACK_DECLARE(normal, float *);
  /* Temp clnors stack. */
  BLI_SMALLSTACK_DECLARE(clnors, short *);
  /* Temp edge vectors stack, only used when computing lnor spacearr. */
-  BLI_Stack *edge_vectors = NULL;

-  {
-    char htype = 0;
-    if (vcos) {
-      htype |= BM_VERT;
-    }
-    /* Face/Loop indices are set inline below. */
-    BM_mesh_elem_index_ensure(bm, htype);
-  }
-
-  if (!r_lnors_spacearr && has_clnors) {
-    /* We need to compute lnor spacearr if some custom lnor data are given to us! */
-    r_lnors_spacearr = &_lnors_spacearr;
-  }
-  if (r_lnors_spacearr) {
-    BKE_lnor_spacearr_init(r_lnors_spacearr, bm->totloop, MLNOR_SPACEARR_BMLOOP_PTR);
-    edge_vectors = BLI_stack_new(sizeof(float[3]), __func__);
-  }
-
-  /* Clear all loops' tags (means none are to be skipped for now). */
-  int index_face, index_loop = 0;
-  BM_ITER_MESH_INDEX (f_curr, &fiter, bm, BM_FACES_OF_MESH, index_face) {
-    BMLoop *l_curr, *l_first;
-
-    BM_elem_index_set(f_curr, index_face); /* set_inline */
-
-    l_curr = l_first = BM_FACE_FIRST_LOOP(f_curr);
-    do {
-      BM_elem_index_set(l_curr, index_loop++); /* set_inline */
-      BM_elem_flag_disable(l_curr, BM_ELEM_TAG);
-    } while ((l_curr = l_curr->next) != l_first);
-  }
-  bm->elem_index_dirty &= ~(BM_FACE | BM_LOOP);
-
-  /* We now know edges that can be smoothed (they are tagged),
-   * and edges that will be hard (they aren't).
-   * Now, time to generate the normals.
-   */
-  BM_ITER_MESH (f_curr, &fiter, bm, BM_FACES_OF_MESH) {
-    BMLoop *l_curr, *l_first;
-
-    l_curr = l_first = BM_FACE_FIRST_LOOP(f_curr);
-    do {
-      if (do_rebuild && !BM_ELEM_API_FLAG_TEST(l_curr, BM_LNORSPACE_UPDATE) &&
-          !(bm->spacearr_dirty & BM_SPACEARR_DIRTY_ALL)) {
-        continue;
-      }
  /* A smooth edge, we have to check for cyclic smooth fan case.
   * If we find a new, never-processed cyclic smooth fan, we can do it now using that loop/edge
   * as 'entry point', otherwise we can skip it. */
@@ -620,7 +582,7 @@ static void bm_mesh_loops_calc_normals(BMesh *bm,
     * this vertex just takes its poly normal.
     */
    const int l_curr_index = BM_elem_index_get(l_curr);
-        const float *no = fnos ? fnos[BM_elem_index_get(f_curr)] : f_curr->no;
+    const float *no = fnos ? fnos[BM_elem_index_get(l_curr->f)] : l_curr->f->no;
    copy_v3_v3(r_lnos[l_curr_index], no);

    /* If needed, generate this (simple!) lnor space. */
@@ -657,6 +619,7 @@ static void bm_mesh_loops_calc_normals(BMesh *bm,
        BKE_lnor_space_custom_data_to_normal(lnor_space, *clnor, r_lnos[l_curr_index]);
      }
    }
+    handled = 1;
  }
  /* We *do not need* to check/tag loops as already computed!
   * Due to the fact a loop only links to one of its two edges,
@@ -692,8 +655,7 @@ static void bm_mesh_loops_calc_normals(BMesh *bm,

    const float *co_pivot = vcos ? vcos[BM_elem_index_get(v_pivot)] : v_pivot->co;

-        MLoopNorSpace *lnor_space = r_lnors_spacearr ? BKE_lnor_space_create(r_lnors_spacearr) :
-                                                       NULL;
+    MLoopNorSpace *lnor_space = r_lnors_spacearr ? BKE_lnor_space_create(r_lnors_spacearr) : NULL;

    BLI_assert((edge_vectors == NULL) || BLI_stack_is_empty(edge_vectors));

@@ -758,8 +720,7 @@ static void bm_mesh_loops_calc_normals(BMesh *bm,
                                                 (const void *)BM_ELEM_CD_GET_VOID_P(
                                                     lfan_pivot, cd_loop_clnors_offset);
          if (clnors_nbr) {
-                clnors_invalid |= ((*clnor_ref)[0] != (*clnor)[0] ||
-                                   (*clnor_ref)[1] != (*clnor)[1]);
+            clnors_invalid |= ((*clnor_ref)[0] != (*clnor)[0] || (*clnor_ref)[1] != (*clnor)[1]);
          }
          else {
            clnor_ref = clnor;
@@ -777,14 +738,15 @@ static void bm_mesh_loops_calc_normals(BMesh *bm,

      if (r_lnors_spacearr) {
        /* Assign current lnor space to current 'vertex' loop. */
-            BKE_lnor_space_add_loop(
-                r_lnors_spacearr, lnor_space, lfan_pivot_index, lfan_pivot, false);
+        BKE_lnor_space_add_loop(r_lnors_spacearr, lnor_space, lfan_pivot_index, lfan_pivot, false);
        if (e_next != e_org) {
          /* We store here all edges-normalized vectors processed. */
          BLI_stack_push(edge_vectors, vec_next);
        }
      }

+      handled += 1;
+
      if (!BM_elem_flag_test(e_next, BM_ELEM_TAG) || (e_next == e_org)) {
        /* Next edge is sharp, we have finished with this fan of faces around this vert! */
        break;
@@ -861,6 +823,277 @@ static void bm_mesh_loops_calc_normals(BMesh *bm,
      BM_elem_flag_enable(l_curr->v, BM_ELEM_TAG);
    }
  }
+  return handled;
+}
+
+static int bm_loop_index_cmp(const void *a, const void *b)
+{
+  BLI_assert(BM_elem_index_get((BMLoop *)a) != BM_elem_index_get((BMLoop *)b));
+  if (BM_elem_index_get((BMLoop *)a) < BM_elem_index_get((BMLoop *)b)) {
+    return -1;
+  }
+  return 1;
+}
+
+/**
+ * Operate on all vertices loops.
+ * operating on vertices this is needed for multi-threading
+ * so there is a guarantee that each thread has isolated loops.
+ */
+static void bm_mesh_loops_calc_normals_for_vert_with_clnors(BMesh *bm,
+                                                            const float (*vcos)[3],
+                                                            const float (*fnos)[3],
+                                                            float (*r_lnos)[3],
+                                                            const short (*clnors_data)[2],
+                                                            const int cd_loop_clnors_offset,
+                                                            const bool do_rebuild,
+                                                            /* TLS */
+                                                            MLoopNorSpaceArray *r_lnors_spacearr,
+                                                            BLI_Stack *edge_vectors,
+                                                            /* Iterate over. */
+                                                            BMVert *v)
+{
+  /* Respecting face order is necessary so the initial starting loop is consistent
+   * with looping over loops of all faces.
+   *
+   * Logically we could sort the loops by their index & loop over them
+   * however it's faster to use the lowest index of an un-ordered list
+   * since it's common that smooth vertices only ever need to pick one loop
+   * which then handles all the others.
+   *
+   * Sorting is only performed when multiple fans are found. */
+  const bool has_clnors = true;
+  LinkNode *loops_of_vert = NULL;
+  int loops_of_vert_count = 0;
+
+  /* The loop with the lowest index. */
+  {
+    LinkNode *link_best;
+    uint index_best = UINT_MAX;
+    BMEdge *e_curr_iter = v->e;
+    do { /* Edges of vertex. */
+      BMLoop *l_curr = e_curr_iter->l;
+      if (l_curr == NULL) {
+        continue;
+      }
+      do { /* Radial loops. */
+        if (l_curr->v != v) {
+          continue;
+        }
+        if (do_rebuild && !BM_ELEM_API_FLAG_TEST(l_curr, BM_LNORSPACE_UPDATE) &&
+            !(bm->spacearr_dirty & BM_SPACEARR_DIRTY_ALL)) {
+          continue;
+        }
+        BM_elem_flag_disable(l_curr, BM_ELEM_TAG);
+        BLI_linklist_prepend_alloca(&loops_of_vert, l_curr);
+        loops_of_vert_count += 1;
+
+        const uint index_test = (uint)BM_elem_index_get(l_curr);
+        if (index_best > index_test) {
+          index_best = index_test;
+          link_best = loops_of_vert;
+        }
+      } while ((l_curr = l_curr->radial_next) != e_curr_iter->l);
+    } while ((e_curr_iter = BM_DISK_EDGE_NEXT(e_curr_iter, v)) != v->e);
+
+    if (UNLIKELY(loops_of_vert == NULL)) {
+      return;
+    }
+
+    /* Immediately pop the best element.
+     * The order doesn't matter, so swap the links as it's simpler than tracking
+     * reference to `link_best`. */
+    if (link_best != loops_of_vert) {
+      SWAP(void *, link_best->link, loops_of_vert->link);
+    }
+  }
+
+  bool loops_of_vert_is_sorted = false;
+
+  /* Keep track of the number of loops that have been assigned. */
+  int loops_of_vert_handled = 0;
+
+  while (loops_of_vert != NULL) {
+    BMLoop *l_best = loops_of_vert->link;
+    loops_of_vert = loops_of_vert->next;
+
+    BLI_assert(l_best->v == v);
+    loops_of_vert_handled += bm_mesh_loops_calc_normals_for_loop(bm,
+                                                                 vcos,
+                                                                 fnos,
+                                                                 clnors_data,
+                                                                 cd_loop_clnors_offset,
+                                                                 has_clnors,
+                                                                 edge_vectors,
+                                                                 l_best,
+                                                                 r_lnos,
+                                                                 r_lnors_spacearr);
+
+    /* Check if an early exit is possible without  an exhaustive inspection of every loop
+     * where 1 loop's fan extends out to all remaining loops.
+     * This is a common case for smooth vertices. */
+    BLI_assert(loops_of_vert_handled <= loops_of_vert_count);
+    if (loops_of_vert_handled == loops_of_vert_count) {
+      break;
+    }
+
+    /* Note on sorting, in some cases it will be faster to scan for the lowest index each time.
+     * However in the worst case this is `O(N^2)`, so use a single sort call instead. */
+    if (!loops_of_vert_is_sorted) {
+      if (loops_of_vert && loops_of_vert->next) {
+        loops_of_vert = BLI_linklist_sort(loops_of_vert, bm_loop_index_cmp);
+        loops_of_vert_is_sorted = true;
+      }
+    }
+  }
+}
+
+/**
+ * A simplified version of #bm_mesh_loops_calc_normals_for_vert_with_clnors
+ * that can operate on loops in any order.
+ */
+static void bm_mesh_loops_calc_normals_for_vert_without_clnors(
+    BMesh *bm,
+    const float (*vcos)[3],
+    const float (*fnos)[3],
+    float (*r_lnos)[3],
+    const bool do_rebuild,
+    /* TLS */
+    MLoopNorSpaceArray *r_lnors_spacearr,
+    BLI_Stack *edge_vectors,
+    /* Iterate over. */
+    BMVert *v)
+{
+  const bool has_clnors = false;
+  const short(*clnors_data)[2] = NULL;
+  const int cd_loop_clnors_offset = -1;
+
+  BMEdge *e_curr_iter;
+
+  /* Unfortunately a loop is needed just to clear loop-tags. */
+  e_curr_iter = v->e;
+  do { /* Edges of vertex. */
+    BMLoop *l_curr = e_curr_iter->l;
+    if (l_curr == NULL) {
+      continue;
+    }
+    do { /* Radial loops. */
+      if (l_curr->v != v) {
+        continue;
+      }
+      BM_elem_flag_disable(l_curr, BM_ELEM_TAG);
+    } while ((l_curr = l_curr->radial_next) != e_curr_iter->l);
+  } while ((e_curr_iter = BM_DISK_EDGE_NEXT(e_curr_iter, v)) != v->e);
+
+  e_curr_iter = v->e;
+  do { /* Edges of vertex. */
+    BMLoop *l_curr = e_curr_iter->l;
+    if (l_curr == NULL) {
+      continue;
+    }
+    do { /* Radial loops. */
+      if (l_curr->v != v) {
+        continue;
+      }
+      if (do_rebuild && !BM_ELEM_API_FLAG_TEST(l_curr, BM_LNORSPACE_UPDATE) &&
+          !(bm->spacearr_dirty & BM_SPACEARR_DIRTY_ALL)) {
+        continue;
+      }
+      bm_mesh_loops_calc_normals_for_loop(bm,
+                                          vcos,
+                                          fnos,
+                                          clnors_data,
+                                          cd_loop_clnors_offset,
+                                          has_clnors,
+                                          edge_vectors,
+                                          l_curr,
+                                          r_lnos,
+                                          r_lnors_spacearr);
+    } while ((l_curr = l_curr->radial_next) != e_curr_iter->l);
+  } while ((e_curr_iter = BM_DISK_EDGE_NEXT(e_curr_iter, v)) != v->e);
+}
+
+/**
+ * BMesh version of BKE_mesh_normals_loop_split() in `mesh_evaluate.cc`
+ * Will use first clnors_data array, and fallback to cd_loop_clnors_offset
+ * (use NULL and -1 to not use clnors).
+ *
+ * \note This sets #BM_ELEM_TAG which is used in tool code (e.g. T84426).
+ * we could add a low-level API flag for this, see #BM_ELEM_API_FLAG_ENABLE and friends.
+ */
+static void bm_mesh_loops_calc_normals__single_threaded(BMesh *bm,
+                                                        const float (*vcos)[3],
+                                                        const float (*fnos)[3],
+                                                        float (*r_lnos)[3],
+                                                        MLoopNorSpaceArray *r_lnors_spacearr,
+                                                        const short (*clnors_data)[2],
+                                                        const int cd_loop_clnors_offset,
+                                                        const bool do_rebuild)
+{
+  BMIter fiter;
+  BMFace *f_curr;
+  const bool has_clnors = clnors_data || (cd_loop_clnors_offset != -1);
+
+  MLoopNorSpaceArray _lnors_spacearr = {NULL};
+
+  BLI_Stack *edge_vectors = NULL;
+
+  {
+    char htype = 0;
+    if (vcos) {
+      htype |= BM_VERT;
+    }
+    /* Face/Loop indices are set inline below. */
+    BM_mesh_elem_index_ensure(bm, htype);
+  }
+
+  if (!r_lnors_spacearr && has_clnors) {
+    /* We need to compute lnor spacearr if some custom lnor data are given to us! */
+    r_lnors_spacearr = &_lnors_spacearr;
+  }
+  if (r_lnors_spacearr) {
+    BKE_lnor_spacearr_init(r_lnors_spacearr, bm->totloop, MLNOR_SPACEARR_BMLOOP_PTR);
+    edge_vectors = BLI_stack_new(sizeof(float[3]), __func__);
+  }
+
+  /* Clear all loops' tags (means none are to be skipped for now). */
+  int index_face, index_loop = 0;
+  BM_ITER_MESH_INDEX (f_curr, &fiter, bm, BM_FACES_OF_MESH, index_face) {
+    BMLoop *l_curr, *l_first;
+
+    BM_elem_index_set(f_curr, index_face); /* set_inline */
+
+    l_curr = l_first = BM_FACE_FIRST_LOOP(f_curr);
+    do {
+      BM_elem_index_set(l_curr, index_loop++); /* set_inline */
+      BM_elem_flag_disable(l_curr, BM_ELEM_TAG);
+    } while ((l_curr = l_curr->next) != l_first);
+  }
+  bm->elem_index_dirty &= ~(BM_FACE | BM_LOOP);
+
+  /* We now know edges that can be smoothed (they are tagged),
+   * and edges that will be hard (they aren't).
+   * Now, time to generate the normals.
+   */
+  BM_ITER_MESH (f_curr, &fiter, bm, BM_FACES_OF_MESH) {
+    BMLoop *l_curr, *l_first;
+
+    l_curr = l_first = BM_FACE_FIRST_LOOP(f_curr);
+    do {
+      if (do_rebuild && !BM_ELEM_API_FLAG_TEST(l_curr, BM_LNORSPACE_UPDATE) &&
+          !(bm->spacearr_dirty & BM_SPACEARR_DIRTY_ALL)) {
+        continue;
+      }
+      bm_mesh_loops_calc_normals_for_loop(bm,
+                                          vcos,
+                                          fnos,
+                                          clnors_data,
+                                          cd_loop_clnors_offset,
+                                          has_clnors,
+                                          edge_vectors,
+                                          l_curr,
+                                          r_lnos,
+                                          r_lnors_spacearr);
    } while ((l_curr = l_curr->next) != l_first);
  }

@@ -872,6 +1105,206 @@ static void bm_mesh_loops_calc_normals(BMesh *bm,
  }
 }

+typedef struct BMLoopsCalcNormalsWithCoordsData {
+  /* Read-only data. */
+  const float (*fnos)[3];
+  const float (*vcos)[3];
+  BMesh *bm;
+  const short (*clnors_data)[2];
+  const int cd_loop_clnors_offset;
+  const bool do_rebuild;
+
+  /* Output. */
+  float (*r_lnos)[3];
+  MLoopNorSpaceArray *r_lnors_spacearr;
+} BMLoopsCalcNormalsWithCoordsData;
+
+typedef struct BMLoopsCalcNormalsWithCoords_TLS {
+  BLI_Stack *edge_vectors;
+
+  /** Copied from #BMLoopsCalcNormalsWithCoordsData.r_lnors_spacearr when it's not NULL. */
+  MLoopNorSpaceArray *lnors_spacearr;
+  MLoopNorSpaceArray lnors_spacearr_buf;
+} BMLoopsCalcNormalsWithCoords_TLS;
+
+static void bm_mesh_loops_calc_normals_for_vert_init_fn(const void *__restrict userdata,
+                                                        void *__restrict chunk)
+{
+  const BMLoopsCalcNormalsWithCoordsData *data = userdata;
+  BMLoopsCalcNormalsWithCoords_TLS *tls_data = chunk;
+  if (data->r_lnors_spacearr) {
+    tls_data->edge_vectors = BLI_stack_new(sizeof(float[3]), __func__);
+    BKE_lnor_spacearr_tls_init(data->r_lnors_spacearr, &tls_data->lnors_spacearr_buf);
+    tls_data->lnors_spacearr = &tls_data->lnors_spacearr_buf;
+  }
+  else {
+    tls_data->lnors_spacearr = NULL;
+  }
+}
+
+static void bm_mesh_loops_calc_normals_for_vert_reduce_fn(const void *__restrict userdata,
+                                                          void *__restrict UNUSED(chunk_join),
+                                                          void *__restrict chunk)
+{
+  const BMLoopsCalcNormalsWithCoordsData *data = userdata;
+  BMLoopsCalcNormalsWithCoords_TLS *tls_data = chunk;
+
+  if (data->r_lnors_spacearr) {
+    BKE_lnor_spacearr_tls_join(data->r_lnors_spacearr, tls_data->lnors_spacearr);
+  }
+}
+
+static void bm_mesh_loops_calc_normals_for_vert_free_fn(const void *__restrict userdata,
+                                                        void *__restrict chunk)
+{
+  const BMLoopsCalcNormalsWithCoordsData *data = userdata;
+  BMLoopsCalcNormalsWithCoords_TLS *tls_data = chunk;
+
+  if (data->r_lnors_spacearr) {
+    BLI_stack_free(tls_data->edge_vectors);
+  }
+}
+
+static void bm_mesh_loops_calc_normals_for_vert_with_clnors_fn(
+    void *userdata, MempoolIterData *mp_v, const TaskParallelTLS *__restrict tls)
+{
+  BMVert *v = (BMVert *)mp_v;
+  if (v->e == NULL) {
+    return;
+  }
+  BMLoopsCalcNormalsWithCoordsData *data = userdata;
+  BMLoopsCalcNormalsWithCoords_TLS *tls_data = tls->userdata_chunk;
+  bm_mesh_loops_calc_normals_for_vert_with_clnors(data->bm,
+                                                  data->vcos,
+                                                  data->fnos,
+                                                  data->r_lnos,
+
+                                                  data->clnors_data,
+                                                  data->cd_loop_clnors_offset,
+                                                  data->do_rebuild,
+                                                  /* Thread local. */
+                                                  tls_data->lnors_spacearr,
+                                                  tls_data->edge_vectors,
+                                                  /* Iterate over. */
+                                                  v);
+}
+
+static void bm_mesh_loops_calc_normals_for_vert_without_clnors_fn(
+    void *userdata, MempoolIterData *mp_v, const TaskParallelTLS *__restrict tls)
+{
+  BMVert *v = (BMVert *)mp_v;
+  if (v->e == NULL) {
+    return;
+  }
+  BMLoopsCalcNormalsWithCoordsData *data = userdata;
+  BMLoopsCalcNormalsWithCoords_TLS *tls_data = tls->userdata_chunk;
+  bm_mesh_loops_calc_normals_for_vert_without_clnors(data->bm,
+                                                     data->vcos,
+                                                     data->fnos,
+                                                     data->r_lnos,
+
+                                                     data->do_rebuild,
+                                                     /* Thread local. */
+                                                     tls_data->lnors_spacearr,
+                                                     tls_data->edge_vectors,
+                                                     /* Iterate over. */
+                                                     v);
+}
+
+static void bm_mesh_loops_calc_normals__multi_threaded(BMesh *bm,
+                                                       const float (*vcos)[3],
+                                                       const float (*fnos)[3],
+                                                       float (*r_lnos)[3],
+                                                       MLoopNorSpaceArray *r_lnors_spacearr,
+                                                       const short (*clnors_data)[2],
+                                                       const int cd_loop_clnors_offset,
+                                                       const bool do_rebuild)
+{
+  const bool has_clnors = clnors_data || (cd_loop_clnors_offset != -1);
+
+  MLoopNorSpaceArray _lnors_spacearr = {NULL};
+
+  {
+    char htype = BM_LOOP;
+    if (vcos) {
+      htype |= BM_VERT;
+    }
+    if (fnos) {
+      htype |= BM_FACE;
+    }
+    /* Face/Loop indices are set inline below. */
+    BM_mesh_elem_index_ensure(bm, htype);
+  }
+
+  if (!r_lnors_spacearr && has_clnors) {
+    /* We need to compute lnor spacearr if some custom lnor data are given to us! */
+    r_lnors_spacearr = &_lnors_spacearr;
+  }
+  if (r_lnors_spacearr) {
+    BKE_lnor_spacearr_init(r_lnors_spacearr, bm->totloop, MLNOR_SPACEARR_BMLOOP_PTR);
+  }
+
+  /* We now know edges that can be smoothed (they are tagged),
+   * and edges that will be hard (they aren't).
+   * Now, time to generate the normals.
+   */
+
+  TaskParallelSettings settings;
+  BLI_parallel_mempool_settings_defaults(&settings);
+
+  BMLoopsCalcNormalsWithCoords_TLS tls = {NULL};
+
+  settings.userdata_chunk = &tls;
+  settings.userdata_chunk_size = sizeof(tls);
+
+  settings.func_init = bm_mesh_loops_calc_normals_for_vert_init_fn;
+  settings.func_reduce = bm_mesh_loops_calc_normals_for_vert_reduce_fn;
+  settings.func_free = bm_mesh_loops_calc_normals_for_vert_free_fn;
+
+  BMLoopsCalcNormalsWithCoordsData data = {
+      .bm = bm,
+      .vcos = vcos,
+      .fnos = fnos,
+      .r_lnos = r_lnos,
+      .r_lnors_spacearr = r_lnors_spacearr,
+      .clnors_data = clnors_data,
+      .cd_loop_clnors_offset = cd_loop_clnors_offset,
+      .do_rebuild = do_rebuild,
+  };
+
+  BM_iter_parallel(bm,
+                   BM_VERTS_OF_MESH,
+                   has_clnors ? bm_mesh_loops_calc_normals_for_vert_with_clnors_fn :
+                                bm_mesh_loops_calc_normals_for_vert_without_clnors_fn,
+                   &data,
+                   &settings);
+
+  if (r_lnors_spacearr) {
+    if (r_lnors_spacearr == &_lnors_spacearr) {
+      BKE_lnor_spacearr_free(r_lnors_spacearr);
+    }
+  }
+}
+
+static void bm_mesh_loops_calc_normals(BMesh *bm,
+                                       const float (*vcos)[3],
+                                       const float (*fnos)[3],
+                                       float (*r_lnos)[3],
+                                       MLoopNorSpaceArray *r_lnors_spacearr,
+                                       const short (*clnors_data)[2],
+                                       const int cd_loop_clnors_offset,
+                                       const bool do_rebuild)
+{
+  if (bm->totloop < BM_OMP_LIMIT) {
+    bm_mesh_loops_calc_normals__single_threaded(
+        bm, vcos, fnos, r_lnos, r_lnors_spacearr, clnors_data, cd_loop_clnors_offset, do_rebuild);
+  }
+  else {
+    bm_mesh_loops_calc_normals__multi_threaded(
+        bm, vcos, fnos, r_lnos, r_lnors_spacearr, clnors_data, cd_loop_clnors_offset, do_rebuild);
+  }
+}
+
 /* This threshold is a bit touchy (usual float precision issue), this value seems OK. */
 #define LNOR_SPACE_TRIGO_THRESHOLD (1.0f - 1e-4f)