[ENH] Refactor dual contouring with parallel and sequential triangulation modules (#26)

Leguark · web-flow · commit ebceb193fd16 · 2025-09-25T10:48:25.000+02:00
# Refactor Dual Contouring with Parallel Processing Support

This PR refactors the dual contouring implementation to support parallel processing for multiple surfaces. The changes include:

- Added parallel processing capability using `torch.multiprocessing` or standard `multiprocessing`
- Split triangulation logic into separate modules for better organization:
  - `_parallel_triangulation.py` for multi-process surface processing
  - `_sequential_triangulation.py` for single-process processing
- Implemented worker initialization to prevent thread oversubscription
- Added logic to automatically determine when parallel processing is beneficial
- Fixed gradient data handling to ensure compatible dtypes between tensors
- Improved error handling and fallback to sequential processing when needed

Currently, parallel processing is disabled by default as benchmarks don't show significant speedup yet, but the infrastructure is in place for future optimization.
diff --git a/gempy_engine/API/dual_contouring/_dual_contouring.py b/gempy_engine/API/dual_contouring/_dual_contouring.py
@@ -1,105 +1,68 @@
-import numpy
-import warnings
+import os
 from typing import List
 
-import numpy as np
-
-from gempy_engine.config import AvailableBackends
 from ... import optional_dependencies
-
 from ...core.backend_tensor import BackendTensor
 from ...core.data.dual_contouring_data import DualContouringData
 from ...core.data.dual_contouring_mesh import DualContouringMesh
 from ...core.utils import gempy_profiler_decorator
-from ...modules.dual_contouring.dual_contouring_interface import triangulate_dual_contouring, generate_dual_contouring_vertices
-from ...modules.dual_contouring.fancy_triangulation import triangulate
+from ...modules.dual_contouring._parallel_triangulation import _should_use_parallel_processing, _process_surface_batch, _init_worker
+from ...modules.dual_contouring._sequential_triangulation import _sequential_triangulation
+
+# Multiprocessing imports
+try:
+    import torch.multiprocessing as mp
+    MULTIPROCESSING_AVAILABLE = True
+except ImportError:
+    import multiprocessing as mp
+    MULTIPROCESSING_AVAILABLE = False
+
+
+
 
 
 @gempy_profiler_decorator
 def compute_dual_contouring(dc_data_per_stack: DualContouringData, left_right_codes=None, debug: bool = False) -> List[DualContouringMesh]:
     valid_edges_per_surface = dc_data_per_stack.valid_edges.reshape((dc_data_per_stack.n_surfaces_to_export, -1, 12))
 
-    # ? Is  there a way to cut also the vertices?
+    # Check if we should use parallel processing
+    use_parallel = _should_use_parallel_processing(dc_data_per_stack.n_surfaces_to_export, BackendTensor.engine_backend)
+    parallel_results = None
+    
+    if use_parallel and False: # ! (Miguel Sep 25) I do not see a speedup
+        print(f"Using parallel processing for {dc_data_per_stack.n_surfaces_to_export} surfaces")
+        parallel_results = _parallel_process_surfaces(dc_data_per_stack, left_right_codes, debug)
+        
 
+    # Fall back to sequential processing
+    print(f"Using sequential processing for {dc_data_per_stack.n_surfaces_to_export} surfaces")
     stack_meshes: List[DualContouringMesh] = []
 
-    last_surface_edge_idx = 0
     for i in range(dc_data_per_stack.n_surfaces_to_export):
         # @off
-        valid_edges          : np.ndarray = valid_edges_per_surface[i]
-        next_surface_edge_idx: int        = valid_edges.sum() + last_surface_edge_idx
-        slice_object         : slice      = slice(last_surface_edge_idx, next_surface_edge_idx)
-        last_surface_edge_idx: int        = next_surface_edge_idx
-
-        dc_data_per_surface = DualContouringData(
-            xyz_on_edge              = dc_data_per_stack.xyz_on_edge,
-            valid_edges              = valid_edges,
-            xyz_on_centers           = dc_data_per_stack.xyz_on_centers,
-            dxdydz                   = dc_data_per_stack.dxdydz,
-            exported_fields_on_edges = dc_data_per_stack.exported_fields_on_edges,
-            n_surfaces_to_export     = dc_data_per_stack.n_surfaces_to_export,
-            tree_depth               = dc_data_per_stack.tree_depth
-
-        )
-        vertices: np.ndarray = generate_dual_contouring_vertices(
-            dc_data_per_stack = dc_data_per_surface,
-            slice_surface     = slice_object,
-            debug             = debug
-        )
-        
-        if left_right_codes is None:
-            # * Legacy triangulation
-            indices = triangulate_dual_contouring(dc_data_per_surface)
+        if parallel_results is not None:
+            _, vertices_numpy = _sequential_triangulation(
+                dc_data_per_stack,
+                debug, 
+                i, 
+                left_right_codes, 
+                valid_edges_per_surface,
+                compute_indices=False 
+            )
+            indices_numpy = parallel_results[i]
         else:
-            # * Fancy triangulation 👗
-            
-            # * Average gradient for the edges
-            edges_normals = BackendTensor.t.zeros((valid_edges.shape[0], 12, 3), dtype=BackendTensor.dtype_obj)
-            edges_normals[:] = np.nan
-            edges_normals[valid_edges] = dc_data_per_stack.gradients[slice_object]
-                
-            # if LEGACY:=True:
-            if BackendTensor.engine_backend != AvailableBackends.PYTORCH:
-                with warnings.catch_warnings():
-                    warnings.simplefilter("ignore", category=RuntimeWarning)
-                    voxel_normal  = np.nanmean(edges_normals, axis=1)
-                    voxel_normal  = voxel_normal[(~np.isnan(voxel_normal).any(axis=1))]  # drop nans
-                    pass 
-            else:
-                # Assuming edges_normals is a PyTorch tensor
-                nan_mask = BackendTensor.t.isnan(edges_normals)
-                valid_count = (~nan_mask).sum(dim=1)
-
-                # Replace NaNs with 0 for sum calculation
-                safe_normals = edges_normals.clone()
-                safe_normals[nan_mask] = 0
-
-                # Compute the sum of non-NaN elements
-                sum_normals = BackendTensor.t.sum(safe_normals, 1)
-
-                # Calculate the mean, avoiding division by zero
-                voxel_normal = sum_normals / valid_count.clamp(min=1)
-
-                # Remove rows where all elements were NaN (and hence valid_count is 0)
-                voxel_normal = voxel_normal[valid_count > 0].reshape(-1, 3)
-                
-
-            valid_voxels = dc_data_per_surface.valid_voxels
-            indices = triangulate(
-                left_right_array = left_right_codes[valid_voxels],
-                valid_edges      = dc_data_per_surface.valid_edges[valid_voxels],
-                tree_depth       = dc_data_per_surface.tree_depth,
-                voxel_normals     = voxel_normal 
+            indices_numpy, vertices_numpy = _sequential_triangulation(
+                dc_data_per_stack,
+                debug, 
+                i, 
+                left_right_codes, 
+                valid_edges_per_surface,
+                compute_indices=True
             )
-            indices = BackendTensor.t.concatenate(indices, axis=0)
-            
-        # @on
-        vertices_numpy = BackendTensor.t.to_numpy(vertices)
-        indices_numpy = BackendTensor.t.to_numpy(indices)
-        
+
         if TRIMESH_LAST_PASS := True:
             vertices_numpy, indices_numpy = _last_pass(vertices_numpy, indices_numpy)
-        
+
         stack_meshes.append(
             DualContouringMesh(
                 vertices_numpy,
@@ -110,6 +73,55 @@ def compute_dual_contouring(dc_data_per_stack: DualContouringData, left_right_co
     return stack_meshes
 
 
+
+
+def _parallel_process_surfaces(dc_data_per_stack, left_right_codes, debug, num_workers=None, chunk_size=2):
+    """Process surfaces in parallel using multiprocessing."""
+    if num_workers is None:
+        num_workers = max(1, min(os.cpu_count() // 2, dc_data_per_stack.n_surfaces_to_export // 2))
+
+    # Prepare data for serialization
+    dc_data_dict = {
+            'xyz_on_edge'             : dc_data_per_stack.xyz_on_edge,
+            'valid_edges'             : dc_data_per_stack.valid_edges,
+            'xyz_on_centers'          : dc_data_per_stack.xyz_on_centers,
+            'dxdydz'                  : dc_data_per_stack.dxdydz,
+            'exported_fields_on_edges': dc_data_per_stack.exported_fields_on_edges,
+            'n_surfaces_to_export'    : dc_data_per_stack.n_surfaces_to_export,
+            'tree_depth'              : dc_data_per_stack.tree_depth,
+            # 'gradients': getattr(dc_data_per_stack, 'gradients', None)
+    }
+
+    # Create surface index chunks
+    surface_indices = list(range(dc_data_per_stack.n_surfaces_to_export))
+    chunks = [surface_indices[i:i + chunk_size] for i in range(0, len(surface_indices), chunk_size)]
+
+    try:
+        # Use spawn context for better PyTorch compatibility
+        ctx = mp.get_context("spawn") if MULTIPROCESSING_AVAILABLE else mp
+
+        with ctx.Pool(processes=num_workers, initializer=_init_worker) as pool:
+            # Submit all chunks
+            async_results = []
+            for chunk in chunks:
+                result = pool.apply_async(
+                    _process_surface_batch,
+                    (chunk, dc_data_dict, left_right_codes, debug)
+                )
+                async_results.append(result)
+
+            # Collect results
+            all_results = []
+            for async_result in async_results:
+                batch_results = async_result.get()
+                all_results.extend(batch_results)
+
+        return all_results
+
+    except Exception as e:
+        print(f"Parallel processing failed: {e}. Falling back to sequential processing.")
+        return None
+
 def _last_pass(vertices, indices):
     # Check if trimesh is available
     try:
@@ -118,4 +130,4 @@ def _last_pass(vertices, indices):
         mesh.fill_holes()
         return mesh.vertices, mesh.faces
     except ImportError:
-        return vertices, indices
+        return vertices, indices
diff --git a/gempy_engine/modules/dual_contouring/_parallel_triangulation.py b/gempy_engine/modules/dual_contouring/_parallel_triangulation.py
@@ -0,0 +1,163 @@
+import numpy as np
+import os
+import warnings
+
+from gempy_engine.config import AvailableBackends
+from ...core.backend_tensor import BackendTensor
+from ...core.data.dual_contouring_data import DualContouringData
+from ...modules.dual_contouring.dual_contouring_interface import triangulate_dual_contouring
+from ...modules.dual_contouring.fancy_triangulation import triangulate
+
+# Multiprocessing imports
+try:
+    import torch.multiprocessing as mp
+    MULTIPROCESSING_AVAILABLE = True
+except ImportError:
+    import multiprocessing as mp
+    MULTIPROCESSING_AVAILABLE = False
+
+
+def _should_use_parallel_processing(n_surfaces: int, backend: AvailableBackends) -> bool:
+    """Determine if parallel processing should be used."""
+    # Only use parallel processing for PyTorch CPU backend with sufficient surfaces
+    if backend == AvailableBackends.PYTORCH and MULTIPROCESSING_AVAILABLE:
+        # Check if we're on CPU (not GPU)
+        try:
+            import torch
+            if torch.cuda.is_available():
+                # If CUDA is available, check if default tensor type is CPU
+                dummy = BackendTensor.t.zeros(1)
+                is_cpu = dummy.device.type == 'cpu' if hasattr(dummy, 'device') else True
+            else:
+                is_cpu = True
+
+            # Use parallel processing if we have CPU tensors and enough surfaces to justify overhead
+            return is_cpu and n_surfaces >= 4
+        except ImportError:
+            return False
+    return False
+
+
+def _init_worker():
+    """Initialize worker process to avoid thread oversubscription."""
+    # Set environment variables for NumPy/OpenMP/MKL
+    os.environ['OMP_NUM_THREADS'] = '1'
+    os.environ['MKL_NUM_THREADS'] = '1'
+    os.environ['OPENBLAS_NUM_THREADS'] = '1'
+    os.environ['NUMEXPR_NUM_THREADS'] = '1'
+
+    # For PyTorch, set environment variables before import
+    os.environ['TORCH_NUM_THREADS'] = '1'
+    os.environ['TORCH_NUM_INTEROP_THREADS'] = '1'
+
+    # Now import torch in the worker process
+    try:
+        import torch
+        # These calls might still work if torch hasn't done any parallel work yet in this process
+        try:
+            torch.set_num_threads(1)
+            torch.set_num_interop_threads(1)
+        except RuntimeError:
+            # If the above fails, the environment variables should handle it
+            pass
+    except ImportError:
+        pass
+
+
+def _process_surface_batch(surface_indices_batch, dc_data_dict, left_right_codes, debug):
+    """Process a batch of surfaces in a worker process."""
+    _init_worker()
+
+    # Reconstruct dc_data_per_stack from dictionary
+    dc_data_per_stack = DualContouringData(**dc_data_dict)
+    valid_edges_per_surface = dc_data_per_stack.valid_edges.reshape((dc_data_per_stack.n_surfaces_to_export, -1, 12))
+
+    batch_results = []
+
+    for i in surface_indices_batch:
+        result = _process_single_surface(
+            i, dc_data_per_stack, valid_edges_per_surface, left_right_codes, debug
+        )
+        batch_results.append(result)
+
+    return batch_results
+
+def _process_single_surface(i, dc_data_per_stack, valid_edges_per_surface, left_right_codes, debug):
+    """Process a single surface and return vertices and indices."""
+    try:
+        valid_edges = valid_edges_per_surface[i]
+
+        # Calculate edge indices for this surface
+        last_surface_edge_idx = sum(valid_edges_per_surface[j].sum() for j in range(i))
+        next_surface_edge_idx = valid_edges.sum() + last_surface_edge_idx
+        slice_object = slice(last_surface_edge_idx, next_surface_edge_idx)
+
+        dc_data_per_surface = DualContouringData(
+            xyz_on_edge=dc_data_per_stack.xyz_on_edge,
+            valid_edges=valid_edges,
+            xyz_on_centers=dc_data_per_stack.xyz_on_centers,
+            dxdydz=dc_data_per_stack.dxdydz,
+            exported_fields_on_edges=dc_data_per_stack.exported_fields_on_edges,
+            n_surfaces_to_export=dc_data_per_stack.n_surfaces_to_export,
+            tree_depth=dc_data_per_stack.tree_depth
+        )
+
+        if left_right_codes is None:
+            # Legacy triangulation
+            indices = triangulate_dual_contouring(dc_data_per_surface)
+        else:
+            edges_normals = BackendTensor.t.zeros((valid_edges.shape[0], 12, 3), dtype=BackendTensor.dtype_obj)
+            if BackendTensor.engine_backend == AvailableBackends.PYTORCH:
+                edges_normals[:] = float('nan')  # Use Python float nan instead of np.nan
+            else:
+                edges_normals[:] = np.nan
+
+            # Get gradient data
+            gradient_data = dc_data_per_stack.gradients[slice_object]
+
+            # Fix dtype mismatch by ensuring compatible dtypes
+            if BackendTensor.engine_backend == AvailableBackends.PYTORCH:
+                if hasattr(gradient_data, 'dtype') and hasattr(edges_normals, 'dtype'):
+                    if gradient_data.dtype != edges_normals.dtype:
+                        gradient_data = gradient_data.to(edges_normals.dtype)
+
+            edges_normals[valid_edges] = gradient_data
+
+            if BackendTensor.engine_backend != AvailableBackends.PYTORCH:
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore", category=RuntimeWarning)
+                    voxel_normal = np.nanmean(edges_normals, axis=1)
+                    voxel_normal = voxel_normal[(~np.isnan(voxel_normal).any(axis=1))]
+            else:
+                # PyTorch tensor operations
+                nan_mask = BackendTensor.t.isnan(edges_normals)
+                valid_count = (~nan_mask).sum(dim=1)
+                safe_normals = edges_normals.clone()
+                safe_normals[nan_mask] = 0
+                sum_normals = BackendTensor.t.sum(safe_normals, 1)
+                voxel_normal = sum_normals / valid_count.clamp(min=1)
+                voxel_normal = voxel_normal[valid_count > 0].reshape(-1, 3)
+
+            valid_voxels = dc_data_per_surface.valid_voxels
+            left_right_per_surface = left_right_codes[valid_voxels]
+            valid_voxels_per_surface = dc_data_per_surface.valid_edges[valid_voxels]
+            tree_depth_per_surface = dc_data_per_surface.tree_depth
+
+            indices = triangulate(
+                left_right_array=left_right_per_surface,
+                valid_edges=valid_voxels_per_surface,
+                tree_depth=tree_depth_per_surface,
+                voxel_normals=voxel_normal
+            )
+            indices = BackendTensor.t.concatenate(indices, axis=0)
+
+        # vertices_numpy = BackendTensor.t.to_numpy(vertices)
+        indices_numpy = BackendTensor.t.to_numpy(indices)
+        return indices_numpy
+
+    except Exception as e:
+        print(f"ERROR in _process_single_surface for surface {i}: {e}")
+        import traceback
+        traceback.print_exc()
+        raise
+
diff --git a/gempy_engine/modules/dual_contouring/_sequential_triangulation.py b/gempy_engine/modules/dual_contouring/_sequential_triangulation.py