From c7415fd922a21e9a1e00910f000af9767ff8a1ac Mon Sep 17 00:00:00 2001
From: "Qingyuan (Andy) Li" <andyli27@mit.edu>
Date: Thu, 20 Nov 2025 01:48:49 -0500
Subject: [PATCH 1/6] first-pass at dino aggregated frame descriptors, tested
 on kmd demo

---
 roman/align/results.py              | 24 ++++++++---
 roman/align/submap_align.py         |  3 ++
 roman/map/fastsam_wrapper.py        | 66 +++++++++++++++++++++++------
 roman/map/map.py                    | 60 +++++++++++++-------------
 roman/map/mapper.py                 |  6 ++-
 roman/map/observation.py            |  2 +-
 roman/map/run.py                    | 18 ++++----
 roman/object/segment.py             |  4 +-
 roman/params/fastsam_params.py      |  6 +++
 roman/params/submap_align_params.py | 12 +++---
 roman/utils.py                      |  1 +
 11 files changed, 136 insertions(+), 66 deletions(-)

diff --git a/roman/align/results.py b/roman/align/results.py
index c007f8a..d232c66 100644
--- a/roman/align/results.py
+++ b/roman/align/results.py
@@ -21,6 +21,7 @@ class SubmapAlignResults:
     clipper_angle_mat: np.array
     clipper_dist_mat: np.array
     clipper_num_associations: np.array
+    similarity_mat: np.array
     submap_yaw_diff_mat: np.array
     associated_objs_mat: np.array
     T_ij_mat: np.array
@@ -52,17 +53,24 @@ def time_to_secs_nsecs(t, as_dict=False):
         return {'seconds': seconds, 'nanoseconds': nanoseconds}
 
 def plot_align_results(results: SubmapAlignResults, dpi=500):
-
+    
+    show_sim = results.similarity_mat is not None
+    
     # if no ground truth, can only show number of associations
     if None in results.submap_io.input_gt_pose_yaml:
-        fig, ax = plt.subplots(1, 1, figsize=(4,4), dpi=dpi)
-        mp = ax.imshow(results.clipper_num_associations, cmap='viridis', vmin=0)
+        fig, ax = plt.subplots(2 if show_sim else 1, 1, figsize=(8 if show_sim else 4,4), dpi=dpi)
+        mp = ax[0, 0].imshow(results.clipper_num_associations, cmap='viridis', vmin=0)
         fig.colorbar(mp, fraction=0.04, pad=0.04)
         ax.set_title("Number of Associations")
+        
+        if show_sim:
+            mp = ax[1, 0].imshow(results.similarity_mat, cmap='viridis', vmin=0)
+            fig.colorbar(mp, fraction=0.04, pad=0.04)
+            ax[1, 0].set_title("Similarity Score")
+        
         fig.suptitle(f"{results.submap_io.run_name}: {results.submap_io.robot_names[0]}, {results.submap_io.robot_names[1]}")
         return
 
-
     fig, ax = plt.subplots(3, 2, figsize=(8, 12), dpi=dpi)
     fig.subplots_adjust(wspace=.3)
     fig.suptitle(f"{results.submap_io.run_name}: {results.submap_io.robot_names[0]}, {results.submap_io.robot_names[1]}")
@@ -95,6 +103,11 @@ def plot_align_results(results: SubmapAlignResults, dpi=500):
     mp = ax[2, 0].imshow(results.clipper_num_associations, cmap='viridis', vmin=0)
     fig.colorbar(mp, fraction=0.04, pad=0.04)
     ax[2, 0].set_title("Number of Associations")
+    
+    if show_sim:
+        mp = ax[2, 1].imshow(results.similarity_mat, cmap='viridis', vmin=0)
+        fig.colorbar(mp, fraction=0.04, pad=0.04)
+        ax[2, 1].set_title("Similarity Score")
 
     for i in range(len(ax)):
         for j in range(len(ax[i])):
@@ -102,7 +115,8 @@ def plot_align_results(results: SubmapAlignResults, dpi=500):
             ax[i,j].set_ylabel("submap index (robot 1)")
             ax[i,j].grid(False)
 
-    fig.delaxes(ax[2, 1])
+    if not show_sim:
+        fig.delaxes(ax[2, 1])
 
 def save_submap_align_results(results: SubmapAlignResults, submaps, roman_maps: List[ROMANMap]):
     plot_align_results(results)
diff --git a/roman/align/submap_align.py b/roman/align/submap_align.py
index 178f744..c53280d 100644
--- a/roman/align/submap_align.py
+++ b/roman/align/submap_align.py
@@ -74,6 +74,7 @@ def submap_align(sm_params: SubmapAlignParams, sm_io: SubmapAlignInputOutput):
     clipper_angle_mat = np.zeros((len(submaps[0]), len(submaps[1])))*np.nan
     clipper_dist_mat = np.zeros((len(submaps[0]), len(submaps[1])))*np.nan
     clipper_num_associations = np.zeros((len(submaps[0]), len(submaps[1])))*np.nan
+    similarity_mat = np.zeros((len(submaps[0]), len(submaps[1])))*np.nan
     robots_nearby_mat = np.zeros((len(submaps[0]), len(submaps[1])))*np.nan
     clipper_percent_associations = np.zeros((len(submaps[0]), len(submaps[1])))*np.nan
     submap_yaw_diff_mat = np.zeros((len(submaps[0]), len(submaps[1])))*np.nan
@@ -184,6 +185,7 @@ def submap_align(sm_params: SubmapAlignParams, sm_io: SubmapAlignInputOutput):
                 clipper_dist_mat[i, j] = np.nan
 
             clipper_num_associations[i, j] = len(associations)
+            similarity_mat[i, j] = submap_sim
             clipper_percent_associations[i, j] = len(associations) / np.mean([len(submap_i), len(submap_j)]) if np.mean([len(submap_i), len(submap_j)]) > 0 else 0.0
             
             T_ij_mat[i, j] = T_ij
@@ -198,6 +200,7 @@ def submap_align(sm_params: SubmapAlignParams, sm_io: SubmapAlignInputOutput):
         clipper_angle_mat=clipper_angle_mat,
         clipper_dist_mat=clipper_dist_mat,
         clipper_num_associations=clipper_num_associations,
+        similarity_mat=similarity_mat if sm_params.submap_descriptor is not None else None,
         submap_yaw_diff_mat=submap_yaw_diff_mat,
         T_ij_mat=T_ij_mat,
         T_ij_hat_mat=T_ij_hat_mat,
diff --git a/roman/map/fastsam_wrapper.py b/roman/map/fastsam_wrapper.py
index 56399e2..4247048 100644
--- a/roman/map/fastsam_wrapper.py
+++ b/roman/map/fastsam_wrapper.py
@@ -143,6 +143,7 @@ def from_params(cls, params: FastSAMParams, depth_cam_params: CameraParams):
             allow_tblr_edges=[True, True, True, True],
             area_bounds=[img_area / (params.min_mask_len_div**2), img_area / (params.max_mask_len_div**2)],
             semantics=params.semantics,
+            frame_descriptor=params.frame_descriptor,
             triangle_ignore_masks=params.triangle_ignore_masks
         )
 
@@ -159,6 +160,7 @@ def setup_filtering(self,
         allow_tblr_edges = [True, True, True, True],
         keep_mask_minimal_intersection=0.3,
         semantics: str = None,
+        frame_descriptor: str = None,
         triangle_ignore_masks=None
     ):
         """
@@ -203,6 +205,9 @@ def setup_filtering(self,
         else:
             raise ValueError(f"Invalid semantics option: {semantics}. Choose from 'clip', 'dino', or 'none'.")
         self.semantic_patches_shape = None
+        self.frame_descriptor_type = frame_descriptor
+        if frame_descriptor is not None:
+            assert self.semantics.lower() == 'dino', "Frame descriptor only supported with DINO semantics."
         
         if triangle_ignore_masks is not None:
             self.constant_ignore_mask = np.zeros((self.depth_cam_params.height, self.depth_cam_params.width), dtype=np.uint8)
@@ -262,16 +267,16 @@ def setup_rgbd_params(
             self.erosion_element = None
         self.plane_filter_params = plane_filter_params
 
-    def run(self, t, pose, img, depth_data=None, plot=False):
+    def run(self, t, pose, img, depth_data=None):
         """
         Takes and image and returns filtered FastSAM masks as Observations.
 
         Args:
             img (cv image): camera image
-            plot (bool, optional): Returns plots if true. Defaults to False.
 
         Returns:
             self.observations (list): list of Observations
+            frame_descriptor (np.ndarray): semantic descriptor of the frame if frame_descriptor is not None, else None
         """
         self.observations = []
         
@@ -294,19 +299,26 @@ def run(self, t, pose, img, depth_data=None, plot=False):
         
         # run fastsam
         masks = self._process_img(img, ignore_mask=ignore_mask, keep_mask=keep_mask)
-        
+                
         if self.semantics == 'dino':
             # Process the image for DINO
             dino_shape = 768
             img_bgr = cv.cvtColor(img, cv.COLOR_BGR2RGB)
             preprocessed = self.semantics_preprocess(images=img_bgr, return_tensors="pt").to(self.device)
             dino_output = self.semantics_model(**preprocessed)
-            dino_features = self.get_per_pixel_features(
+            dino_output_patches = self.get_output_patches(
                 model_output=dino_output.last_hidden_state, 
                 img_shape=img.shape, 
                 feature_dim=dino_shape
             )
-
+            dino_features = self.get_per_pixel_features(
+                model_output_patches=dino_output_patches,
+                img_shape=img.shape
+            )
+            
+        frame_descriptor = None
+        if self.frame_descriptor_type is not None:
+            frame_descriptor = self.get_frame_descriptor(dino_output_patches)
         
         for mask in masks:
             
@@ -406,7 +418,7 @@ def run(self, t, pose, img, depth_data=None, plot=False):
                     processed_img = self.semantics_preprocess(Image.fromarray(img_bbox, mode='RGB')).to(self.device)
                     clip_embedding = self.semantics_model.encode_image(processed_img.unsqueeze(dim=0))
                     clip_embedding = clip_embedding.squeeze().cpu().detach().numpy()
-                    self.observations.append(Observation(t, pose, mask, mask_downsampled, ptcld, clip_embedding=clip_embedding))
+                    self.observations.append(Observation(t, pose, mask, mask_downsampled, ptcld, semantic_descriptor=clip_embedding))
             elif self.semantics == 'dino':
                 assert mask.shape[0] == dino_features.shape[0] and mask.shape[1] == dino_features.shape[1], \
                     "Mask and DINO features must have the same shape."
@@ -414,12 +426,12 @@ def run(self, t, pose, img, depth_data=None, plot=False):
                 dino_mask = dino_mask.cpu().detach().numpy()
                 mean_dino = np.mean(dino_mask, axis=0) # dino_shape
                 mean_dino = mean_dino / np.linalg.norm(mean_dino) # normalize
-                self.observations.append(Observation(t, pose, mask, mask_downsampled, ptcld, clip_embedding=mean_dino))
+                self.observations.append(Observation(t, pose, mask, mask_downsampled, ptcld, semantic_descriptor=mean_dino))
                 
             else:
                 self.observations.append(Observation(t, pose, mask, mask_downsampled, ptcld))
                 
-        return self.observations
+        return self.observations, frame_descriptor
     
     def apply_rotation(self, img, unrotate=False):
         if self.rotate_img is None:
@@ -575,11 +587,10 @@ def _process_img(self, image_bgr, ignore_mask=None, keep_mask=None):
             return []
 
         return segmask
-
-    def get_per_pixel_features(self, model_output: ArrayLike, img_shape: ArrayLike, 
-            feature_dim: int) -> ArrayLike:
+    
+    def get_output_patches(self, model_output: ArrayLike, img_shape: ArrayLike, feature_dim: int) -> ArrayLike:
         """
-        Extract (Dino) per-pixel features
+        Extract (Dino) output patches
 
         Args:
             model_output (ArrayLike): Last hidden state of (Dino) model
@@ -600,6 +611,19 @@ def get_per_pixel_features(self, model_output: ArrayLike, img_shape: ArrayLike,
             
         model_output_patches = model_output_flat_patches.reshape(self.semantic_patches_shape)
 
+        return model_output_patches # 1 x h x w x feature_dim
+
+    def get_per_pixel_features(self, model_output_patches: ArrayLike, img_shape: ArrayLike) -> ArrayLike:
+        """
+        Extract (Dino) per-pixel features
+
+        Args:
+            model_output_patches (ArrayLike): Reshaped (Dino) output patches
+            img_shape (ArrayLike): Original image shape
+
+        Returns:
+            ArrayLike: Reshaped (Dino) output
+        """
         # interpolate the feature map to match the size of the original image
         per_pixel_features = torch.nn.functional.interpolate(
             model_output_patches.permute(0, 3, 1, 2), # permute to be batch, channels, height, width
@@ -612,3 +636,21 @@ def get_per_pixel_features(self, model_output: ArrayLike, img_shape: ArrayLike,
 
         return per_pixel_features # h x w x feature_dim
         
+    def get_frame_descriptor(self, dino_features: torch.Tensor) -> np.ndarray:   
+        with torch.no_grad(): # prevent memory leak
+            dino_features_flat = dino_features.view(-1, dino_features.shape[-1])
+            if self.frame_descriptor_type == 'dino-gap':
+                frame_descriptor = torch.sum(dino_features_flat, dim=0)
+            elif self.frame_descriptor_type == 'dino-gmp':
+                frame_descriptor = torch.max(dino_features_flat, dim=0).values
+            elif self.frame_descriptor_type == 'dino-gem':
+                cubed_descriptor = torch.mean(dino_features_flat ** 3, dim=0)
+                frame_descriptor = torch.sign(cubed_descriptor) * \
+                                   (torch.abs(cubed_descriptor).clamp(min=1e-12) ** (1.0 / 3)) # avoid NaN from negative or zero root
+            else:
+                raise ValueError(f"frame descriptor must be one of 'dino-gap', 'dino-gmp', or 'dino-gem'.")
+                
+            frame_descriptor /= torch.norm(frame_descriptor)
+                                            
+        return frame_descriptor.cpu().detach().numpy()
+            
\ No newline at end of file
diff --git a/roman/map/map.py b/roman/map/map.py
index d903495..f28cdb7 100644
--- a/roman/map/map.py
+++ b/roman/map/map.py
@@ -22,6 +22,7 @@ class ROMANMap:
     segments: List[Segment]
     trajectory: List[np.ndarray]
     times: np.ndarray
+    descriptors: List[np.ndarray] = None
     poses_are_flu: bool = True
 
     def __post_init__(self):
@@ -36,6 +37,7 @@ def minimal_data(self):
             segments=[seg.minimal_data() for seg in self.segments],
             trajectory=self.trajectory,
             times=self.times,
+            descriptors=self.descriptors,
             poses_are_flu=self.poses_are_flu
         )
         
@@ -53,19 +55,9 @@ def make_picklable(self):
     def from_pickle(cls, pickle_file: str):
         # extract pickled data
         with open(os.path.expanduser(pickle_file), 'rb') as f:
-            pickle_data = pickle.load(f)
-            
-            if type(pickle_data) == cls:
-                roman_map = pickle_data
-            else:
-                assert len(pickle_data) == 3
-                mapper, poses, times = pickle_data
-                roman_map = cls(
-                    segments=mapper.get_segment_map(),
-                    trajectory=poses,
-                    times=times
-                )
-        return roman_map
+            roman_map = pickle.load(f)
+            assert(type(roman_map) == cls)     
+            return roman_map
         
     @classmethod
     def concatenate(cls, roman_maps: list):
@@ -82,6 +74,8 @@ def concatenate(cls, roman_maps: list):
                 segments=reference.segments + other.segments,
                 trajectory=reference.trajectory + other.trajectory,
                 times=reference.times + other.times,
+                descriptors = reference.descriptors + other.descriptors if \
+                              reference.descriptors is not None and other.descriptors is not None else None,
                 poses_are_flu=reference.poses_are_flu
             )
         
@@ -122,12 +116,20 @@ def position_gt(self):
     def has_gt(self):
         return self.pose_flu_gt is not None
     
+    @property
+    def first_seen(self):
+        return min([seg.first_seen for seg in self.segments])
+    
+    @property
+    def last_seen(self):
+        return max([seg.last_seen for seg in self.segments])
+    
     @property
     def segments_as_global_points(self):
         # self.pose_gravity_aligned returns T_odom_center
         # which is transformation from center frame to odom frame
         # so this transforms segments back to the global (odom) frame
-        T_odom_center = self.pose_gravity_aligned_gt if self.pose_flu_gt is not None else self.pose_gravity_aligned
+        T_odom_center = self.pose_gravity_aligned_gt if self.has_gt else self.pose_gravity_aligned
         return transform(T_odom_center, np.vstack([seg.center.T for seg in self.segments])) # (1, 3) -> (N, 3)
 
     def __len__(self):
@@ -174,19 +176,9 @@ def load_roman_map(map_file: str) -> ROMANMap:
     """
     # extract pickled data
     with open(os.path.expanduser(map_file), 'rb') as f:
-        pickle_data = pickle.load(f)
-        
-        if type(pickle_data) == ROMANMap:
-            roman_map = pickle_data
-        else:
-            assert len(pickle_data) == 3
-            mapper, poses, times = pickle_data
-            roman_map = ROMANMap(
-                segments=mapper.get_segment_map(),
-                trajectory=poses,
-                times=times
-            )
-    return roman_map
+        roman_map = pickle.load(f)
+        assert type(roman_map) == ROMANMap
+        return roman_map
 
 def submaps_from_roman_map(roman_map: ROMANMap, submap_params: SubmapParams, 
                            gt_flu_pose_data: PoseData=None) -> List[Submap]:
@@ -233,10 +225,8 @@ def submaps_from_roman_map(roman_map: ROMANMap, submap_params: SubmapParams,
                 pose_flu_gt=gt_flu_pose_data.pose(submap_time_roman_map) if gt_flu_pose_data is not None else None
             )
 
-            # sm.pose is the pose of center w.r.t. odom, which is T_odom_center (center is child, odom is parent frame)
-            # T_odom_center is also transformation from center frame to odom frame
-            # so its inverse, T_center_odom, is transformation from odom frame to center frame
-            # which transforms the segments into the center frame (centered w.r.t submap centroid) since they are in the odom frame
+            # sm.pose is the pose of center w.r.t. odom, which is T_odom_center, inverse is T_center_odom
+            # transforms the segments into the center frame (centered w.r.t submap centroid) since they are in the odom frame
             T_center_odom = np.linalg.inv(sm.pose_gravity_aligned)
             for seg in sm.segments:
                 seg.transform(T_center_odom)
@@ -291,6 +281,14 @@ def submaps_from_roman_map(roman_map: ROMANMap, submap_params: SubmapParams,
         # compute mean semantic for each submap
         for submap in submaps:
             submap.descriptor = np.mean([seg.semantic_descriptor for seg in submap.segments], axis=0).flatten()
+    elif submap_params.submap_descriptor == 'mean_frame_descriptor':
+        assert roman_map.descriptors is not None, "ROMAN map must have frame descriptors to compute submap descriptors from them."
+        map_times_np = np.array(roman_map.times)
+        descriptors_np = np.vstack(roman_map.descriptors)
+        for submap in submaps:
+            frame_mask = (map_times_np >= submap.first_seen) & (map_times_np <= submap.last_seen)
+            submap.descriptor = descriptors_np[frame_mask].mean(axis=0)
+            
     return submaps
 
 
diff --git a/roman/map/mapper.py b/roman/map/mapper.py
index 13c3e06..5b1f567 100644
--- a/roman/map/mapper.py
+++ b/roman/map/mapper.py
@@ -43,14 +43,17 @@ def __init__(self, params: MapperParams, camera_params: CameraParams):
         self.last_pose = None
         self.poses_flu_history = []
         self.times_history = []
+        self.frame_descriptors_history = []
         self._T_camera_flu = np.eye(4)
 
-    def update(self, t: float, pose: np.array, observations: List[Observation]):
+    def update(self, t: float, pose: np.array, observations: List[Observation], frame_descriptor: np.ndarray):
 
         # have T_WC, want T_WB
         # T_WB = T_WC @ T_CB
         self.poses_flu_history.append(pose @ self._T_camera_flu)
         self.times_history.append(t)
+        if frame_descriptor is not None:
+            self.frame_descriptors_history.append(frame_descriptor)
         
         # store last pose
         self.last_pose = pose.copy()
@@ -320,6 +323,7 @@ def get_roman_map(self) -> ROMANMap:
             segments=segment_map,
             trajectory=self.poses_flu_history,
             times=self.times_history,
+            descriptors=self.frame_descriptors_history if self.frame_descriptors_history else None,
             poses_are_flu=True
         )
     
diff --git a/roman/map/observation.py b/roman/map/observation.py
index 4b10c6f..e025153 100644
--- a/roman/map/observation.py
+++ b/roman/map/observation.py
@@ -20,7 +20,7 @@ class Observation():
     mask: np.ndarray = None
     mask_downsampled: np.ndarray = None
     point_cloud: np.ndarray = None  # n-by-3 matrix. Each row is a 3D point.
-    clip_embedding: np.ndarray = None
+    semantic_descriptor: np.ndarray = None
     voxel_grid: Dict[float, VoxelGrid] = field(default_factory=dict)
     _transformed_points: np.ndarray = None
     _pcd: o3d.geometry.PointCloud = None
diff --git a/roman/map/run.py b/roman/map/run.py
index 4030e59..0d018b4 100644
--- a/roman/map/run.py
+++ b/roman/map/run.py
@@ -112,10 +112,10 @@ def update(self, t: float):
         img_output = None
         update_t0 = time.time()
 
-        img_time, observations, pose_odom_camera, img = self.update_fastsam(t)
+        img_time, observations, pose_odom_camera, img, frame_descriptor = self.update_fastsam(t)
         update_t1 = time.time()
         if observations is not None and pose_odom_camera is not None and img is not None:
-            img_output = self.update_segment_track(img_time, observations, pose_odom_camera, img)
+            img_output = self.update_segment_track(img_time, observations, pose_odom_camera, img, frame_descriptor)
         
         update_t2 = time.time()
         self.processing_times.map_times.append(update_t2 - update_t1)
@@ -137,12 +137,12 @@ def update_fastsam(self, t):
             else:
                 depth_data = self.depth_data.img(img_t)
         except NoDataNearTimeException:
-            return None, None, None, None
+            return None, None, None, None, None
         
-        observations = self.fastsam.run(img_t, T_odom_camera, img, depth_data=depth_data)
-        return img_t, observations, T_odom_camera, img
+        observations, frame_descriptor = self.fastsam.run(img_t, T_odom_camera, img, depth_data=depth_data)
+        return img_t, observations, T_odom_camera, img, frame_descriptor
 
-    def update_segment_track(self, t, observations, pose_odom_camera, img): 
+    def update_segment_track(self, t, observations, pose_odom_camera, img, frame_descriptor): 
 
         # collect reprojected masks
         reprojected_bboxs = []
@@ -155,11 +155,13 @@ def update_segment_track(self, t, observations, pose_odom_camera, img):
                     reprojected_bboxs.append((segment.id, bbox))
 
         if len(observations) > 0:
-            self.mapper.update(t, pose_odom_camera, observations)
+            self.mapper.update(t, pose_odom_camera, observations, frame_descriptor)
         else:
             self.mapper.poses_flu_history.append(pose_odom_camera @ self.mapper._T_camera_flu)
             self.mapper.times_history.append(t)
-
+            if frame_descriptor is not None:
+                self.mapper.frame_descriptors_history.append(frame_descriptor)
+            
         if self.viz_map or self.viz_observations or self.viz_3d:
             img_ret = self.draw(t, img, pose_odom_camera, observations, reprojected_bboxs)
 
diff --git a/roman/object/segment.py b/roman/object/segment.py
index 1019dc0..a529195 100644
--- a/roman/object/segment.py
+++ b/roman/object/segment.py
@@ -107,8 +107,8 @@ def update(self, observation: Observation, integrate_points=True):
         # Integrate point measurements
         if integrate_points:
             self._integrate_points_from_observation(observation)
-            if observation.clip_embedding is not None:
-                self._add_semantic_descriptor(observation.clip_embedding)
+            if observation.semantic_descriptor is not None:
+                self._add_semantic_descriptor(observation.semantic_descriptor)
 
         self.num_sightings += 1
         self.observations.append(observation.copy(include_mask=False))
diff --git a/roman/params/fastsam_params.py b/roman/params/fastsam_params.py
index 8b7336f..09e6ee1 100644
--- a/roman/params/fastsam_params.py
+++ b/roman/params/fastsam_params.py
@@ -40,6 +40,7 @@ class FastSAMParams:
         plane_filter_params (tuple): parameters for plane filtering
         rotate_img (str): how to rotate the image ('CW', 'CCW', '180')
         semantics (str): which semantics to use for observations ('clip', 'dino', or 'none')
+        frame_descriptor (str): type of frame descriptor to use ('dino-gem', 'dino-gap','dino-gmp', or 'none')
         yolo_imgsz (Tuple[int, int]): size of the YOLO image
         depth_scale (float): depth scale factor for processing depth images
         max_depth (float): maximum depth before rejecting observation points
@@ -71,11 +72,16 @@ class FastSAMParams:
     plane_filter_params: tuple = tuple([3.0, 1.0, 0.2])
     rotate_img: str = None
     semantics: str = 'dino'
+    frame_descriptor: str = 'dino-gem'
     yolo_imgsz: Tuple[int, int] = (256, 256)
     depth_scale: float = 1e3
     max_depth: float = 7.5
     triangle_ignore_masks: List[Tuple[Tuple[int,int], Tuple[int,int], Tuple[int,int]]] = None
     
+    def __post_init__(self):
+        if self.frame_descriptor.lower() == 'none':
+            self.frame_descriptor = None
+    
     @classmethod
     def from_yaml(cls, yaml_path: str, run: str = None):
         with open(yaml_path) as f:
diff --git a/roman/params/submap_align_params.py b/roman/params/submap_align_params.py
index 14a75ac..67cebf0 100644
--- a/roman/params/submap_align_params.py
+++ b/roman/params/submap_align_params.py
@@ -35,23 +35,23 @@ class SubmapAlignParams:
     use_avg_time_as_segment_ref_time: bool = True  # If true, use (first_seen + last_seen) / 2 for each segment reference time 
                                                    # (only applicable if force_fill_submaps == True or submap_pruning_method == 'time')
 
-    force_fill_submaps: bool = False        # If true, force all submaps to be filled with segments
+    force_fill_submaps: bool = False         # If true, force all submaps to be filled with segments
     submap_max_size: int = 40               # Maximum number of segments in a submap (to save computation)
 
     # the following is applicable only if force_fill_submaps is true
     submap_overlap: int = int(0.5 * submap_max_size)  # Number of overlapping segments between submaps
 
     # the following are applicable only if force_fill_submaps is false -----------
-    submap_radius: float = 15.0             # Radius of submap in meters. If set to None, segments 
+    submap_radius: float = 15.0              # Radius of submap in meters. If set to None, segments 
                                             #    are never excluded from submaps based on distance 
                                             #    (though they may still be pruned)
-    submap_center_dist: float = 10.0        # Distance between submap centers in meters
-    submap_center_time: float = 50.0        # time threshold between segments and submap center times
+    submap_center_dist: float = 10.0         # Distance between submap centers in meters
+    submap_center_time: float = 50.0         # time threshold between segments and submap center times
     submap_pruning_method: str = 'distance' # Metric for pruning segments in a submap: 
                                             #    ('time', 'distance') -> max gets pruned
     # ----------------------------------------------------------------------------
-    submap_descriptor: str = None           # Type of submap descriptor. Either 'none' or 'mean_semantic'.
-    submap_descriptor_thresh: float = 0.8   # ROMAN object matching will only be run if submap 
+    submap_descriptor: str = None           # Type of submap descriptor. Either 'none', 'mean_semantic', or 'mean_frame_descriptor'.
+    submap_descriptor_thresh: float = 0.8    # ROMAN object matching will only be run if submap 
                                             #    descriptor cosine similarity is above this threshold.
 
 
diff --git a/roman/utils.py b/roman/utils.py
index 6bef1ff..38c8774 100644
--- a/roman/utils.py
+++ b/roman/utils.py
@@ -12,6 +12,7 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
+import torch
 from typing import List
 from scipy.spatial.transform import Rotation as Rot
 from os.path import expandvars, expanduser

From 6513434b6c0126211546c71fd16495beff29ab83 Mon Sep 17 00:00:00 2001
From: "Qingyuan (Andy) Li" <andyli27@mit.edu>
Date: Fri, 21 Nov 2025 17:41:39 -0500
Subject: [PATCH 2/6] add stacked descriptors and maximum piecewise similarity

---
 roman/align/submap_align.py         |  3 +-
 roman/map/map.py                    | 78 +++++++++++++++++++++++------
 roman/params/submap_align_params.py | 67 ++++++++++++-------------
 3 files changed, 98 insertions(+), 50 deletions(-)

diff --git a/roman/align/submap_align.py b/roman/align/submap_align.py
index c53280d..e3d1df4 100644
--- a/roman/align/submap_align.py
+++ b/roman/align/submap_align.py
@@ -129,8 +129,7 @@ def submap_align(sm_params: SubmapAlignParams, sm_io: SubmapAlignInputOutput):
                 submap_yaw_diff_mat[i, j] = np.abs(np.rad2deg(relative_yaw_angle))
                 
             if sm_params.submap_descriptor is not None:
-                submap_sim = np.sum(submap_i.descriptor * submap_j.descriptor) / \
-                    (np.linalg.norm(submap_i.descriptor) * np.linalg.norm(submap_j.descriptor))
+                submap_sim = Submap.similarity(submap_i, submap_j)
             else:
                 submap_sim = np.inf # always try to register object maps if no descriptor is used
                 
diff --git a/roman/map/map.py b/roman/map/map.py
index f28cdb7..a80aaeb 100644
--- a/roman/map/map.py
+++ b/roman/map/map.py
@@ -1,12 +1,11 @@
 import numpy as np
-from numpy.linalg import norm
 from scipy.spatial.transform import Rotation as Rot
 
 import os
 import pickle
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import List
+from typing import List, Tuple
 import json
 
 from robotdatapy.data.pose_data import PoseData
@@ -134,6 +133,28 @@ def segments_as_global_points(self):
 
     def __len__(self):
         return len(self.segments)
+    
+    @classmethod
+    def similarity(cls, submap1, submap2):
+        desc1 = submap1.descriptor
+        desc2 = submap2.descriptor
+        
+        if len(desc1.shape) == len(desc2.shape) == 1:
+            # 1-d cosine similarity
+            norm_prod = np.linalg.norm(desc1) * np.linalg.norm(desc2)
+            if np.isclose(norm_prod, 0.0, atol=1e-9, rtol=0.0): return 0.0
+            return np.dot(desc1, desc2) / norm_prod
+        
+        elif len(desc1.shape) == len(desc2.shape) == 2:
+            # maximum piecewise cosine similarity
+            sims = []
+            desc1 = desc1.reshape(desc1.shape[0], 1, desc1.shape[1])  # (N1, 1, D)
+            desc2 = desc2.reshape(1, desc2.shape[0], desc2.shape[1])  # (1, N2, D)
+            norm_prods = np.linalg.norm(desc1, axis=2) * np.linalg.norm(desc2, axis=2)  # (N1, N2)
+            sims = np.sum(desc1 * desc2, axis=2) / norm_prods  # (N1, N2, D) -> (N1, N2)
+            sims[np.isclose(norm_prods, 0.0, atol=1e-9, rtol=0.0)] = 0.0
+            return np.max(sims)
+
 
 @dataclass
 class SubmapParams:
@@ -145,10 +166,10 @@ class SubmapParams:
     distance: float = 10.0
     time_threshold: float = np.inf
     pruning_method: str = 'time'
-    use_avg_time_as_segment_ref_time: bool = True
     object_center_ref: str = 'mean'
     use_minimal_data: bool = True
     submap_descriptor: str = None
+    frame_descriptor_dist: float = None
 
     @classmethod
     def from_submap_align_params(cls, submap_align_params: SubmapAlignParams):
@@ -160,8 +181,8 @@ def from_submap_align_params(cls, submap_align_params: SubmapAlignParams):
             distance=submap_align_params.submap_center_dist,
             time_threshold=submap_align_params.submap_center_time,
             pruning_method=submap_align_params.submap_pruning_method,
-            use_avg_time_as_segment_ref_time=submap_align_params.use_avg_time_as_segment_ref_time,
-            submap_descriptor=submap_align_params.submap_descriptor
+            submap_descriptor=submap_align_params.submap_descriptor,
+            frame_descriptor_dist=submap_align_params.frame_descriptor_dist,
         )
 
 def load_roman_map(map_file: str) -> ROMANMap:
@@ -203,14 +224,14 @@ def submaps_from_roman_map(roman_map: ROMANMap, submap_params: SubmapParams,
     if submap_params.force_fill_submaps:
         submaps = []
 
-        segments_sorted_by_time = sorted(roman_map.segments, key=lambda seg: seg.reference_time(use_avg_time=submap_params.use_avg_time_as_segment_ref_time))
+        segments_sorted_by_time = sorted(roman_map.segments, key=lambda seg: seg.reference_time())
 
         for i in range(0, len(segments_sorted_by_time), submap_params.max_size - submap_params.overlap):
 
             sm_segments = segments_sorted_by_time[i:i + submap_params.max_size]
             if len(sm_segments) == 0: continue
 
-            segment_times = [seg.reference_time(use_avg_time=submap_params.use_avg_time_as_segment_ref_time) for seg in sm_segments]
+            segment_times = [seg.reference_time() for seg in sm_segments]
             submap_time = np.average(segment_times)
             submap_roman_map_index = np.argmin(np.abs(roman_map.times - submap_time))
 
@@ -259,7 +280,8 @@ def submaps_from_roman_map(roman_map: ROMANMap, submap_params: SubmapParams,
             )
 
             for seg in roman_map.segments:
-                if (submap_params.radius is None or (norm(seg.center.flatten() - sm.pose_flu[:3,3]) < submap_params.radius)) \
+                if (submap_params.radius is None or \
+                        (np.linalg.norm(seg.center.flatten() - sm.pose_flu[:3,3]) < submap_params.radius)) \
                         and meets_time_constraints(seg):
                     sm.segments.append(deepcopy(seg))
 
@@ -269,26 +291,54 @@ def submaps_from_roman_map(roman_map: ROMANMap, submap_params: SubmapParams,
 
             if submap_params.max_size is not None:
                 if submap_params.pruning_method == 'time': # time-based pruning
-                    pruning_key = lambda seg: abs(seg.reference_time(use_avg_time=submap_params.use_avg_time_as_segment_ref_time) - submaps[i].time)
+                    pruning_key = lambda seg: abs(seg.reference_time() - submaps[i].time)
                 else: # distance-based pruning
-                    pruning_key = lambda seg: norm(seg.center.flatten())
+                    pruning_key = lambda seg: np.linalg.norm(seg.center.flatten())
 
                 segments_sorted_by_key = sorted(sm.segments, key=pruning_key)
                 sm.segments = segments_sorted_by_key[:submap_params.max_size]
 
     submaps = [submap for submap in submaps if len(submap.segments) > 0]
+    
     if submap_params.submap_descriptor == 'mean_semantic':
         # compute mean semantic for each submap
         for submap in submaps:
             submap.descriptor = np.mean([seg.semantic_descriptor for seg in submap.segments], axis=0).flatten()
-    elif submap_params.submap_descriptor == 'mean_frame_descriptor':
+            
+    elif submap_params.submap_descriptor is not None:
+        # using frame descriptors
         assert roman_map.descriptors is not None, "ROMAN map must have frame descriptors to compute submap descriptors from them."
         map_times_np = np.array(roman_map.times)
         descriptors_np = np.vstack(roman_map.descriptors)
-        for submap in submaps:
-            frame_mask = (map_times_np >= submap.first_seen) & (map_times_np <= submap.last_seen)
-            submap.descriptor = descriptors_np[frame_mask].mean(axis=0)
             
+        if submap_params.submap_descriptor == 'mean_frame_descriptor':
+            for submap in submaps:
+                frame_mask = (map_times_np >= submap.first_seen) & (map_times_np <= submap.last_seen)
+                submap.descriptor = descriptors_np[frame_mask].mean(axis=0)
+                
+        elif submap_params.submap_descriptor == 'stacked_frame_descriptors':
+            if submap_params.frame_descriptor_dist is None:
+                for submap in submaps:
+                    frame_mask = (map_times_np >= submap.first_seen) & (map_times_np <= submap.last_seen)
+                    submap.descriptor = descriptors_np[frame_mask]
+            else:
+                dist_thresh = submap_params.frame_descriptor_dist
+                map_pos_np = np.array([pose[:3,3] for pose in roman_map.trajectory])
+                
+                for submap in submaps:
+                    frame_mask = (map_times_np >= submap.first_seen) & (map_times_np <= submap.last_seen)
+                    frame_descriptors = descriptors_np[frame_mask]
+                    frame_pos = map_pos_np[frame_mask]
+                    
+                    stacked_descriptors = []
+                    last_pose = None
+                    for fd, fp in zip(frame_descriptors, frame_pos):
+                        if (last_pose is None or np.linalg.norm(fp - last_pose) >= dist_thresh):
+                            stacked_descriptors.append(fd)
+                            last_pose = fp
+                        
+                    submap.descriptor = np.vstack(stacked_descriptors)
+                            
     return submaps
 
 
diff --git a/roman/params/submap_align_params.py b/roman/params/submap_align_params.py
index 67cebf0..8d44498 100644
--- a/roman/params/submap_align_params.py
+++ b/roman/params/submap_align_params.py
@@ -13,7 +13,7 @@
 import numpy as np
 
 from dataclasses import dataclass, field
-from typing import List
+from typing import List, Tuple, Union
 import os
 import yaml
 
@@ -25,43 +25,42 @@
 @dataclass
 class SubmapAlignParams:
 
-    dim: int = 3                            # 2 or 3. 2D or 3D object map registration
-    method: str = 'roman'                   # by default, use semantic + pca + volume + gravity
-                                            # same as in ROMAN paper.
-                                            # See get_object_registration for other methods
-    fusion_method: str = 'geometric_mean'   # How to fuse similarity scores. (geometric_mean, 
-                                            # arithmetic_mean, product)
-    
-    use_avg_time_as_segment_ref_time: bool = True  # If true, use (first_seen + last_seen) / 2 for each segment reference time 
-                                                   # (only applicable if force_fill_submaps == True or submap_pruning_method == 'time')
-
-    force_fill_submaps: bool = False         # If true, force all submaps to be filled with segments
-    submap_max_size: int = 40               # Maximum number of segments in a submap (to save computation)
+    dim: int = 3                                            # 2 or 3. 2D or 3D object map registration
+    method: str = 'roman'                                   # by default, use semantic + pca + volume + gravity
+                                                            #   same as in ROMAN paper.
+                                                            #   See get_object_registration for other methods
+    fusion_method: str = 'geometric_mean'                   # How to fuse similarity scores. (geometric_mean, 
+                                                            #   arithmetic_mean, product)
 
-    # the following is applicable only if force_fill_submaps is true
-    submap_overlap: int = int(0.5 * submap_max_size)  # Number of overlapping segments between submaps
+    force_fill_submaps: bool = False                        # If true, force all submaps to be filled with segments
+    submap_max_size: int = 40                               # Maximum number of segments in a submap (to save computation)
 
-    # the following are applicable only if force_fill_submaps is false -----------
-    submap_radius: float = 15.0              # Radius of submap in meters. If set to None, segments 
-                                            #    are never excluded from submaps based on distance 
-                                            #    (though they may still be pruned)
-    submap_center_dist: float = 10.0         # Distance between submap centers in meters
-    submap_center_time: float = 50.0         # time threshold between segments and submap center times
-    submap_pruning_method: str = 'distance' # Metric for pruning segments in a submap: 
-                                            #    ('time', 'distance') -> max gets pruned
-    # ----------------------------------------------------------------------------
-    submap_descriptor: str = None           # Type of submap descriptor. Either 'none', 'mean_semantic', or 'mean_frame_descriptor'.
-    submap_descriptor_thresh: float = 0.8    # ROMAN object matching will only be run if submap 
-                                            #    descriptor cosine similarity is above this threshold.
+    # the following is applicable only if force_fill_submaps is true -----------------------------------
+    submap_overlap: int = int(0.5 * submap_max_size)        # Number of overlapping segments between submaps
 
+    # the following are applicable only if force_fill_submaps is false ---------------------------------
+    submap_radius: float = 15.0                             # Radius of submap in meters. If set to None, segments 
+                                                            #    are never excluded from submaps based on distance 
+                                                            #    (though they may still be pruned)
+    submap_center_dist: float = 10.0                        # Distance between submap centers in meters
+    submap_center_time: float = 50.0                        # time threshold between segments and submap center times
+    submap_pruning_method: str = 'distance'                 # Metric for pruning segments in a submap: 
+                                                            #    ('time', 'distance') -> max gets pruned
+    # --------------------------------------------------------------------------------------------------
+    submap_descriptor: Union[str, None] = None              # Type of submap descriptor. Either 'none', 'mean_semantic', 'mean_frame_descriptor', 
+                                                            #    or 'stacked_frame_descriptors'.
+    frame_descriptor_dist: float = None                     # If submap_descriptor=='stacked_frame_descriptors', dist threshold to sequentially
+                                                            #    add a new frame descriptors to each submap descriptor
+    submap_descriptor_thresh: float = 0.8                   # ROMAN object matching will only be run if submap 
+                                                            #    descriptor cosine similarity is above this threshold.
 
-    single_robot_lc: bool = False           # If true, do not try and perform loop closures with submaps
-                                            # nearby in time
-    single_robot_lc_time_thresh: float = 50.0   # Time threshold for single robot loop closure
-    force_rm_lc_roll_pitch: bool = True     # If true, remove parts of rotation about x or y axes
-    force_rm_upside_down: bool = True       # If true, assumes upside down submap rotations are incorrect
-    use_object_bottom_middle: bool = False  # If true, uses the bottom middle of the object as a reference
-                                            # point for registration rather than the center of the object
+    single_robot_lc: bool = False                           # If true, do not try and perform loop closures with submaps
+                                                            #   nearby in time
+    single_robot_lc_time_thresh: float = 50.0               # Time threshold for single robot loop closure
+    force_rm_lc_roll_pitch: bool = True                     # If true, remove parts of rotation about x or y axes
+    force_rm_upside_down: bool = True                       # If true, assumes upside down submap rotations are incorrect
+    use_object_bottom_middle: bool = False                  # If true, uses the bottom middle of the object as a reference
+                                                            #   point for registration rather than the center of the object
     
     # registration params
     sigma: float = 0.4

From 1dd897898a5fbe4d3b38b2ded00d99a8c1499aaf Mon Sep 17 00:00:00 2001
From: "Qingyuan (Andy) Li" <andyli27@mit.edu>
Date: Fri, 21 Nov 2025 22:36:12 -0500
Subject: [PATCH 3/6] tested demo params

---
 params/demo/fastsam.yaml        | 3 +--
 params/demo/submap_align.yaml   | 4 +++-
 params/demo_no_gpu/fastsam.yaml | 3 ++-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/params/demo/fastsam.yaml b/params/demo/fastsam.yaml
index 4a75fc1..c4074f7 100644
--- a/params/demo/fastsam.yaml
+++ b/params/demo/fastsam.yaml
@@ -16,5 +16,4 @@ semantics: 'dino'
 yolo_imgsz: [256, 256]
 depth_scale: 1000.0
 max_depth: 7.5
-
-frame_descriptor: 'dino-gmp'
\ No newline at end of file
+frame_descriptor: 'dino-gem'
\ No newline at end of file
diff --git a/params/demo/submap_align.yaml b/params/demo/submap_align.yaml
index af2482b..f154d85 100644
--- a/params/demo/submap_align.yaml
+++ b/params/demo/submap_align.yaml
@@ -14,4 +14,6 @@ cosine_min: 0.5
 cosine_max: 0.7
 semantics_dim: 768
 
-submap_descriptor: 'mean_frame_descriptor'
\ No newline at end of file
+submap_descriptor: 'stacked_frame_descriptors'
+frame_descriptor_dist: 10.0
+submap_descriptor_thresh: 0.8
\ No newline at end of file
diff --git a/params/demo_no_gpu/fastsam.yaml b/params/demo_no_gpu/fastsam.yaml
index 18510b5..fae3e41 100644
--- a/params/demo_no_gpu/fastsam.yaml
+++ b/params/demo_no_gpu/fastsam.yaml
@@ -15,4 +15,5 @@ plane_filter_params: [3.0, 1.0, 0.2]
 semantics: 'none'
 yolo_imgsz: [256, 256]
 depth_scale: 1000.0
-max_depth: 7.5
\ No newline at end of file
+max_depth: 7.5
+frame_descriptor: 'none'
\ No newline at end of file

From fc45e469fe37430da4f150efe605a1553464ca62 Mon Sep 17 00:00:00 2001
From: "Qingyuan (Andy) Li" <andyli27@mit.edu>
Date: Sat, 22 Nov 2025 21:14:40 -0500
Subject: [PATCH 4/6] pr changes

---
 roman/align/results.py       | 7 ++++---
 roman/map/fastsam_wrapper.py | 3 ++-
 roman/map/map.py             | 1 -
 roman/map/mapper.py          | 3 +++
 roman/map/run.py             | 9 ++-------
 roman/utils.py               | 1 -
 6 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/roman/align/results.py b/roman/align/results.py
index d232c66..245f67c 100644
--- a/roman/align/results.py
+++ b/roman/align/results.py
@@ -58,13 +58,14 @@ def plot_align_results(results: SubmapAlignResults, dpi=500):
     
     # if no ground truth, can only show number of associations
     if None in results.submap_io.input_gt_pose_yaml:
-        fig, ax = plt.subplots(2 if show_sim else 1, 1, figsize=(8 if show_sim else 4,4), dpi=dpi)
+        fig, ax = plt.subplots(2 if show_sim else 1, 1, figsize=(8 if show_sim else 4, 4), dpi=dpi)
+        ax = np.array(ax).reshape(-1, 1)
         mp = ax[0, 0].imshow(results.clipper_num_associations, cmap='viridis', vmin=0)
         fig.colorbar(mp, fraction=0.04, pad=0.04)
         ax.set_title("Number of Associations")
         
         if show_sim:
-            mp = ax[1, 0].imshow(results.similarity_mat, cmap='viridis', vmin=0)
+            mp = ax[1, 0].imshow(results.similarity_mat, cmap='viridis', vmin=0.0, vmax=1.0)
             fig.colorbar(mp, fraction=0.04, pad=0.04)
             ax[1, 0].set_title("Similarity Score")
         
@@ -105,7 +106,7 @@ def plot_align_results(results: SubmapAlignResults, dpi=500):
     ax[2, 0].set_title("Number of Associations")
     
     if show_sim:
-        mp = ax[2, 1].imshow(results.similarity_mat, cmap='viridis', vmin=0)
+        mp = ax[2, 1].imshow(results.similarity_mat, cmap='viridis', vmin=0.0, vmax=1.0)
         fig.colorbar(mp, fraction=0.04, pad=0.04)
         ax[2, 1].set_title("Similarity Score")
 
diff --git a/roman/map/fastsam_wrapper.py b/roman/map/fastsam_wrapper.py
index a19c72c..6f2785a 100644
--- a/roman/map/fastsam_wrapper.py
+++ b/roman/map/fastsam_wrapper.py
@@ -272,7 +272,7 @@ def run(self, t, pose, img, depth_data=None):
         
         # run fastsam
         masks = self._process_img(img, ignore_mask=ignore_mask, keep_mask=keep_mask)
-                
+        
         if self.semantics == 'dino':
             # Process the image for DINO
             dino_shape = 768
@@ -288,6 +288,7 @@ def run(self, t, pose, img, depth_data=None):
                 model_output_patches=dino_output_patches,
                 img_shape=img.shape
             )
+            dino_features = self.unapply_rotation(dino_features)
             
         frame_descriptor = None
         if self.frame_descriptor_type is not None:
diff --git a/roman/map/map.py b/roman/map/map.py
index a80aaeb..7a2b5ce 100644
--- a/roman/map/map.py
+++ b/roman/map/map.py
@@ -147,7 +147,6 @@ def similarity(cls, submap1, submap2):
         
         elif len(desc1.shape) == len(desc2.shape) == 2:
             # maximum piecewise cosine similarity
-            sims = []
             desc1 = desc1.reshape(desc1.shape[0], 1, desc1.shape[1])  # (N1, 1, D)
             desc2 = desc2.reshape(1, desc2.shape[0], desc2.shape[1])  # (1, N2, D)
             norm_prods = np.linalg.norm(desc1, axis=2) * np.linalg.norm(desc2, axis=2)  # (N1, N2)
diff --git a/roman/map/mapper.py b/roman/map/mapper.py
index bdb7d6e..3ee08af 100644
--- a/roman/map/mapper.py
+++ b/roman/map/mapper.py
@@ -53,6 +53,9 @@ def update(self, t: float, pose: np.array, observations: List[Observation], fram
         if frame_descriptor is not None:
             self.frame_descriptors_history.append(frame_descriptor)
         
+        if len(observations) == 0: # nothing to update
+            return
+
         # store last pose
         self.last_pose = pose.copy()
         
diff --git a/roman/map/run.py b/roman/map/run.py
index 0d018b4..c37e3b5 100644
--- a/roman/map/run.py
+++ b/roman/map/run.py
@@ -154,13 +154,8 @@ def update_segment_track(self, t, observations, pose_odom_camera, img, frame_des
                 if bbox is not None:
                     reprojected_bboxs.append((segment.id, bbox))
 
-        if len(observations) > 0:
-            self.mapper.update(t, pose_odom_camera, observations, frame_descriptor)
-        else:
-            self.mapper.poses_flu_history.append(pose_odom_camera @ self.mapper._T_camera_flu)
-            self.mapper.times_history.append(t)
-            if frame_descriptor is not None:
-                self.mapper.frame_descriptors_history.append(frame_descriptor)
+        # update mapper with new frame
+        self.mapper.update(t, pose_odom_camera, observations, frame_descriptor)
             
         if self.viz_map or self.viz_observations or self.viz_3d:
             img_ret = self.draw(t, img, pose_odom_camera, observations, reprojected_bboxs)
diff --git a/roman/utils.py b/roman/utils.py
index 38c8774..6bef1ff 100644
--- a/roman/utils.py
+++ b/roman/utils.py
@@ -12,7 +12,6 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
-import torch
 from typing import List
 from scipy.spatial.transform import Rotation as Rot
 from os.path import expandvars, expanduser

From a3c005a494b6d9ffc1afa1af637e73c13cfaa52d Mon Sep 17 00:00:00 2001
From: "Qingyuan (Andy) Li" <andyli27@mit.edu>
Date: Thu, 27 Nov 2025 03:45:42 -0500
Subject: [PATCH 5/6] refactor submap descriptor extraction for roman_ros

---
 roman/map/map.py | 73 +++++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 32 deletions(-)

diff --git a/roman/map/map.py b/roman/map/map.py
index 7a2b5ce..20ac7d9 100644
--- a/roman/map/map.py
+++ b/roman/map/map.py
@@ -199,6 +199,40 @@ def load_roman_map(map_file: str) -> ROMANMap:
         roman_map = pickle.load(f)
         assert type(roman_map) == ROMANMap
         return roman_map
+    
+def extract_submap_descriptors(submaps: List[Submap], descriptor_times: List[float], descriptors: List[np.ndarray], \
+                               poses: List[np.ndarray], submap_params: SubmapParams):
+    assert descriptors is not None, "ROMAN map must have frame descriptors to compute submap descriptors from them."
+    map_times_np = np.array(descriptor_times)
+    descriptors_np = np.vstack(descriptors)
+        
+    if submap_params.submap_descriptor == 'mean_frame_descriptor':
+        for submap in submaps:
+            frame_mask = (map_times_np >= submap.first_seen) & (map_times_np <= submap.last_seen)
+            submap.descriptor = descriptors_np[frame_mask].mean(axis=0)
+            
+    elif submap_params.submap_descriptor == 'stacked_frame_descriptors':
+        if submap_params.frame_descriptor_dist is None:
+            for submap in submaps:
+                frame_mask = (map_times_np >= submap.first_seen) & (map_times_np <= submap.last_seen)
+                submap.descriptor = descriptors_np[frame_mask]
+        else:
+            dist_thresh = submap_params.frame_descriptor_dist
+            map_pos_np = np.array([pose[:3,3] for pose in poses])
+            
+            for submap in submaps:
+                frame_mask = (map_times_np >= submap.first_seen) & (map_times_np <= submap.last_seen)
+                frame_descriptors = descriptors_np[frame_mask]
+                frame_pos = map_pos_np[frame_mask]
+                
+                stacked_descriptors = []
+                last_pose = None
+                for fd, fp in zip(frame_descriptors, frame_pos):
+                    if (last_pose is None or np.linalg.norm(fp - last_pose) >= dist_thresh):
+                        stacked_descriptors.append(fd)
+                        last_pose = fp
+                    
+                submap.descriptor = np.vstack(stacked_descriptors)
 
 def submaps_from_roman_map(roman_map: ROMANMap, submap_params: SubmapParams, 
                            gt_flu_pose_data: PoseData=None) -> List[Submap]:
@@ -305,38 +339,13 @@ def submaps_from_roman_map(roman_map: ROMANMap, submap_params: SubmapParams,
             submap.descriptor = np.mean([seg.semantic_descriptor for seg in submap.segments], axis=0).flatten()
             
     elif submap_params.submap_descriptor is not None:
-        # using frame descriptors
-        assert roman_map.descriptors is not None, "ROMAN map must have frame descriptors to compute submap descriptors from them."
-        map_times_np = np.array(roman_map.times)
-        descriptors_np = np.vstack(roman_map.descriptors)
-            
-        if submap_params.submap_descriptor == 'mean_frame_descriptor':
-            for submap in submaps:
-                frame_mask = (map_times_np >= submap.first_seen) & (map_times_np <= submap.last_seen)
-                submap.descriptor = descriptors_np[frame_mask].mean(axis=0)
-                
-        elif submap_params.submap_descriptor == 'stacked_frame_descriptors':
-            if submap_params.frame_descriptor_dist is None:
-                for submap in submaps:
-                    frame_mask = (map_times_np >= submap.first_seen) & (map_times_np <= submap.last_seen)
-                    submap.descriptor = descriptors_np[frame_mask]
-            else:
-                dist_thresh = submap_params.frame_descriptor_dist
-                map_pos_np = np.array([pose[:3,3] for pose in roman_map.trajectory])
-                
-                for submap in submaps:
-                    frame_mask = (map_times_np >= submap.first_seen) & (map_times_np <= submap.last_seen)
-                    frame_descriptors = descriptors_np[frame_mask]
-                    frame_pos = map_pos_np[frame_mask]
-                    
-                    stacked_descriptors = []
-                    last_pose = None
-                    for fd, fp in zip(frame_descriptors, frame_pos):
-                        if (last_pose is None or np.linalg.norm(fp - last_pose) >= dist_thresh):
-                            stacked_descriptors.append(fd)
-                            last_pose = fp
-                        
-                    submap.descriptor = np.vstack(stacked_descriptors)
+        extract_submap_descriptors(
+            submaps=submaps,
+            descriptor_times=roman_map.times,
+            descriptors=roman_map.descriptors,
+            poses=roman_map.trajectory,
+            submap_params=submap_params
+        )
                             
     return submaps
 

From 8c35f84fe368babb04cd9a1036eab65e52524aa3 Mon Sep 17 00:00:00 2001
From: Mason Peterson <mbpeterson70@gmail.com>
Date: Mon, 1 Dec 2025 16:19:02 -0500
Subject: [PATCH 6/6] update setup.py version/info

---
 setup.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index ccb3170..9a4ff3a 100644
--- a/setup.py
+++ b/setup.py
@@ -9,11 +9,11 @@
 
 setup(
     name='roman',
-    version='0.1.3',
+    version='0.1.4',
     description=project_description,
-    url='url',
-    author='Mason Peterson, Lucas Jia, and Yulun Tian',
-    author_email='masonbp@mit.edu, yixuany@mit.edu, yut034@ucsd.edu',
+    url='https://acl.mit.edu/roman',
+    author='Mason Peterson, Lucas Jia, Yulun Tian, and Andy Li',
+    author_email='masonbp@mit.edu, yixuany@mit.edu, yut034@ucsd.edu, andyli27@mit.edu',
     license='MIT',
     packages=find_packages(),
     install_requires=[