-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex.html
More file actions
840 lines (785 loc) · 45.8 KB
/
index.html
File metadata and controls
840 lines (785 loc) · 45.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>SAM2Act</title>
<meta name="description" content="SAM2Act">
<meta name="keywords" content="Foundation Model, Affordance Prediction">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
<link rel="stylesheet" href="./static/css/bulma.min.css">
<link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
<link rel="stylesheet" href="./static/css/bulma-slider.min.css">
<!-- <link rel="stylesheet" href="./static/css/fontawesome.all.min.css"> -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
<link rel="stylesheet" href="./static/css/index.css">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
<!-- <script defer src="./static/js/fontawesome.all.min.js"></script> -->
<script src="./static/js/bulma-carousel.min.js"></script>
<script src="./static/js/bulma-slider.min.js"></script>
<script src="./static/js/index.js"></script>
<!-- <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css">
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
<script>hljs.highlightAll();</script> -->
<script type="text/javascript" async
src="https://polyfill.io/v3/polyfill.min.js?features=es6">
</script>
<script type="text/javascript" async
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.2.2/es5/tex-mml-chtml.min.js">
</script>
<script type="text/javascript">
function init() {
const video_sam2act = document.getElementById("simulation-menu-video-sam2act");
video_sam2act.addEventListener("error", () => {
console.log("Error loading video: ", video_sam2act.src);
}, true);
const video_baseline = document.getElementById("simulation-menu-video-baseline");
video_baseline.addEventListener("error", () => {
console.log("Error loading video: ", video_baseline.src);
}, true);
const video_comparison_sam2act = document.getElementById("comparison-menu-video-sam2act");
video_comparison_sam2act.addEventListener("error", () => {
console.log("Error loading video: ", video_comparison_sam2act.src);
}, true);
const video_comparison_rvt2 = document.getElementById("comparison-menu-video-rvt2");
video_comparison_rvt2.addEventListener("error", () => {
console.log("Error loading video: ", video_comparison_rvt2.src);
}, true);
const video_memorybench = document.getElementById("memorybench-menu-video-method");
video_memorybench.addEventListener("error", () => {
console.log("Error loading video: ", video_memorybench.src);
}, true);
}
function updateTaskVideo() {
const task = document.getElementById("simulation-menu-task-name").value;
const baseline = document.getElementById("simulation-menu-baseline-name").value;
const video_sam2act = document.getElementById("simulation-menu-video-sam2act");
video_sam2act.src = `static/videos/simulation/sam2act/${task}_fixed.mp4`;;
video_sam2act.playbackRate = 1.75;
video_sam2act.play();
const video_baseline = document.getElementById("simulation-menu-video-baseline");
video_baseline.src = `static/videos/simulation/${baseline}/${task}_fixed.mp4`;
video_baseline.playbackRate = 1.75;
video_baseline.play();
}
function updateComparisonVideo() {
const task = document.getElementById("comparison-menu-task-name").value;
const episode = document.getElementById("comparison-menu-episode-name").value;
const video_sam2act = document.getElementById("comparison-menu-video-sam2act");
video_sam2act.src = `static/videos/comparison/${task}/vid_sam2act_in_distribution_${episode}.mp4`;
video_sam2act.playbackRate = 1.75;
video_sam2act.play();
const video_rvt2 = document.getElementById("comparison-menu-video-rvt2");
video_rvt2.src = `static/videos/comparison/${task}/vid_rvt2_in_distribution_${episode}.mp4`;
video_rvt2.playbackRate = 1.75;
video_rvt2.play();
}
function updateMemoryBenchVideo() {
const task = document.getElementById("memorybench-menu-task-name").value;
const method = document.getElementById("memorybench-menu-method-name").value;
const episode = document.getElementById("memorybench-menu-episode-name").value;
const video = document.getElementById("memorybench-menu-video-method");
video.src = `static/videos/memorybench/${method}/${task}_${episode}_fixed.mp4`;
video.playbackRate = 1.75;
video.play();
}
function getRandomInt(max) {
return Math.floor(Math.random() * max);
}
function updateColosseumVideo() {
const available_episodes = [
"basketball_in_hoop_0_s0",
"basketball_in_hoop_0_s1",
"basketball_in_hoop_0_s2",
// "close_box_1_s0",
"close_box_1_s1",
// "close_box_1_s2",
"close_laptop_lid_2_s0",
"close_laptop_lid_2_s1",
// "close_laptop_lid_2_s2",
// "hockey_5_s0",
// "hockey_5_s1",
// "hockey_5_s2",
// "setup_chess_14_s0",
// "setup_chess_14_s1",
// "setup_chess_14_s2",
"insert_onto_square_peg_6_s0",
"insert_onto_square_peg_6_s1",
"insert_onto_square_peg_6_s2",
"stack_cups_13_s0",
// "stack_cups_13_s1",
// "stack_cups_13_s2",
// "straighten_rope_10_s0",
// "straighten_rope_10_s1",
"straighten_rope_10_s2",
"meat_on_grill_8_s0",
"meat_on_grill_8_s1",
"meat_on_grill_8_s2",
"turn_oven_on_12_s0",
"turn_oven_on_12_s1",
"turn_oven_on_12_s2",
// "move_hanger_9_s0",
// "move_hanger_9_s1",
"move_hanger_9_s2",
// "empty_dishwasher_3_s0",
// "empty_dishwasher_3_s1",
// "empty_dishwasher_3_s2",
// "wipe_desk_11_s0",
// "wipe_desk_11_s1",
"wipe_desk_11_s2",
"place_wine_at_rack_location_7_s0",
"place_wine_at_rack_location_7_s1",
"place_wine_at_rack_location_7_s2",
// "get_ice_from_fridge_4_s0",
// "get_ice_from_fridge_4_s1",
"get_ice_from_fridge_4_s2"
];
const ep_index = getRandomInt(available_episodes.length);
const uri = `static/videos/colosseum/${available_episodes[ep_index]}_fixed.mp4`;
const video = document.getElementById("colosseum-menu-video-episode");
video.src = uri;
video.playbackRate = 1.75;
video.play();
}
function updateComparisonOutDistVideo() {
const task = document.getElementById("outdistribution-menu-task-name").value;
const episode = document.getElementById("outdistribution-menu-episode-name").value;
const video_sam2act = document.getElementById("outdistribution-menu-video-sam2act");
video_sam2act.src = `static/videos/out_of_distribution/${task}/vid_sam2act_turn_on_out_${episode}.mp4`;
video_sam2act.playbackRate = 1.75;
video_sam2act.play();
const video_rvt2 = document.getElementById("outdistribution-menu-video-rvt2");
video_rvt2.src = `static/videos/out_of_distribution/${task}/vid_rvt_turn_on_out_${episode}.mp4`;
video_rvt2.playbackRate = 1.75;
video_rvt2.play();
}
</script>
</head>
<body onload="init(); updateTaskVideo(); updateComparisonVideo(); updateMemoryBenchVideo(); updateColosseumVideo(); updateComparisonOutDistVideo();">
<!-- Title / Authors Info -->
<section class="hero">
<div class="hero-body" style="padding-bottom: 0 !important;">
<div class="container is-max-desktop">
<div class="columns is-centered">
<div class="column has-text-centered">
<h1 class="title is-1 publication-title" style="margin-bottom: 0 !important; display: flex; align-items: center; justify-content: center;">
<img src="static/images/img_logo.png" alt="Emoji" style="height: 60px; vertical-align: middle; margin-right: 10px;">
SAM2Act:
</h1>
<h2 class="title is-2 publication-title">
Integrating Visual Foundation Model with
<br>
A Memory Architecture for Robotic Manipulation
</h2>
<div class="is-size-5 publication-authors">
<span class="author-block">
<a target="_blank" href="https://hq-fang.github.io/">Haoquan Fang</a><sup>1</sup>,
</span>
<span class="author-block">
<a target="_blank" href="https://www.markusgrotz.com/">Markus Grotz</a><sup>1</sup>,
</span>
<span class="author-block">
<a target="_blank" href="https://wpumacay.github.io/">Wilbert Pumacay</a><sup>2</sup>,
</span>
<span class="author-block">
<a target="_blank" href="https://helen9975.github.io/">Yi Ru Wang</a><sup>1</sup>,
</span>
<br>
<span class="author-block">
<a target="_blank" href="https://homes.cs.washington.edu/~fox/">Dieter Fox</a><sup>1, 3</sup>*,
</span>
<span class="author-block">
<a target="_blank" href="https://ranjaykrishna.com/">Ranjay Krishna</a><sup>1, 4</sup>*,
</span>
<span class="author-block">
<a target="_blank" href="https://duanjiafei.com/">Jiafei Duan</a><sup>1, 4</sup>*
</span>
</div>
<div class="is-size-5 publication-authors">
<span class="author-block"><sup>1</sup>University of Washington</span>
<span class="author-block"><sup>2</sup>Universidad Católica San Pablo</span>
<br>
<span class="author-block"><sup>3</sup>NVIDIA</span>
<span class="author-block"><sup>4</sup>Allen Institute for AI</span>
</div>
<div class="is-size-6 publication-authors">
<span class="author-block">*Equal Advising</span>
</div>
<div class="is-size-5 publication-authors">
<span class="author-block">
<p style="color:rgb(147, 36, 36);">International Conference on Machine Learning (ICML) 2025</p>
</span>
<br>
<span class="author-block">
<p style="color:rgb(147, 36, 36);">RemembeRL Workshop @ CoRL 2025, <b>Best Paper Award</b></p>
</span>
</div>
<div class="column has-text-centered">
<!-- ArXiv link -->
<span class="link-block">
<a target="_blank" href="https://arxiv.org/abs/2501.18564" class="external-link button is-normal is-rounded is-dark">
<!-- <span class="icon"><i class="ai ai-arxiv"></i></span> -->
<span class="icon"><img src="static/images/arxiv_icon_small.svg" style="height: 16px;"/></i></span>
<span>arXiv</span>
</a>
</span>
<!-- Code Link. -->
<span class="link-block">
<a target="_blank" href="https://github.com/sam2act/sam2act" class="external-link button is-normal is-rounded is-dark">
<span class="icon"><i class="fab fa-github"></i></span>
<span>Code</span>
</a>
</span>
<!-- MemoryBench Link. -->
<span class="link-block">
<a target="_blank" href="https://huggingface.co/datasets/hqfang/memorybench" class="external-link button is-normal is-rounded is-dark">
<span class="icon"><img src="static/images/hf_icon.svg" style="height: 24px;"/></span>
<span>MemoryBench</span>
</a>
</span>
<!-- X Post Link. -->
<span class="link-block">
<a target="_blank" href="https://x.com/DJiafei/status/1884954101697699940" class="external-link button is-normal is-rounded is-dark">
<span class="icon"><i class="fab fa-x-twitter"></i></span>
<span>Post</span>
</a>
</span>
<!-- AI Index Report Link. -->
<span class="link-block">
<a target="_blank" href="https://hai-production.s3.amazonaws.com/files/hai_ai_index_report_2025.pdf#page=150" class="external-link button is-normal is-rounded is-dark">
<span class="icon"><i class="fa-solid fa-file"></i></span>
<span>AI Index Report</span>
</a>
</span>
<div class="is-size-6 publication-authors">
</div>
</div>
</div>
</div>
</div>
</div>
</section>
<section class="section">
<div class="container is-max-desktop">
<!-- Video Teaser -->
<video id="teaser" autoplay muted muted loop playsinline height="100">
<source src="static/videos/vid_intro.mp4" type="video/mp4">
</video>
<!-- /Video Teaser -->
<br/>
<br/>
<!-- Abstract. -->
<div class="columns is-centered has-text-centered">
<div class="column is-four-fifths">
<h2 class="title is-3">Abstract</h2>
<div class="content has-text-justified">
<p>
Robotic manipulation systems operating in diverse, dynamic environments must exhibit three critical
abilities: multitask interaction, generalization to unseen scenarios, and spatial memory. While significant
progress has been made in robotic manipulation, existing approaches often fall short in generalization to
complex environmental variations and addressing memory-dependent tasks. To bridge this gap, we introduce
<b>SAM2Act</b>, a multi-view robotic transformer-based policy that leverages multi-resolution upsampling
with visual representations from large-scale foundation model. SAM2Act achieves a state-of-the-art average
success rate of <b>86.8% across 18 tasks</b> in the RLBench benchmark, and demonstrates robust generalization
on <i>The Colosseum</i> benchmark, with only a <b>4.3% performance gap</b> under diverse environmental
perturbations. Building on this foundation, we propose <b>SAM2Act+</b>, a memory-based architecture inspired
by SAM2, which incorporates a memory bank, an encoder, and an attention mechanism to enhance spatial memory.
To address the need for evaluating memory-dependent tasks, we introduce <i>MemoryBench</i>, a novel benchmark
designed to assess spatial memory and action recall in robotic manipulation. SAM2Act+ achieves
<b>competitive performance on <i>MemoryBench</i></b>, significantly outperforming existing approaches and
pushing the boundaries of memory-based robotic systems.
</p>
</div>
</div>
</div>
<!--/ Abstract. -->
</div>
</section>
<section class="hero is-light is-small">
<div class="hero-body">
<div class="container">
<div class="column has-text-centered">
<h3 class="title is-5">Real World Results</h3>
</div>
<div id="results-carousel" class="carousel results-carousel">
<div class="item item-steve">
<video id="steve" autoplay muted muted loop playsinline>
<source src="static/videos/real_world/vid_rw_press_buttons.mp4" type="video/mp4">
</video>
</div>
<div class="item item-fullbody">
<video id="fullbody" autoplay muted muted loop playsinline>
<source src="static/videos/real_world/vid_rw_press_button_block.mp4" type="video/mp4">
</video>
</div>
<div class="item item-shiba">
<video id="shiba" autoplay muted muted loop playsinline>
<source src="static/videos/real_world/vid_rw_stack_block.mp4" type="video/mp4">
</video>
</div>
<div class="item item-ender">
<video id="ender" autoplay muted muted loop playsinline>
<source src="static/videos/real_world/vid_rw_turn_lamp.mp4" type="video/mp4">
</video>
</div>
</div>
<br>
<br>
<div class="column has-text-centered">
<h3 class="title is-5">Memory Tasks</h3>
</div>
<div id="results-carousel" class="carousel results-carousel">
<div class="item item-patrick">
<video id="patrick" autoplay muted muted loop playsinline>
<source src="static/videos/memory_tasks/button_and_drawer_simple.mp4" type="video/mp4">
</video>
</div>
<div class="item item-fullcuerpo">
<video id="fullcuerpo" autoplay muted muted loop playsinline>
<source src="static/videos/memory_tasks/play_with_cube_sequence.mp4" type="video/mp4">
</video>
</div>
<div class="item item-shibar">
<video id="shibar" autoplay muted muted loop playsinline>
<source src="static/videos/memory_tasks/play_with_two_cubes.mp4" type="video/mp4">
</video>
</div>
<div class="item item-patrick">
<video id="patrick" autoplay muted muted loop playsinline>
<source src="static/videos/memory_tasks/button_and_drawer_simple.mp4" type="video/mp4">
</video>
</div>
<div class="item item-fullcuerpo">
<video id="fullcuerpo" autoplay muted muted loop playsinline>
<source src="static/videos/memory_tasks/play_with_cube_sequence.mp4" type="video/mp4">
</video>
</div>
<div class="item item-shibar">
<video id="shibar" autoplay muted muted loop playsinline>
<source src="static/videos/memory_tasks/play_with_two_cubes.mp4" type="video/mp4">
</video>
</div>
</div>
</div>
</div>
</section>
<section class="section">
<div class="container is-max-desktop">
<h2 class="title is-3">Summary</h2>
<div class="columns is-centered has-text-centered">
<div class="column">
<p>
<figure>
<img src="static/images/img_summary.png" style="max-width: 100%; height: auto;">
</figure>
<br>
<div class="content has-text-justified">
We introduce SAM2Act, a multi-view robotics transformer-based policy that enhances feature
representation by integrating multi-resolution upsampling with visual embeddings from large-scale
foundation model. Built on the RVT-2 multi-view transformer, SAM2Act achieves strong multitask
success and generalization. Building on this foundation, we introduce SAM2Act+, which incorporates
a memory-based architecture inspired by SAM2's approach. Using a memory bank, an encoder,
and an attention mechanism, SAM2Act+ enables episodic recall to solve spatial memory-dependent
manipulation tasks.
</div>
</p>
</div>
</div>
<h2 class="title is-3">Overview of SAM2Act and SAM2Act+</h2>
<div class="columns is-centered has-text-centered">
<div class="column">
<p>
<figure>
<img src="static/images/img_pipeline.png" style="max-width: 100%; height: auto;">
</figure>
<br>
<div class="content has-text-justified">
Our method, SAM2Act, enables precise 3D manipulation with strong generalization across environ-
mental and object-level variations. Building upon the RVT-2 framework, SAM2Act introduces
key architectural innovations that enhance visual feature representation and task-specific reasoning.
The architecture reconstructs a point cloud of the scene, renders it from virtual cameras at orthogonal
views, and employs a two-stage multi-view transformer (coarse-to-fine) to predict action heatmaps.
The coarse branch generates zoom-in heatmaps to localize regions of interest, while the fine branch
refines these into precise action heatmaps.
<br></br>
SAM2Act leverages the pre-trained SAM2 encoder
to extract multi-resolution image embeddings, which are further refined through the multi-resolution
upsampling technique to predict accurate translation heatmaps with minimal information loss. To
address tasks requiring spatial memory, SAM2Act+ extends the SAM2Act architecture by incorporating
memory-based components. These include Memory Bank, Memory Encoder, and Memory Attention,
enabling the model to encode historical actions and condition current observations. This memory-
based policy enhances the agent's ability to predict actions based on past contextual information,
significantly improving performance in tasks that require sequential decision-making.
</div>
</p>
</div>
</div>
<h2 class="title is-3"><i>MemoryBench</i></h2>
<div class="columns is-centered has-text-centered">
<div class="column">
<p>
<figure>
<img src="static/images/img_memorybench.png" style="max-width: 100%; height: auto;">
</figure>
<br>
<div class="content has-text-justified">
Unlike standard RLBench tasks, many of which involve long-horizon scenarios, our tasks are
specifically designed to require spatial memory. Without such memory, the agent would be forced to
rely on random actions. To create these tasks, we intentionally violate the Markov assumption, which
states that in a Markov Decision Process (MDP), the next observation depends solely on the current
observation and action:
<p>$$ P\bigl(o_{t+1} \mid o_1, a_1, \dots, o_t, a_t\bigr) \;=\; P\bigl(o_{t+1} \mid o_t, a_t\bigr). $$</p>
This assumption implies that knowing only ot and at is sufficient to predict \( o_{t+1} \). However, in our
tasks, we design scenarios where two distinct action histories lead to the same observation ot, but
require different subsequent actions. This forces the agent to recall which action history led to ot to
perform the correct next action. Furthermore, we standardized the language instructions to prevent
unintentional leakage of spatial information that could aid the model in memory-based tasks. These
principles guided the development of our spatial memory-based tasks.
</div>
</p>
</div>
</div>
<h2 class="title is-3">Experiments and Results</h2>
<h2 class="title is-4">RLBench 18 Tasks</h2>
<div class="columns is-centered has-text-centered">
<div class="column">
<p>
<figure>
<img src="static/images/img_results_rlbench.png"></a>
</figure>
<br>
<div class="content has-text-justified">
Overall, SAM2Act achieves an average success rate of <b>86.8%</b>±0.5, surpassing the previous best
(RVT-2) by <b>5.4%</b>. A closer look at individual tasks reveals that SAM2Act ranks <b>first in 9 out of 18
tasks</b> and remains <b>highly competitive in 7 others</b>, coming within one successful attempt or 4% of
the best performance. These tasks include Close Jar, Drag Stick, Meat Off Grill, Place Wine, Screw
Bulb, Sweep to Dustpan, and Turn Tap. The largest margin of improvement occurs in Insert Peg,
where SAM2Act <b>exceeds RVT-2 by 44%</b> (approximately <b>2.1×</b>), and in Sort Shape, where it
outperforms RVT-2 by 29%. Both tasks require precise manipulation, underscoring the effectiveness
of SAM2Act's multi-resolution upsampling strategy. These results establish SAM2Act as a <b>leading
policy for complex 3D tasks</b>, highlighting its ability to handle high-precision manipulations - an area
where prior methods have struggled.
</div>
</p>
</div>
</div>
<h2 class="title is-4"><i>The Colosseum</i></h2>
<div class="columns is-centered has-text-centered">
<div class="column">
<p>
<figure>
<img src="static/images/img_results_colosseum.png"></a>
</figure>
<br>
<div class="content has-text-justified">
The results evaluated in the above figure were obtained by training and testing models within the
same environment. However, to truly assess <b>generalization performance</b>, policies must remain
robust against both environmental and object-level perturbations. We therefore trained SAM2Act
and the baseline methods on 20 tasks from The Colosseum benchmark and tested them under 13
different perturbation categories over three runs. <b>SAM2Act exhibits the smallest performance
drop compared to the baselines</b>, with an average decrease of <b>4.3%</b> (standard deviation of 3.59%).
Notably, it proves particularly robust to environmental perturbations – such as changes in lighting,
table color/texture, the addition of distractors, and even camera pose – while also maintaining
competitive performance under object-level perturbations.
</div>
</p>
</div>
</div>
<h2 class="title is-4"><i>MemoryBench</i></h2>
<div class="columns is-centered has-text-centered">
<div class="column">
<p>
<figure>
<img src="static/images/img_results_memorybench.png"></a>
</figure>
<br>
<div class="content has-text-justified">
In the figure above, we evaluate SAM2Act+ against SoTA 3D BC model, RVT-2 on MemoryBench, training all
models in a single-task setting to isolate memory-related challenges (e.g., opening the wrong drawer
rather than unrelated mid-task failures). This setup ensures that performance differences stem from
memory capabilities. For a random agent, the expected success rates are determined by the number
of possible choices per task: 33% for reopen_drawer (three drawers), 25% for put_block_back
(four patches), and 50% for rearrange_block (two blocks). However, variations in task complexity,
fixed training data, and imbalanced task distributions lead to slight deviations from these baselines.
Our proposed memory-based model, SAM2Act+, demonstrates <b>a strong understanding of spatial
memory</b>, achieving an average success rate of <b>94.3%</b> across all tasks. It <b>outperforms SAM2Act
(without memory) by a huge margin of 39.3%</b> on MemoryBench, highlighting the significant
impact of explicit memory modeling. Note that we made an update to <i>open_drawer</i> task, see more
in our paper's appendix.
</div>
</p>
</div>
</div>
<h2 class="title is-4">Real-robot</h2>
<div class="columns is-centered has-text-centered">
<div class="column">
<p>
<figure>
<img src="static/images/img_results_real.png" style="max-width: 60%; height: auto;"></a>
</figure>
<br>
<div class="content has-text-justified">
The table above presents our real-world experiment results, where our method achieves a 75% task success
rate, compared to 43% for RVT-2. SAM2Act significantly outperforms the baseline in high-precision
tasks (60% vs 0%). SAM2Act+ (indicated with *) excels in memory-based tasks, such as (d) Push the same button, which
requires recalling the button's previous location. Here, SAM2Act achieves 70% success, while RVT-2,
relying on random guessing, scores 40%. We also test models' generalization against perturbations
like lighting changes, distractors, and position variations.
</div>
</p>
</div>
</div>
</div>
</section>
<div class="columns is-centered has-text-centered">
<h2 class="title is-4">More Video Results ⬇️</h2>
</div>
<!-- Simulation Results -->
<section class="section">
<div class="container is-max-widescreen">
<h2 class="title">Results on RLBench 18 Tasks</h2>
Task
<div class="select is-small">
<select id="simulation-menu-task-name" onchange="updateTaskVideo()">
<option value="close_jar" selected="selected">Close Jar</option>
<option value="reach_and_drag">Drag Stick</option>
<option value="insert_onto_square_peg">Insert Peg</option>
<option value="meat_off_grill">Meat off Grill</option>
<option value="open_drawer">Open Drawer</option>
<option value="place_cups">Place Cups</option>
<option value="place_wine_at_rack_location">Place Wine</option>
<option value="push_buttons">Push Buttons</option>
<option value="put_groceries_in_cupboard">Put in Cupboard</option>
<option value="put_item_in_drawer">Put in Drawer</option>
<option value="put_money_in_safe">Put in Safe</option>
<option value="light_bulb_in">Screw Bulb</option>
<option value="slide_block_to_color_target">Slide Block</option>
<option value="place_shape_in_shape_sorter">Sort Shape</option>
<option value="stack_blocks">Stack Blocks</option>
<option value="stack_cups">Stack Cups</option>
<option value="sweep_to_dustpan_of_size">Sweep to Dustpan</option>
<option value="turn_tap">Turn Tap</option>
</select>
</div>
Baseline
<div class="select is-small">
<select id="simulation-menu-baseline-name" onchange="updateTaskVideo()">
<option value="rvt" selected="selected">RVT-2</option>
<option value="same">SAM-E</option>
</select>
</div>
<br>
<br>
<div class="columns is-centered">
<div class="column">
<div class="columns is-centered">
<div class="column content">
<h2 class="title is-4">Ours</h2>
<video id="simulation-menu-video-sam2act" muted autoplay muted loop playsinline>
<source src="static/videos/simulation/sam2act/close_jar_fixed.mp4" type="video/mp4">
</video>
</div>
</div>
</div>
<div class="column">
<div class="columns is-centered">
<div class="column content">
<h2 class="title is-4">Baseline</h2>
<video id="simulation-menu-video-baseline" muted autoplay muted loop playsinline>
<source src="static/videos/simulation/rvt/close_jar_fixed.mp4" type="video/mp4">
</video>
</div>
</div>
</div>
</div>
</div>
</section>
<!-- Colosseum Results -->
<section class="section">
<div class="container is-max-widescreen">
<h2 class="title">Results on <i>The Colosseum</i></h2>
<button onclick="updateColosseumVideo()">Random Episode</button>
<br>
<br>
<div class="columns is-centered">
<div class="column">
<div class="columns is-centered">
<div class="column content">
<video id="colosseum-menu-video-episode" muted autoplay muted loop playsinline>
<source src="static/videos/colosseum/basketball_in_hoop_0_s0_fixed.mp4" type="video/mp4">
</video>
</div>
</div>
</div>
</div>
</div>
</section>
<!-- MemoryBench Results -->
<section class="section">
<div class="container is-max-widescreen">
<h2 class="title">Results on <i>MemoryBench</i></h2>
Task
<div class="select is-small">
<select id="memorybench-menu-task-name" onchange="updateMemoryBenchVideo()">
<option value="button_and_drawer_simple" selected="selected">Reopen Drawer</option>
<option value="play_with_cube_sequence">Put Block Back</option>
<option value="play_with_two_cubes_sequence">Rearrange Block</option>
</select>
</div>
Method
<div class="select is-small">
<select id="memorybench-menu-method-name" onchange="updateMemoryBenchVideo()">
<option value="sam2act+" selected="selected">SAM2Act+</option>
<option value="sam2act">SAM2Act</option>
<option value="rvt">RVT</option>
</select>
</div>
Episode
<div class="select is-small">
<select id="memorybench-menu-episode-name" onchange="updateMemoryBenchVideo()">
<option value="s0" selected="selected">Episode 1</option>
<option value="s1">Episode 2</option>
<option value="s2">Episode 3</option>
<option value="s3">Episode 4</option>
</select>
</div>
<br>
<br>
<div class="columns is-centered">
<div class="column">
<div class="columns is-centered">
<div class="column content">
<video id="memorybench-menu-video-method" muted autoplay muted loop playsinline>
<source src="static/videos/memorybench/sam2act+/button_and_drawer_simple_s0_fixed.mp4" type="video/mp4">
</video>
</div>
</div>
</div>
</div>
</div>
</section>
<!-- Comparison -->
<section class="section">
<div class="container is-max-widescreen">
<h2 class="title">In-distribution Real-world Results</h2>
Task
<div class="select is-small">
<select id="comparison-menu-task-name" onchange="updateComparisonVideo()">
<option value="turn_lamp" selected="selected">Turn lamp</option>
<option value="push_buttons">Push buttons</option>
<option value="stack_cube">Stack cube</option>
</select>
</div>
Episode
<div class="select is-small">
<select id="comparison-menu-episode-name" onchange="updateComparisonVideo()">
<option value="ep0" selected="selected">Episode 1</option>
<option value="ep1">Episode 2</option>
<option value="ep2">Episode 3</option>
</select>
</div>
<br>
<br>
<div class="columns is-centered">
<div class="column">
<div class="columns is-centered">
<div class="column content">
<h2 class="title is-4">SAM2Act</h2>
<video id="comparison-menu-video-sam2act" muted autoplay muted loop playsinline>
<source src="static/videos/comparison/turn_lamp/vid_sam2act_in_distribution_ep0.mp4" type="video/mp4">
</video>
</div>
</div>
</div>
<div class="column">
<div class="columns is-centered">
<div class="column content">
<h2 class="title is-4">RVT-2</h2>
<video id="comparison-menu-video-rvt2" muted autoplay muted loop playsinline>
<source src="static/videos/comparison/turn_lamp/vid_rvt2_in_distribution_ep0.mp4" type="video/mp4">
</video>
</div>
</div>
</div>
</div>
</div>
</section>
<!-- Comparison Out-of-distribution -->
<section class="section">
<div class="container is-max-widescreen">
<h2 class="title">Out-distribution Real-world Results</h2>
Task
<div class="select is-small">
<select id="outdistribution-menu-task-name" onchange="updateComparisonOutDistVideo()">
<option value="turn_lamp" selected="selected">Turn lamp</option>
<option value="push_buttons">Push buttons</option>
<option value="stack_cubes">Stack Cubes</option>
<!--<option value="stack_cube">Stack cube</option>-->
</select>
</div>
Episode
<div class="select is-small">
<select id="outdistribution-menu-episode-name" onchange="updateComparisonOutDistVideo()">
<option value="ep0" selected="selected">Episode 1</option>
<option value="ep1">Episode 2</option>
<option value="ep2">Episode 3</option>
</select>
</div>
<br>
<br>
<div class="columns is-centered">
<div class="column">
<div class="columns is-centered">
<div class="column content">
<h2 class="title is-4">SAM2Act</h2>
<video id="outdistribution-menu-video-sam2act" muted autoplay muted loop playsinline>
<source src="static/videos/out_of_distribution/turn_lamp/vid_sam2act_turn_on_out_ep0.mp4" type="video/mp4">
</video>
</div>
</div>
</div>
<div class="column">
<div class="columns is-centered">
<div class="column content">
<h2 class="title is-4">RVT-2</h2>
<video id="outdistribution-menu-video-rvt2" muted autoplay muted loop playsinline>
<source src="static/videos/out_of_distribution/turn_lamp/vid_rvt_turn_on_out_ep0.mp4" type="video/mp4">
</video>
</div>
</div>
</div>
</div>
</div>
</section>
<section class="section" id="BibTeX">
<div class="container is-max-desktop">
<h2 class="title">BibTeX</h2>
<pre><code>@misc{fang2025sam2act,
title={SAM2Act: Integrating Visual Foundation Model with A Memory Architecture for Robotic Manipulation},
author={Haoquan Fang and Markus Grotz and Wilbert Pumacay and Yi Ru Wang and Dieter Fox and Ranjay Krishna and Jiafei Duan},
year={2025},
eprint={2501.18564},
archivePrefix={arXiv},
primaryClass={cs.RO},
url={https://arxiv.org/abs/2501.18564},
}</code></pre>
</div>
</section>
<footer class="footer" style="padding-bottom: 3rem !important;">
<div class="container">
<div class="columns is-centered">
<div class="column">
<div class="content has-text-centered">
<p>
Website borrowed from <a href="https://github.com/nerfies/nerfies.github.io">NeRFies</a> under a <a
href="https://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0
International</a>
</p>
</div>
</div>
</div>
</div>
</footer>
</body>
</html>