Skip to content

Commit 2683f20

Browse files
committed
Add paper
1 parent d327965 commit 2683f20

File tree

3 files changed

+39
-15
lines changed

3 files changed

+39
-15
lines changed

source/_data/SymbioticLab.bib

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2125,19 +2125,35 @@ @PhDThesis{amberljc:dissertation
21252125
}
21262126
21272127
@Article{tetriserve:arxiv25,
2128-
author = {Runyu Lu and Shiqi He and Wenxuan Tan and Shenggui Li and Ruofan Wu and Jeff J. Ma and Ang Chen and Mosharaf Chowdhury},
2129-
title = {{TetriServe}: Efficient {DiT} Serving for Heterogeneous Image Generation},
2130-
year = {2025},
2131-
month = {Oct},
2132-
volume = {abs/2510.01565},
2133-
archivePrefix = {arXiv},
2134-
eprint = {2510.01565},
2135-
url = {https://arxiv.org/abs/2510.01565},
2136-
publist_confkey = {arXiv:2510.01565},
2137-
publist_link = {paper || https://arxiv.org/abs/2510.01565},
2138-
publist_topic = {Systems + AI},
2139-
publist_abstract = {
2140-
Diffusion Transformer (DiT) models excel at generating high-quality images through iterative denoising steps, but serving them under strict Service Level Objectives (SLOs) is challenging due to their high computational cost, particularly at large resolutions. Existing serving systems use fixed degree sequence parallelism, which is inefficient for heterogeneous workloads with mixed resolutions and deadlines, leading to poor GPU utilization and low SLO attainment. In this paper, we propose step-level sequence parallelism to dynamically adjust the parallel degree of individual requests according to their deadlines. We present TetriServe, a DiT serving system that implements this strategy for highly efficient image generation. Specifically, TetriServe introduces a novel round-based scheduling mechanism that improves SLO attainment: (1) discretizing time into fixed rounds to make deadline-aware scheduling tractable, (2) adapting parallelism at the step level and minimize GPU hour consumption, and (3) jointly packing requests to minimize late completions. Extensive evaluation on state-of-the-art DiT models shows that TetriServe achieves up to 32% higher SLO attainment compared to existing solutions without degrading image quality.
2141-
}
2128+
author = {Runyu Lu and Shiqi He and Wenxuan Tan and Shenggui Li and Ruofan Wu and Jeff J. Ma and Ang Chen and Mosharaf Chowdhury},
2129+
title = {{TetriServe}: Efficient {DiT} Serving for Heterogeneous Image Generation},
2130+
year = {2025},
2131+
month = {Oct},
2132+
volume = {abs/2510.01565},
2133+
archivePrefix = {arXiv},
2134+
eprint = {2510.01565},
2135+
url = {https://arxiv.org/abs/2510.01565},
2136+
publist_confkey = {arXiv:2510.01565},
2137+
publist_link = {paper || https://arxiv.org/abs/2510.01565},
2138+
publist_topic = {Systems + AI},
2139+
publist_abstract = {
2140+
Diffusion Transformer (DiT) models excel at generating high-quality images through iterative denoising steps, but serving them under strict Service Level Objectives (SLOs) is challenging due to their high computational cost, particularly at large resolutions. Existing serving systems use fixed degree sequence parallelism, which is inefficient for heterogeneous workloads with mixed resolutions and deadlines, leading to poor GPU utilization and low SLO attainment. In this paper, we propose step-level sequence parallelism to dynamically adjust the parallel degree of individual requests according to their deadlines. We present TetriServe, a DiT serving system that implements this strategy for highly efficient image generation. Specifically, TetriServe introduces a novel round-based scheduling mechanism that improves SLO attainment: (1) discretizing time into fixed rounds to make deadline-aware scheduling tractable, (2) adapting parallelism at the step level and minimize GPU hour consumption, and (3) jointly packing requests to minimize late completions. Extensive evaluation on state-of-the-art DiT models shows that TetriServe achieves up to 32% higher SLO attainment compared to existing solutions without degrading image quality.
2141+
}
21422142
}
21432143
2144+
@InProceedings{mlenergy-benchmark:neuripsdb25,
2145+
title = {The {ML.ENERGY} Benchmark: Toward Automated Inference Energy Measurement and Optimization},
2146+
author = {Jae-Won Chung and Jiachen Liu and Jeff J. Ma and Ruofan Wu and Oh Jun Kweon and Yuxuan Xia and Zhiyu Wu and Mosharaf Chowdhury},
2147+
year = {2025},
2148+
month = {Dec},
2149+
booktitle = {NeurIPS D\&B},
2150+
publist_topic = {Systems + AI},
2151+
publist_topic = {Energy-Efficient Systems},
2152+
publist_confkey = {NeurIPS'25 D&B},
2153+
publist_link = {paper || mlenergy-benchmark-neuripsdb25.pdf},
2154+
publist_link = {code || https://github.com/ml-energy/benchmark},
2155+
publist_badge = {Spotlight Paper (acceptance rate: 2.81%)},
2156+
publist_abstract = {
2157+
As the adoption of Generative AI in real-world services grow explosively, energy has emerged as a critical bottleneck resource. However, energy remains a metric that is often overlooked, under-explored, or poorly understood in the context of building ML systems. We present the ML.ENERGY Benchmark, a benchmark suite and tool for measuring inference energy consumption under realistic service environments, and the corresponding ML.ENERGY Leaderboard, which have served as a valuable resource for those hoping to understand and optimize the energy consumption of their generative AI services. In this paper, we explain four key design principles for benchmarking ML energy we have acquired over time, and then describe how they are implemented in the ML.ENERGY Benchmark. We then highlight results from the early 2025 iteration of the benchmark, including energy measurements of 40 widely used model architectures across 6 different tasks, case studies of how ML design choices impact energy consumption, and how automated optimization recommendations can lead to significant (sometimes more than 40\%) energy savings without changing what is being computed by the model. The ML.ENERGY Benchmark is open-source and can be easily extended to various customized models and application scenarios.
2158+
}
2159+
}
Binary file not shown.

source/publications/index.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,11 +446,19 @@ venues:
446446
date: 2024-12-09
447447
url: https://neurips.cc/Conferences/2024
448448
acceptance: 25.8%
449+
'NeurIPS D&B':
450+
category: Conferences
451+
occurrences:
452+
- key: NeurIPS'25 D&B
453+
name: The Thirty-ninth Conference on Neural Information Processing Systems Track on Datasets and Benchmarks
454+
date: 2025-12-02
455+
url: https://neurips.cc/Conferences/2025
456+
acceptance: 24.91%
449457
{% endpublist %}
450458

451459
---
452460

453461
{% note default %}
454462
#### Copyright notice
455463
The documents listed above have been provided by the contributing authors as a means to ensure timely dissemination of scholarly and technical work on a noncommercial basis. Copyright and all rights therein are maintained by the authors or by other copyright holders, notwithstanding that they have offered their works here electronically. It is understood that all persons copying this information will adhere to the terms and constraints invoked by each author’s copyright. These works may not be reposted without the explicit permission of the copyright holder.
456-
{% endnote %}
464+
{% endnote %}

0 commit comments

Comments
 (0)