From cf8734ee20c7d497070d91b45c308b98b3d41c28 Mon Sep 17 00:00:00 2001
From: annakong23 <ssk07020@naver.com>
Date: Sun, 5 Oct 2025 13:24:59 +0900
Subject: [PATCH 1/5] fix typo and spacing

#1006
---
 .../nlp/sequence_models_tutorial.py           | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/beginner_source/nlp/sequence_models_tutorial.py b/beginner_source/nlp/sequence_models_tutorial.py
index 6cea3987f..ff0c1df28 100644
--- a/beginner_source/nlp/sequence_models_tutorial.py
+++ b/beginner_source/nlp/sequence_models_tutorial.py
@@ -14,7 +14,7 @@
 순환 신경망은 일종의 상태를 유지하는 네트워크입니다. 
 예를 들면, 출력은 다음 입력의 일부로 사용될 수 있습니다. 
 정보는 네트워크가 시퀀스를 통과할 때 전파될 수 있습니다. 
-LSTM의 경우에, 시퀀스의 각 요소에 대응하는 *은닉 상태(hidden state)* :math:`h_t` 가 존재하며,
+LSTM의 경우에, 시퀀스의 각 요소에 대응하는 *은닉 상태(hidden state)* :math:`h_t`가 존재하며,
 이는 원칙적으로 시퀀스의 앞부분에 있는 임의 포인트의 정보를 포함할 수 있습니다. 
 우리는 은닉 상태를 이용하여 언어 모델에서의 단어,
 품사 태그 등 무수히 많은 것들을 예측할 수 있습니다. 
@@ -100,25 +100,25 @@
 # `여기 <https://tutorials.pytorch.kr/beginner/nlp/word_embeddings_tutorial.html>`__.
 # 에서 관련 내용을 읽을 수 있습니다.
 #
-# 모델은 다음과 같습니다. 단어가 :math:`w_i \in V` 일 때, 
-# 입력 문장을 :math:`w_1, \dots, w_M` 라고 합시다. 또한, 
-# :math:`T` 를 우리의 태그 집합라고 하고, :math:`w_i` 의 단어 태그를 :math:`y_i` 라고 합니다. 
-# 단어 :math:`w_i` 에 대한 예측된 태그를 :math:`\hat{y}_i` 로 표시합니다. 
+# 모델은 다음과 같습니다. 단어가 :math:`w_i \in V`일 때, 
+# 입력 문장을 :math:`w_1, \dots, w_M`라고 합시다. 또한, 
+# :math:`T`를 우리의 태그 집합이라고 하고, :math:`w_i`의 단어 태그를 :math:`y_i`라고 합니다. 
+# 단어 :math:`w_i`에 대한 예측된 태그를 :math:`\hat{y}_i`로 표시합니다. 
 # 
 #
-# 이것은 :math:`\hat{y}_i \in T` 일 때, 출력이 :math:`\hat{y}_1, \dots, \hat{y}_M` 시퀀스인
+# 이것은 :math:`\hat{y}_i \in T`일 때, 출력이 :math:`\hat{y}_1, \dots, \hat{y}_M` 시퀀스인
 # 구조 예측 모델입니다. 
 #
 # 예측을 하기 위해, LSTM에 문장을 전달합니다. 한 시간 단계
-# :math:`i` 의 은닉 상태는 :math:`h_i` 로 표시합니다. 또한 각 태그에
+# :math:`i`의 은닉 상태는 :math:`h_i`로 표시합니다. 또한 각 태그에
 # 고유한 인덱스를 할당합니다 (단어 임베딩 섹션에서 word\_to\_ix 를 사용한 것과 유사합니다.)
-# 그러면 :math:`\hat{y}_i`  예측 규칙은 다음과 같습니다. 
+# 그러면 :math:`\hat{y}_i` 예측 규칙은 다음과 같습니다. 
 #
 # .. math::  \hat{y}_i = \text{argmax}_j \  (\log \text{Softmax}(Ah_i + b))_j
 #
 # 즉, 은닉 상태의 아핀 맵(affine map)에 대해 로그 소프트맥스(log softmax)를 취하고,
 # 예측된 태그는 이 벡터에서 가장 큰 값을 가지는 태그가 됩니다. 
-# 이것은 곧 :math:`A` 의 타깃 공간의 차원이 :math:`|T|` 라는 것을 
+# 이것은 곧 :math:`A`의 타깃 공간의 차원이 :math:`|T|`라는 것을 
 # 의미한다는 것을 알아두세요.
 #
 #
@@ -131,7 +131,7 @@ def prepare_sequence(seq, to_ix):
 
 training_data = [
     # 태그는 다음과 같습니다: DET - 한정사;NN - 명사;V - 동사
-    # 예를 들어, "The" 라는 단어는 한정사입니다.
+    # 예를 들어, "The"라는 단어는 한정사입니다.
     ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
     ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
 ]
@@ -233,17 +233,17 @@ def forward(self, sentence):
 # 단어의 문자에서 파생된 표현으로 단어 임베딩을 증가시켜보겠습니다. 
 # 접사(affixes)와 같은 문자 수준의 정보는 품사에 큰 영향을 미치기 때문에, 
 # 상당한 도움이 될 것으로 예상합니다. 
-# 예를 들어, 접사 *-ly* 가 있는 단어는
+# 예를 들어, 접사 *-ly*가 있는 단어는
 # 영어에서 거의 항상 부사로 태그가 지정됩니다.
 #
-# 이것을 하기 위해서, :math:`c_w` 를 단어 :math:`w` 의 C를 단어 w의 문자 수준 표현이라고 하고, 
-# 전과 같이 :math:`x_w` 를 단어임베딩이라고 합시다. 
-# 그렇다면 우리의 시퀀스 모델에 대한 입력은 :math:`x_w` 와
-# :math:`c_w` 의 연결이라고 할 수 있습니다. 만약에 :math:`x_w` 가 차원 5를 가지고, :math:`c_w`
+# 이것을 하기 위해서, :math:`c_w`를 단어 :math:`w`의 C를 단어 w의 문자 수준 표현이라고 하고, 
+# 전과 같이 :math:`x_w`를 단어임베딩이라고 합시다. 
+# 그렇다면 우리의 시퀀스 모델에 대한 입력은 :math:`x_w`와
+# :math:`c_w`의 연결이라고 할 수 있습니다. 만약에 :math:`x_w`가 차원 5를 가지고, :math:`c_w`
 # 차원 3을 가지면 LSTM은 차원 8의 입력을 받아들여야 합니다. 
 #
 # 문자 수준의 표현을 얻기 위해서, 단어의 문자에 대해서 LSTM을 수행하고
-# :math:`c_w` 를 LSTM의 최종 은닉 상태가 되도록 합니다. 
+# :math:`c_w`를 LSTM의 최종 은닉 상태가 되도록 합니다. 
 # 힌트:
 #
 # * 새 모델에는 두 개의 LSTM이 있을 것입니다. 

From 3868e64f1571d5e8fbab663eeffeb35fd21b10ee Mon Sep 17 00:00:00 2001
From: annakong23 <ssk07020@naver.com>
Date: Sun, 5 Oct 2025 16:56:35 +0900
Subject: [PATCH 2/5] Style: fix spacing issues based on code review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

beginner_source/nlp/sequence_models_tutorial.py 오탈자 수정
Fixes #1006
---
 .../nlp/sequence_models_tutorial.py           | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/beginner_source/nlp/sequence_models_tutorial.py b/beginner_source/nlp/sequence_models_tutorial.py
index ff0c1df28..7f61e5231 100644
--- a/beginner_source/nlp/sequence_models_tutorial.py
+++ b/beginner_source/nlp/sequence_models_tutorial.py
@@ -14,7 +14,7 @@
 순환 신경망은 일종의 상태를 유지하는 네트워크입니다. 
 예를 들면, 출력은 다음 입력의 일부로 사용될 수 있습니다. 
 정보는 네트워크가 시퀀스를 통과할 때 전파될 수 있습니다. 
-LSTM의 경우에, 시퀀스의 각 요소에 대응하는 *은닉 상태(hidden state)* :math:`h_t`가 존재하며,
+LSTM의 경우에, 시퀀스의 각 요소에 대응하는 *은닉 상태(hidden state)* :math:`h_t` 가 존재하며,
 이는 원칙적으로 시퀀스의 앞부분에 있는 임의 포인트의 정보를 포함할 수 있습니다. 
 우리는 은닉 상태를 이용하여 언어 모델에서의 단어,
 품사 태그 등 무수히 많은 것들을 예측할 수 있습니다. 
@@ -100,25 +100,25 @@
 # `여기 <https://tutorials.pytorch.kr/beginner/nlp/word_embeddings_tutorial.html>`__.
 # 에서 관련 내용을 읽을 수 있습니다.
 #
-# 모델은 다음과 같습니다. 단어가 :math:`w_i \in V`일 때, 
-# 입력 문장을 :math:`w_1, \dots, w_M`라고 합시다. 또한, 
-# :math:`T`를 우리의 태그 집합이라고 하고, :math:`w_i`의 단어 태그를 :math:`y_i`라고 합니다. 
-# 단어 :math:`w_i`에 대한 예측된 태그를 :math:`\hat{y}_i`로 표시합니다. 
+# 모델은 다음과 같습니다. 단어가 :math:`w_i \in V` 일 때, 
+# 입력 문장을 :math:`w_1, \dots, w_M` 라고 합시다. 또한, 
+# :math:`T` 를 우리의 태그 집합이라고 하고, :math:`w_i` 의 단어 태그를 :math:`y_i` 라고 합니다. 
+# 단어 :math:`w_i` 에 대한 예측된 태그를 :math:`\hat{y}_i` 로 표시합니다. 
 # 
 #
-# 이것은 :math:`\hat{y}_i \in T`일 때, 출력이 :math:`\hat{y}_1, \dots, \hat{y}_M` 시퀀스인
+# 이것은 :math:`\hat{y}_i \in T` 일 때, 출력이 :math:`\hat{y}_1, \dots, \hat{y}_M` 시퀀스인
 # 구조 예측 모델입니다. 
 #
 # 예측을 하기 위해, LSTM에 문장을 전달합니다. 한 시간 단계
-# :math:`i`의 은닉 상태는 :math:`h_i`로 표시합니다. 또한 각 태그에
+# :math:`i` 의 은닉 상태는 :math:`h_i` 로 표시합니다. 또한 각 태그에
 # 고유한 인덱스를 할당합니다 (단어 임베딩 섹션에서 word\_to\_ix 를 사용한 것과 유사합니다.)
-# 그러면 :math:`\hat{y}_i` 예측 규칙은 다음과 같습니다. 
+# 그러면 :math:`\hat{y}_i`  예측 규칙은 다음과 같습니다. 
 #
 # .. math::  \hat{y}_i = \text{argmax}_j \  (\log \text{Softmax}(Ah_i + b))_j
 #
 # 즉, 은닉 상태의 아핀 맵(affine map)에 대해 로그 소프트맥스(log softmax)를 취하고,
 # 예측된 태그는 이 벡터에서 가장 큰 값을 가지는 태그가 됩니다. 
-# 이것은 곧 :math:`A`의 타깃 공간의 차원이 :math:`|T|`라는 것을 
+# 이것은 곧 :math:`A` 의 타깃 공간의 차원이 :math:`|T|` 라는 것을 
 # 의미한다는 것을 알아두세요.
 #
 #
@@ -131,7 +131,7 @@ def prepare_sequence(seq, to_ix):
 
 training_data = [
     # 태그는 다음과 같습니다: DET - 한정사;NN - 명사;V - 동사
-    # 예를 들어, "The"라는 단어는 한정사입니다.
+    # 예를 들어, "The" 라는 단어는 한정사입니다.
     ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
     ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
 ]
@@ -233,17 +233,17 @@ def forward(self, sentence):
 # 단어의 문자에서 파생된 표현으로 단어 임베딩을 증가시켜보겠습니다. 
 # 접사(affixes)와 같은 문자 수준의 정보는 품사에 큰 영향을 미치기 때문에, 
 # 상당한 도움이 될 것으로 예상합니다. 
-# 예를 들어, 접사 *-ly*가 있는 단어는
+# 예를 들어, 접사 *-ly* 가 있는 단어는
 # 영어에서 거의 항상 부사로 태그가 지정됩니다.
 #
-# 이것을 하기 위해서, :math:`c_w`를 단어 :math:`w`의 C를 단어 w의 문자 수준 표현이라고 하고, 
-# 전과 같이 :math:`x_w`를 단어임베딩이라고 합시다. 
-# 그렇다면 우리의 시퀀스 모델에 대한 입력은 :math:`x_w`와
-# :math:`c_w`의 연결이라고 할 수 있습니다. 만약에 :math:`x_w`가 차원 5를 가지고, :math:`c_w`
+# 이것을 하기 위해서, :math:`c_w` 를 단어 :math:`w` 의 C를 단어 w의 문자 수준 표현이라고 하고, 
+# 전과 같이 :math:`x_w` 를 단어임베딩이라고 합시다. 
+# 그렇다면 우리의 시퀀스 모델에 대한 입력은 :math:`x_w` 와
+# :math:`c_w` 의 연결이라고 할 수 있습니다. 만약에 :math:`x_w` 가 차원 5를 가지고, :math:`c_w`
 # 차원 3을 가지면 LSTM은 차원 8의 입력을 받아들여야 합니다. 
 #
 # 문자 수준의 표현을 얻기 위해서, 단어의 문자에 대해서 LSTM을 수행하고
-# :math:`c_w`를 LSTM의 최종 은닉 상태가 되도록 합니다. 
+# :math:`c_w` 를 LSTM의 최종 은닉 상태가 되도록 합니다. 
 # 힌트:
 #
 # * 새 모델에는 두 개의 LSTM이 있을 것입니다. 

From 1df88071440bdb7b71c76f1b0f50340359c71487 Mon Sep 17 00:00:00 2001
From: annakong23 <ssk07020@naver.com>
Date: Sat, 11 Oct 2025 18:37:57 +0900
Subject: [PATCH 3/5] Translate advanced_source/sharding.rst to Korean
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

advanced_source/sharding.rst 번역
Fixes #1024
---
 advanced_source/sharding.rst | 173 ++++++++++++++++-------------------
 1 file changed, 78 insertions(+), 95 deletions(-)

diff --git a/advanced_source/sharding.rst b/advanced_source/sharding.rst
index 7dfeeb88b..c62b4f21c 100644
--- a/advanced_source/sharding.rst
+++ b/advanced_source/sharding.rst
@@ -1,22 +1,21 @@
-Exploring TorchRec sharding
+TorchRec 샤딩 방식 살펴보기
 ===========================
 
-This tutorial will mainly cover the sharding schemes of embedding tables
-via ``EmbeddingPlanner`` and ``DistributedModelParallel`` API and
-explore the benefits of different sharding schemes for the embedding
-tables by explicitly configuring them.
+이 튜토리얼에서는 ``EmbeddingPlanner``와 ``DistributedModelParallel`` API를 통해 
+임베딩 테이블의 샤딩(Sharding) 방식을 다루며, 각기 다른 샤딩 구성을 명시적으로 설정해 봄으로써 
+샤딩 방식에 따른 성능상의 이점을 탐구합니다.
 
-Installation
+설치
 ------------
 
-Requirements: - python >= 3.7
+필수 요구사항: - python >= 3.7
 
-We highly recommend CUDA when using torchRec. If using CUDA: - cuda >=
-11.0
+TorchRec을 사용할 때는 CUDA 환경을 사용하는 것을 강력히 권장합니다. 
+CUDA를 사용할 경우: - cuda >= 11.0
 
 .. code:: python
 
-    # install conda to make installying pytorch with cudatoolkit 11.3 easier. 
+    # conda를 설치하면 condatoolkit 11.3과 함께 pytorch를 쉽게 설치할 수 있습니다.
     !sudo rm Miniconda3-py37_4.9.2-Linux-x86_64.sh Miniconda3-py37_4.9.2-Linux-x86_64.sh.*
     !sudo wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.9.2-Linux-x86_64.sh
     !sudo chmod +x Miniconda3-py37_4.9.2-Linux-x86_64.sh
@@ -24,38 +23,36 @@ We highly recommend CUDA when using torchRec. If using CUDA: - cuda >=
 
 .. code:: python
 
-    # install pytorch with cudatoolkit 11.3
+    # PyTorch 설치 (cudatoolkit 11.3 포함)
     !sudo conda install pytorch cudatoolkit=11.3 -c pytorch-nightly -y
 
-Installing torchRec will also install
-`FBGEMM <https://github.com/pytorch/fbgemm>`__, a collection of CUDA
-kernels and GPU enabled operations to run
+torchRec을 설치하면 자동으로 `FBGEMM <https://github.com/pytorch/fbgemm>`__,도 함께 설치됩니다.
+FBGEMM은 CUDA 커널과 GPU 연산이 포함된 연산 라이브러리 모음입니다.
 
 .. code:: python
 
-    # install torchrec
+    # torchrec 설치
     !pip3 install torchrec-nightly
 
-Install multiprocess which works with ipython to for multi-processing
-programming within colab
+Colab 환경에서 다중 프로세싱을 사용하기 위해 multiprocess 패키지를 설치해야 합니다.
+이 패키지는 IPython 환경에서 멀티프로세싱 프로그래밍이 가능하게 합니다.
 
 .. code:: python
 
     !pip3 install multiprocess
 
-The following steps are needed for the Colab runtime to detect the added
-shared libraries. The runtime searches for shared libraries in /usr/lib,
-so we copy over the libraries which were installed in /usr/local/lib/.
-**This is a very necessary step, only in the colab runtime**.
+Colab 런타임 환경 설정:
+Colab에서는 런타임이 /usr/lib 폴더에서 공유 라이브러리를 탐색하기 때문에, 
+/usr/local/lib/ 에 설치된 라이브러리를 복사해야 합니다.
+**이 과정은 Colab 환경에서 필수적인 단계 입니다.**.
 
 .. code:: python
 
     !sudo cp /usr/local/lib/lib* /usr/lib/
 
-**Restart your runtime at this point for the newly installed packages
-to be seen.** Run the step below immediately after restarting so that
-python knows where to look for packages. **Always run this step after
-restarting the runtime.**
+**이 시점에서 새로 설치된 패키지를 인식하도록 런타임을 재시작하세요** 
+재시작 직후 아래 단계를 실행하여 Python이 패키지의 위치를 알 수 있도록 합니다.
+**런타임을 재시작한 후 항상 이 단계를 실행해야 합니다.**
 
 .. code:: python
 
@@ -63,16 +60,13 @@ restarting the runtime.**
     sys.path = ['', '/env/python', '/usr/local/lib/python37.zip', '/usr/local/lib/python3.7', '/usr/local/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/site-packages', './.local/lib/python3.7/site-packages']
 
 
-Distributed Setup
+분산 설정 (Distributed Setup)
 -----------------
 
-Due to the notebook enviroment, we cannot run
-`SPMD <https://en.wikipedia.org/wiki/SPMD>`_ program here but we
-can do multiprocessing inside the notebook to mimic the setup. Users
-should be responsible for setting up their own
-`SPMD <https://en.wikipedia.org/wiki/SPMD>`_ launcher when using
-Torchrec. We setup our environment so that torch distributed based
-communication backend can work.
+노트북 환경에서는 `SPMD <https://en.wikipedia.org/wiki/SPMD>`_ 프로그램을 직접 실행할 
+수 없기 때문에, 여기서는 멀티프로세싱을 활용하여 이를 유사하게 구현합니다. TorchRec을 사용할 
+때는 사용자가 직접 `SPMD <https://en.wikipedia.org/wiki/SPMD>`_ 실행 환경을 설정해야 합니다. 이 예시에서는 PyTorch의 
+분산 통신(Distributed Communication) 백엔드가 정상적으로 동작할 수 있도록 환경 설정을 구성합니다.
 
 .. code:: python
 
@@ -83,32 +77,30 @@ communication backend can work.
     os.environ["MASTER_ADDR"] = "localhost"
     os.environ["MASTER_PORT"] = "29500"
 
-Constructing our embedding model
+임베딩 모델 구성 (Constructing our embedding model)
 --------------------------------
 
-Here we use TorchRec offering of
-`EmbeddingBagCollection <https://github.com/facebookresearch/torchrec/blob/main/torchrec/modules/embedding_modules.py#L59>`_
-to construct our embedding bag model with embedding tables.
-
-Here, we create an EmbeddingBagCollection (EBC) with four embedding
-bags. We have two types of tables: large tables and small tables
-differentiated by their row size difference: 4096 vs 1024. Each table is
-still represented by 64 dimension embedding.
-
-We configure the ``ParameterConstraints`` data structure for the tables,
-which provides hints for the model parallel API to help decide the
-sharding and placement strategy for the tables. In TorchRec, we support
-\* ``table-wise``: place the entire table on one device; \*
-``row-wise``: shard the table evenly by row dimension and place one
-shard on each device of the communication world; \* ``column-wise``:
-shard the table evenly by embedding dimension, and place one shard on
-each device of the communication world; \* ``table-row-wise``: special
-sharding optimized for intra-host communication for available fast
-intra-machine device interconnect, e.g. NVLink; \* ``data_parallel``:
-replicate the tables for every device;
-
-Note how we initially allocate the EBC on device "meta". This will tell
-EBC to not allocate memory yet.
+여기에서는
+`EmbeddingBagCollection <https://github.com/facebookresearch/torchrec/blob/main/torchrec/modules/embedding_modules.py#L59>`_을
+사용하여, 여러 개의 임베딩 테이블로 구성된 임베딩 백(embedding bag) 모델을 구축합니다.
+
+이번 예시에서는 4개의 임베딩 백(embedding bag) 으로 구성된 EmbeddingBagCollection (EBC)를 생성합니다.
+테이블은 두 가지 크기로 구분됩니다:
+큰 테이블과 작은 테이블로, 각각 행 크기 4096과 1024로 구분됩니다.
+모든 테이블의 임베딩 차원은 동일하게 64차원으로 설정합니다.
+
+또한, 각 테이블에 대해 ``ParameterConstraints`` 데이터 구조를 설정합니다.
+이 구조는 모델 병렬화 API가 테이블의 샤딩 및 배치 전략을 결정하는 데 도움이 되는 힌트를 제공합니다.
+TorchRec에서는 다음과 같은 샤딩 방식을 지원합니다:
+\* ``table-wise``: 전체 테이블을 하나의 디바이스에 배치; \*
+``row-wise``: 테이블을 행 단위로 균등 분할하여 통신 그룹의 각 디바이스에 하나씩 배치; \* 
+``column-wise``:
+임베딩 차원을 기준으로 균등 분할하여 각 디바이스에 하나씩 배치; \* 
+``table-row-wise``: NVLink와 같은 빠른 디바이스 간 연결을 활용해, 호스트 내부 통신에 최적화된 특수 샤딩 방식; \* 
+``data_parallel``:모든 디바이스에 테이블 전체를 복제;
+
+EBC를 처음 생성할 때 “meta” 디바이스에 할당하는 점에 주의하세요. 
+이는 아직 실제 메모리를 할당하지 않고, 이후에 필요한 시점에 할당하도록 지시하는 설정입니다.
 
 .. code:: python
 
@@ -159,17 +151,15 @@ EBC to not allocate memory yet.
         tables=large_tables + small_tables
     )
 
-DistributedModelParallel in multiprocessing
+멀티프로세싱에서의 DistributedModelParallel
 -------------------------------------------
 
-Now, we have a single process execution function for mimicking one
-rank's work during `SPMD <https://en.wikipedia.org/wiki/SPMD>`_
-execution.
+이제, `SPMD <https://en.wikipedia.org/wiki/SPMD>`_ 실행 중에 각 프로세스(rank) 가 수행하는 작업을 
+모방하기 위한 단일 프로세스 실행 함수를 정의합니다.
 
-This code will shard the model collectively with other processes and
-allocate memories accordingly. It first sets up process groups and do
-embedding table placement using planner and generate sharded model using
-``DistributedModelParallel``.
+이 코드에서는 다른 프로세스들과 함께 모델을 공동으로 샤딩하고, 그에 따라 메모리를 적절히 할당합니다.
+먼저 프로세스 그룹을 설정한 뒤, 플래너를 사용해 임베딩 테이블의 배치를 수행하고,
+그 결과를 바탕으로 ``DistributedModelParallel``을 통해 샤딩된 모델을 생성합니다.
 
 .. code:: python
 
@@ -193,7 +183,7 @@ embedding table placement using planner and generate sharded model using
             rank: int,
             world_size: int,
             backend: str,
-            # pyre-fixme[11]: Annotation `ProcessGroup` is not defined as a type.
+            # pyre-fixme[11]: `ProcessGroup`이 타입(type)으로 정의되어 있지 않습니다.
         ) -> dist.ProcessGroup:
             os.environ["RANK"] = f"{rank}"
             os.environ["WORLD_SIZE"] = f"{world_size}"
@@ -225,11 +215,10 @@ embedding table placement using planner and generate sharded model using
         return sharded_model
 
 
-Multiprocessing Execution
+멀티프로세싱 실행 (Multiprocessing Execution)
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Now let's execute the code in multi-processes representing multiple GPU
-ranks.
+이제 여러 개의 GPU rank를 나타내는 다중 프로세스 환경에서 코드를 실행해 보겠습니다.
 
 .. code:: python
 
@@ -259,15 +248,14 @@ ranks.
           p.join()
           assert 0 == p.exitcode
 
-Table Wise Sharding
+테이블 단위 샤딩 (Table-Wise Sharding)
 ~~~~~~~~~~~~~~~~~~~
 
-Now let's execute the code in two processes for 2 GPUs. We can see in
-the plan print that how our tables are sharded across GPUs. Each node
-will have one large table and one small which shows our planner tries
-for load balance for the embedding tables. Table-wise is the de-factor
-go-to sharding schemes for many small-medium size tables for load
-balancing over the devices.
+이제 두 개의 GPU를 사용하여 2개의 프로세스로 코드를 실행해 보겠습니다. 출력된 plan을 보면, 
+각 테이블이 GPU 간에 어떻게 샤딩되었는지를 확인할 수 있습니다. 각 노드는 큰 테이블 하나와 
+작은 테이블 하나씩을 가지며, 이는 플래너가 임베딩 테이블의 로드 밸런싱을 고려하여 분배했음을 
+보여줍니다. Table-wise 샤딩은 여러 개의 소형~중형 규모 테이블을 디바이스 간에 균형 있게 
+분산시키기 위한 가장 일반적이고 기본적인 샤딩 방식입니다.
 
 .. code:: python
 
@@ -279,17 +267,15 @@ balancing over the devices.
     rank:1,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[0], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 64], placement=rank:0/cuda:0)])), 'large_table_1': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 64], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[0], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 64], placement=rank:0/cuda:0)])), 'small_table_1': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 64], placement=rank:1/cuda:1)]))}}
     rank:0,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[0], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 64], placement=rank:0/cuda:0)])), 'large_table_1': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 64], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[0], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 64], placement=rank:0/cuda:0)])), 'small_table_1': ParameterSharding(sharding_type='table_wise', compute_kernel='batched_fused', ranks=[1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 64], placement=rank:1/cuda:1)]))}}
 
-Explore other sharding modes
+다른 샤딩 방식 살펴보기 (Explore other sharding modes)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-We have initially explored what table-wise sharding would look like and
-how it balances the tables placement. Now we explore sharding modes with
-finer focus on load balance: row-wise. Row-wise is specifically
-addressing large tables which a single device cannot hold due to the
-memory size increase from large embedding row numbers. It can address
-the placement of the super large tables in your models. Users can see
-that in the ``shard_sizes`` section in the printed plan log, the tables
-are halved by row dimension to be distributed onto two GPUs.
+앞서 table-wise 샤딩이 어떻게 작동하고 테이블 배치를 균형 있게 수행하는지를 살펴보았습니다.이제는 
+로드 밸런싱(load balance)에 더 초점을 맞춘 다른 샤딩 방식, 즉 row-wise 샤딩을 살펴보겠습니다.
+Row-wise 샤딩은 특히 임베딩 행의 수가 매우 많아 단일 디바이스 메모리에 전체 테이블을 담을 수 없는 
+큰 테이블을 처리하기 위한 방식입니다. 이 방법은 모델 내의 초대형 테이블을 효율적으로 분산 배치할 수 
+있게 해줍니다. 출력된 플랜 로그의 ``shard_sizes`` 섹션을 보면, 테이블이 행 단위로 절반씩 나뉘어 
+두 개의 GPU에 분산된 것을 확인할 수 있습니다.
 
 .. code:: python
 
@@ -301,11 +287,9 @@ are halved by row dimension to be distributed onto two GPUs.
     rank:1,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[2048, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[2048, 0], shard_sizes=[2048, 64], placement=rank:1/cuda:1)])), 'large_table_1': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[2048, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[2048, 0], shard_sizes=[2048, 64], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[512, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[512, 0], shard_sizes=[512, 64], placement=rank:1/cuda:1)])), 'small_table_1': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[512, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[512, 0], shard_sizes=[512, 64], placement=rank:1/cuda:1)]))}}
     rank:0,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[2048, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[2048, 0], shard_sizes=[2048, 64], placement=rank:1/cuda:1)])), 'large_table_1': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[2048, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[2048, 0], shard_sizes=[2048, 64], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[512, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[512, 0], shard_sizes=[512, 64], placement=rank:1/cuda:1)])), 'small_table_1': ParameterSharding(sharding_type='row_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[512, 64], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[512, 0], shard_sizes=[512, 64], placement=rank:1/cuda:1)]))}}
 
-Column-wise on the other hand, address the load imbalance problems for
-tables with large embedding dimensions. We will split the table
-vertically. Users can see that in the ``shard_sizes`` section in the
-printed plan log, the tables are halved by embedding dimension to be
-distributed onto two GPUs.
+반면, column-wise 샤딩은 임베딩 차원이 큰 테이블에서 발생하는 로드 불균형 문제를 해결하기 위한 방식입니다.
+이 경우 테이블을 세로 방향(임베딩 차원 기준) 으로 분할합니다. 출력된 플랜 로그의 ``shard_sizes`` 섹션을 
+보면, 테이블이 임베딩 차원 기준으로 절반씩 나뉘어 두 개의 GPU에 분산된 것을 확인할 수 있습니다.
 
 .. code:: python
 
@@ -317,12 +301,11 @@ distributed onto two GPUs.
     rank:0,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[4096, 32], placement=rank:1/cuda:1)])), 'large_table_1': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[4096, 32], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[1024, 32], placement=rank:1/cuda:1)])), 'small_table_1': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[1024, 32], placement=rank:1/cuda:1)]))}}
     rank:1,sharding plan: {'': {'large_table_0': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[4096, 32], placement=rank:1/cuda:1)])), 'large_table_1': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[4096, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[4096, 32], placement=rank:1/cuda:1)])), 'small_table_0': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[1024, 32], placement=rank:1/cuda:1)])), 'small_table_1': ParameterSharding(sharding_type='column_wise', compute_kernel='batched_fused', ranks=[0, 1], sharding_spec=EnumerableShardingSpec(shards=[ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1024, 32], placement=rank:0/cuda:0), ShardMetadata(shard_offsets=[0, 32], shard_sizes=[1024, 32], placement=rank:1/cuda:1)]))}}
 
-For ``table-row-wise``, unfortuately we cannot simulate it due to its
-nature of operating under multi-host setup. We will present a python
-`SPMD <https://en.wikipedia.org/wiki/SPMD>`_ example in the future
-to train models with ``table-row-wise``.
+``table-row-wise`` 방식은 멀티 호스트(multi-host) 환경에서 동작하도록 설계되어 있기 때문에,
+현재는 이를 시뮬레이션할 수 없습니다. 앞으로는 Python 기반의 `SPMD <https://en.wikipedia.org/wiki/SPMD>`_ 예제를 통해
+``table-row-wise`` 방식을 사용하여 모델을 학습하는 방법을 소개할 예정입니다.
 
-With data parallel, we will repeat the tables for all devices.
+data-parallel 방식에서는 모든 디바이스에 동일한 테이블을 복제하여 사용합니다.
 
 .. code:: python
 

From c357a5d1c696bde17719b0f4e9cdaee9b4cc9f64 Mon Sep 17 00:00:00 2001
From: annakong23 <ssk07020@naver.com>
Date: Sat, 11 Oct 2025 18:44:48 +0900
Subject: [PATCH 4/5] Update: adjust spacing

---
 advanced_source/sharding.rst | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/advanced_source/sharding.rst b/advanced_source/sharding.rst
index c62b4f21c..68fb8b739 100644
--- a/advanced_source/sharding.rst
+++ b/advanced_source/sharding.rst
@@ -1,7 +1,7 @@
 TorchRec 샤딩 방식 살펴보기
 ===========================
 
-이 튜토리얼에서는 ``EmbeddingPlanner``와 ``DistributedModelParallel`` API를 통해 
+이 튜토리얼에서는 ``EmbeddingPlanner`` 와 ``DistributedModelParallel`` API를 통해 
 임베딩 테이블의 샤딩(Sharding) 방식을 다루며, 각기 다른 샤딩 구성을 명시적으로 설정해 봄으로써 
 샤딩 방식에 따른 성능상의 이점을 탐구합니다.
 
@@ -10,7 +10,7 @@ TorchRec 샤딩 방식 살펴보기
 
 필수 요구사항: - python >= 3.7
 
-TorchRec을 사용할 때는 CUDA 환경을 사용하는 것을 강력히 권장합니다. 
+TorchRec을 사용할 때는 CUDA 환경을 사용하는 것을 강력히 권장합니다.   
 CUDA를 사용할 경우: - cuda >= 11.0
 
 .. code:: python
@@ -80,9 +80,7 @@ Colab에서는 런타임이 /usr/lib 폴더에서 공유 라이브러리를 탐
 임베딩 모델 구성 (Constructing our embedding model)
 --------------------------------
 
-여기에서는
-`EmbeddingBagCollection <https://github.com/facebookresearch/torchrec/blob/main/torchrec/modules/embedding_modules.py#L59>`_을
-사용하여, 여러 개의 임베딩 테이블로 구성된 임베딩 백(embedding bag) 모델을 구축합니다.
+여기에서는 `EmbeddingBagCollection <https://github.com/facebookresearch/torchrec/blob/main/torchrec/modules/embedding_modules.py#L59>`_ 을 사용하여, 여러 개의 임베딩 테이블로 구성된 임베딩 백(embedding bag) 모델을 구축합니다.
 
 이번 예시에서는 4개의 임베딩 백(embedding bag) 으로 구성된 EmbeddingBagCollection (EBC)를 생성합니다.
 테이블은 두 가지 크기로 구분됩니다:
@@ -159,7 +157,7 @@ EBC를 처음 생성할 때 “meta” 디바이스에 할당하는 점에 주
 
 이 코드에서는 다른 프로세스들과 함께 모델을 공동으로 샤딩하고, 그에 따라 메모리를 적절히 할당합니다.
 먼저 프로세스 그룹을 설정한 뒤, 플래너를 사용해 임베딩 테이블의 배치를 수행하고,
-그 결과를 바탕으로 ``DistributedModelParallel``을 통해 샤딩된 모델을 생성합니다.
+그 결과를 바탕으로 ``DistributedModelParallel`` 을 통해 샤딩된 모델을 생성합니다.
 
 .. code:: python
 

From 5021b060790c891e1567e8cdaa65e47d6c37d6eb Mon Sep 17 00:00:00 2001
From: annakong23 <ssk07020@naver.com>
Date: Mon, 13 Oct 2025 00:38:39 +0900
Subject: [PATCH 5/5] Revert typo fix to match original branch state

---
 beginner_source/nlp/sequence_models_tutorial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/beginner_source/nlp/sequence_models_tutorial.py b/beginner_source/nlp/sequence_models_tutorial.py
index 7f61e5231..6cea3987f 100644
--- a/beginner_source/nlp/sequence_models_tutorial.py
+++ b/beginner_source/nlp/sequence_models_tutorial.py
@@ -102,7 +102,7 @@
 #
 # 모델은 다음과 같습니다. 단어가 :math:`w_i \in V` 일 때, 
 # 입력 문장을 :math:`w_1, \dots, w_M` 라고 합시다. 또한, 
-# :math:`T` 를 우리의 태그 집합이라고 하고, :math:`w_i` 의 단어 태그를 :math:`y_i` 라고 합니다. 
+# :math:`T` 를 우리의 태그 집합라고 하고, :math:`w_i` 의 단어 태그를 :math:`y_i` 라고 합니다. 
 # 단어 :math:`w_i` 에 대한 예측된 태그를 :math:`\hat{y}_i` 로 표시합니다. 
 # 
 #