From 73581d13904577d93190dbb4fe940b13a1280eb4 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Tue, 28 May 2024 16:44:51 -0700
Subject: [PATCH 1/8] add api doc

---
 Api/CausalLMPipeline.md    | 89 ++++++++++++++++++++++++++++++++++++++
 Api/CausalLanguageModel.md | 75 ++++++++++++++++++++++++++++++++
 Api/README.md              |  1 +
 Api/Tokenizer.md           |  6 +++
 Api/Usage.md               | 15 +++++++
 5 files changed, 186 insertions(+)
 create mode 100644 Api/CausalLMPipeline.md
 create mode 100644 Api/CausalLanguageModel.md
 create mode 100644 Api/README.md
 create mode 100644 Api/Tokenizer.md
 create mode 100644 Api/Usage.md

diff --git a/Api/CausalLMPipeline.md b/Api/CausalLMPipeline.md
new file mode 100644
index 0000000..1101b6e
--- /dev/null
+++ b/Api/CausalLMPipeline.md
@@ -0,0 +1,89 @@
+# What is a causal language model pipeline?
+
+The causal language model pipeline is a utility class which wraps a tokenizer and a causal language model and provides a uniformed interface for various decoding method to generate text. The pipeline is designed to be easy to use and requires only a few lines of code to generate text.
+
+# Usage
+```C#
+ITokenizer tokenizer;
+nn.Module<CausalLanguageModelInput, CausalLanguageModelOutput> model;
+
+var pipeline = new CausalLMPipeline(tokenizer, model);
+var prompt = "Once upon a time";
+// top-k sampling
+var output = pipeline.Generate(
+    prompt: prompt,
+    maxLen: 100,
+    temperature: 0.7f,
+    topP: 0.9f,
+    stopSequences: null,
+    device: "cuda",
+    bos: true, // add bos token to the prompt
+    eos: false, // do not add eos token to the prompt
+    echo: true // echo the prompt in the generated text
+);
+```
+
+# The API in CasualLMPipeline
+## Sample (Top P sample)
+```C#
+public (
+    Tensor, // output token ids [batch_size, sequence_length]
+    Tensor // output logits [batch_size, sequence_length, vocab_size]
+) Generate(
+    Tensor inputIds, // input token ids [batch_size, sequence_length]
+    Tensor attentionMask, // attention mask [batch_size, sequence_length]
+    float temperature = 0.7f,
+    float topP = 0.9f,
+    int maxLen = 128,
+    int[][]? stopTokenSequence = null,
+    bool echo = false); // echo the input token ids in the output token ids
+```
+
+>[!NOTE]
+> The Greedy search and beam search are not implemented in the pipeline yet. They will be added in the future.
+
+## Greedy Search
+```C#
+public (
+    Tensor, // output token ids [batch_size, sequence_length]
+    Tensor // output logits [batch_size, sequence_length, vocab_size]
+) GreedySearch(
+    Tensor inputIds, // input token ids [batch_size, sequence_length]
+    Tensor attentionMask, // attention mask [batch_size, sequence_length]
+    int maxLen = 128,
+    int[][]? stopTokenSequence = null,
+    bool echo = false); // echo the input token ids in the output token ids
+```
+
+## Beam Search
+```C#
+public (
+    Tensor, // output token ids [batch_size, sequence_length]
+    Tensor // output logits [batch_size, sequence_length, vocab_size]
+) BeamSearch(
+    Tensor inputIds, // input token ids [batch_size, sequence_length]
+    Tensor attentionMask, // attention mask [batch_size, sequence_length]
+    int maxLen = 128,
+    int[][]? stopTokenSequence = null,
+    int beamSize = 5,
+    float lengthPenalty = 1.0f,
+    bool echo = false); // echo the input token ids in the output token ids
+```
+
+## The extension method for `CausalLMPipeline`
+
+The extension `Generate` method provides a even-easier way to generate text without the necessary to generate the input tensor. The method takes a prompt string and other optional parameters to generate text.
+
+```C#
+public static string Generate(
+    this CasualLMPipeline pipeline,
+    string prompt,
+    int maxLen = 128,
+    float temperature = 0.7f,
+    float topP = 0.9f,
+    string[]? stopSequences = null,
+    string device = "cpu",
+    bool bos = true,
+    bool eos = false,
+    bool echo = false)
+```
\ No newline at end of file
diff --git a/Api/CausalLanguageModel.md b/Api/CausalLanguageModel.md
new file mode 100644
index 0000000..02af51a
--- /dev/null
+++ b/Api/CausalLanguageModel.md
@@ -0,0 +1,75 @@
+# What is a Causal Language Model?
+
+A causal language model is a type of language model that predicts the next token in a sequence of tokens. The model generates text one token at a time, with each token conditioned on the tokens that came before it. This type of model is useful for generating text, such as in chatbots, machine translation, and text summarization. [see more](https://huggingface.co/docs/transformers/tasks/language_modeling)
+
+
+# The Causal Language Model Contract
+In the remaining sections, we will describe the contract for a causal language model.
+
+## `CasualLMModelInput`
+```C#
+public CasualLMModelInput
+{
+    // [batch_size, sequence_length]
+    public Tensor input_ids { get; set; }
+
+    // optional: [batch_size, sequence_length]
+    public Tensor? attention_mask { get; set; }
+
+    // optional: [batch_size, sequence_length]
+    public Tensor? position_ids { get; set; }
+
+    // optional: kv cache for attention layers
+    public IKVCache? kv_cache { get; set; }
+
+    // optional: [batch_size, sequence_length, hidden_size]
+    // if provided, the model will use these embeddings instead of computing them from input_ids
+    public Tensor? inputs_embeds { get; set; }
+
+    // if use kv cache when calculating attention
+    public bool use_cache { get; set; }
+
+    // if return attentions in model output
+    public bool output_attentions { get; set; }
+
+    // if return hidden states in model output
+    // for e.g. calculating loss
+    public bool output_hidden_states { get; set; }
+}
+```
+
+## `CasualLMModelOutput`
+```C#
+public class CasualLMModelOutput
+{
+    // [batch_size, sequence_length, vocab_size]
+    // The predicted logits for each token in the input sequence.
+    public Tensor logits { get; set; }
+
+    // optional: [batch_size, sequence_length, hidden_size]
+    public Tensor last_hidden_state { get; set; }
+
+    // optional: all hidden states
+    public Tensor[]? hidden_states { get; set; }
+
+    // optional: all attentions
+    public Tensor[]? attentions { get; set; }
+
+    // optional: kv cache for attention layers
+    public IKVCache? cache { get; set; }
+}
+```
+
+Once both `CasualLMModelInput` and `CasualLMModelOutput` are defined, the causal language model can be implemented as follows (use Phi-3 as an example):
+
+```C#
+public class Phi3ForCasualLM : nn.Module<CasualLMModelInput, CasualLMModelOutput>
+```
+
+
+# What language model has been implemented using this contract in this repo?
+- `Phi3ForCasualLM`
+- `Phi2ForCasualLM`
+
+# What language model has been implemented using this pattern, but not exactly the same contract class in the other repo?
+- `LLaMAForCasualLM` (for both llama2 and llama3)
diff --git a/Api/README.md b/Api/README.md
new file mode 100644
index 0000000..c630bbc
--- /dev/null
+++ b/Api/README.md
@@ -0,0 +1 @@
+This folder contains the design doc for GenAI Model API
\ No newline at end of file
diff --git a/Api/Tokenizer.md b/Api/Tokenizer.md
new file mode 100644
index 0000000..cd0a3e5
--- /dev/null
+++ b/Api/Tokenizer.md
@@ -0,0 +1,6 @@
+# What is a tokenizer?
+
+A tokenizer is a class that splits a string into tokens and encodes them into numerical(int) values.
+
+# The Tokenizer Contract
+We can simply use the tokenizer contract from ml.net
\ No newline at end of file
diff --git a/Api/Usage.md b/Api/Usage.md
new file mode 100644
index 0000000..769e5c4
--- /dev/null
+++ b/Api/Usage.md
@@ -0,0 +1,15 @@
+This document shows how to use the causal language model API for text generation.
+
+## Usage
+```C#
+var pathToPhi3 = "path/to/phi3";
+var tokenizer = LLama2Tokenizer.FromPretrained(pathToPhi3);
+var phi3CausalModel = Phi3ForCasualLM.FromPretrained(pathToPhi3);
+
+var pipeline = new CausalLMPipeline(tokenizer, phi3CausalModel);
+
+var prompt = "Once upon a time";
+var output = pipeline.Generate(
+    prompt: prompt,
+    maxLen: 100);
+```
\ No newline at end of file

From ff04b4ec1cde69655491afa5d9522099bbf0176c Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 29 May 2024 11:17:20 -0700
Subject: [PATCH 2/8] update

---
 Api/README.md |  9 ++++++++-
 Api/Usage.md  | 21 +++++++++++++++++++--
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/Api/README.md b/Api/README.md
index c630bbc..eddbc1e 100644
--- a/Api/README.md
+++ b/Api/README.md
@@ -1 +1,8 @@
-This folder contains the design doc for GenAI Model API
\ No newline at end of file
+This folder contains the design doc for GenAI Model package
+
+- [Usage](./Usage.md)
+
+## Contracts && API
+- [Causal Language Model](./CausalLanguageModel.md)
+- [Tokenizer](./TokenClassification.md)
+- [Causal Language Model Pipeline](./CausalLMPipeline.md)
\ No newline at end of file
diff --git a/Api/Usage.md b/Api/Usage.md
index 769e5c4..2e56d64 100644
--- a/Api/Usage.md
+++ b/Api/Usage.md
@@ -1,6 +1,9 @@
 This document shows how to use the causal language model API for text generation.
 
-## Usage
+### Use CausalLMPipeline to generate text
+
+`CausalLMPipeline` provides the most vanilla way to generate text from a language model, which means the prompt will be fed into the model as is, without applying any chat template.
+
 ```C#
 var pathToPhi3 = "path/to/phi3";
 var tokenizer = LLama2Tokenizer.FromPretrained(pathToPhi3);
@@ -8,8 +11,22 @@ var phi3CausalModel = Phi3ForCasualLM.FromPretrained(pathToPhi3);
 
 var pipeline = new CausalLMPipeline(tokenizer, phi3CausalModel);
 
-var prompt = "Once upon a time";
+var prompt = "<|user|>Once upon a time<|end|><assistant>";
 var output = pipeline.Generate(
     prompt: prompt,
+
     maxLen: 100);
+```
+
+### Consume model from semantic kernel
+In most cases, developers would like to consume the model in a uniformed way. In this case, we can provide an extension method to semantic kernel which adds CausalLMPipeline as `ChatCompletionService`
+
+```C#
+var pathToPhi3 = "path/to/phi3";
+var tokenizer = LLama2Tokenizer.FromPretrained(pathToPhi3);
+var phi3CausalModel = Phi3ForCasualLM.FromPretrained(pathToPhi3);
+var pipeline = new CausalLMPipeline(tokenizer, phi3CausalModel);
+var kernel = Kernel.CreateBuilder()
+    .AddCausalLMPipelineAsChatCompletionService(pipeline)
+    .Build();
 ```
\ No newline at end of file

From 08edc420d461ae88730d09eafc8c23fb94d89a9a Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 29 May 2024 13:17:17 -0700
Subject: [PATCH 3/8] update

---
 Api/Benchmark && Evaluation.md | 14 ++++++++
 Api/CausalLMPipeline.md        | 60 +++++++++++++++++++++++++---------
 Api/README.md                  |  3 +-
 Api/Usage.md                   | 41 ++++++++++++++++++++++-
 4 files changed, 100 insertions(+), 18 deletions(-)
 create mode 100644 Api/Benchmark && Evaluation.md

diff --git a/Api/Benchmark && Evaluation.md b/Api/Benchmark && Evaluation.md
new file mode 100644
index 0000000..edf69ea
--- /dev/null
+++ b/Api/Benchmark && Evaluation.md	
@@ -0,0 +1,14 @@
+It's critical to evaluate the performance of the GenAI model once it's available. The evaluation && benchmark will be on two-fold:
+- evaluation on various eval datasets: this is to make sure our implementation is correct and the model is working as expected comparing to python-implemented model.
+- benchmark on inference speed: this is to make sure the model can be used in real-time applications.
+
+This document will cover the topic of how to evaluate the model on various eval datasets.
+
+## How we evaluate the model
+To get the most comparable result with other llms, we evaluate the model in the same way as [Open LLM leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard), which uses [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) as the evaluation framework.
+
+For the details of which evaluation datasets are used, please refer to the [Open LLM leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard).
+
+Because `lm-evaluation-harness` is written in python, there is no way to directly use it in .NET. Therefore we use the following steps as a workaround:
+- in C#, start a openai chat completion service server with the model we want to evaluate.
+- in python, use `lm-evaluation-harness` to evaluate the model using openai mode.
\ No newline at end of file
diff --git a/Api/CausalLMPipeline.md b/Api/CausalLMPipeline.md
index 1101b6e..6ba8f38 100644
--- a/Api/CausalLMPipeline.md
+++ b/Api/CausalLMPipeline.md
@@ -2,12 +2,38 @@
 
 The causal language model pipeline is a utility class which wraps a tokenizer and a causal language model and provides a uniformed interface for various decoding method to generate text. The pipeline is designed to be easy to use and requires only a few lines of code to generate text.
 
+# Contract
+```C#
+public abstract class CasualLMPipeline
+{
+    public virtual (
+        Tensor, // output token ids [batch_size, sequence_length]
+        Tensor // output logits [batch_size, sequence_length, vocab_size]
+    ) Generate(
+        Tensor inputIds, // input token ids [batch_size, sequence_length]
+        Tensor attentionMask, // attention mask [batch_size, sequence_length]
+        float temperature = 0.7f,
+        float topP = 0.9f,
+        int maxLen = 128,
+        int[][]? stopTokenSequence = null,
+        bool echo = false); // echo the input token ids in the output token ids
+}
+
+public CasualLMPipeline<TTokenizer, TCasualLM> : CasualLMPipeline
+    where TTokenizer : ITokenizer
+    where TCasualLM : nn.Module<CausalLanguageModelInput, CausalLanguageModelOutput>
+{
+    public CasualLMPipeline<LLama2Tokenizer, Phi3ForCasualLM> Create(LLama2Tokenizer tokenizer, Phi3ForCasualLM model);
+
+}
+```
+
 # Usage
 ```C#
-ITokenizer tokenizer;
-nn.Module<CausalLanguageModelInput, CausalLanguageModelOutput> model;
+LLama2Tokenizer tokenizer;
+Phi3ForCasualLM model;
 
-var pipeline = new CausalLMPipeline(tokenizer, model);
+var pipeline = CausalLMPipeline.Create(tokenizer, model);
 var prompt = "Once upon a time";
 // top-k sampling
 var output = pipeline.Generate(
@@ -23,20 +49,22 @@ var output = pipeline.Generate(
 );
 ```
 
-# The API in CasualLMPipeline
-## Sample (Top P sample)
+# Sampling methods
+The `CaualLMPipeline` provides a uniformed interface for various decoding methods to generate text. This saves our effort to implement different decoding methods for each model.
+
+## Sampling
 ```C#
-public (
-    Tensor, // output token ids [batch_size, sequence_length]
-    Tensor // output logits [batch_size, sequence_length, vocab_size]
-) Generate(
-    Tensor inputIds, // input token ids [batch_size, sequence_length]
-    Tensor attentionMask, // attention mask [batch_size, sequence_length]
-    float temperature = 0.7f,
-    float topP = 0.9f,
-    int maxLen = 128,
-    int[][]? stopTokenSequence = null,
-    bool echo = false); // echo the input token ids in the output token ids
+public virtual (
+        Tensor, // output token ids [batch_size, sequence_length]
+        Tensor // output logits [batch_size, sequence_length, vocab_size]
+    ) Generate(
+        Tensor inputIds, // input token ids [batch_size, sequence_length]
+        Tensor attentionMask, // attention mask [batch_size, sequence_length]
+        float temperature = 0.7f,
+        float topP = 0.9f,
+        int maxLen = 128,
+        int[][]? stopTokenSequence = null,
+        bool echo = false); // echo the input token ids in the output token ids
 ```
 
 >[!NOTE]
diff --git a/Api/README.md b/Api/README.md
index eddbc1e..cb6b014 100644
--- a/Api/README.md
+++ b/Api/README.md
@@ -1,6 +1,7 @@
 This folder contains the design doc for GenAI Model package
 
-- [Usage](./Usage.md)
+- [Usage](./Usage.md): how to use the model from GenAI Model package
+- [Benchmark && Evaluation](./Benchmark%20&&%20Evaluation.md): how to evaluate the model from GenAI Model package
 
 ## Contracts && API
 - [Causal Language Model](./CausalLanguageModel.md)
diff --git a/Api/Usage.md b/Api/Usage.md
index 2e56d64..6cc81dc 100644
--- a/Api/Usage.md
+++ b/Api/Usage.md
@@ -29,4 +29,43 @@ var pipeline = new CausalLMPipeline(tokenizer, phi3CausalModel);
 var kernel = Kernel.CreateBuilder()
     .AddCausalLMPipelineAsChatCompletionService(pipeline)
     .Build();
-```
\ No newline at end of file
+```
+
+### Consume model from AutoGen
+Similarly, developers would also like to consume the language model like agent.
+```C#
+var pathToPhi3 = "path/to/phi3";
+var tokenizer = LLama2Tokenizer.FromPretrained(pathToPhi3);
+var phi3CausalModel = Phi3ForCasualLM.FromPretrained(pathToPhi3);
+var pipeline = new CausalLMPipeline(tokenizer, phi3CausalModel);
+var agent = new Phi3MiniAgent(pipeline, name: "assistant");
+
+var reply = await agent.SendAsync("Tell me a joke");
+```
+
+### Consume model like an OpenAI chat completion service
+
+> [!NOTE]
+> This feature is very useful for evaluation and benchmarking as well.
+
+If the model is deployed as a service, developers can consume the model similar to OpenAI chat completion service.
+```C#
+// server.cs
+var pathToPhi3 = "path/to/phi3";
+var tokenizer = LLama2Tokenizer.FromPretrained(pathToPhi3);
+var phi3CausalModel = Phi3ForCasualLM.FromPretrained(pathToPhi3);
+var pipeline = new CausalLMPipeline(tokenizer, phi3CausalModel);
+var agent = new Phi3MiniAgent(pipeline, name: "assistant");
+
+// AutoGen.Net allows you to run the agent as an OpenAI chat completion endpoint
+var host = Host.CreateDefaultBuilder()
+    .ConfigureWebHostDefaults(app =>
+    {
+        app.UseAgentAsOpenAIChatCompletionEndpoint(agent);
+    })
+    .Build();
+
+await host.RunAsync();
+```
+
+On the client side, the consumption code will be no dfferent from consuming an openai chat completion service.
\ No newline at end of file

From 51ce3229d6ea2fb6283a16673b976ef8b890772e Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Wed, 29 May 2024 13:25:04 -0700
Subject: [PATCH 4/8] udpate

---
 Api/README.md                           |  9 ---------
 {Api => Doc}/Benchmark && Evaluation.md |  0
 {Api => Doc}/CausalLMPipeline.md        |  0
 {Api => Doc}/CausalLanguageModel.md     |  0
 Doc/PartialLoading.md                   |  3 +++
 Doc/README.md                           | 14 ++++++++++++++
 {Api => Doc}/Tokenizer.md               |  0
 {Api => Doc}/Usage.md                   |  0
 8 files changed, 17 insertions(+), 9 deletions(-)
 delete mode 100644 Api/README.md
 rename {Api => Doc}/Benchmark && Evaluation.md (100%)
 rename {Api => Doc}/CausalLMPipeline.md (100%)
 rename {Api => Doc}/CausalLanguageModel.md (100%)
 create mode 100644 Doc/PartialLoading.md
 create mode 100644 Doc/README.md
 rename {Api => Doc}/Tokenizer.md (100%)
 rename {Api => Doc}/Usage.md (100%)

diff --git a/Api/README.md b/Api/README.md
deleted file mode 100644
index cb6b014..0000000
--- a/Api/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-This folder contains the design doc for GenAI Model package
-
-- [Usage](./Usage.md): how to use the model from GenAI Model package
-- [Benchmark && Evaluation](./Benchmark%20&&%20Evaluation.md): how to evaluate the model from GenAI Model package
-
-## Contracts && API
-- [Causal Language Model](./CausalLanguageModel.md)
-- [Tokenizer](./TokenClassification.md)
-- [Causal Language Model Pipeline](./CausalLMPipeline.md)
\ No newline at end of file
diff --git a/Api/Benchmark && Evaluation.md b/Doc/Benchmark && Evaluation.md
similarity index 100%
rename from Api/Benchmark && Evaluation.md
rename to Doc/Benchmark && Evaluation.md
diff --git a/Api/CausalLMPipeline.md b/Doc/CausalLMPipeline.md
similarity index 100%
rename from Api/CausalLMPipeline.md
rename to Doc/CausalLMPipeline.md
diff --git a/Api/CausalLanguageModel.md b/Doc/CausalLanguageModel.md
similarity index 100%
rename from Api/CausalLanguageModel.md
rename to Doc/CausalLanguageModel.md
diff --git a/Doc/PartialLoading.md b/Doc/PartialLoading.md
new file mode 100644
index 0000000..e98c2ba
--- /dev/null
+++ b/Doc/PartialLoading.md
@@ -0,0 +1,3 @@
+Partial loading is a technique to inference very large model on a machine with limited GPU memory. The idea is to load only part of the model to GPU memory and run inference on the loaded part. Once the inference is done, the loaded part is released from GPU memory and the next part is loaded to GPU memory. This process is repeated until the whole model is processed.
+
+The technique is available in both llama.cpp and [huggingface accelerate](https://huggingface.co/blog/accelerate-large-models). The GenAI model package should also support this technique.
\ No newline at end of file
diff --git a/Doc/README.md b/Doc/README.md
new file mode 100644
index 0000000..dc4961a
--- /dev/null
+++ b/Doc/README.md
@@ -0,0 +1,14 @@
+This folder contains the design doc for GenAI Model package
+
+- [Usage](./Usage.md): how to use the model from GenAI Model package
+- [Benchmark && Evaluation](./Benchmark%20&&%20Evaluation.md): how to evaluate the model from GenAI Model package
+
+## Contracts && API
+- [Causal Language Model](./CausalLanguageModel.md)
+- [Tokenizer](./TokenClassification.md)
+- [Causal Language Model Pipeline](./CausalLMPipeline.md)
+
+## Need further investigation
+- [Partial loading](./PartialLoading.md): load only part of model to GPU when gpu memory is limited
+- Improve loading speed: I notice that the model loading speed from disk to memory is slower in torchsharp than what it is in huggingface. Need to investigate the reason and improve the loading speed
+- Quantization: quantize the model to reduce the model size and improve the inference speed
\ No newline at end of file
diff --git a/Api/Tokenizer.md b/Doc/Tokenizer.md
similarity index 100%
rename from Api/Tokenizer.md
rename to Doc/Tokenizer.md
diff --git a/Api/Usage.md b/Doc/Usage.md
similarity index 100%
rename from Api/Usage.md
rename to Doc/Usage.md

From 490b31ef19050d38768f5ada832625eb9545c339 Mon Sep 17 00:00:00 2001
From: Xiaoyun Zhang <bigmiao.zhang@gmail.com>
Date: Thu, 30 May 2024 15:04:06 -0700
Subject: [PATCH 5/8] Update PartialLoading.md

---
 Doc/PartialLoading.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/Doc/PartialLoading.md b/Doc/PartialLoading.md
index e98c2ba..c5964c6 100644
--- a/Doc/PartialLoading.md
+++ b/Doc/PartialLoading.md
@@ -1,3 +1,11 @@
 Partial loading is a technique to inference very large model on a machine with limited GPU memory. The idea is to load only part of the model to GPU memory and run inference on the loaded part. Once the inference is done, the loaded part is released from GPU memory and the next part is loaded to GPU memory. This process is repeated until the whole model is processed.
 
-The technique is available in both llama.cpp and [huggingface accelerate](https://huggingface.co/blog/accelerate-large-models). The GenAI model package should also support this technique.
\ No newline at end of file
+The technique is available in both llama.cpp and [huggingface accelerate](https://huggingface.co/blog/accelerate-large-models). The GenAI model package should also support this technique.
+
+## Update on 2024/05/30
+Experiment over partial loading is done in PR #10. The main take-away are
+- partial loading can gain acceleration from 1.03X to over 30X even not fully loading model to GPU.
+- the main bottleneck is still memory traffic between CPU and GPU.
+- larger blocks should have higher priority when deciding which block to be 'pin' to GPU memory.
+
+The result can be found in [record](../record.md)

From 72ee5bbbebc469a123a421bf8a08370e61653019 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Mon, 3 Jun 2024 11:10:42 -0700
Subject: [PATCH 6/8] update

---
 Doc/CausalLMPipeline.md                      | 13 +++++++------
 Doc/{PartialLoading.md => DynamicLoading.md} |  2 +-
 Doc/Package Structure.md                     |  7 +++++++
 Doc/README.md                                | 10 ++++++----
 Doc/Usage.md                                 | 11 +++++++----
 5 files changed, 28 insertions(+), 15 deletions(-)
 rename Doc/{PartialLoading.md => DynamicLoading.md} (93%)
 create mode 100644 Doc/Package Structure.md

diff --git a/Doc/CausalLMPipeline.md b/Doc/CausalLMPipeline.md
index 6ba8f38..aeab451 100644
--- a/Doc/CausalLMPipeline.md
+++ b/Doc/CausalLMPipeline.md
@@ -2,9 +2,10 @@
 
 The causal language model pipeline is a utility class which wraps a tokenizer and a causal language model and provides a uniformed interface for various decoding method to generate text. The pipeline is designed to be easy to use and requires only a few lines of code to generate text.
 
+In Microsoft.ML.GenAI, we will provide a generic `CausalLMPipeline` class plus a typed `CausalLMPipeline` class which specifies the type parameters for the tokenizer and the causal language model. The typed `CausalLMPipeline` class make it easier to develop consuming method for semantic kernel. see [here](./Usage.md#consume-model-from-semantic-kernel) for more details
 # Contract
 ```C#
-public abstract class CasualLMPipeline
+public abstract class CausalLMPipeline
 {
     public virtual (
         Tensor, // output token ids [batch_size, sequence_length]
@@ -19,11 +20,11 @@ public abstract class CasualLMPipeline
         bool echo = false); // echo the input token ids in the output token ids
 }
 
-public CasualLMPipeline<TTokenizer, TCasualLM> : CasualLMPipeline
+public CasualLMPipeline<TTokenizer, TCausalLM> : CausalLMPipeline
     where TTokenizer : ITokenizer
-    where TCasualLM : nn.Module<CausalLanguageModelInput, CausalLanguageModelOutput>
+    where TCausalLM : nn.Module<CausalLanguageModelInput, CausalLanguageModelOutput>
 {
-    public CasualLMPipeline<LLama2Tokenizer, Phi3ForCasualLM> Create(LLama2Tokenizer tokenizer, Phi3ForCasualLM model);
+    public CausalLMPipeline<LLama2Tokenizer, Phi3ForCasualLM> Create(LLama2Tokenizer tokenizer, Phi3ForCasualLM model);
 
 }
 ```
@@ -31,7 +32,7 @@ public CasualLMPipeline<TTokenizer, TCasualLM> : CasualLMPipeline
 # Usage
 ```C#
 LLama2Tokenizer tokenizer;
-Phi3ForCasualLM model;
+Phi3ForCausalLM model;
 
 var pipeline = CausalLMPipeline.Create(tokenizer, model);
 var prompt = "Once upon a time";
@@ -50,7 +51,7 @@ var output = pipeline.Generate(
 ```
 
 # Sampling methods
-The `CaualLMPipeline` provides a uniformed interface for various decoding methods to generate text. This saves our effort to implement different decoding methods for each model.
+The `CausalLMPipeline` provides a uniformed interface for various decoding methods to generate text. This saves our effort to implement different decoding methods for each model.
 
 ## Sampling
 ```C#
diff --git a/Doc/PartialLoading.md b/Doc/DynamicLoading.md
similarity index 93%
rename from Doc/PartialLoading.md
rename to Doc/DynamicLoading.md
index c5964c6..4fe3afe 100644
--- a/Doc/PartialLoading.md
+++ b/Doc/DynamicLoading.md
@@ -1,4 +1,4 @@
-Partial loading is a technique to inference very large model on a machine with limited GPU memory. The idea is to load only part of the model to GPU memory and run inference on the loaded part. Once the inference is done, the loaded part is released from GPU memory and the next part is loaded to GPU memory. This process is repeated until the whole model is processed.
+Dynamic loading is a technique to inference very large model on a machine with limited GPU memory. The idea is to load only part of the model to GPU memory and run inference on the loaded part. Once the inference is done, the loaded part is released from GPU memory and the next part is loaded to GPU memory. This process is repeated until the whole model is processed.
 
 The technique is available in both llama.cpp and [huggingface accelerate](https://huggingface.co/blog/accelerate-large-models). The GenAI model package should also support this technique.
 
diff --git a/Doc/Package Structure.md b/Doc/Package Structure.md
new file mode 100644
index 0000000..9410b7f
--- /dev/null
+++ b/Doc/Package Structure.md	
@@ -0,0 +1,7 @@
+The GenAI project will be a collection of popular open source AI models. It will be organized in the following structure:
+
+- Microsoft.ML.GenAI.Core: the core library for GenAI project, it contains the fundamental contracts or classes like `CausalLanguageModel` and `CausalLMPipeline`
+- Microsoft.ML.GenAI.{ModelName}: the implementation of a specific model, which includes the model configuration, causal lm model implementation (like `Phi3ForCausalLM`) and tokenizer implementation if any. In the first stage, we plan to provide the following models:
+  - Microsoft.ML.GenAI.Phi: the implementation of Phi-series model
+  - Microsoft.ML.GenAI.LLaMA: the implementation of LLaMA-series model
+  - Microsoft.ML.GenAI.StableDiffusion: the implementation of Stable Diffusion model
\ No newline at end of file
diff --git a/Doc/README.md b/Doc/README.md
index dc4961a..dfcad9c 100644
--- a/Doc/README.md
+++ b/Doc/README.md
@@ -1,14 +1,16 @@
 This folder contains the design doc for GenAI Model package
 
+### Basic
+- [Package Structure](./Package%20Structure.md): the structure of GenAI Model package
 - [Usage](./Usage.md): how to use the model from GenAI Model package
 - [Benchmark && Evaluation](./Benchmark%20&&%20Evaluation.md): how to evaluate the model from GenAI Model package
 
-## Contracts && API
+### Contracts && API
 - [Causal Language Model](./CausalLanguageModel.md)
-- [Tokenizer](./TokenClassification.md)
+- [Tokenizer](./Tokenizer.md)
 - [Causal Language Model Pipeline](./CausalLMPipeline.md)
 
-## Need further investigation
-- [Partial loading](./PartialLoading.md): load only part of model to GPU when gpu memory is limited
+### Need further investigation
+- [Dynamic loading](./DynamicLoading.md): load only part of model to GPU when gpu memory is limited
 - Improve loading speed: I notice that the model loading speed from disk to memory is slower in torchsharp than what it is in huggingface. Need to investigate the reason and improve the loading speed
 - Quantization: quantize the model to reduce the model size and improve the inference speed
\ No newline at end of file
diff --git a/Doc/Usage.md b/Doc/Usage.md
index 6cc81dc..b4e9fa7 100644
--- a/Doc/Usage.md
+++ b/Doc/Usage.md
@@ -9,7 +9,7 @@ var pathToPhi3 = "path/to/phi3";
 var tokenizer = LLama2Tokenizer.FromPretrained(pathToPhi3);
 var phi3CausalModel = Phi3ForCasualLM.FromPretrained(pathToPhi3);
 
-var pipeline = new CausalLMPipeline(tokenizer, phi3CausalModel);
+CausalLMPipeline<LLama2Tokenizer, Phi3ForCasualLM> pipeline = new CausalLMPipeline(tokenizer, phi3CausalModel);
 
 var prompt = "<|user|>Once upon a time<|end|><assistant>";
 var output = pipeline.Generate(
@@ -25,9 +25,12 @@ In most cases, developers would like to consume the model in a uniformed way. In
 var pathToPhi3 = "path/to/phi3";
 var tokenizer = LLama2Tokenizer.FromPretrained(pathToPhi3);
 var phi3CausalModel = Phi3ForCasualLM.FromPretrained(pathToPhi3);
-var pipeline = new CausalLMPipeline(tokenizer, phi3CausalModel);
+CausalLMPipeline<LLama2Tokenizer, Phi3ForCasualLM> pipeline = new CausalLMPipeline(tokenizer, phi3CausalModel);
 var kernel = Kernel.CreateBuilder()
-    .AddCausalLMPipelineAsChatCompletionService(pipeline)
+    // the type of the tokenizer and the model are explicitly specified
+    // here for clarity, but the compiler can infer them
+    // The typed pipeline prevent developers from passing an arbitrary CausalLMPipeline
+    .AddPhi3AsChatCompletionService<LLama2Tokenizer, Phi3ForCasualLM>(pipeline)
     .Build();
 ```
 
@@ -46,7 +49,7 @@ var reply = await agent.SendAsync("Tell me a joke");
 ### Consume model like an OpenAI chat completion service
 
 > [!NOTE]
-> This feature is very useful for evaluation and benchmarking as well.
+> This feature is very useful for evaluation and benchmarking. Because most of the benchmarking frameworks are implemented in python, but support consuming openai-like api.
 
 If the model is deployed as a service, developers can consume the model similar to OpenAI chat completion service.
 ```C#

From f786eedce7bf7b185dae7caa2b7d141679ab9218 Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Mon, 3 Jun 2024 11:18:44 -0700
Subject: [PATCH 7/8] update

---
 Doc/CausalLanguageModel.md | 18 +++++++++---------
 Doc/README.md              |  4 ++--
 Doc/Usage.md               |  5 ++++-
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/Doc/CausalLanguageModel.md b/Doc/CausalLanguageModel.md
index 02af51a..c423417 100644
--- a/Doc/CausalLanguageModel.md
+++ b/Doc/CausalLanguageModel.md
@@ -6,9 +6,9 @@ A causal language model is a type of language model that predicts the next token
 # The Causal Language Model Contract
 In the remaining sections, we will describe the contract for a causal language model.
 
-## `CasualLMModelInput`
+## `CausalLMModelInput`
 ```C#
-public CasualLMModelInput
+public CausalLMModelInput
 {
     // [batch_size, sequence_length]
     public Tensor input_ids { get; set; }
@@ -38,9 +38,9 @@ public CasualLMModelInput
 }
 ```
 
-## `CasualLMModelOutput`
+## `CausalLMModelOutput`
 ```C#
-public class CasualLMModelOutput
+public class CausalLMModelOutput
 {
     // [batch_size, sequence_length, vocab_size]
     // The predicted logits for each token in the input sequence.
@@ -60,16 +60,16 @@ public class CasualLMModelOutput
 }
 ```
 
-Once both `CasualLMModelInput` and `CasualLMModelOutput` are defined, the causal language model can be implemented as follows (use Phi-3 as an example):
+Once both `CausalLMModelInput` and `CausalLMModelOutput` are defined, the causal language model can be implemented as follows (use Phi-3 as an example):
 
 ```C#
-public class Phi3ForCasualLM : nn.Module<CasualLMModelInput, CasualLMModelOutput>
+public class Phi3ForCausalLM : nn.Module<CausalLMModelInput, CausalLMModelOutput>
 ```
 
 
 # What language model has been implemented using this contract in this repo?
-- `Phi3ForCasualLM`
-- `Phi2ForCasualLM`
+- `Phi3ForCausalLM`
+- `Phi2ForCausalLM`
 
 # What language model has been implemented using this pattern, but not exactly the same contract class in the other repo?
-- `LLaMAForCasualLM` (for both llama2 and llama3)
+- `LLaMAForCausalLM` (for both llama2 and llama3)
diff --git a/Doc/README.md b/Doc/README.md
index dfcad9c..b382510 100644
--- a/Doc/README.md
+++ b/Doc/README.md
@@ -6,9 +6,9 @@ This folder contains the design doc for GenAI Model package
 - [Benchmark && Evaluation](./Benchmark%20&&%20Evaluation.md): how to evaluate the model from GenAI Model package
 
 ### Contracts && API
-- [Causal Language Model](./CausalLanguageModel.md)
+- [CausalLMPipeline](./CausalLMPipeline.md)
+- [CausalLMModelInput and CausalLMModelOutput](./CausalLanguageModel.md)
 - [Tokenizer](./Tokenizer.md)
-- [Causal Language Model Pipeline](./CausalLMPipeline.md)
 
 ### Need further investigation
 - [Dynamic loading](./DynamicLoading.md): load only part of model to GPU when gpu memory is limited
diff --git a/Doc/Usage.md b/Doc/Usage.md
index b4e9fa7..611d5fb 100644
--- a/Doc/Usage.md
+++ b/Doc/Usage.md
@@ -30,6 +30,9 @@ var kernel = Kernel.CreateBuilder()
     // the type of the tokenizer and the model are explicitly specified
     // here for clarity, but the compiler can infer them
     // The typed pipeline prevent developers from passing an arbitrary CausalLMPipeline
+    // The reason why we don't want to allow developers to pass an arbitrary CausalLMPipeline is because
+    // - the model and the tokenizer must be compatible
+    // - the chat template must be compatible with the model. e.g. In `AddPhi3AsChatCompletionService`, the chat template is fixed to "<|user|>{prompt}<|end|><assistant>"
     .AddPhi3AsChatCompletionService<LLama2Tokenizer, Phi3ForCasualLM>(pipeline)
     .Build();
 ```
@@ -49,7 +52,7 @@ var reply = await agent.SendAsync("Tell me a joke");
 ### Consume model like an OpenAI chat completion service
 
 > [!NOTE]
-> This feature is very useful for evaluation and benchmarking. Because most of the benchmarking frameworks are implemented in python, but support consuming openai-like api.
+> This feature is very useful for evaluation and benchmarking. Because most of the benchmarking frameworks are implemented in python, but support consuming openai-like api. Therefore we can use this feature to evaluate the model using the same benchmarking framework as other models and get comparable results.
 
 If the model is deployed as a service, developers can consume the model similar to OpenAI chat completion service.
 ```C#

From c7ce28ea4c7bf6e283ac7181443ac2fbad87e6fa Mon Sep 17 00:00:00 2001
From: XiaoYun Zhang <xiaoyuz@microsoft.com>
Date: Mon, 3 Jun 2024 11:21:33 -0700
Subject: [PATCH 8/8] update readme

---
 Doc/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Doc/README.md b/Doc/README.md
index b382510..9baba5c 100644
--- a/Doc/README.md
+++ b/Doc/README.md
@@ -11,6 +11,6 @@ This folder contains the design doc for GenAI Model package
 - [Tokenizer](./Tokenizer.md)
 
 ### Need further investigation
-- [Dynamic loading](./DynamicLoading.md): load only part of model to GPU when gpu memory is limited
+- [Dynamic loading](./DynamicLoading.md): load only part of model to GPU when gpu memory is limited. We explore the result w/o dynamic loading in [record](../record.md)
 - Improve loading speed: I notice that the model loading speed from disk to memory is slower in torchsharp than what it is in huggingface. Need to investigate the reason and improve the loading speed
 - Quantization: quantize the model to reduce the model size and improve the inference speed
\ No newline at end of file