Skip to content

Commit 9e8a665

Browse files
Riandycmodi-metaWuhanMonkey
authored
Upgrade SDK to v0.1.0 (#18)
* Update SDK to 0.0.63 * Update SDK to 0.0.64.rc1 * Update InferenceServiceLocalImpl.kt * Upgrade SDK to 0.1.0.rc2 stainless ref: a625d927ae60b454c19142c50c14f41e39c8e953 Patch needed: - ParamType.kt - addContenBatchOfInterleavedContentItems() in BatchInferenceCompletionParams.kts * Add local streaming support * Local streaming with custom tool calling+stats * Upgrade SDK to 0.1.0.rc5 stainless commit: d2701cb9f0b15c13dab4935e3c7d513410b068c3 Manual patches: - BatchInferenceCompletionParams.kt --> addContentBatchOfInterleavedContentItems() need to use addContentBatch(). The generated method does not exist - InferenceEmbeddingsParams.kt --> same as above. replace the error with addContent() - ToolRuntimeListToolsParams.kt --> forEachQueryParam method does not exist, replace it with ._additionalProperties()?.forEach() * Add multi custom tool calling with local streaming * Update SDK to 0.0.64.rc1 version accidentally added .draft, reverting this back. Everything is validated working against 0.0.64.rc1 * Fix local response util * Add toolcall content type in contentdelta * Upgrade SDK to 0.1.0.rc10 stainless commit: 21fbaab559158253523ca3043f6dfddd4de887ad Manual patches: - BatchInferenceCompletionParams.kt --> addContentBatchOfInterleavedContentItems() need to use addContentBatch(). The generated method does not exist - InferenceEmbeddingsParams.kt --> same as above. replace the error with addContent() - ToolRuntimeListToolsParams.kt --> forEachQueryParam method does not exist, replace it with ._additionalProperties()?.forEach() Need to disable some test on the following files due to type mismatch in autogenerated code - ToolRuntimeListToolsParamsTest.kt - TelemetryQueryTracesParamsTest.kt * Upgrade SDK to 0.1.0.rc13 + agent patch Stainless commit: 9d827a56d9b8e61dac3d49e1b7fbcbd909ebd0e3 Plus agent patch from Ashwin's changes on AgentTurnCreateResponse Manual patches are labelled // MANUAL PATCH * 0.1.0.rc14 * Fix ParamType.kt type errors * rc14 manual patch * Update README.md Include Agent and Image reasoning changes * Another manual patch - Delete ToolCall class under ContentDelta. This conflicts with the standalone ToolCall class - Other fixes marked as //MANUAL PATCH * Fix broken ResponseUtil for Local module * SDK updates Latest from Ashwin's fixes . Stainless commit: c85d35f1b56791b27f6e786d36f6ae84e97782e3 * local module patch * Another patch fixing ToolCall errors * Fix ResponseUtils --------- Co-authored-by: cmodi-meta <98582575+cmodi-meta@users.noreply.github.com> Co-authored-by: Chester Hu <hcp199242@gmail.com>
1 parent 8c5a265 commit 9e8a665

File tree

496 files changed

+46717
-38872
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

496 files changed

+46717
-38872
lines changed

README.md

Lines changed: 146 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,152 @@ client = LlamaStackClientOkHttpClient
105105
</tr>
106106
</table>
107107

108+
### Agents
108109

109-
### Run Inference
110+
Llama Stack agent is capable of running multi-turn inference using both customized and built-in tools.
111+
112+
Create the agent configuration:
113+
```
114+
val agentConfig =
115+
AgentConfig.builder()
116+
.enableSessionPersistence(false)
117+
.instructions("You are a helpful assistant")
118+
.maxInferIters(100)
119+
.model("meta-llama/Llama-3.2-3B-Instruct")
120+
.samplingParams(
121+
SamplingParams.builder()
122+
.strategy(
123+
SamplingParams.Strategy.ofGreedySamplingStrategy(
124+
SamplingParams.Strategy.GreedySamplingStrategy.builder()
125+
.type(SamplingParams.Strategy.GreedySamplingStrategy.Type.GREEDY)
126+
.build()
127+
)
128+
)
129+
.build()
130+
)
131+
.toolChoice(AgentConfig.ToolChoice.AUTO)
132+
.toolPromptFormat(AgentConfig.ToolPromptFormat.PYTHON_LIST)
133+
.clientTools(
134+
listOf(
135+
CustomTools.getCreateCalendarEventTool() #Custom local tools
136+
)
137+
)
138+
.build()
139+
```
140+
141+
Create the agent:
142+
```
143+
val agentService = client!!.agents() #LlamaStackClientLocalClient
144+
val agentCreateResponse = agentService.create(
145+
AgentCreateParams.builder()
146+
.agentConfig(agentConfig)
147+
.build(),
148+
)
149+
val agentId = agentCreateResponse.agentId()
150+
```
151+
152+
Create the session:
153+
```
154+
val sessionService = agentService.session()
155+
val agentSessionCreateResponse = sessionService.create(
156+
AgentSessionCreateParams.builder()
157+
.agentId(agentId)
158+
.sessionName("test-session")
159+
.build()
160+
)
161+
162+
val sessionId = agentSessionCreateResponse.sessionId()
163+
```
164+
165+
Create a turn:
166+
```
167+
val turnService = agentService.turn()
168+
turnService.createStreaming(
169+
AgentTurnCreateParams.builder()
170+
.agentId(agentId)
171+
.messages(
172+
listOf(
173+
AgentTurnCreateParams.Message.ofUserMessage(
174+
UserMessage.builder()
175+
.content(InterleavedContent.ofString("What is the capital of France?"))
176+
.role(UserMessage.Role.USER)
177+
.build()
178+
)
179+
)
180+
.sessionId(sessionId)
181+
.build()
182+
)
183+
```
184+
185+
Handle the stream chunk callback:
186+
```
187+
agentTurnCreateResponseStream.use {
188+
agentTurnCreateResponseStream.asSequence().forEach {
189+
val agentResponsePayload = it.agentTurnResponseStreamChunk()?.event()?.payload()
190+
if (agentResponsePayload != null) {
191+
when {
192+
agentResponsePayload.isTurnStart() -> {
193+
// Handle Turn Start Payload
194+
}
195+
agentResponsePayload.isStepStart() -> {
196+
// Handle Step Start Payload
197+
}
198+
agentResponsePayload.isStepProgress() -> {
199+
// Handle Step Progress Payload
200+
}
201+
agentResponsePayload.isStepComplete() -> {
202+
// Handle Step Complete Payload
203+
}
204+
agentResponsePayload.isTurnComplete() -> {
205+
// Handle Turn Complete Payload
206+
}
207+
}
208+
}
209+
```
210+
211+
More examples can be found in our demo app (TO-ADD Agent section)
212+
213+
214+
### Run Image Reasoning
215+
The Kotlin SDK also supports single image inference where the image can be a HTTP web url or captured on your local device.
216+
217+
Create an image inference with agent:
218+
219+
```
220+
val agentTurnCreateResponseStream =
221+
turnService.createStreaming(
222+
AgentTurnCreateParams.builder()
223+
.agentId(agentId)
224+
.messages(
225+
listOf(
226+
AgentTurnCreateParams.Message.ofUserMessage(
227+
UserMessage.builder()
228+
.content(InterleavedContent.ofString("What is in the image?"))
229+
.role(UserMessage.Role.USER)
230+
.build()
231+
),
232+
AgentTurnCreateParams.Message.ofUserMessage(
233+
UserMessage.builder()
234+
.content(InterleavedContent.ofImageContentItem(
235+
InterleavedContent.ImageContentItem.builder()
236+
.image(imageUrl)
237+
.type(InterleavedContent.ImageContentItem.Type.IMAGE)
238+
.build()
239+
))
240+
.role(UserMessage.Role.USER)
241+
.build()
242+
)
243+
)
244+
)
245+
.sessionId(sessionId)
246+
.build()
247+
)
248+
```
249+
250+
Note that image captured on device needs to be encoded with Base64 before sending it to the model. Check out our demo app example here (TO-ADD Image Reasoning section)
251+
252+
253+
### Run Simple Inference
110254
With the Kotlin Library managing all the major operational logic, there are minimal to no changes when running simple chat inference for local or remote:
111255

112256
```
@@ -135,7 +279,7 @@ val result = client!!.inference().chatCompletionStreaming(
135279
// See Android demo app for a detailed implementation example.
136280
```
137281

138-
### Setup Custom Tool Calling
282+
### Setup Simple Inference with Custom Tool Calling
139283

140284
Android demo app for more details: [Custom Tool Calling](https://github.com/meta-llama/llama-stack-apps/tree/main/examples/android_app#tool-calling)
141285

build.gradle.kts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ plugins {
44

55
allprojects {
66
group = "com.llama.llamastack"
7-
version = "0.0.58"
7+
version = "0.1.0.rc14.manual-patch"
88
}

llama-stack-client-kotlin-client-local/src/main/kotlin/com/llama/llamastack/client/local/InferenceServiceLocalImpl.kt

Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ package com.llama.llamastack.client.local
44

55
import com.llama.llamastack.client.local.util.PromptFormatLocal
66
import com.llama.llamastack.client.local.util.buildInferenceChatCompletionResponse
7+
import com.llama.llamastack.client.local.util.buildInferenceChatCompletionResponseFromStream
8+
import com.llama.llamastack.client.local.util.buildLastInferenceChatCompletionResponsesFromStream
79
import com.llama.llamastack.core.RequestOptions
810
import com.llama.llamastack.core.http.StreamResponse
911
import com.llama.llamastack.models.EmbeddingsResponse
@@ -27,19 +29,32 @@ constructor(
2729
private var modelName: String = ""
2830

2931
private var sequenceLengthKey: String = "seq_len"
32+
private var stopToken: String = ""
33+
34+
private val streamingResponseList = mutableListOf<InferenceChatCompletionResponse>()
35+
private var isStreaming: Boolean = false
36+
37+
private val waitTime: Long = 100
3038

3139
override fun onResult(p0: String?) {
3240
if (PromptFormatLocal.getStopTokens(modelName).any { it == p0 }) {
41+
stopToken = p0!!
3342
onResultComplete = true
3443
return
3544
}
3645

3746
if (p0.equals("\n\n") || p0.equals("\n")) {
3847
if (resultMessage.isNotEmpty()) {
3948
resultMessage += p0
49+
if (p0 != null && isStreaming) {
50+
streamingResponseList.add(buildInferenceChatCompletionResponseFromStream(p0))
51+
}
4052
}
4153
} else {
4254
resultMessage += p0
55+
if (p0 != null && isStreaming) {
56+
streamingResponseList.add(buildInferenceChatCompletionResponseFromStream(p0))
57+
}
4358
}
4459
}
4560

@@ -55,7 +70,8 @@ constructor(
5570
params: InferenceChatCompletionParams,
5671
requestOptions: RequestOptions
5772
): InferenceChatCompletionResponse {
58-
resultMessage = ""
73+
isStreaming = false
74+
clearElements()
5975
val mModule = clientOptions.llamaModule
6076
modelName = params.modelId()
6177
val formattedPrompt =
@@ -74,19 +90,67 @@ constructor(
7490
mModule.generate(formattedPrompt, seqLength, this, false)
7591

7692
while (!onResultComplete && !onStatsComplete) {
77-
Thread.sleep(100)
93+
Thread.sleep(waitTime)
7894
}
7995
onResultComplete = false
8096
onStatsComplete = false
8197

82-
return buildInferenceChatCompletionResponse(resultMessage, statsMetric)
98+
return buildInferenceChatCompletionResponse(resultMessage, statsMetric, stopToken)
8399
}
84100

101+
private val streamResponse =
102+
object : StreamResponse<InferenceChatCompletionResponse> {
103+
override fun asSequence(): Sequence<InferenceChatCompletionResponse> {
104+
return sequence {
105+
while (!onResultComplete || streamingResponseList.isNotEmpty()) {
106+
if (streamingResponseList.isNotEmpty()) {
107+
yield(streamingResponseList.removeAt(0))
108+
} else {
109+
Thread.sleep(waitTime)
110+
}
111+
}
112+
while (!onStatsComplete) {
113+
Thread.sleep(waitTime)
114+
}
115+
val chatCompletionResponses =
116+
buildLastInferenceChatCompletionResponsesFromStream(
117+
resultMessage,
118+
statsMetric,
119+
stopToken,
120+
)
121+
for (ccr in chatCompletionResponses) {
122+
yield(ccr)
123+
}
124+
}
125+
}
126+
127+
override fun close() {
128+
isStreaming = false
129+
}
130+
}
131+
85132
override fun chatCompletionStreaming(
86133
params: InferenceChatCompletionParams,
87134
requestOptions: RequestOptions
88135
): StreamResponse<InferenceChatCompletionResponse> {
89-
TODO("Not yet implemented")
136+
isStreaming = true
137+
streamingResponseList.clear()
138+
resultMessage = ""
139+
val mModule = clientOptions.llamaModule
140+
modelName = params.modelId()
141+
val formattedPrompt =
142+
PromptFormatLocal.getTotalFormattedPrompt(params.messages(), modelName)
143+
144+
val seqLength =
145+
params._additionalQueryParams().values(sequenceLengthKey).lastOrNull()?.toInt()
146+
?: ((formattedPrompt.length * 0.75) + 64).toInt()
147+
148+
println("Chat Completion Prompt is: $formattedPrompt with seqLength of $seqLength")
149+
onResultComplete = false
150+
val thread = Thread { mModule.generate(formattedPrompt, seqLength, this, false) }
151+
thread.start()
152+
153+
return streamResponse
90154
}
91155

92156
override fun completion(
@@ -109,4 +173,9 @@ constructor(
109173
): EmbeddingsResponse {
110174
TODO("Not yet implemented")
111175
}
176+
177+
fun clearElements() {
178+
resultMessage = ""
179+
stopToken = ""
180+
}
112181
}

llama-stack-client-kotlin-client-local/src/main/kotlin/com/llama/llamastack/client/local/LlamaStackClientClientLocalImpl.kt

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,30 @@ constructor(
1616

1717
override fun inference(): InferenceService = inference
1818

19+
override fun vectorIo(): VectorIoService {
20+
TODO("Not yet implemented")
21+
}
22+
23+
override fun vectorDbs(): VectorDbService {
24+
TODO("Not yet implemented")
25+
}
26+
1927
override fun async(): LlamaStackClientClientAsync {
2028
TODO("Not yet implemented")
2129
}
2230

31+
override fun toolgroups(): ToolgroupService {
32+
TODO("Not yet implemented")
33+
}
34+
35+
override fun tools(): ToolService {
36+
TODO("Not yet implemented")
37+
}
38+
39+
override fun toolRuntime(): ToolRuntimeService {
40+
TODO("Not yet implemented")
41+
}
42+
2343
override fun telemetry(): TelemetryService {
2444
TODO("Not yet implemented")
2545
}
@@ -64,10 +84,6 @@ constructor(
6484
TODO("Not yet implemented")
6585
}
6686

67-
override fun memory(): MemoryService {
68-
TODO("Not yet implemented")
69-
}
70-
7187
override fun postTraining(): PostTrainingService {
7288
TODO("Not yet implemented")
7389
}
@@ -88,10 +104,6 @@ constructor(
88104
TODO("Not yet implemented")
89105
}
90106

91-
override fun memoryBanks(): MemoryBankService {
92-
TODO("Not yet implemented")
93-
}
94-
95107
override fun shields(): ShieldService {
96108
TODO("Not yet implemented")
97109
}

0 commit comments

Comments
 (0)