@@ -92,7 +92,7 @@ def __init__(
9292 embedding : bool = False ,
9393 offload_kqv : bool = True ,
9494 flash_attn : bool = False ,
95- op_offloat : Optional [bool ] = None ,
95+ op_offload : Optional [bool ] = None ,
9696 swa_full : Optional [bool ] = None ,
9797 # Sampling Params
9898 no_perf : bool = False ,
@@ -174,7 +174,7 @@ def __init__(
174174 embedding: Embedding mode only.
175175 offload_kqv: Offload K, Q, V to GPU.
176176 flash_attn: Use flash attention.
177- op_offloat : offload host tensor operations to device
177+ op_offload : offload host tensor operations to device
178178 swa_full: use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
179179 no_perf: Measure performance timings.
180180 last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
@@ -343,8 +343,8 @@ def __init__(
343343 self .context_params .offload_kqv = offload_kqv
344344 self .context_params .flash_attn = flash_attn
345345
346- if op_offloat is not None :
347- self .context_params .op_offloat = op_offloat
346+ if op_offload is not None :
347+ self .context_params .op_offload = op_offload
348348
349349 if swa_full is not None :
350350 self .context_params .swa_full = swa_full
@@ -2097,7 +2097,7 @@ def __getstate__(self):
20972097 embedding = self .context_params .embeddings ,
20982098 offload_kqv = self .context_params .offload_kqv ,
20992099 flash_attn = self .context_params .flash_attn ,
2100- op_offloat = self .context_params .op_offloat ,
2100+ op_offload = self .context_params .op_offload ,
21012101 swa_full = self .context_params .swa_full ,
21022102 # Sampling Params
21032103 no_perf = self .context_params .no_perf ,
0 commit comments