Draft
Conversation
Comment on lines
+129
to
+132
| .AddUniformVariables({{num_workgroups}, // sequence_length = total workgroups | ||
| {num_workgroups}, // present_sequence_length = same (flat) | ||
| {0u}, // start_token = 0 | ||
| {1u}}); // n_reps = 1 |
Contributor
There was a problem hiding this comment.
Suggested change
| .AddUniformVariables({{num_workgroups}, // sequence_length = total workgroups | |
| {num_workgroups}, // present_sequence_length = same (flat) | |
| {0u}, // start_token = 0 | |
| {1u}}); // n_reps = 1 | |
| .AddUniformVariables({{num_workgroups}, // sequence_length = total workgroups | |
| {num_workgroups}, // present_sequence_length = same (flat) | |
| {0u}, // start_token = 0 | |
| {1u}}); // n_reps = 1 |
| bool enable_pix_capture{false}; // PIX capture is disabled by default | ||
| bool enable_int64{false}; // int64 ops are not enabled by default | ||
| uint32_t multi_rotary_cache_concat_offset{0}; // offset for concatenated multi rotary cache (0 = disabled) | ||
| bool turbo_quant{false}; // enable TurboQuant KV cache compression |
Contributor
There was a problem hiding this comment.
Suggested change
| bool turbo_quant{false}; // enable TurboQuant KV cache compression | |
| bool turbo_quant{false}; // enable TurboQuant KV cache compression |
|
|
||
| static const float TQ_CENTROIDS[16] = { | ||
| -0.2377f, -0.1809f, -0.1419f, -0.1104f, -0.0829f, -0.0578f, -0.0342f, -0.0113f, | ||
| 0.0113f, 0.0342f, 0.0578f, 0.0829f, 0.1104f, 0.1419f, 0.1809f, 0.2377f}; |
Contributor
There was a problem hiding this comment.
Suggested change
| 0.0113f, 0.0342f, 0.0578f, 0.0829f, 0.1104f, 0.1419f, 0.1809f, 0.2377f}; | |
| 0.0113f, 0.0342f, 0.0578f, 0.0829f, 0.1104f, 0.1419f, 0.1809f, 0.2377f}; |
|
|
||
| static const float TQ_BOUNDARIES[15] = { | ||
| -0.2093f, -0.1614f, -0.1261f, -0.0966f, -0.0704f, -0.0460f, -0.0227f, | ||
| 0.0000f, 0.0227f, 0.0460f, 0.0704f, 0.0966f, 0.1261f, 0.1614f, 0.2093f}; |
Contributor
There was a problem hiding this comment.
Suggested change
| 0.0000f, 0.0227f, 0.0460f, 0.0704f, 0.0966f, 0.1261f, 0.1614f, 0.2093f}; | |
| 0.0000f, 0.0227f, 0.0460f, 0.0704f, 0.0966f, 0.1261f, 0.1614f, 0.2093f}; |
Comment on lines
+431
to
+432
| static_cast<int64_t>(cfg.max_cache), | ||
| cache_dim}; |
Contributor
There was a problem hiding this comment.
Suggested change
| static_cast<int64_t>(cfg.max_cache), | |
| cache_dim}; | |
| static_cast<int64_t>(cfg.max_cache), | |
| cache_dim}; |
Comment on lines
+45
to
+53
| query = helper.make_tensor_value_info( | ||
| "query", TensorProto.FLOAT16, [batch_size, "seq_len", hidden_size] | ||
| ) | ||
| key = helper.make_tensor_value_info( | ||
| "key", TensorProto.FLOAT16, [batch_size, "seq_len", kv_hidden_size] | ||
| ) | ||
| value = helper.make_tensor_value_info( | ||
| "value", TensorProto.FLOAT16, [batch_size, "seq_len", kv_hidden_size] | ||
| ) |
Contributor
There was a problem hiding this comment.
Suggested change
| query = helper.make_tensor_value_info( | |
| "query", TensorProto.FLOAT16, [batch_size, "seq_len", hidden_size] | |
| ) | |
| key = helper.make_tensor_value_info( | |
| "key", TensorProto.FLOAT16, [batch_size, "seq_len", kv_hidden_size] | |
| ) | |
| value = helper.make_tensor_value_info( | |
| "value", TensorProto.FLOAT16, [batch_size, "seq_len", kv_hidden_size] | |
| ) | |
| query = helper.make_tensor_value_info("query", TensorProto.FLOAT16, [batch_size, "seq_len", hidden_size]) | |
| key = helper.make_tensor_value_info("key", TensorProto.FLOAT16, [batch_size, "seq_len", kv_hidden_size]) | |
| value = helper.make_tensor_value_info("value", TensorProto.FLOAT16, [batch_size, "seq_len", kv_hidden_size]) |
Comment on lines
+64
to
+69
| seqlens_k = helper.make_tensor_value_info( | ||
| "seqlens_k", TensorProto.INT32, [batch_size] | ||
| ) | ||
| total_sequence_length = helper.make_tensor_value_info( | ||
| "total_sequence_length", TensorProto.INT32, [1] | ||
| ) |
Contributor
There was a problem hiding this comment.
Suggested change
| seqlens_k = helper.make_tensor_value_info( | |
| "seqlens_k", TensorProto.INT32, [batch_size] | |
| ) | |
| total_sequence_length = helper.make_tensor_value_info( | |
| "total_sequence_length", TensorProto.INT32, [1] | |
| ) | |
| seqlens_k = helper.make_tensor_value_info("seqlens_k", TensorProto.INT32, [batch_size]) | |
| total_sequence_length = helper.make_tensor_value_info("total_sequence_length", TensorProto.INT32, [1]) |
Comment on lines
+72
to
+74
| output = helper.make_tensor_value_info( | ||
| "output", TensorProto.FLOAT16, [batch_size, "seq_len", hidden_size] | ||
| ) |
Contributor
There was a problem hiding this comment.
Suggested change
| output = helper.make_tensor_value_info( | |
| "output", TensorProto.FLOAT16, [batch_size, "seq_len", hidden_size] | |
| ) | |
| output = helper.make_tensor_value_info("output", TensorProto.FLOAT16, [batch_size, "seq_len", hidden_size]) |
Comment on lines
+90
to
+95
| "query", # 0 | ||
| "key", # 1 | ||
| "value", # 2 | ||
| "past_key", # 3 | ||
| "past_value", # 4 | ||
| "seqlens_k", # 5 |
Contributor
There was a problem hiding this comment.
Suggested change
| "query", # 0 | |
| "key", # 1 | |
| "value", # 2 | |
| "past_key", # 3 | |
| "past_value", # 4 | |
| "seqlens_k", # 5 | |
| "query", # 0 | |
| "key", # 1 | |
| "value", # 2 | |
| "past_key", # 3 | |
| "past_value", # 4 | |
| "seqlens_k", # 5 |
Comment on lines
+99
to
+101
| "output", # 0 | ||
| "present_key", # 1 | ||
| "present_value", # 2 |
Contributor
There was a problem hiding this comment.
Suggested change
| "output", # 0 | |
| "present_key", # 1 | |
| "present_value", # 2 | |
| "output", # 0 | |
| "present_key", # 1 | |
| "present_value", # 2 |
| @@ -0,0 +1,199 @@ | |||
| // Copyright (c) Microsoft Corporation. All rights reserved. | |||
| @@ -0,0 +1,1032 @@ | |||
| // Copyright (c) Microsoft Corporation. All rights reserved. | |||
| @@ -0,0 +1,148 @@ | |||
| # Copyright (c) Microsoft Corporation. All rights reserved. | |||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Description
WIP Turbo quant implementation from claude, uses hadmard matrix for rotation instead of regular matrix - deviates from the paper this way.
Early numbers
Without Turbo Quant
C:\onnxruntime-genai\examples\c>C:\onnxruntime-genai\examples\c\build\RelWithDebInfo\model_benchmark.exe -i "C:\models\phi4-onnx" -l 1024
Batch size: 1, prompt tokens: 1024, tokens to generate: 128
Prompt processing (time to first token):
avg (us): 572028
avg (tokens/s): 1790.12
p50 (us): 571558
stddev (us): 1542.95
n: 5 * 1024 token(s)
Token generation:
avg (us): 10015.9
avg (tokens/s): 99.8415
p50 (us): 9654.5
stddev (us): 3461.49
n: 635 * 1 token(s)
Token sampling:
avg (us): 35.9
avg (tokens/s): 27855.2
p50 (us): 38
stddev (us): 6.61135
n: 5 * 1 token(s)
E2E generation (entire generation loop):
avg (ms): 1844.11
p50 (ms): 1845.6
stddev (ms): 4.43472
n: 5
Peak working set size (bytes): 2098737152
With Turbo Quant
C:\onnxruntime-genai\examples\c>C:\onnxruntime-genai\examples\c\build\RelWithDebInfo\model_benchmark.exe -i "C:\models\phi4-onnx" -l 1024
Batch size: 1, prompt tokens: 1024, tokens to generate: 128
Prompt processing (time to first token):
avg (us): 589068
avg (tokens/s): 1738.34
p50 (us): 588342
stddev (us): 2264.58
n: 5 * 1024 token(s)
Token generation:
avg (us): 10817.2
avg (tokens/s): 92.4455
p50 (us): 10443
stddev (us): 3579.67
n: 635 * 1 token(s)
Token sampling:
avg (us): 37.1
avg (tokens/s): 26954.2
p50 (us): 38.8
stddev (us): 5.39954
n: 5 * 1 token(s)
E2E generation (entire generation loop):
avg (ms): 1962.92
p50 (ms): 1960.86
stddev (ms): 3.43245
n: 5
Peak working set size (bytes): 1856163840
Saves about 200MB of VRAM for 1K prompt, but does slow down prompt and generation.
Implementation passes needle in haystack and ruler kind of tests but then has clear model dumbing down for prompts like "HI", write me a poem. That is the Hi response has some strange tool call, poem was more repetitive.
Next Step - root cause why model quality degrades.
Motivation and Context