diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3ea1326b4608..3bf3d34f4826 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -150,6 +150,8 @@ Optimizations
 * GITHUB#15160: Increased the size used for blocks of postings from 128 to 256.
   This gives a noticeable speedup to many queries. (Adrien Grand)
 
+* GITHUB#14863: Perform scoring for 4, 7, 8 bit quantized vectors off-heap. (Kaival Parikh)
+
 Bug Fixes
 ---------------------
 * GITHUB#14161: PointInSetQuery's constructor now throws IllegalArgumentException
diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java
index a8eb1b945cee..4c8253fdab9f 100644
--- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java
+++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java
@@ -54,11 +54,13 @@ static void compressBytes(byte[] raw, byte[] compressed) {
   private byte[] bytesA;
   private byte[] bytesB;
   private byte[] halfBytesA;
+  private byte[] halfBytesAPacked;
   private byte[] halfBytesB;
   private byte[] halfBytesBPacked;
   private float[] floatsA;
   private float[] floatsB;
-  private int expectedhalfByteDotProduct;
+  private int expectedHalfByteDotProduct;
+  private int expectedHalfByteSquareDistance;
 
   @Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
   int size;
@@ -74,16 +76,23 @@ public void init() {
     random.nextBytes(bytesB);
     // random half byte arrays for binary methods
     // this means that all values must be between 0 and 15
-    expectedhalfByteDotProduct = 0;
+    expectedHalfByteDotProduct = 0;
+    expectedHalfByteSquareDistance = 0;
     halfBytesA = new byte[size];
     halfBytesB = new byte[size];
     for (int i = 0; i < size; ++i) {
       halfBytesA[i] = (byte) random.nextInt(16);
       halfBytesB[i] = (byte) random.nextInt(16);
-      expectedhalfByteDotProduct += halfBytesA[i] * halfBytesB[i];
+      expectedHalfByteDotProduct += halfBytesA[i] * halfBytesB[i];
+
+      int diff = halfBytesA[i] - halfBytesB[i];
+      expectedHalfByteSquareDistance += diff * diff;
     }
     // pack the half byte arrays
     if (size % 2 == 0) {
+      halfBytesAPacked = new byte[(size + 1) >> 1];
+      compressBytes(halfBytesA, halfBytesAPacked);
+
       halfBytesBPacked = new byte[(size + 1) >> 1];
       compressBytes(halfBytesB, halfBytesBPacked);
     }
@@ -108,6 +117,74 @@ public float binaryCosineVector() {
     return VectorUtil.cosine(bytesA, bytesB);
   }
 
+  @Benchmark
+  public int binarySquareScalar() {
+    return VectorUtil.squareDistance(bytesA, bytesB);
+  }
+
+  @Benchmark
+  @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
+  public int binarySquareVector() {
+    return VectorUtil.squareDistance(bytesA, bytesB);
+  }
+
+  @Benchmark
+  public int binaryHalfByteSquareScalar() {
+    int v = VectorUtil.int4SquareDistance(halfBytesA, halfBytesB);
+    if (v != expectedHalfByteSquareDistance) {
+      throw new RuntimeException("Expected " + expectedHalfByteDotProduct + " but got " + v);
+    }
+    return v;
+  }
+
+  @Benchmark
+  @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
+  public int binaryHalfByteSquareVector() {
+    int v = VectorUtil.int4SquareDistance(halfBytesA, halfBytesB);
+    if (v != expectedHalfByteSquareDistance) {
+      throw new RuntimeException("Expected " + expectedHalfByteDotProduct + " but got " + v);
+    }
+    return v;
+  }
+
+  @Benchmark
+  public int binaryHalfByteSquareSinglePackedScalar() {
+    int v = VectorUtil.int4SquareDistanceSinglePacked(halfBytesA, halfBytesBPacked);
+    if (v != expectedHalfByteSquareDistance) {
+      throw new RuntimeException("Expected " + expectedHalfByteDotProduct + " but got " + v);
+    }
+    return v;
+  }
+
+  @Benchmark
+  @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
+  public int binaryHalfByteSquareSinglePackedVector() {
+    int v = VectorUtil.int4SquareDistanceSinglePacked(halfBytesA, halfBytesBPacked);
+    if (v != expectedHalfByteSquareDistance) {
+      throw new RuntimeException("Expected " + expectedHalfByteDotProduct + " but got " + v);
+    }
+    return v;
+  }
+
+  @Benchmark
+  public int binaryHalfByteSquareBothPackedScalar() {
+    int v = VectorUtil.int4SquareDistanceBothPacked(halfBytesAPacked, halfBytesBPacked);
+    if (v != expectedHalfByteSquareDistance) {
+      throw new RuntimeException("Expected " + expectedHalfByteDotProduct + " but got " + v);
+    }
+    return v;
+  }
+
+  @Benchmark
+  @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
+  public int binaryHalfByteSquareBothPackedVector() {
+    int v = VectorUtil.int4SquareDistanceBothPacked(halfBytesAPacked, halfBytesBPacked);
+    if (v != expectedHalfByteSquareDistance) {
+      throw new RuntimeException("Expected " + expectedHalfByteDotProduct + " but got " + v);
+    }
+    return v;
+  }
+
   @Benchmark
   public int binaryDotProductScalar() {
     return VectorUtil.dotProduct(bytesA, bytesB);
@@ -131,14 +208,22 @@ public int binaryDotProductUint8Vector() {
   }
 
   @Benchmark
-  public int binarySquareScalar() {
-    return VectorUtil.squareDistance(bytesA, bytesB);
+  public int binaryHalfByteDotProductScalar() {
+    int v = VectorUtil.int4DotProduct(halfBytesA, halfBytesB);
+    if (v != expectedHalfByteDotProduct) {
+      throw new RuntimeException("Expected " + expectedHalfByteDotProduct + " but got " + v);
+    }
+    return v;
   }
 
   @Benchmark
   @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
-  public int binarySquareVector() {
-    return VectorUtil.squareDistance(bytesA, bytesB);
+  public int binaryHalfByteDotProductVector() {
+    int v = VectorUtil.int4DotProduct(halfBytesA, halfBytesB);
+    if (v != expectedHalfByteDotProduct) {
+      throw new RuntimeException("Expected " + expectedHalfByteDotProduct + " but got " + v);
+    }
+    return v;
   }
 
   @Benchmark
@@ -153,37 +238,39 @@ public int binarySquareUint8Vector() {
   }
 
   @Benchmark
-  public int binaryHalfByteScalar() {
-    return VectorUtil.int4DotProduct(halfBytesA, halfBytesB);
+  public int binaryHalfByteDotProductSinglePackedScalar() {
+    int v = VectorUtil.int4DotProductSinglePacked(halfBytesA, halfBytesBPacked);
+    if (v != expectedHalfByteDotProduct) {
+      throw new RuntimeException("Expected " + expectedHalfByteDotProduct + " but got " + v);
+    }
+    return v;
   }
 
   @Benchmark
   @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
-  public int binaryHalfByteVector() {
-    return VectorUtil.int4DotProduct(halfBytesA, halfBytesB);
+  public int binaryHalfByteDotProductSinglePackedVector() {
+    int v = VectorUtil.int4DotProductSinglePacked(halfBytesA, halfBytesBPacked);
+    if (v != expectedHalfByteDotProduct) {
+      throw new RuntimeException("Expected " + expectedHalfByteDotProduct + " but got " + v);
+    }
+    return v;
   }
 
   @Benchmark
-  public int binaryHalfByteScalarPacked() {
-    if (size % 2 != 0) {
-      throw new RuntimeException("Size must be even for this benchmark");
-    }
-    int v = VectorUtil.int4DotProductPacked(halfBytesA, halfBytesBPacked);
-    if (v != expectedhalfByteDotProduct) {
-      throw new RuntimeException("Expected " + expectedhalfByteDotProduct + " but got " + v);
+  public int binaryHalfByteDotProductBothPackedScalar() {
+    int v = VectorUtil.int4DotProductBothPacked(halfBytesAPacked, halfBytesBPacked);
+    if (v != expectedHalfByteDotProduct) {
+      throw new RuntimeException("Expected " + expectedHalfByteDotProduct + " but got " + v);
     }
     return v;
   }
 
   @Benchmark
   @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
-  public int binaryHalfByteVectorPacked() {
-    if (size % 2 != 0) {
-      throw new RuntimeException("Size must be even for this benchmark");
-    }
-    int v = VectorUtil.int4DotProductPacked(halfBytesA, halfBytesBPacked);
-    if (v != expectedhalfByteDotProduct) {
-      throw new RuntimeException("Expected " + expectedhalfByteDotProduct + " but got " + v);
+  public int binaryHalfByteDotProductBothPackedVector() {
+    int v = VectorUtil.int4DotProductBothPacked(halfBytesAPacked, halfBytesBPacked);
+    if (v != expectedHalfByteDotProduct) {
+      throw new RuntimeException("Expected " + expectedHalfByteDotProduct + " but got " + v);
     }
     return v;
   }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java
index 808d7b3cc882..123c18e00c08 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/hnsw/FlatVectorScorerUtil.java
@@ -37,4 +37,8 @@ private FlatVectorScorerUtil() {}
   public static FlatVectorsScorer getLucene99FlatVectorsScorer() {
     return IMPL.getLucene99FlatVectorsScorer();
   }
+
+  public static FlatVectorsScorer getLucene99ScalarQuantizedVectorsScorer() {
+    return IMPL.getLucene99ScalarQuantizedVectorsScorer();
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java
index 117521ddcc2a..80afaf5c685a 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java
@@ -23,6 +23,7 @@
 import org.apache.lucene.codecs.hnsw.FlatVectorsScorer;
 import org.apache.lucene.index.KnnVectorValues;
 import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.util.FloatToFloatFunction;
 import org.apache.lucene.util.VectorUtil;
 import org.apache.lucene.util.hnsw.RandomVectorScorer;
 import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
@@ -245,7 +246,7 @@ public float score(int vectorOrdinal) throws IOException {
       values.getSlice().seek((long) vectorOrdinal * (values.getVectorByteLength() + Float.BYTES));
       values.getSlice().readBytes(compressedVector, 0, compressedVector.length);
       float vectorOffset = values.getScoreCorrectionConstant(vectorOrdinal);
-      int dotProduct = VectorUtil.int4DotProductPacked(targetBytes, compressedVector);
+      int dotProduct = VectorUtil.int4DotProductSinglePacked(targetBytes, compressedVector);
       // For the current implementation of scalar quantization, all dotproducts should
       // be >= 0;
       assert dotProduct >= 0;
@@ -301,11 +302,6 @@ public void setScoringOrdinal(int node) throws IOException {
     }
   }
 
-  @FunctionalInterface
-  private interface FloatToFloatFunction {
-    float apply(float f);
-  }
-
   private static final class ScalarQuantizedRandomVectorScorerSupplier
       implements RandomVectorScorerSupplier {
 
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java
index 0f339ecbe0a8..76c73980aef8 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsFormat.java
@@ -18,10 +18,10 @@
 package org.apache.lucene.codecs.lucene99;
 
 import java.io.IOException;
-import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
 import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil;
 import org.apache.lucene.codecs.hnsw.FlatVectorsFormat;
 import org.apache.lucene.codecs.hnsw.FlatVectorsReader;
+import org.apache.lucene.codecs.hnsw.FlatVectorsScorer;
 import org.apache.lucene.codecs.hnsw.FlatVectorsWriter;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
@@ -68,7 +68,7 @@ public class Lucene99ScalarQuantizedVectorsFormat extends FlatVectorsFormat {
 
   final byte bits;
   final boolean compress;
-  final Lucene99ScalarQuantizedVectorScorer flatVectorScorer;
+  final FlatVectorsScorer flatVectorScorer;
 
   /** Constructs a format using default graph construction parameters */
   public Lucene99ScalarQuantizedVectorsFormat() {
@@ -115,8 +115,7 @@ public Lucene99ScalarQuantizedVectorsFormat(
     this.bits = (byte) bits;
     this.confidenceInterval = confidenceInterval;
     this.compress = compress;
-    this.flatVectorScorer =
-        new Lucene99ScalarQuantizedVectorScorer(DefaultFlatVectorScorer.INSTANCE);
+    this.flatVectorScorer = FlatVectorScorerUtil.getLucene99ScalarQuantizedVectorsScorer();
   }
 
   public static float calculateDefaultConfidenceInterval(int vectorDimension) {
diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java
index 89c813a4b93b..7f08c673a7f1 100644
--- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java
+++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java
@@ -164,24 +164,35 @@ public int uint8DotProduct(byte[] a, byte[] b) {
   }
 
   @Override
-  public int int4DotProduct(byte[] a, boolean apacked, byte[] b, boolean bpacked) {
-    assert (apacked && bpacked) == false;
-    if (apacked || bpacked) {
-      byte[] packed = apacked ? a : b;
-      byte[] unpacked = apacked ? b : a;
-      int total = 0;
-      for (int i = 0; i < packed.length; i++) {
-        byte packedByte = packed[i];
-        byte unpacked1 = unpacked[i];
-        byte unpacked2 = unpacked[i + packed.length];
-        total += (packedByte & 0x0F) * unpacked2;
-        total += ((packedByte & 0xFF) >> 4) * unpacked1;
-      }
-      return total;
-    }
+  public int int4DotProduct(byte[] a, byte[] b) {
     return dotProduct(a, b);
   }
 
+  @Override
+  public int int4DotProductSinglePacked(byte[] unpacked, byte[] packed) {
+    int total = 0;
+    for (int i = 0; i < packed.length; i++) {
+      byte packedByte = packed[i];
+      byte unpacked1 = unpacked[i];
+      byte unpacked2 = unpacked[i + packed.length];
+      total += (packedByte & 0x0F) * unpacked2;
+      total += ((packedByte & 0xFF) >> 4) * unpacked1;
+    }
+    return total;
+  }
+
+  @Override
+  public int int4DotProductBothPacked(byte[] a, byte[] b) {
+    int total = 0;
+    for (int i = 0; i < a.length; i++) {
+      byte aByte = a[i];
+      byte bByte = b[i];
+      total += (aByte & 0x0F) * (bByte & 0x0F);
+      total += ((aByte & 0xFF) >> 4) * ((bByte & 0xFF) >> 4);
+    }
+    return total;
+  }
+
   @Override
   public float cosine(byte[] a, byte[] b) {
     // Note: this will not overflow if dim < 2^18, since max(byte * byte) = 2^14.
@@ -210,6 +221,42 @@ public int squareDistance(byte[] a, byte[] b) {
     return squareSum;
   }
 
+  @Override
+  public int int4SquareDistance(byte[] a, byte[] b) {
+    return squareDistance(a, b);
+  }
+
+  @Override
+  public int int4SquareDistanceSinglePacked(byte[] unpacked, byte[] packed) {
+    int total = 0;
+    for (int i = 0; i < packed.length; i++) {
+      byte packedByte = packed[i];
+      byte unpacked1 = unpacked[i];
+      byte unpacked2 = unpacked[i + packed.length];
+
+      int diff1 = (packedByte & 0x0F) - unpacked2;
+      int diff2 = ((packedByte & 0xFF) >> 4) - unpacked1;
+
+      total += diff1 * diff1 + diff2 * diff2;
+    }
+    return total;
+  }
+
+  @Override
+  public int int4SquareDistanceBothPacked(byte[] a, byte[] b) {
+    int total = 0;
+    for (int i = 0; i < a.length; i++) {
+      byte aByte = a[i];
+      byte bByte = b[i];
+
+      int diff1 = (aByte & 0x0F) - (bByte & 0x0F);
+      int diff2 = ((aByte & 0xFF) >> 4) - ((bByte & 0xFF) >> 4);
+
+      total += diff1 * diff1 + diff2 * diff2;
+    }
+    return total;
+  }
+
   @Override
   public int uint8SquareDistance(byte[] a, byte[] b) {
     // Note: this will not overflow if dim < 2^16, since max(ubyte * ubyte) = 2^16.
diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java
index c5e9301e9bc4..21977fa3dc77 100644
--- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java
+++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorizationProvider.java
@@ -19,6 +19,7 @@
 
 import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
 import org.apache.lucene.codecs.hnsw.FlatVectorsScorer;
+import org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorScorer;
 import org.apache.lucene.store.IndexInput;
 
 /** Default provider returning scalar implementations. */
@@ -40,6 +41,11 @@ public FlatVectorsScorer getLucene99FlatVectorsScorer() {
     return DefaultFlatVectorScorer.INSTANCE;
   }
 
+  @Override
+  public FlatVectorsScorer getLucene99ScalarQuantizedVectorsScorer() {
+    return new Lucene99ScalarQuantizedVectorScorer(DefaultFlatVectorScorer.INSTANCE);
+  }
+
   @Override
   public PostingDecodingUtil newPostingDecodingUtil(IndexInput input) {
     return new PostingDecodingUtil(input);
diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java
index 7190f983b4ce..7242a2501a19 100644
--- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java
+++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java
@@ -36,18 +36,40 @@ public interface VectorUtilSupport {
   /** Returns the dot product computed over signed bytes. */
   int dotProduct(byte[] a, byte[] b);
 
+  /** Returns the dot product computed over unsigned half-bytes, both uncompressed. */
+  int int4DotProduct(byte[] a, byte[] b);
+
+  /** Returns the dot product computed over unsigned half-bytes, one compressed. */
+  int int4DotProductSinglePacked(byte[] unpacked, byte[] packed);
+
+  /** Returns the dot product computed over unsigned half-bytes, both compressed. */
+  int int4DotProductBothPacked(byte[] a, byte[] b);
+
   /** Returns the dot product computed as though the bytes were unsigned. */
   int uint8DotProduct(byte[] a, byte[] b);
 
-  /** Returns the dot product over the computed bytes, assuming the values are int4 encoded. */
-  int int4DotProduct(byte[] a, boolean apacked, byte[] b, boolean bpacked);
-
   /** Returns the cosine similarity between the two byte vectors. */
   float cosine(byte[] a, byte[] b);
 
   /** Returns the sum of squared differences of the two byte vectors. */
   int squareDistance(byte[] a, byte[] b);
 
+  /**
+   * Returns the sum of squared differences between two unsigned half-byte vectors, both
+   * uncompressed.
+   */
+  int int4SquareDistance(byte[] a, byte[] b);
+
+  /**
+   * Returns the sum of squared differences between two unsigned half-byte vectors, one compressed.
+   */
+  int int4SquareDistanceSinglePacked(byte[] unpacked, byte[] packed);
+
+  /**
+   * Returns the sum of squared differences between two unsigned half-byte vectors, both compressed.
+   */
+  int int4SquareDistanceBothPacked(byte[] a, byte[] b);
+
   /** Returns the sum of squared differences of the two unsigned byte vectors. */
   int uint8SquareDistance(byte[] a, byte[] b);
 
diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java
index 24864318af5a..cf9c56c59774 100644
--- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java
+++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java
@@ -109,6 +109,9 @@ public static VectorizationProvider getInstance() {
   /** Returns a FlatVectorsScorer that supports the Lucene99 format. */
   public abstract FlatVectorsScorer getLucene99FlatVectorsScorer();
 
+  /** Returns a FlatVectorsScorer that supports the Lucene99 format. */
+  public abstract FlatVectorsScorer getLucene99ScalarQuantizedVectorsScorer();
+
   /** Create a new {@link PostingDecodingUtil} for the given {@link IndexInput}. */
   public abstract PostingDecodingUtil newPostingDecodingUtil(IndexInput input) throws IOException;
 
diff --git a/lucene/core/src/java/org/apache/lucene/util/FloatToFloatFunction.java b/lucene/core/src/java/org/apache/lucene/util/FloatToFloatFunction.java
new file mode 100644
index 000000000000..9068a5438361
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/FloatToFloatFunction.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util;
+
+/**
+ * Simple interface to map one float to another (useful in scaling scores).
+ *
+ * @lucene.internal
+ */
+@FunctionalInterface
+public interface FloatToFloatFunction {
+  float apply(float f);
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java
index 38b9cf6d67a5..db1f6fee083b 100644
--- a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java
@@ -113,6 +113,37 @@ public static int squareDistance(byte[] a, byte[] b) {
     return IMPL.squareDistance(a, b);
   }
 
+  /** Returns the sum of squared differences between two uint4 (values between [0,15]) vectors. */
+  public static int int4SquareDistance(byte[] a, byte[] b) {
+    if (a.length != b.length) {
+      throw new IllegalArgumentException("vector dimensions differ: " + a.length + "!=" + b.length);
+    }
+    return IMPL.int4SquareDistance(a, b);
+  }
+
+  /**
+   * Returns the sum of squared differences between two uint4 (values between [0,15]) vectors. The
+   * second vector is considered "packed" (i.e. every byte representing two values).
+   */
+  public static int int4SquareDistanceSinglePacked(byte[] unpacked, byte[] packed) {
+    if (packed.length != ((unpacked.length + 1) >> 1)) {
+      throw new IllegalArgumentException(
+          "vector dimensions differ: " + unpacked.length + "!= 2 * " + packed.length);
+    }
+    return IMPL.int4SquareDistanceSinglePacked(unpacked, packed);
+  }
+
+  /**
+   * Returns the sum of squared differences between two uint4 (values between [0,15]) vectors. Both
+   * vectors are considered "packed" (i.e. every byte representing two values).
+   */
+  public static int int4SquareDistanceBothPacked(byte[] a, byte[] b) {
+    if (a.length != b.length) {
+      throw new IllegalArgumentException("vector dimensions differ: " + a.length + "!=" + b.length);
+    }
+    return IMPL.int4SquareDistanceBothPacked(a, b);
+  }
+
   /** Returns the sum of squared differences of the two vectors where each byte is unsigned */
   public static int uint8SquareDistance(byte[] a, byte[] b) {
     if (a.length != b.length) {
@@ -189,15 +220,22 @@ public static int uint8DotProduct(byte[] a, byte[] b) {
     return IMPL.uint8DotProduct(a, b);
   }
 
+  /**
+   * Dot product computed over uint4 (values between [0,15]) bytes.
+   *
+   * @param a bytes containing a vector
+   * @param b bytes containing another vector, of the same dimension
+   * @return the value of the dot product of the two vectors
+   */
   public static int int4DotProduct(byte[] a, byte[] b) {
     if (a.length != b.length) {
       throw new IllegalArgumentException("vector dimensions differ: " + a.length + "!=" + b.length);
     }
-    return IMPL.int4DotProduct(a, false, b, false);
+    return IMPL.int4DotProduct(a, b);
   }
 
   /**
-   * Dot product computed over int4 (values between [0,15]) bytes. The second vector is considered
+   * Dot product computed over uint4 (values between [0,15]) bytes. The second vector is considered
    * "packed" (i.e. every byte representing two values). The following packing is assumed:
    *
    * <pre class="prettyprint lang-java">
@@ -211,12 +249,28 @@ public static int int4DotProduct(byte[] a, byte[] b) {
    * @param packed the packed vector, of length {@code (unpacked.length + 1) / 2}
    * @return the value of the dot product of the two vectors
    */
-  public static int int4DotProductPacked(byte[] unpacked, byte[] packed) {
+  public static int int4DotProductSinglePacked(byte[] unpacked, byte[] packed) {
     if (packed.length != ((unpacked.length + 1) >> 1)) {
       throw new IllegalArgumentException(
           "vector dimensions differ: " + unpacked.length + "!= 2 * " + packed.length);
     }
-    return IMPL.int4DotProduct(unpacked, false, packed, true);
+    return IMPL.int4DotProductSinglePacked(unpacked, packed);
+  }
+
+  /**
+   * Dot product computed over uint4 (values between [0,15]) bytes. Both vectors are considered
+   * "packed" (i.e. every byte representing two values).
+   *
+   * @param a bytes containing a packed vector
+   * @param b bytes containing another packed vector, of the same dimension
+   * @return the value of the dot product of the two vectors
+   */
+  public static int int4DotProductBothPacked(byte[] a, byte[] b) {
+    if (a.length != b.length) {
+      throw new IllegalArgumentException(
+          "vector dimensions differ: " + a.length + " != " + b.length);
+    }
+    return IMPL.int4DotProductBothPacked(a, b);
   }
 
   /**
diff --git a/lucene/core/src/java24/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentScalarQuantizedVectorScorer.java b/lucene/core/src/java24/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentScalarQuantizedVectorScorer.java
new file mode 100644
index 000000000000..12b95f6c2ff2
--- /dev/null
+++ b/lucene/core/src/java24/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentScalarQuantizedVectorScorer.java
@@ -0,0 +1,323 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.internal.vectorization;
+
+import static java.lang.foreign.ValueLayout.JAVA_INT_UNALIGNED;
+import static org.apache.lucene.codecs.hnsw.ScalarQuantizedVectorScorer.quantizeQuery;
+
+import java.io.IOException;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.nio.ByteOrder;
+import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer;
+import org.apache.lucene.codecs.hnsw.FlatVectorsScorer;
+import org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorScorer;
+import org.apache.lucene.index.KnnVectorValues;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.store.MemorySegmentAccessInput;
+import org.apache.lucene.util.FloatToFloatFunction;
+import org.apache.lucene.util.VectorUtil;
+import org.apache.lucene.util.hnsw.RandomVectorScorer;
+import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
+import org.apache.lucene.util.hnsw.UpdateableRandomVectorScorer;
+import org.apache.lucene.util.quantization.QuantizedByteVectorValues;
+import org.apache.lucene.util.quantization.ScalarQuantizer;
+
+class Lucene99MemorySegmentScalarQuantizedVectorScorer implements FlatVectorsScorer {
+  static final Lucene99MemorySegmentScalarQuantizedVectorScorer INSTANCE =
+      new Lucene99MemorySegmentScalarQuantizedVectorScorer();
+
+  private static final FlatVectorsScorer DELEGATE =
+      new Lucene99ScalarQuantizedVectorScorer(DefaultFlatVectorScorer.INSTANCE);
+
+  private Lucene99MemorySegmentScalarQuantizedVectorScorer() {}
+
+  @Override
+  public RandomVectorScorerSupplier getRandomVectorScorerSupplier(
+      VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues)
+      throws IOException {
+    if (vectorValues instanceof QuantizedByteVectorValues quantized
+        && quantized.getSlice() instanceof MemorySegmentAccessInput input) {
+      return new RandomVectorScorerSupplierImpl(similarityFunction, quantized, input);
+    }
+    return DELEGATE.getRandomVectorScorerSupplier(similarityFunction, vectorValues);
+  }
+
+  @Override
+  public RandomVectorScorer getRandomVectorScorer(
+      VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target)
+      throws IOException {
+    if (vectorValues instanceof QuantizedByteVectorValues quantized
+        && quantized.getSlice() instanceof MemorySegmentAccessInput input) {
+      return new RandomVectorScorerImpl(similarityFunction, quantized, input, target);
+    }
+    return DELEGATE.getRandomVectorScorer(similarityFunction, vectorValues, target);
+  }
+
+  @Override
+  public RandomVectorScorer getRandomVectorScorer(
+      VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target)
+      throws IOException {
+    return DELEGATE.getRandomVectorScorer(similarityFunction, vectorValues, target);
+  }
+
+  @Override
+  public String toString() {
+    return "Lucene99MemorySegmentScalarQuantizedVectorScorer()";
+  }
+
+  private abstract static class RandomVectorScorerBase
+      extends RandomVectorScorer.AbstractRandomVectorScorer {
+
+    private final ScalarQuantizer quantizer;
+    private final float constMultiplier;
+    private final MemorySegmentAccessInput input;
+    private final int vectorByteSize;
+    private final int nodeSize;
+    private final Scorer scorer;
+    private final FloatToFloatFunction scaler;
+    private byte[] scratch;
+
+    RandomVectorScorerBase(
+        VectorSimilarityFunction similarityFunction,
+        QuantizedByteVectorValues values,
+        MemorySegmentAccessInput input) {
+      super(values);
+
+      this.quantizer = values.getScalarQuantizer();
+      this.constMultiplier = this.quantizer.getConstantMultiplier();
+      this.input = input;
+      this.vectorByteSize = values.getVectorByteLength();
+      this.nodeSize = this.vectorByteSize + Float.BYTES;
+
+      this.scorer =
+          switch (similarityFunction) {
+            case EUCLIDEAN -> {
+              if (this.quantizer.getBits() <= 4) {
+                if (this.vectorByteSize != values.dimension()) {
+                  yield this::compressedInt4Euclidean;
+                }
+                yield this::int4Euclidean;
+              }
+              yield this::euclidean;
+            }
+            case DOT_PRODUCT, COSINE, MAXIMUM_INNER_PRODUCT -> {
+              if (this.quantizer.getBits() <= 4) {
+                if (this.vectorByteSize != values.dimension()) {
+                  yield this::compressedInt4DotProduct;
+                }
+                yield this::int4DotProduct;
+              }
+              yield this::dotProduct;
+            }
+          };
+
+      this.scaler =
+          switch (similarityFunction) {
+            case EUCLIDEAN -> VectorUtil::normalizeDistanceToUnitInterval;
+            case DOT_PRODUCT, COSINE -> VectorUtil::normalizeToUnitInterval;
+            case MAXIMUM_INNER_PRODUCT -> VectorUtil::scaleMaxInnerProductScore;
+          };
+
+      checkInvariants();
+    }
+
+    final void checkInvariants() {
+      if (input.length() < (long) nodeSize * maxOrd()) {
+        throw new IllegalArgumentException("input length is less than expected vector data");
+      }
+    }
+
+    final void checkOrdinal(int ord) {
+      if (ord < 0 || ord >= maxOrd()) {
+        throw new IllegalArgumentException("illegal ordinal: " + ord);
+      }
+    }
+
+    ScalarQuantizer getQuantizer() {
+      return quantizer;
+    }
+
+    private static final ValueLayout.OfInt INT_UNALIGNED_LE =
+        JAVA_INT_UNALIGNED.withOrder(ByteOrder.LITTLE_ENDIAN);
+
+    @SuppressWarnings("restricted")
+    Node getNode(int ord) throws IOException {
+      checkOrdinal(ord);
+      long byteOffset = (long) ord * nodeSize;
+      MemorySegment node = input.segmentSliceOrNull(byteOffset, nodeSize);
+      if (node == null) {
+        if (scratch == null) {
+          scratch = new byte[nodeSize];
+        }
+        input.readBytes(byteOffset, scratch, 0, nodeSize);
+        node = MemorySegment.ofArray(scratch);
+      }
+      return new Node(
+          node.reinterpret(vectorByteSize),
+          Float.intBitsToFloat(node.get(INT_UNALIGNED_LE, vectorByteSize)));
+    }
+
+    float scoreBody(int ord, float queryOffset) throws IOException {
+      checkOrdinal(ord);
+      Node node = getNode(ord);
+      return scaler.apply(scorer.score(node.vector) * constMultiplier + node.offset + queryOffset);
+    }
+
+    abstract int euclidean(MemorySegment doc);
+
+    abstract int int4Euclidean(MemorySegment doc);
+
+    abstract int compressedInt4Euclidean(MemorySegment doc);
+
+    abstract int dotProduct(MemorySegment doc);
+
+    abstract int int4DotProduct(MemorySegment doc);
+
+    abstract int compressedInt4DotProduct(MemorySegment doc);
+
+    record Node(MemorySegment vector, float offset) {}
+
+    @FunctionalInterface
+    private interface Scorer {
+      int score(MemorySegment doc) throws IOException;
+    }
+  }
+
+  private static class RandomVectorScorerImpl extends RandomVectorScorerBase {
+    private final byte[] targetBytes;
+    private final float queryOffset;
+
+    RandomVectorScorerImpl(
+        VectorSimilarityFunction similarityFunction,
+        QuantizedByteVectorValues values,
+        MemorySegmentAccessInput input,
+        float[] target) {
+      super(similarityFunction, values, input);
+      this.targetBytes = new byte[target.length];
+      this.queryOffset = quantizeQuery(target, targetBytes, similarityFunction, getQuantizer());
+    }
+
+    @Override
+    public float score(int node) throws IOException {
+      return scoreBody(node, queryOffset);
+    }
+
+    @Override
+    int euclidean(MemorySegment doc) {
+      return PanamaVectorUtilSupport.uint8SquareDistance(targetBytes, doc);
+    }
+
+    @Override
+    int int4Euclidean(MemorySegment doc) {
+      return PanamaVectorUtilSupport.int4SquareDistance(targetBytes, doc);
+    }
+
+    @Override
+    int compressedInt4Euclidean(MemorySegment doc) {
+      return PanamaVectorUtilSupport.int4SquareDistanceSinglePacked(targetBytes, doc);
+    }
+
+    @Override
+    int dotProduct(MemorySegment doc) {
+      return PanamaVectorUtilSupport.uint8DotProduct(targetBytes, doc);
+    }
+
+    @Override
+    int int4DotProduct(MemorySegment doc) {
+      return PanamaVectorUtilSupport.int4DotProduct(targetBytes, doc);
+    }
+
+    @Override
+    int compressedInt4DotProduct(MemorySegment doc) {
+      return PanamaVectorUtilSupport.int4DotProductSinglePacked(targetBytes, doc);
+    }
+  }
+
+  private record RandomVectorScorerSupplierImpl(
+      VectorSimilarityFunction similarityFunction,
+      QuantizedByteVectorValues values,
+      MemorySegmentAccessInput input)
+      implements RandomVectorScorerSupplier {
+
+    @Override
+    public UpdateableRandomVectorScorer scorer() {
+      return new UpdateableRandomVectorScorerImpl(similarityFunction, values, input);
+    }
+
+    @Override
+    public RandomVectorScorerSupplier copy() {
+      return new RandomVectorScorerSupplierImpl(similarityFunction, values, input);
+    }
+  }
+
+  private static class UpdateableRandomVectorScorerImpl extends RandomVectorScorerBase
+      implements UpdateableRandomVectorScorer {
+    private MemorySegment query;
+    private float queryOffset;
+
+    UpdateableRandomVectorScorerImpl(
+        VectorSimilarityFunction similarityFunction,
+        QuantizedByteVectorValues values,
+        MemorySegmentAccessInput input) {
+      super(similarityFunction, values, input);
+    }
+
+    @Override
+    public void setScoringOrdinal(int ord) throws IOException {
+      checkOrdinal(ord);
+      Node node = getNode(ord);
+      query = node.vector;
+      queryOffset = node.offset;
+    }
+
+    @Override
+    public float score(int node) throws IOException {
+      return scoreBody(node, queryOffset);
+    }
+
+    @Override
+    int euclidean(MemorySegment doc) {
+      return PanamaVectorUtilSupport.uint8SquareDistance(query, doc);
+    }
+
+    @Override
+    int int4Euclidean(MemorySegment doc) {
+      return PanamaVectorUtilSupport.int4SquareDistance(query, doc);
+    }
+
+    @Override
+    int compressedInt4Euclidean(MemorySegment doc) {
+      return PanamaVectorUtilSupport.int4SquareDistanceBothPacked(query, doc);
+    }
+
+    @Override
+    int dotProduct(MemorySegment doc) {
+      return PanamaVectorUtilSupport.uint8DotProduct(query, doc);
+    }
+
+    @Override
+    int int4DotProduct(MemorySegment doc) {
+      return PanamaVectorUtilSupport.int4DotProduct(query, doc);
+    }
+
+    @Override
+    int compressedInt4DotProduct(MemorySegment doc) {
+      return PanamaVectorUtilSupport.int4DotProductBothPacked(query, doc);
+    }
+  }
+}
diff --git a/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java
index a77c4846ca2a..ba612f750040 100644
--- a/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java
+++ b/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java
@@ -360,7 +360,7 @@ public byte tail(int index) {
 
   @Override
   public int dotProduct(byte[] a, byte[] b) {
-    return dotProductBody(new ArrayLoader(a), new ArrayLoader(b));
+    return dotProductBody(new ArrayLoader(a), new ArrayLoader(b), true);
   }
 
   @Override
@@ -369,15 +369,19 @@ public int uint8DotProduct(byte[] a, byte[] b) {
   }
 
   public static int dotProduct(byte[] a, MemorySegment b) {
-    return dotProductBody(new ArrayLoader(a), new MemorySegmentLoader(b));
+    return dotProductBody(new ArrayLoader(a), new MemorySegmentLoader(b), true);
   }
 
   public static int dotProduct(MemorySegment a, MemorySegment b) {
-    return dotProductBody(new MemorySegmentLoader(a), new MemorySegmentLoader(b));
+    return dotProductBody(new MemorySegmentLoader(a), new MemorySegmentLoader(b), true);
   }
 
-  private static int dotProductBody(ByteVectorLoader a, ByteVectorLoader b) {
-    return dotProductBody(a, b, true);
+  public static int uint8DotProduct(byte[] a, MemorySegment b) {
+    return dotProductBody(new ArrayLoader(a), new MemorySegmentLoader(b), false);
+  }
+
+  public static int uint8DotProduct(MemorySegment a, MemorySegment b) {
+    return dotProductBody(new MemorySegmentLoader(a), new MemorySegmentLoader(b), false);
   }
 
   private static int dotProductBody(ByteVectorLoader a, ByteVectorLoader b, boolean signed) {
@@ -479,178 +483,198 @@ private static int dotProductBody128(
     return acc.reduceLanes(ADD);
   }
 
+  private static class Int4Constants {
+    static final VectorSpecies<Byte> BYTE_SPECIES;
+    static final VectorSpecies<Short> SHORT_SPECIES;
+    static final int CHUNK;
+
+    static {
+      if (VECTOR_BITSIZE >= 512) {
+        BYTE_SPECIES = ByteVector.SPECIES_256;
+        SHORT_SPECIES = ShortVector.SPECIES_512;
+        CHUNK = 4096;
+      } else if (VECTOR_BITSIZE == 256) {
+        BYTE_SPECIES = ByteVector.SPECIES_128;
+        SHORT_SPECIES = ShortVector.SPECIES_256;
+        CHUNK = 2048;
+      } else {
+        BYTE_SPECIES = ByteVector.SPECIES_64;
+        SHORT_SPECIES = ShortVector.SPECIES_128;
+        CHUNK = 1024;
+      }
+    }
+  }
+
   @Override
-  public int int4DotProduct(byte[] a, boolean apacked, byte[] b, boolean bpacked) {
-    assert (apacked && bpacked) == false;
+  public int int4DotProduct(byte[] a, byte[] b) {
+    return int4DotProductBody(new ArrayLoader(a), new ArrayLoader(b));
+  }
+
+  public static int int4DotProduct(byte[] a, MemorySegment b) {
+    return int4DotProductBody(new ArrayLoader(a), new MemorySegmentLoader(b));
+  }
+
+  public static int int4DotProduct(MemorySegment a, MemorySegment b) {
+    return int4DotProductBody(new MemorySegmentLoader(a), new MemorySegmentLoader(b));
+  }
+
+  private static int int4DotProductBody(ByteVectorLoader a, ByteVectorLoader b) {
     int i = 0;
     int res = 0;
-    if (apacked || bpacked) {
-      byte[] packed = apacked ? a : b;
-      byte[] unpacked = apacked ? b : a;
-      if (packed.length >= 32) {
-        if (VECTOR_BITSIZE >= 512) {
-          i += ByteVector.SPECIES_256.loopBound(packed.length);
-          res += dotProductBody512Int4Packed(unpacked, packed, i);
-        } else if (VECTOR_BITSIZE == 256) {
-          i += ByteVector.SPECIES_128.loopBound(packed.length);
-          res += dotProductBody256Int4Packed(unpacked, packed, i);
-        } else {
-          i += ByteVector.SPECIES_64.loopBound(packed.length);
-          res += dotProductBody128Int4Packed(unpacked, packed, i);
-        }
-      }
-      // scalar tail
-      for (; i < packed.length; i++) {
-        byte packedByte = packed[i];
-        byte unpacked1 = unpacked[i];
-        byte unpacked2 = unpacked[i + packed.length];
-        res += (packedByte & 0x0F) * unpacked2;
-        res += ((packedByte & 0xFF) >> 4) * unpacked1;
-      }
-    } else {
-      if (VECTOR_BITSIZE >= 512 || VECTOR_BITSIZE == 256) {
-        return dotProduct(a, b);
-      } else if (a.length >= 32) {
-        i += ByteVector.SPECIES_128.loopBound(a.length);
-        res += int4DotProductBody128(a, b, i);
-      }
-      // scalar tail
-      for (; i < a.length; i++) {
-        res += b[i] * a[i];
-      }
+    if (a.length() >= 32) {
+      i += Int4Constants.BYTE_SPECIES.loopBound(a.length());
+      res += int4DotProductBody(a, b, i);
+    }
+    // scalar tail
+    for (; i < a.length(); i++) {
+      res += a.tail(i) * b.tail(i);
     }
-
     return res;
   }
 
-  private int dotProductBody512Int4Packed(byte[] unpacked, byte[] packed, int limit) {
+  private static int int4DotProductBody(ByteVectorLoader a, ByteVectorLoader b, int limit) {
     int sum = 0;
-    // iterate in chunks of 1024 items to ensure we don't overflow the short accumulator
-    for (int i = 0; i < limit; i += 4096) {
-      ShortVector acc0 = ShortVector.zero(ShortVector.SPECIES_512);
-      ShortVector acc1 = ShortVector.zero(ShortVector.SPECIES_512);
-      int innerLimit = Math.min(limit - i, 4096);
-      for (int j = 0; j < innerLimit; j += ByteVector.SPECIES_256.length()) {
-        // packed
-        var vb8 = ByteVector.fromArray(ByteVector.SPECIES_256, packed, i + j);
+    // iterate in chunks to ensure we don't overflow the short accumulator
+    for (int i = 0; i < limit; i += Int4Constants.CHUNK) {
+      ShortVector acc = ShortVector.zero(Int4Constants.SHORT_SPECIES);
+      int innerLimit = Math.min(limit - i, Int4Constants.CHUNK);
+      for (int j = 0; j < innerLimit; j += Int4Constants.BYTE_SPECIES.length()) {
         // unpacked
-        var va8 = ByteVector.fromArray(ByteVector.SPECIES_256, unpacked, i + j + packed.length);
+        ByteVector vb8 = b.load(Int4Constants.BYTE_SPECIES, i + j);
+        Vector<Short> vb16 = vb8.convertShape(B2S, Int4Constants.SHORT_SPECIES, 0);
 
-        // upper
-        ByteVector prod8 = vb8.and((byte) 0x0F).mul(va8);
-        Vector<Short> prod16 = prod8.convertShape(ZERO_EXTEND_B2S, ShortVector.SPECIES_512, 0);
-        acc0 = acc0.add(prod16);
+        // unpacked
+        ByteVector va8 = a.load(Int4Constants.BYTE_SPECIES, i + j);
+        Vector<Short> va16 = va8.convertShape(B2S, Int4Constants.SHORT_SPECIES, 0);
 
-        // lower
-        ByteVector vc8 = ByteVector.fromArray(ByteVector.SPECIES_256, unpacked, i + j);
-        ByteVector prod8a = vb8.lanewise(LSHR, 4).mul(vc8);
-        Vector<Short> prod16a = prod8a.convertShape(ZERO_EXTEND_B2S, ShortVector.SPECIES_512, 0);
-        acc1 = acc1.add(prod16a);
+        acc = acc.add(vb16.mul(va16));
       }
-      IntVector intAcc0 = acc0.convertShape(S2I, IntVector.SPECIES_512, 0).reinterpretAsInts();
-      IntVector intAcc1 = acc0.convertShape(S2I, IntVector.SPECIES_512, 1).reinterpretAsInts();
-      IntVector intAcc2 = acc1.convertShape(S2I, IntVector.SPECIES_512, 0).reinterpretAsInts();
-      IntVector intAcc3 = acc1.convertShape(S2I, IntVector.SPECIES_512, 1).reinterpretAsInts();
-      sum += intAcc0.add(intAcc1).add(intAcc2).add(intAcc3).reduceLanes(ADD);
+      Vector<Integer> intAcc0 = acc.convert(S2I, 0);
+      Vector<Integer> intAcc1 = acc.convert(S2I, 1);
+      sum += intAcc0.add(intAcc1).reinterpretAsInts().reduceLanes(ADD);
     }
     return sum;
   }
 
-  private int dotProductBody256Int4Packed(byte[] unpacked, byte[] packed, int limit) {
+  @Override
+  public int int4DotProductSinglePacked(byte[] unpacked, byte[] packed) {
+    return int4DotProductSinglePackedBody(new ArrayLoader(unpacked), new ArrayLoader(packed));
+  }
+
+  public static int int4DotProductSinglePacked(byte[] unpacked, MemorySegment packed) {
+    return int4DotProductSinglePackedBody(
+        new ArrayLoader(unpacked), new MemorySegmentLoader(packed));
+  }
+
+  private static int int4DotProductSinglePackedBody(
+      ByteVectorLoader unpacked, ByteVectorLoader packed) {
+    int i = 0;
+    int res = 0;
+    if (packed.length() >= 32) {
+      i += Int4Constants.BYTE_SPECIES.loopBound(packed.length());
+      res += int4DotProductSinglePackedBody(unpacked, packed, i);
+    }
+    // scalar tail
+    for (; i < packed.length(); i++) {
+      byte packedByte = packed.tail(i);
+      byte unpacked1 = unpacked.tail(i);
+      byte unpacked2 = unpacked.tail(i + packed.length());
+      res += (packedByte & 0x0F) * unpacked2;
+      res += ((packedByte & 0xFF) >> 4) * unpacked1;
+    }
+    return res;
+  }
+
+  private static int int4DotProductSinglePackedBody(
+      ByteVectorLoader unpacked, ByteVectorLoader packed, int limit) {
     int sum = 0;
-    // iterate in chunks of 1024 items to ensure we don't overflow the short accumulator
-    for (int i = 0; i < limit; i += 2048) {
-      ShortVector acc0 = ShortVector.zero(ShortVector.SPECIES_256);
-      ShortVector acc1 = ShortVector.zero(ShortVector.SPECIES_256);
-      int innerLimit = Math.min(limit - i, 2048);
-      for (int j = 0; j < innerLimit; j += ByteVector.SPECIES_128.length()) {
+    // iterate in chunks to ensure we don't overflow the short accumulator
+    for (int i = 0; i < limit; i += Int4Constants.CHUNK) {
+      ShortVector acc0 = ShortVector.zero(Int4Constants.SHORT_SPECIES);
+      ShortVector acc1 = ShortVector.zero(Int4Constants.SHORT_SPECIES);
+      int innerLimit = Math.min(limit - i, Int4Constants.CHUNK);
+      for (int j = 0; j < innerLimit; j += Int4Constants.BYTE_SPECIES.length()) {
         // packed
-        var vb8 = ByteVector.fromArray(ByteVector.SPECIES_128, packed, i + j);
-        // unpacked
-        var va8 = ByteVector.fromArray(ByteVector.SPECIES_128, unpacked, i + j + packed.length);
+        ByteVector vb8 = packed.load(Int4Constants.BYTE_SPECIES, i + j);
 
         // upper
+        ByteVector va8 = unpacked.load(Int4Constants.BYTE_SPECIES, i + j + packed.length());
         ByteVector prod8 = vb8.and((byte) 0x0F).mul(va8);
-        Vector<Short> prod16 = prod8.convertShape(ZERO_EXTEND_B2S, ShortVector.SPECIES_256, 0);
+        Vector<Short> prod16 = prod8.convertShape(ZERO_EXTEND_B2S, Int4Constants.SHORT_SPECIES, 0);
         acc0 = acc0.add(prod16);
 
         // lower
-        ByteVector vc8 = ByteVector.fromArray(ByteVector.SPECIES_128, unpacked, i + j);
+        ByteVector vc8 = unpacked.load(Int4Constants.BYTE_SPECIES, i + j);
         ByteVector prod8a = vb8.lanewise(LSHR, 4).mul(vc8);
-        Vector<Short> prod16a = prod8a.convertShape(ZERO_EXTEND_B2S, ShortVector.SPECIES_256, 0);
+        Vector<Short> prod16a =
+            prod8a.convertShape(ZERO_EXTEND_B2S, Int4Constants.SHORT_SPECIES, 0);
         acc1 = acc1.add(prod16a);
       }
-      IntVector intAcc0 = acc0.convertShape(S2I, IntVector.SPECIES_256, 0).reinterpretAsInts();
-      IntVector intAcc1 = acc0.convertShape(S2I, IntVector.SPECIES_256, 1).reinterpretAsInts();
-      IntVector intAcc2 = acc1.convertShape(S2I, IntVector.SPECIES_256, 0).reinterpretAsInts();
-      IntVector intAcc3 = acc1.convertShape(S2I, IntVector.SPECIES_256, 1).reinterpretAsInts();
-      sum += intAcc0.add(intAcc1).add(intAcc2).add(intAcc3).reduceLanes(ADD);
+      Vector<Integer> intAcc0 = acc0.convert(S2I, 0);
+      Vector<Integer> intAcc1 = acc0.convert(S2I, 1);
+      Vector<Integer> intAcc2 = acc1.convert(S2I, 0);
+      Vector<Integer> intAcc3 = acc1.convert(S2I, 1);
+      sum += intAcc0.add(intAcc1).add(intAcc2).add(intAcc3).reinterpretAsInts().reduceLanes(ADD);
     }
     return sum;
   }
 
-  /** vectorized dot product body (128 bit vectors) */
-  private int dotProductBody128Int4Packed(byte[] unpacked, byte[] packed, int limit) {
-    int sum = 0;
-    // iterate in chunks of 1024 items to ensure we don't overflow the short accumulator
-    for (int i = 0; i < limit; i += 1024) {
-      ShortVector acc0 = ShortVector.zero(ShortVector.SPECIES_128);
-      ShortVector acc1 = ShortVector.zero(ShortVector.SPECIES_128);
-      int innerLimit = Math.min(limit - i, 1024);
-      for (int j = 0; j < innerLimit; j += ByteVector.SPECIES_64.length()) {
-        // packed
-        ByteVector vb8 = ByteVector.fromArray(ByteVector.SPECIES_64, packed, i + j);
-        // unpacked
-        ByteVector va8 =
-            ByteVector.fromArray(ByteVector.SPECIES_64, unpacked, i + j + packed.length);
+  @Override
+  public int int4DotProductBothPacked(byte[] a, byte[] b) {
+    return int4DotProductBothPackedBody(new ArrayLoader(a), new ArrayLoader(b));
+  }
 
-        // upper
-        ByteVector prod8 = vb8.and((byte) 0x0F).mul(va8);
-        ShortVector prod16 =
-            prod8.convertShape(B2S, ShortVector.SPECIES_128, 0).reinterpretAsShorts();
-        acc0 = acc0.add(prod16.and((short) 0xFF));
+  public static int int4DotProductBothPacked(MemorySegment a, MemorySegment b) {
+    return int4DotProductBothPackedBody(new MemorySegmentLoader(a), new MemorySegmentLoader(b));
+  }
 
-        // lower
-        va8 = ByteVector.fromArray(ByteVector.SPECIES_64, unpacked, i + j);
-        prod8 = vb8.lanewise(LSHR, 4).mul(va8);
-        prod16 = prod8.convertShape(B2S, ShortVector.SPECIES_128, 0).reinterpretAsShorts();
-        acc1 = acc1.add(prod16.and((short) 0xFF));
-      }
-      IntVector intAcc0 = acc0.convertShape(S2I, IntVector.SPECIES_128, 0).reinterpretAsInts();
-      IntVector intAcc1 = acc0.convertShape(S2I, IntVector.SPECIES_128, 1).reinterpretAsInts();
-      IntVector intAcc2 = acc1.convertShape(S2I, IntVector.SPECIES_128, 0).reinterpretAsInts();
-      IntVector intAcc3 = acc1.convertShape(S2I, IntVector.SPECIES_128, 1).reinterpretAsInts();
-      sum += intAcc0.add(intAcc1).add(intAcc2).add(intAcc3).reduceLanes(ADD);
+  private static int int4DotProductBothPackedBody(ByteVectorLoader a, ByteVectorLoader b) {
+    int i = 0;
+    int res = 0;
+    if (a.length() >= 32) {
+      i += Int4Constants.BYTE_SPECIES.loopBound(a.length());
+      res += int4DotProductBothPackedBody(a, b, i);
     }
-    return sum;
+    // scalar tail
+    for (; i < a.length(); i++) {
+      byte aByte = a.tail(i);
+      byte bByte = b.tail(i);
+      res += (aByte & 0x0F) * (bByte & 0x0F);
+      res += ((aByte & 0xFF) >> 4) * ((bByte & 0xFF) >> 4);
+    }
+    return res;
   }
 
-  private int int4DotProductBody128(byte[] a, byte[] b, int limit) {
+  private static int int4DotProductBothPackedBody(
+      ByteVectorLoader a, ByteVectorLoader b, int limit) {
     int sum = 0;
-    // iterate in chunks of 1024 items to ensure we don't overflow the short accumulator
-    for (int i = 0; i < limit; i += 1024) {
-      ShortVector acc0 = ShortVector.zero(ShortVector.SPECIES_128);
-      ShortVector acc1 = ShortVector.zero(ShortVector.SPECIES_128);
-      int innerLimit = Math.min(limit - i, 1024);
-      for (int j = 0; j < innerLimit; j += ByteVector.SPECIES_128.length()) {
-        ByteVector va8 = ByteVector.fromArray(ByteVector.SPECIES_64, a, i + j);
-        ByteVector vb8 = ByteVector.fromArray(ByteVector.SPECIES_64, b, i + j);
-        ByteVector prod8 = va8.mul(vb8);
-        ShortVector prod16 =
-            prod8.convertShape(B2S, ShortVector.SPECIES_128, 0).reinterpretAsShorts();
-        acc0 = acc0.add(prod16.and((short) 0xFF));
-
-        va8 = ByteVector.fromArray(ByteVector.SPECIES_64, a, i + j + 8);
-        vb8 = ByteVector.fromArray(ByteVector.SPECIES_64, b, i + j + 8);
-        prod8 = va8.mul(vb8);
-        prod16 = prod8.convertShape(B2S, ShortVector.SPECIES_128, 0).reinterpretAsShorts();
-        acc1 = acc1.add(prod16.and((short) 0xFF));
+    // iterate in chunks to ensure we don't overflow the short accumulator
+    for (int i = 0; i < limit; i += Int4Constants.CHUNK) {
+      ShortVector acc0 = ShortVector.zero(Int4Constants.SHORT_SPECIES);
+      ShortVector acc1 = ShortVector.zero(Int4Constants.SHORT_SPECIES);
+      int innerLimit = Math.min(limit - i, Int4Constants.CHUNK);
+      for (int j = 0; j < innerLimit; j += Int4Constants.BYTE_SPECIES.length()) {
+        // packed
+        var vb8 = b.load(Int4Constants.BYTE_SPECIES, i + j);
+        // packed
+        var va8 = a.load(Int4Constants.BYTE_SPECIES, i + j);
+
+        // upper
+        ByteVector prod8 = vb8.and((byte) 0x0F).mul(va8.and((byte) 0x0F));
+        Vector<Short> prod16 = prod8.convertShape(ZERO_EXTEND_B2S, Int4Constants.SHORT_SPECIES, 0);
+        acc0 = acc0.add(prod16);
+
+        // lower
+        ByteVector prod8a = vb8.lanewise(LSHR, 4).mul(va8.lanewise(LSHR, 4));
+        Vector<Short> prod16a =
+            prod8a.convertShape(ZERO_EXTEND_B2S, Int4Constants.SHORT_SPECIES, 0);
+        acc1 = acc1.add(prod16a);
       }
-      IntVector intAcc0 = acc0.convertShape(S2I, IntVector.SPECIES_128, 0).reinterpretAsInts();
-      IntVector intAcc1 = acc0.convertShape(S2I, IntVector.SPECIES_128, 1).reinterpretAsInts();
-      IntVector intAcc2 = acc1.convertShape(S2I, IntVector.SPECIES_128, 0).reinterpretAsInts();
-      IntVector intAcc3 = acc1.convertShape(S2I, IntVector.SPECIES_128, 1).reinterpretAsInts();
-      sum += intAcc0.add(intAcc1).add(intAcc2).add(intAcc3).reduceLanes(ADD);
+      Vector<Integer> intAcc0 = acc0.convert(S2I, 0);
+      Vector<Integer> intAcc1 = acc0.convert(S2I, 1);
+      Vector<Integer> intAcc2 = acc1.convert(S2I, 0);
+      Vector<Integer> intAcc3 = acc1.convert(S2I, 1);
+      sum += intAcc0.add(intAcc1).add(intAcc2).add(intAcc3).reinterpretAsInts().reduceLanes(ADD);
     }
     return sum;
   }
@@ -788,7 +812,7 @@ private static float[] cosineBody128(ByteVectorLoader a, ByteVectorLoader b, int
 
   @Override
   public int squareDistance(byte[] a, byte[] b) {
-    return squareDistanceBody(new ArrayLoader(a), new ArrayLoader(b));
+    return squareDistanceBody(new ArrayLoader(a), new ArrayLoader(b), true);
   }
 
   @Override
@@ -797,15 +821,19 @@ public int uint8SquareDistance(byte[] a, byte[] b) {
   }
 
   public static int squareDistance(MemorySegment a, MemorySegment b) {
-    return squareDistanceBody(new MemorySegmentLoader(a), new MemorySegmentLoader(b));
+    return squareDistanceBody(new MemorySegmentLoader(a), new MemorySegmentLoader(b), true);
   }
 
   public static int squareDistance(byte[] a, MemorySegment b) {
-    return squareDistanceBody(new ArrayLoader(a), new MemorySegmentLoader(b));
+    return squareDistanceBody(new ArrayLoader(a), new MemorySegmentLoader(b), true);
   }
 
-  private static int squareDistanceBody(ByteVectorLoader a, ByteVectorLoader b) {
-    return squareDistanceBody(a, b, true);
+  public static int uint8SquareDistance(MemorySegment a, MemorySegment b) {
+    return squareDistanceBody(new MemorySegmentLoader(a), new MemorySegmentLoader(b), false);
+  }
+
+  public static int uint8SquareDistance(byte[] a, MemorySegment b) {
+    return squareDistanceBody(new ArrayLoader(a), new MemorySegmentLoader(b), false);
   }
 
   private static int squareDistanceBody(ByteVectorLoader a, ByteVectorLoader b, boolean signed) {
@@ -886,6 +914,183 @@ private static int squareDistanceBody128(
     return acc1.add(acc2).reduceLanes(ADD);
   }
 
+  @Override
+  public int int4SquareDistance(byte[] a, byte[] b) {
+    return int4SquareDistanceBody(new ArrayLoader(a), new ArrayLoader(b));
+  }
+
+  public static int int4SquareDistance(byte[] a, MemorySegment b) {
+    return int4SquareDistanceBody(new ArrayLoader(a), new MemorySegmentLoader(b));
+  }
+
+  public static int int4SquareDistance(MemorySegment a, MemorySegment b) {
+    return int4SquareDistanceBody(new MemorySegmentLoader(a), new MemorySegmentLoader(b));
+  }
+
+  private static int int4SquareDistanceBody(ByteVectorLoader a, ByteVectorLoader b) {
+    int i = 0;
+    int res = 0;
+    if (a.length() >= 32) {
+      i += Int4Constants.BYTE_SPECIES.loopBound(a.length());
+      res += int4SquareDistanceBody(a, b, i);
+    }
+    // scalar tail
+    for (; i < a.length(); i++) {
+      int diff = a.tail(i) - b.tail(i);
+      res += diff * diff;
+    }
+    return res;
+  }
+
+  private static int int4SquareDistanceBody(ByteVectorLoader a, ByteVectorLoader b, int limit) {
+    int sum = 0;
+    // iterate in chunks to ensure we don't overflow the short accumulator
+    for (int i = 0; i < limit; i += Int4Constants.CHUNK) {
+      ShortVector acc = ShortVector.zero(Int4Constants.SHORT_SPECIES);
+      int innerLimit = Math.min(limit - i, Int4Constants.CHUNK);
+      for (int j = 0; j < innerLimit; j += Int4Constants.BYTE_SPECIES.length()) {
+        // unpacked
+        var vb8 = b.load(Int4Constants.BYTE_SPECIES, i + j);
+        // unpacked
+        var va8 = a.load(Int4Constants.BYTE_SPECIES, i + j);
+
+        ByteVector diff8 = vb8.sub(va8);
+        Vector<Short> diff16 = diff8.convertShape(B2S, Int4Constants.SHORT_SPECIES, 0);
+        acc = acc.add(diff16.mul(diff16));
+      }
+      Vector<Integer> intAcc0 = acc.convert(S2I, 0);
+      Vector<Integer> intAcc1 = acc.convert(S2I, 1);
+      sum += intAcc0.add(intAcc1).reinterpretAsInts().reduceLanes(ADD);
+    }
+    return sum;
+  }
+
+  @Override
+  public int int4SquareDistanceSinglePacked(byte[] a, byte[] b) {
+    return int4SquareDistanceSinglePackedBody(new ArrayLoader(a), new ArrayLoader(b));
+  }
+
+  public static int int4SquareDistanceSinglePacked(byte[] a, MemorySegment b) {
+    return int4SquareDistanceSinglePackedBody(new ArrayLoader(a), new MemorySegmentLoader(b));
+  }
+
+  private static int int4SquareDistanceSinglePackedBody(
+      ByteVectorLoader unpacked, ByteVectorLoader packed) {
+    int i = 0;
+    int res = 0;
+    if (packed.length() >= 32) {
+      i += Int4Constants.BYTE_SPECIES.loopBound(packed.length());
+      res += int4SquareDistanceSinglePackedBody(unpacked, packed, i);
+    }
+    // scalar tail
+    for (; i < packed.length(); i++) {
+      byte packedByte = packed.tail(i);
+      byte unpacked1 = unpacked.tail(i);
+      byte unpacked2 = unpacked.tail(i + packed.length());
+
+      int diff1 = (packedByte & 0x0F) - unpacked2;
+      int diff2 = ((packedByte & 0xFF) >> 4) - unpacked1;
+
+      res += diff1 * diff1 + diff2 * diff2;
+    }
+    return res;
+  }
+
+  private static int int4SquareDistanceSinglePackedBody(
+      ByteVectorLoader unpacked, ByteVectorLoader packed, int limit) {
+    int sum = 0;
+    // iterate in chunks to ensure we don't overflow the short accumulator
+    for (int i = 0; i < limit; i += Int4Constants.CHUNK) {
+      ShortVector acc0 = ShortVector.zero(Int4Constants.SHORT_SPECIES);
+      ShortVector acc1 = ShortVector.zero(Int4Constants.SHORT_SPECIES);
+      int innerLimit = Math.min(limit - i, Int4Constants.CHUNK);
+      for (int j = 0; j < innerLimit; j += Int4Constants.BYTE_SPECIES.length()) {
+        // packed
+        ByteVector vb8 = packed.load(Int4Constants.BYTE_SPECIES, i + j);
+
+        // upper
+        ByteVector va8 = unpacked.load(Int4Constants.BYTE_SPECIES, i + j + packed.length());
+        ByteVector diff8 = vb8.and((byte) 0x0F).sub(va8);
+        Vector<Short> diff16 = diff8.convertShape(B2S, Int4Constants.SHORT_SPECIES, 0);
+        acc0 = acc0.add(diff16.mul(diff16));
+
+        // lower
+        ByteVector vc8 = unpacked.load(Int4Constants.BYTE_SPECIES, i + j);
+        ByteVector diff8a = vb8.lanewise(LSHR, 4).sub(vc8);
+        Vector<Short> diff16a = diff8a.convertShape(B2S, Int4Constants.SHORT_SPECIES, 0);
+        acc1 = acc1.add(diff16a.mul(diff16a));
+      }
+      Vector<Integer> intAcc0 = acc0.convert(S2I, 0);
+      Vector<Integer> intAcc1 = acc0.convert(S2I, 1);
+      Vector<Integer> intAcc2 = acc1.convert(S2I, 0);
+      Vector<Integer> intAcc3 = acc1.convert(S2I, 1);
+      sum += intAcc0.add(intAcc1).add(intAcc2).add(intAcc3).reinterpretAsInts().reduceLanes(ADD);
+    }
+    return sum;
+  }
+
+  @Override
+  public int int4SquareDistanceBothPacked(byte[] a, byte[] b) {
+    return int4SquareDistanceBothPackedBody(new ArrayLoader(a), new ArrayLoader(b));
+  }
+
+  public static int int4SquareDistanceBothPacked(MemorySegment a, MemorySegment b) {
+    return int4SquareDistanceBothPackedBody(new MemorySegmentLoader(a), new MemorySegmentLoader(b));
+  }
+
+  private static int int4SquareDistanceBothPackedBody(ByteVectorLoader a, ByteVectorLoader b) {
+    int i = 0;
+    int res = 0;
+    if (a.length() >= 32) {
+      i += Int4Constants.BYTE_SPECIES.loopBound(a.length());
+      res += int4SquareDistanceBothPackedBody(a, b, i);
+    }
+    // scalar tail
+    for (; i < a.length(); i++) {
+      byte aByte = a.tail(i);
+      byte bByte = b.tail(i);
+
+      int diff1 = (aByte & 0x0F) - (bByte & 0x0F);
+      int diff2 = ((aByte & 0xFF) >> 4) - ((bByte & 0xFF) >> 4);
+
+      res += diff1 * diff1 + diff2 * diff2;
+    }
+    return res;
+  }
+
+  private static int int4SquareDistanceBothPackedBody(
+      ByteVectorLoader a, ByteVectorLoader b, int limit) {
+    int sum = 0;
+    // iterate in chunks to ensure we don't overflow the short accumulator
+    for (int i = 0; i < limit; i += Int4Constants.CHUNK) {
+      ShortVector acc0 = ShortVector.zero(Int4Constants.SHORT_SPECIES);
+      ShortVector acc1 = ShortVector.zero(Int4Constants.SHORT_SPECIES);
+      int innerLimit = Math.min(limit - i, Int4Constants.CHUNK);
+      for (int j = 0; j < innerLimit; j += Int4Constants.BYTE_SPECIES.length()) {
+        // packed
+        var vb8 = b.load(Int4Constants.BYTE_SPECIES, i + j);
+        // packed
+        var va8 = a.load(Int4Constants.BYTE_SPECIES, i + j);
+
+        // upper
+        ByteVector diff8 = vb8.and((byte) 0x0F).sub(va8.and((byte) 0x0F));
+        Vector<Short> diff16 = diff8.convertShape(B2S, Int4Constants.SHORT_SPECIES, 0);
+        acc0 = acc0.add(diff16.mul(diff16));
+
+        // lower
+        ByteVector diff8a = vb8.lanewise(LSHR, 4).sub(va8.lanewise(LSHR, 4));
+        Vector<Short> diff16a = diff8a.convertShape(B2S, Int4Constants.SHORT_SPECIES, 0);
+        acc1 = acc1.add(diff16a.mul(diff16a));
+      }
+      Vector<Integer> intAcc0 = acc0.convert(S2I, 0);
+      Vector<Integer> intAcc1 = acc0.convert(S2I, 1);
+      Vector<Integer> intAcc2 = acc1.convert(S2I, 0);
+      Vector<Integer> intAcc3 = acc1.convert(S2I, 1);
+      sum += intAcc0.add(intAcc1).add(intAcc2).add(intAcc3).reinterpretAsInts().reduceLanes(ADD);
+    }
+    return sum;
+  }
+
   // Experiments suggest that we need at least 8 lanes so that the overhead of going with the vector
   // approach and counting trues on vector masks pays off.
   private static final boolean ENABLE_FIND_NEXT_GEQ_VECTOR_OPTO = INT_SPECIES.length() >= 8;
diff --git a/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java b/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java
index 54b3be67afcb..cf3ab94f417c 100644
--- a/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java
+++ b/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java
@@ -78,6 +78,11 @@ public FlatVectorsScorer getLucene99FlatVectorsScorer() {
     return Lucene99MemorySegmentFlatVectorsScorer.INSTANCE;
   }
 
+  @Override
+  public FlatVectorsScorer getLucene99ScalarQuantizedVectorsScorer() {
+    return Lucene99MemorySegmentScalarQuantizedVectorScorer.INSTANCE;
+  }
+
   @Override
   public PostingDecodingUtil newPostingDecodingUtil(IndexInput input) throws IOException {
     if (input instanceof MemorySegmentAccessInput msai) {
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java
index 2c6c54cece73..3ad2cab88690 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99HnswQuantizedVectorsFormat.java
@@ -308,10 +308,19 @@ public KnnVectorsFormat knnVectorsFormat() {
           }
         };
     String expectedPattern =
-        "Lucene99HnswScalarQuantizedVectorsFormat(name=Lucene99HnswScalarQuantizedVectorsFormat, maxConn=10, beamWidth=20, flatVectorFormat=Lucene99ScalarQuantizedVectorsFormat(name=Lucene99ScalarQuantizedVectorsFormat, confidenceInterval=0.9, bits=4, compress=false, flatVectorScorer=ScalarQuantizedVectorScorer(nonQuantizedDelegate=DefaultFlatVectorScorer()), rawVectorFormat=Lucene99FlatVectorsFormat(vectorsScorer=%s())))";
-    var defaultScorer = format(Locale.ROOT, expectedPattern, "DefaultFlatVectorScorer");
+        "Lucene99HnswScalarQuantizedVectorsFormat(name=Lucene99HnswScalarQuantizedVectorsFormat, maxConn=10, beamWidth=20, flatVectorFormat=Lucene99ScalarQuantizedVectorsFormat(name=Lucene99ScalarQuantizedVectorsFormat, confidenceInterval=0.9, bits=4, compress=false, flatVectorScorer=%s, rawVectorFormat=Lucene99FlatVectorsFormat(vectorsScorer=%s)))";
+    var defaultScorer =
+        format(
+            Locale.ROOT,
+            expectedPattern,
+            "ScalarQuantizedVectorScorer(nonQuantizedDelegate=DefaultFlatVectorScorer())",
+            "DefaultFlatVectorScorer()");
     var memSegScorer =
-        format(Locale.ROOT, expectedPattern, "Lucene99MemorySegmentFlatVectorsScorer");
+        format(
+            Locale.ROOT,
+            expectedPattern,
+            "Lucene99MemorySegmentScalarQuantizedVectorScorer()",
+            "Lucene99MemorySegmentFlatVectorsScorer()");
     assertThat(customCodec.knnVectorsFormat().toString(), is(oneOf(defaultScorer, memSegScorer)));
   }
 
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java
index e04054c27e37..7156afd9cc3c 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene99/TestLucene99ScalarQuantizedVectorsFormat.java
@@ -372,10 +372,19 @@ public KnnVectorsFormat knnVectorsFormat() {
           }
         };
     String expectedPattern =
-        "Lucene99ScalarQuantizedVectorsFormat(name=Lucene99ScalarQuantizedVectorsFormat, confidenceInterval=0.9, bits=4, compress=false, flatVectorScorer=ScalarQuantizedVectorScorer(nonQuantizedDelegate=DefaultFlatVectorScorer()), rawVectorFormat=Lucene99FlatVectorsFormat(vectorsScorer=%s()))";
-    var defaultScorer = format(Locale.ROOT, expectedPattern, "DefaultFlatVectorScorer");
+        "Lucene99ScalarQuantizedVectorsFormat(name=Lucene99ScalarQuantizedVectorsFormat, confidenceInterval=0.9, bits=4, compress=false, flatVectorScorer=%s, rawVectorFormat=Lucene99FlatVectorsFormat(vectorsScorer=%s))";
+    var defaultScorer =
+        format(
+            Locale.ROOT,
+            expectedPattern,
+            "ScalarQuantizedVectorScorer(nonQuantizedDelegate=DefaultFlatVectorScorer())",
+            "DefaultFlatVectorScorer()");
     var memSegScorer =
-        format(Locale.ROOT, expectedPattern, "Lucene99MemorySegmentFlatVectorsScorer");
+        format(
+            Locale.ROOT,
+            expectedPattern,
+            "Lucene99MemorySegmentScalarQuantizedVectorScorer()",
+            "Lucene99MemorySegmentFlatVectorsScorer()");
     assertThat(customCodec.knnVectorsFormat().toString(), is(oneOf(defaultScorer, memSegScorer)));
   }
 
diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorUtilSupport.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorUtilSupport.java
index 7ec661b3659f..78280e7e4c36 100644
--- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorUtilSupport.java
+++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestVectorUtilSupport.java
@@ -107,11 +107,23 @@ public void testInt4DotProduct() {
       b[i] = (byte) random().nextInt(16);
     }
 
-    assertIntReturningProviders(p -> p.int4DotProduct(a, false, pack(b), true));
-    assertIntReturningProviders(p -> p.int4DotProduct(pack(a), true, b, false));
+    assertIntReturningProviders(p -> p.int4DotProduct(a, b));
+    assertIntReturningProviders(p -> p.int4DotProductSinglePacked(a, pack(b)));
+    assertIntReturningProviders(p -> p.int4DotProductSinglePacked(b, pack(a)));
+    assertIntReturningProviders(p -> p.int4DotProductBothPacked(pack(a), pack(b)));
+
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProduct(a, b));
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProductSinglePacked(a, pack(b)));
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProductSinglePacked(b, pack(a)));
     assertEquals(
         LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
-        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProduct(a, false, pack(b), true));
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProductBothPacked(pack(a), pack(b)));
   }
 
   public void testInt4DotProductBoundaries() {
@@ -122,20 +134,106 @@ public void testInt4DotProductBoundaries() {
 
     Arrays.fill(a, MAX_VALUE);
     Arrays.fill(b, MAX_VALUE);
-    assertIntReturningProviders(p -> p.int4DotProduct(a, false, pack(b), true));
-    assertIntReturningProviders(p -> p.int4DotProduct(pack(a), true, b, false));
+
+    assertIntReturningProviders(p -> p.int4DotProduct(a, b));
+    assertIntReturningProviders(p -> p.int4DotProductSinglePacked(a, pack(b)));
+    assertIntReturningProviders(p -> p.int4DotProductSinglePacked(b, pack(a)));
+    assertIntReturningProviders(p -> p.int4DotProductBothPacked(pack(a), pack(b)));
+
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProduct(a, b));
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProductSinglePacked(a, pack(b)));
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProductSinglePacked(b, pack(a)));
     assertEquals(
         LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
-        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProduct(a, false, pack(b), true));
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProductBothPacked(pack(a), pack(b)));
 
     byte MIN_VALUE = 0;
     Arrays.fill(a, MIN_VALUE);
     Arrays.fill(b, MIN_VALUE);
-    assertIntReturningProviders(p -> p.int4DotProduct(a, false, pack(b), true));
-    assertIntReturningProviders(p -> p.int4DotProduct(pack(a), true, b, false));
+
+    assertIntReturningProviders(p -> p.int4DotProduct(a, b));
+    assertIntReturningProviders(p -> p.int4DotProductSinglePacked(a, pack(b)));
+    assertIntReturningProviders(p -> p.int4DotProductSinglePacked(b, pack(a)));
+    assertIntReturningProviders(p -> p.int4DotProductBothPacked(pack(a), pack(b)));
+
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProduct(a, b));
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProductSinglePacked(a, pack(b)));
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProductSinglePacked(b, pack(a)));
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProductBothPacked(pack(a), pack(b)));
+  }
+
+  public void testInt4SquareDistance() {
+    assumeTrue("even sizes only", size % 2 == 0);
+    var a = new byte[size];
+    var b = new byte[size];
+    for (int i = 0; i < size; ++i) {
+      a[i] = (byte) random().nextInt(16);
+      b[i] = (byte) random().nextInt(16);
+    }
+
+    assertIntReturningProviders(p -> p.int4SquareDistance(a, b));
+    assertIntReturningProviders(p -> p.int4SquareDistanceSinglePacked(a, pack(b)));
+    assertIntReturningProviders(p -> p.int4SquareDistanceSinglePacked(b, pack(a)));
+    assertIntReturningProviders(p -> p.int4SquareDistanceBothPacked(pack(a), pack(b)));
+
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().squareDistance(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4SquareDistance(a, b));
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().squareDistance(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4SquareDistanceSinglePacked(a, pack(b)));
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().squareDistance(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4SquareDistanceSinglePacked(b, pack(a)));
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().squareDistance(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4SquareDistanceBothPacked(pack(a), pack(b)));
+  }
+
+  public void testInt4SquareDistanceBoundaries() {
+    assumeTrue("even sizes only", size % 2 == 0);
+
+    // squareDistance is maximized when the points are farther away
+
+    byte MAX_VALUE = 15;
+    var a = new byte[size];
+    Arrays.fill(a, MAX_VALUE);
+
+    byte MIN_VALUE = 0;
+    var b = new byte[size];
+    Arrays.fill(b, MIN_VALUE);
+
+    assertIntReturningProviders(p -> p.int4DotProduct(a, b));
+    assertIntReturningProviders(p -> p.int4DotProductSinglePacked(a, pack(b)));
+    assertIntReturningProviders(p -> p.int4DotProductSinglePacked(b, pack(a)));
+    assertIntReturningProviders(p -> p.int4DotProductBothPacked(pack(a), pack(b)));
+
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProduct(a, b));
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProductSinglePacked(a, pack(b)));
+    assertEquals(
+        LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProductSinglePacked(b, pack(a)));
     assertEquals(
         LUCENE_PROVIDER.getVectorUtilSupport().dotProduct(a, b),
-        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProduct(a, false, pack(b), true));
+        PANAMA_PROVIDER.getVectorUtilSupport().int4DotProductBothPacked(pack(a), pack(b)));
   }
 
   public void testInt4BitDotProduct() {