From 88a008593b323a3175197edcc127ad59e4d70ead Mon Sep 17 00:00:00 2001 From: kbukum1 Date: Mon, 27 Apr 2026 20:28:41 +0000 Subject: [PATCH 1/9] Implement git-upload-pack request normalization and add pkt-line parsing --- internal/cache/handlers.go | 66 +++++++++- internal/cache/handlers_test.go | 202 +++++++++++++++++++++++++++++++ internal/pktline/pktline.go | 104 ++++++++++++++++ internal/pktline/pktline_test.go | 150 +++++++++++++++++++++++ 4 files changed, 521 insertions(+), 1 deletion(-) create mode 100644 internal/pktline/pktline.go create mode 100644 internal/pktline/pktline_test.go diff --git a/internal/cache/handlers.go b/internal/cache/handlers.go index 44fb396..10e3359 100644 --- a/internal/cache/handlers.go +++ b/internal/cache/handlers.go @@ -12,12 +12,15 @@ import ( "path/filepath" "regexp" "sort" + "strings" "sync" "github.com/elazarl/goproxy" "github.com/sirupsen/logrus" "gopkg.in/yaml.v3" + "github.com/dependabot/proxy/internal/pktline" + "github.com/dependabot/proxy/internal/ctxdata" ) @@ -62,6 +65,63 @@ var ignoreHeaders = map[string]struct{}{ "X-Pub-Session-Id": {}, } +// gitInlineAgentRegex matches the volatile inline "agent=" capability token +// in v1 want lines (e.g., " agent=git/2.43.0-Linux"). Real git always emits +// agent= as a trailing capability after a leading space. +var gitInlineAgentRegex = regexp.MustCompile(` agent=[^ \r\n]*`) + +// isGitUploadPack returns true if the request is a POST to a git-upload-pack endpoint. +func isGitUploadPack(r *http.Request) bool { + return r.Method == "POST" && strings.HasSuffix(r.URL.Path, "/git-upload-pack") +} + +// normalizeGitBody parses the pkt-line stream from a git-upload-pack POST body +// and rebuilds it with volatile fields removed, producing a stable cache key. +// +// Specifically: +// - "have" lines are dropped (local negotiation state that varies between clients) +// - Standalone "agent=" pkt-lines (v2) are dropped +// - Inline " agent=" tokens in v1 capability lines are stripped +// - All other lines (want, deepen, filter, command, shallow, etc.) are preserved +// with recomputed pkt-line length prefixes +// - Special packets (flush 0000, delim 0001, response-end 0002) are preserved +// +// If the body is not valid pkt-line data, it is included as-is so the hash falls +// back to full-body behavior (cache miss, not corruption). +// +// Safety assumption: Dependabot updaters run in clean ephemeral containers with +// no pre-existing git objects, so "have" negotiation state is effectively +// identical across runs for any ecosystem that uses git-upload-pack (nix, bundler +// git sources, Go modules, git submodules, etc.). Stripping "have" lines is safe +// because the server response does not depend on local object state when haves +// are empty or near-empty. +func normalizeGitBody(data []byte) []byte { + packets := pktline.Parse(data) + var filtered []pktline.Packet + for _, p := range packets { + if p.Type != pktline.Data { + filtered = append(filtered, p) + continue + } + payload := string(p.Payload) + + // Drop "have" lines + if strings.HasPrefix(payload, "have ") { + continue + } + + // Drop standalone "agent=" pkt-lines (v2) + if strings.HasPrefix(payload, "agent=") { + continue + } + + // Strip inline " agent=" tokens from v1 capability lines + cleaned := gitInlineAgentRegex.ReplaceAllString(payload, "") + filtered = append(filtered, pktline.Packet{Type: pktline.Data, Payload: []byte(cleaned)}) + } + return pktline.Encode(filtered) +} + // generates the key used in the DB, includes a hash of the body func key(r *http.Request) Key { data, _ := io.ReadAll(r.Body) @@ -97,8 +157,12 @@ func key(r *http.Request) Key { k.HeaderHash = hex.EncodeToString(headerHash.Sum(nil)) } if len(data) > 0 { + hashData := data + if isGitUploadPack(r) { + hashData = normalizeGitBody(data) + } hash := sha256.New() - hash.Write(data) + hash.Write(hashData) k.BodyHash = hex.EncodeToString(hash.Sum(nil)) } return k diff --git a/internal/cache/handlers_test.go b/internal/cache/handlers_test.go index 1bc65fe..7e3eb3b 100644 --- a/internal/cache/handlers_test.go +++ b/internal/cache/handlers_test.go @@ -245,6 +245,208 @@ func Test_key(t *testing.T) { t.Error("headerHash should be blank, got", key.HeaderHash) } }) + + t.Run("git-upload-pack: same wants, different haves produce same key", func(t *testing.T) { + // Real protocol v1 format: want lines with capabilities, followed by have lines + // OIDs from github.com/octocat/Hello-World + body1 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n" + + "0032want b1b3f9723831141a31a1a7252a213e216ea76e56\n0000" + + "0032have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e\n0009done\n" + body2 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n" + + "0032want b1b3f9723831141a31a1a7252a213e216ea76e56\n0000" + + "0032have a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2\n0009done\n" + + req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) + req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1 != key2 { + t.Error("Same wants with different haves should produce the same cache key") + } + }) + + t.Run("git-upload-pack: different wants produce different keys", func(t *testing.T) { + // Two requests wanting different refs from the same repo + body1 := "0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n00000009done\n" + body2 := "0032want b3cbd5bbd7e81436d2eee04537ea2b4c0cad4cdf\n00000009done\n" + + req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) + req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1 == key2 { + t.Error("Different wants should produce different cache keys") + } + }) + + t.Run("git-upload-pack: different agent strings produce same key", func(t *testing.T) { + // Use different-length agent strings to verify pkt-line length prefixes are also normalized + body1 := "00a3want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack no-progress ofs-delta deepen-since deepen-not agent=git/2.9.5\n00000009done\n" + body2 := "00aawant 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack no-progress ofs-delta deepen-since deepen-not agent=git/2.43.0-Linux\n00000009done\n" + + req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) + req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1 != key2 { + t.Error("Different agent strings should produce the same cache key") + } + }) + + t.Run("git-upload-pack: non-git-upload-pack POST still uses full body hash", func(t *testing.T) { + body1 := `{"query":"{ viewer { login } }"}` + body2 := `{"query":"{ repository(owner:\"octocat\") { name } }"}` + + req1 := httptest.NewRequest("POST", "https://api.github.com/graphql", strings.NewReader(body1)) + req2 := httptest.NewRequest("POST", "https://api.github.com/graphql", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1 == key2 { + t.Error("Non-git-upload-pack POST should use full body hash") + } + }) + + t.Run("git-upload-pack: GET to git-upload-pack URL is not normalized", func(t *testing.T) { + body1 := "body with have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e in it" + body2 := "body with have b3cbd5bbd7e81436d2eee04537ea2b4c0cad4cdf in it" + + req1 := httptest.NewRequest("GET", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) + req2 := httptest.NewRequest("GET", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1 == key2 { + t.Error("GET requests should not have body normalization") + } + }) + + t.Run("git-upload-pack: protocol v2 ls-refs with no want lines still contributes to body hash", func(t *testing.T) { + // Real protocol v2 ls-refs commands have no want lines + body1 := "0014command=ls-refs\n0015agent=git/2.43.0\n001bref-prefix refs/heads/\n0000" + body2 := "0014command=ls-refs\n0015agent=git/2.43.0\n001aref-prefix refs/tags/\n0000" + + req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) + req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1.BodyHash == "" { + t.Error("git-upload-pack POST with no want lines should still include a body hash") + } + if key2.BodyHash == "" { + t.Error("git-upload-pack POST with no want lines should still include a body hash") + } + if key1 == key2 { + t.Error("Different non-want git-upload-pack requests should produce different cache keys") + } + }) + + t.Run("git-upload-pack: protocol v2 fetch same wants different agent produces same key", func(t *testing.T) { + // Real protocol v2 fetch command as seen in shallow clones + body1 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n0001000ddeepen 1\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" + body2 := "0012command=fetch\n001bagent=git/2.53.0-Linux\n0001000ddeepen 1\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" + + req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) + req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1 != key2 { + t.Error("Same wants with different agent versions in v2 fetch should produce the same cache key") + } + }) + + t.Run("git-upload-pack: different deepen values produce different keys", func(t *testing.T) { + body1 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n0001000ddeepen 1\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" + body2 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n0001000ddeepen 2\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" + + req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) + req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1 == key2 { + t.Error("Same wants with different deepen values should produce different cache keys") + } + }) + + t.Run("git-upload-pack: different filter values produce different keys", func(t *testing.T) { + body1 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n00010015filter blob:none\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" + body2 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n00010012filter tree:0\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" + + req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) + req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1 == key2 { + t.Error("Same wants with different filter values should produce different cache keys") + } + }) + + t.Run("git-upload-pack: shallow vs full clone produce different keys", func(t *testing.T) { + // Shallow clone with deepen + body1 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n0001000ddeepen 1\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" + // Full clone without deepen + body2 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n00010032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" + + req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) + req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1 == key2 { + t.Error("Shallow clone with deepen and full clone should produce different cache keys") + } + }) + + t.Run("git-upload-pack: protocol v2 ls-refs same prefix different agent produces same key", func(t *testing.T) { + body1 := "0014command=ls-refs\n0015agent=git/2.43.0\n001bref-prefix refs/heads/\n0000" + body2 := "0014command=ls-refs\n0015agent=git/2.53.0\n001bref-prefix refs/heads/\n0000" + + req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) + req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1 != key2 { + t.Error("Same ls-refs with different agent should produce the same cache key") + } + }) + + t.Run("git-upload-pack: different non-agent capabilities produce different keys", func(t *testing.T) { + // Capabilities like thin-pack affect response encoding and must remain in the hash + body1 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n00000009done\n" + body2 := "0076want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k ofs-delta agent=git/2.43.0\n00000009done\n" + + req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) + req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1 == key2 { + t.Error("Different non-agent capabilities should produce different cache keys") + } + }) + + t.Run("git-upload-pack: v1 agent-only difference with identical capabilities produces same key", func(t *testing.T) { + // Only the agent= suffix differs; all other capabilities are identical + body1 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n00000009done\n" + body2 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.53.0\n00000009done\n" + + req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) + req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1 != key2 { + t.Error("Identical capabilities with only agent= difference should produce the same cache key") + } + }) } type BufferWithClose struct { diff --git a/internal/pktline/pktline.go b/internal/pktline/pktline.go new file mode 100644 index 0000000..41dba81 --- /dev/null +++ b/internal/pktline/pktline.go @@ -0,0 +1,104 @@ +// Package pktline implements parsing and encoding of the git pkt-line format. +// +// The pkt-line format is used by the git smart HTTP protocol (git-upload-pack, +// git-receive-pack) to frame variable-length data. Each line is prefixed with a +// 4-hex-digit length that includes itself, or is a special packet: +// +// - "0000" flush packet (stream boundary) +// - "0001" delimiter packet (section separator, protocol v2) +// - "0002" response-end packet +// - "0003" reserved +// - "0004"+ data packet with payload of (length - 4) bytes +// +// See https://git-scm.com/docs/protocol-common#_pkt_line_format +package pktline + +import ( + "fmt" + "strconv" +) + +// PacketType identifies the kind of pkt-line packet. +type PacketType int + +const ( + // Data is a normal data packet with a payload. + Data PacketType = iota + // Flush is the "0000" packet indicating a stream boundary. + Flush + // Delim is the "0001" packet separating sections in protocol v2. + Delim + // ResponseEnd is the "0002" packet indicating the end of a response. + ResponseEnd +) + +// Packet represents a single pkt-line packet. +type Packet struct { + Type PacketType + Payload []byte // nil for Flush, Delim, ResponseEnd +} + +// Parse parses a pkt-line byte stream into a slice of packets. +// If the stream contains malformed data (bad length prefix, truncated packet), +// the remainder is returned as a single Data packet so callers can degrade +// gracefully rather than losing data. +func Parse(data []byte) []Packet { + var packets []Packet + for len(data) > 0 { + if len(data) < 4 { + packets = append(packets, Packet{Type: Data, Payload: data}) + break + } + + length, err := strconv.ParseUint(string(data[:4]), 16, 16) + if err != nil { + packets = append(packets, Packet{Type: Data, Payload: data}) + break + } + + switch { + case length == 0: + packets = append(packets, Packet{Type: Flush}) + data = data[4:] + case length == 1: + packets = append(packets, Packet{Type: Delim}) + data = data[4:] + case length == 2: + packets = append(packets, Packet{Type: ResponseEnd}) + data = data[4:] + case length == 3: + // Reserved; treat as opaque data + packets = append(packets, Packet{Type: Data, Payload: data[:4]}) + data = data[4:] + default: + if int(length) > len(data) { + // Truncated packet; include remainder as-is + packets = append(packets, Packet{Type: Data, Payload: data}) + data = nil + } else { + packets = append(packets, Packet{Type: Data, Payload: data[4:length]}) + data = data[length:] + } + } + } + return packets +} + +// Encode serializes a slice of packets back into the pkt-line wire format. +func Encode(packets []Packet) []byte { + var buf []byte + for _, p := range packets { + switch p.Type { + case Flush: + buf = append(buf, "0000"...) + case Delim: + buf = append(buf, "0001"...) + case ResponseEnd: + buf = append(buf, "0002"...) + case Data: + buf = append(buf, fmt.Sprintf("%04x", 4+len(p.Payload))...) + buf = append(buf, p.Payload...) + } + } + return buf +} diff --git a/internal/pktline/pktline_test.go b/internal/pktline/pktline_test.go new file mode 100644 index 0000000..761111a --- /dev/null +++ b/internal/pktline/pktline_test.go @@ -0,0 +1,150 @@ +package pktline + +import ( + "bytes" + "testing" +) + +func TestParseFlush(t *testing.T) { + packets := Parse([]byte("0000")) + if len(packets) != 1 || packets[0].Type != Flush { + t.Fatal("expected single Flush packet") + } +} + +func TestParseDelim(t *testing.T) { + packets := Parse([]byte("0001")) + if len(packets) != 1 || packets[0].Type != Delim { + t.Fatal("expected single Delim packet") + } +} + +func TestParseResponseEnd(t *testing.T) { + packets := Parse([]byte("0002")) + if len(packets) != 1 || packets[0].Type != ResponseEnd { + t.Fatal("expected single ResponseEnd packet") + } +} + +func TestParseDataPacket(t *testing.T) { + // "000ahello\n" = length 10 (0x000a), payload "hello\n" + packets := Parse([]byte("000ahello\n")) + if len(packets) != 1 { + t.Fatalf("expected 1 packet, got %d", len(packets)) + } + if packets[0].Type != Data { + t.Fatal("expected Data packet") + } + if string(packets[0].Payload) != "hello\n" { + t.Fatalf("expected payload %q, got %q", "hello\n", string(packets[0].Payload)) + } +} + +func TestParseMultiplePackets(t *testing.T) { + // Two data packets + flush + input := "000ahello\n" + "000aworld\n" + "0000" + packets := Parse([]byte(input)) + if len(packets) != 3 { + t.Fatalf("expected 3 packets, got %d", len(packets)) + } + if packets[0].Type != Data || string(packets[0].Payload) != "hello\n" { + t.Error("first packet wrong") + } + if packets[1].Type != Data || string(packets[1].Payload) != "world\n" { + t.Error("second packet wrong") + } + if packets[2].Type != Flush { + t.Error("third packet should be Flush") + } +} + +func TestParseMalformedLength(t *testing.T) { + // "gggg" is not valid hex — should return as raw data + packets := Parse([]byte("gggghello")) + if len(packets) != 1 || packets[0].Type != Data { + t.Fatal("expected single raw Data packet for malformed input") + } + if string(packets[0].Payload) != "gggghello" { + t.Fatalf("expected full input as payload, got %q", string(packets[0].Payload)) + } +} + +func TestParseTruncatedPacket(t *testing.T) { + // Claims length 0x0020 (32 bytes) but only has 10 bytes total + packets := Parse([]byte("0020short")) + if len(packets) != 1 || packets[0].Type != Data { + t.Fatal("expected single raw Data packet for truncated input") + } +} + +func TestParseRealV1Body(t *testing.T) { + // Real protocol v1 upload-pack body captured from git clone of octocat/Hello-World + input := "00a4want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack no-progress ofs-delta deepen-since deepen-not agent=git/2.43.0\n" + + "0032want b1b3f9723831141a31a1a7252a213e216ea76e56\n" + + "0000" + + "0032have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e\n" + + "0009done\n" + packets := Parse([]byte(input)) + if len(packets) != 5 { + t.Fatalf("expected 5 packets, got %d", len(packets)) + } + if packets[0].Type != Data { + t.Error("packet 0 should be Data (want with caps)") + } + if packets[1].Type != Data { + t.Error("packet 1 should be Data (want)") + } + if packets[2].Type != Flush { + t.Error("packet 2 should be Flush") + } + if packets[3].Type != Data { + t.Error("packet 3 should be Data (have)") + } + if packets[4].Type != Data { + t.Error("packet 4 should be Data (done)") + } +} + +func TestParseRealV2Body(t *testing.T) { + // Real protocol v2 fetch command + input := "0012command=fetch\n" + + "0015agent=git/2.43.0\n" + + "0001" + + "000ddeepen 1\n" + + "0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n" + + "0009done\n" + + "0000" + packets := Parse([]byte(input)) + if len(packets) != 7 { + t.Fatalf("expected 7 packets, got %d", len(packets)) + } + if packets[0].Type != Data || string(packets[0].Payload) != "command=fetch\n" { + t.Error("packet 0 should be command=fetch") + } + if packets[1].Type != Data || string(packets[1].Payload) != "agent=git/2.43.0\n" { + t.Error("packet 1 should be agent=") + } + if packets[2].Type != Delim { + t.Error("packet 2 should be Delim") + } + if packets[6].Type != Flush { + t.Error("packet 6 should be Flush") + } +} + +func TestEncodeRoundTrip(t *testing.T) { + input := []byte("000ahello\n" + "0000" + "0001" + "000aworld\n" + "0002") + packets := Parse(input) + output := Encode(packets) + if !bytes.Equal(input, output) { + t.Fatalf("round-trip failed:\n input: %q\n output: %q", input, output) + } +} + +func TestEncodeEmptyPayload(t *testing.T) { + packets := []Packet{{Type: Flush}, {Type: Delim}} + output := Encode(packets) + if string(output) != "00000001" { + t.Fatalf("expected %q, got %q", "00000001", string(output)) + } +} From 9725946a344e17d40c4839516dbe9c0578253a1a Mon Sep 17 00:00:00 2001 From: kbukum1 Date: Mon, 27 Apr 2026 20:39:15 +0000 Subject: [PATCH 2/9] Refactor pktline parsing to return status flag and improve error handling --- internal/cache/handlers.go | 10 ++++-- internal/cache/handlers_test.go | 20 +++++++++++ internal/pktline/pktline.go | 32 +++++++++-------- internal/pktline/pktline_test.go | 59 +++++++++++++++++++++++++------- 4 files changed, 91 insertions(+), 30 deletions(-) diff --git a/internal/cache/handlers.go b/internal/cache/handlers.go index 10e3359..9297539 100644 --- a/internal/cache/handlers.go +++ b/internal/cache/handlers.go @@ -86,8 +86,9 @@ func isGitUploadPack(r *http.Request) bool { // with recomputed pkt-line length prefixes // - Special packets (flush 0000, delim 0001, response-end 0002) are preserved // -// If the body is not valid pkt-line data, it is included as-is so the hash falls -// back to full-body behavior (cache miss, not corruption). +// If the body is not valid pkt-line data, the original bytes are returned +// unchanged so the hash falls back to full-body behavior (cache miss, not +// corruption). // // Safety assumption: Dependabot updaters run in clean ephemeral containers with // no pre-existing git objects, so "have" negotiation state is effectively @@ -96,7 +97,10 @@ func isGitUploadPack(r *http.Request) bool { // because the server response does not depend on local object state when haves // are empty or near-empty. func normalizeGitBody(data []byte) []byte { - packets := pktline.Parse(data) + packets, ok := pktline.Parse(data) + if !ok { + return data + } var filtered []pktline.Packet for _, p := range packets { if p.Type != pktline.Data { diff --git a/internal/cache/handlers_test.go b/internal/cache/handlers_test.go index 7e3eb3b..80c75f9 100644 --- a/internal/cache/handlers_test.go +++ b/internal/cache/handlers_test.go @@ -447,6 +447,26 @@ func Test_key(t *testing.T) { t.Error("Identical capabilities with only agent= difference should produce the same cache key") } }) + + t.Run("git-upload-pack: malformed body falls back to full-body hash", func(t *testing.T) { + // Not valid pkt-line data (no 4-hex length prefix). normalizeGitBody must + // return the original bytes so that two different malformed bodies hash to + // different keys (no collisions from re-encoding). + body1 := "this is not pkt-line data" + body2 := "this is also not pkt-line data" + + req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) + req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) + + key1 := key(req1) + key2 := key(req2) + if key1.BodyHash == "" || key2.BodyHash == "" { + t.Error("malformed body should still produce a body hash") + } + if key1 == key2 { + t.Error("different malformed bodies must not collide on the same cache key") + } + }) } type BufferWithClose struct { diff --git a/internal/pktline/pktline.go b/internal/pktline/pktline.go index 41dba81..7854133 100644 --- a/internal/pktline/pktline.go +++ b/internal/pktline/pktline.go @@ -39,22 +39,25 @@ type Packet struct { } // Parse parses a pkt-line byte stream into a slice of packets. -// If the stream contains malformed data (bad length prefix, truncated packet), -// the remainder is returned as a single Data packet so callers can degrade -// gracefully rather than losing data. -func Parse(data []byte) []Packet { - var packets []Packet +// +// The returned ok flag is true when the entire stream was consumed as +// well-formed pkt-line data. If the stream contains malformed data (bad +// length prefix, truncated packet), ok is false and the remainder of the +// stream is returned as a final Data packet containing the unparsed bytes +// verbatim, so callers can degrade gracefully rather than losing data. +func Parse(data []byte) (packets []Packet, ok bool) { for len(data) > 0 { if len(data) < 4 { packets = append(packets, Packet{Type: Data, Payload: data}) - break + return packets, false } - length, err := strconv.ParseUint(string(data[:4]), 16, 16) + length64, err := strconv.ParseUint(string(data[:4]), 16, 16) if err != nil { packets = append(packets, Packet{Type: Data, Payload: data}) - break + return packets, false } + length := int(length64) switch { case length == 0: @@ -71,17 +74,16 @@ func Parse(data []byte) []Packet { packets = append(packets, Packet{Type: Data, Payload: data[:4]}) data = data[4:] default: - if int(length) > len(data) { - // Truncated packet; include remainder as-is + if length > len(data) { + // Truncated packet; include remainder as-is. packets = append(packets, Packet{Type: Data, Payload: data}) - data = nil - } else { - packets = append(packets, Packet{Type: Data, Payload: data[4:length]}) - data = data[length:] + return packets, false } + packets = append(packets, Packet{Type: Data, Payload: data[4:length]}) + data = data[length:] } } - return packets + return packets, true } // Encode serializes a slice of packets back into the pkt-line wire format. diff --git a/internal/pktline/pktline_test.go b/internal/pktline/pktline_test.go index 761111a..9cbfcd7 100644 --- a/internal/pktline/pktline_test.go +++ b/internal/pktline/pktline_test.go @@ -5,22 +5,51 @@ import ( "testing" ) +func TestParseEmptyInput(t *testing.T) { + packets, ok := Parse(nil) + if !ok { + t.Error("expected ok=true for empty input") + } + if len(packets) != 0 { + t.Fatalf("expected 0 packets, got %d", len(packets)) + } +} + +func TestParseReservedLength3(t *testing.T) { + // Length 0x0003 is reserved; we treat the 4 prefix bytes as opaque data + // and continue parsing. + packets, ok := Parse([]byte("0003" + "0000")) + if !ok { + t.Error("expected ok=true; reserved length is treated as opaque, not an error") + } + if len(packets) != 2 { + t.Fatalf("expected 2 packets, got %d", len(packets)) + } + if packets[0].Type != Data || string(packets[0].Payload) != "0003" { + t.Errorf("packet 0 should be Data with payload %q, got type=%v payload=%q", + "0003", packets[0].Type, string(packets[0].Payload)) + } + if packets[1].Type != Flush { + t.Error("packet 1 should be Flush") + } +} + func TestParseFlush(t *testing.T) { - packets := Parse([]byte("0000")) + packets, _ := Parse([]byte("0000")) if len(packets) != 1 || packets[0].Type != Flush { t.Fatal("expected single Flush packet") } } func TestParseDelim(t *testing.T) { - packets := Parse([]byte("0001")) + packets, _ := Parse([]byte("0001")) if len(packets) != 1 || packets[0].Type != Delim { t.Fatal("expected single Delim packet") } } func TestParseResponseEnd(t *testing.T) { - packets := Parse([]byte("0002")) + packets, _ := Parse([]byte("0002")) if len(packets) != 1 || packets[0].Type != ResponseEnd { t.Fatal("expected single ResponseEnd packet") } @@ -28,7 +57,7 @@ func TestParseResponseEnd(t *testing.T) { func TestParseDataPacket(t *testing.T) { // "000ahello\n" = length 10 (0x000a), payload "hello\n" - packets := Parse([]byte("000ahello\n")) + packets, _ := Parse([]byte("000ahello\n")) if len(packets) != 1 { t.Fatalf("expected 1 packet, got %d", len(packets)) } @@ -43,7 +72,7 @@ func TestParseDataPacket(t *testing.T) { func TestParseMultiplePackets(t *testing.T) { // Two data packets + flush input := "000ahello\n" + "000aworld\n" + "0000" - packets := Parse([]byte(input)) + packets, _ := Parse([]byte(input)) if len(packets) != 3 { t.Fatalf("expected 3 packets, got %d", len(packets)) } @@ -59,8 +88,11 @@ func TestParseMultiplePackets(t *testing.T) { } func TestParseMalformedLength(t *testing.T) { - // "gggg" is not valid hex — should return as raw data - packets := Parse([]byte("gggghello")) + // "gggg" is not valid hex — should return as raw data with ok=false + packets, ok := Parse([]byte("gggghello")) + if ok { + t.Error("expected ok=false for malformed input") + } if len(packets) != 1 || packets[0].Type != Data { t.Fatal("expected single raw Data packet for malformed input") } @@ -70,8 +102,11 @@ func TestParseMalformedLength(t *testing.T) { } func TestParseTruncatedPacket(t *testing.T) { - // Claims length 0x0020 (32 bytes) but only has 10 bytes total - packets := Parse([]byte("0020short")) + // Claims length 0x0020 (32 bytes) but only has 9 bytes total + packets, ok := Parse([]byte("0020short")) + if ok { + t.Error("expected ok=false for truncated input") + } if len(packets) != 1 || packets[0].Type != Data { t.Fatal("expected single raw Data packet for truncated input") } @@ -84,7 +119,7 @@ func TestParseRealV1Body(t *testing.T) { "0000" + "0032have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e\n" + "0009done\n" - packets := Parse([]byte(input)) + packets, _ := Parse([]byte(input)) if len(packets) != 5 { t.Fatalf("expected 5 packets, got %d", len(packets)) } @@ -114,7 +149,7 @@ func TestParseRealV2Body(t *testing.T) { "0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n" + "0009done\n" + "0000" - packets := Parse([]byte(input)) + packets, _ := Parse([]byte(input)) if len(packets) != 7 { t.Fatalf("expected 7 packets, got %d", len(packets)) } @@ -134,7 +169,7 @@ func TestParseRealV2Body(t *testing.T) { func TestEncodeRoundTrip(t *testing.T) { input := []byte("000ahello\n" + "0000" + "0001" + "000aworld\n" + "0002") - packets := Parse(input) + packets, _ := Parse(input) output := Encode(packets) if !bytes.Equal(input, output) { t.Fatalf("round-trip failed:\n input: %q\n output: %q", input, output) From cf3fe58878e9638287521b6f8c9027194849189f Mon Sep 17 00:00:00 2001 From: kbukum1 Date: Mon, 27 Apr 2026 21:14:52 +0000 Subject: [PATCH 3/9] Implement pkt-line parsing and encoding for git smart-HTTP protocol - Introduced `pktline.go` and `pktline_test.go` for handling pkt-line format. - Added functions to parse and encode pkt-lines, supporting special packets like flush, delim, and response-end. - Created tests to validate parsing of various pkt-line scenarios including empty input, malformed data, and real-world examples from git repositories. - Moved pkt-line functionality into a dedicated package to streamline the gitproto implementation. - Implemented `upload_pack.go` and `upload_pack_test.go` to manage upload-pack requests and normalize request bodies, ensuring consistent cache keys by stripping volatile fields. - Removed legacy `pktline` package to consolidate functionality within `gitproto`. --- internal/cache/handlers.go | 69 +------- internal/cache/handlers_test.go | 230 +++----------------------- internal/gitproto/pktline.go | 96 +++++++++++ internal/gitproto/pktline_test.go | 107 ++++++++++++ internal/gitproto/upload_pack.go | 95 +++++++++++ internal/gitproto/upload_pack_test.go | 201 ++++++++++++++++++++++ internal/pktline/pktline.go | 106 ------------ internal/pktline/pktline_test.go | 185 --------------------- 8 files changed, 521 insertions(+), 568 deletions(-) create mode 100644 internal/gitproto/pktline.go create mode 100644 internal/gitproto/pktline_test.go create mode 100644 internal/gitproto/upload_pack.go create mode 100644 internal/gitproto/upload_pack_test.go delete mode 100644 internal/pktline/pktline.go delete mode 100644 internal/pktline/pktline_test.go diff --git a/internal/cache/handlers.go b/internal/cache/handlers.go index 9297539..2cefda4 100644 --- a/internal/cache/handlers.go +++ b/internal/cache/handlers.go @@ -12,16 +12,14 @@ import ( "path/filepath" "regexp" "sort" - "strings" "sync" "github.com/elazarl/goproxy" "github.com/sirupsen/logrus" "gopkg.in/yaml.v3" - "github.com/dependabot/proxy/internal/pktline" - "github.com/dependabot/proxy/internal/ctxdata" + "github.com/dependabot/proxy/internal/gitproto" ) // DB contains the metadata of the disk cache @@ -65,67 +63,6 @@ var ignoreHeaders = map[string]struct{}{ "X-Pub-Session-Id": {}, } -// gitInlineAgentRegex matches the volatile inline "agent=" capability token -// in v1 want lines (e.g., " agent=git/2.43.0-Linux"). Real git always emits -// agent= as a trailing capability after a leading space. -var gitInlineAgentRegex = regexp.MustCompile(` agent=[^ \r\n]*`) - -// isGitUploadPack returns true if the request is a POST to a git-upload-pack endpoint. -func isGitUploadPack(r *http.Request) bool { - return r.Method == "POST" && strings.HasSuffix(r.URL.Path, "/git-upload-pack") -} - -// normalizeGitBody parses the pkt-line stream from a git-upload-pack POST body -// and rebuilds it with volatile fields removed, producing a stable cache key. -// -// Specifically: -// - "have" lines are dropped (local negotiation state that varies between clients) -// - Standalone "agent=" pkt-lines (v2) are dropped -// - Inline " agent=" tokens in v1 capability lines are stripped -// - All other lines (want, deepen, filter, command, shallow, etc.) are preserved -// with recomputed pkt-line length prefixes -// - Special packets (flush 0000, delim 0001, response-end 0002) are preserved -// -// If the body is not valid pkt-line data, the original bytes are returned -// unchanged so the hash falls back to full-body behavior (cache miss, not -// corruption). -// -// Safety assumption: Dependabot updaters run in clean ephemeral containers with -// no pre-existing git objects, so "have" negotiation state is effectively -// identical across runs for any ecosystem that uses git-upload-pack (nix, bundler -// git sources, Go modules, git submodules, etc.). Stripping "have" lines is safe -// because the server response does not depend on local object state when haves -// are empty or near-empty. -func normalizeGitBody(data []byte) []byte { - packets, ok := pktline.Parse(data) - if !ok { - return data - } - var filtered []pktline.Packet - for _, p := range packets { - if p.Type != pktline.Data { - filtered = append(filtered, p) - continue - } - payload := string(p.Payload) - - // Drop "have" lines - if strings.HasPrefix(payload, "have ") { - continue - } - - // Drop standalone "agent=" pkt-lines (v2) - if strings.HasPrefix(payload, "agent=") { - continue - } - - // Strip inline " agent=" tokens from v1 capability lines - cleaned := gitInlineAgentRegex.ReplaceAllString(payload, "") - filtered = append(filtered, pktline.Packet{Type: pktline.Data, Payload: []byte(cleaned)}) - } - return pktline.Encode(filtered) -} - // generates the key used in the DB, includes a hash of the body func key(r *http.Request) Key { data, _ := io.ReadAll(r.Body) @@ -162,8 +99,8 @@ func key(r *http.Request) Key { } if len(data) > 0 { hashData := data - if isGitUploadPack(r) { - hashData = normalizeGitBody(data) + if gitproto.IsUploadPackRequest(r) { + hashData = gitproto.NormalizeUploadPackBody(data) } hash := sha256.New() hash.Write(hashData) diff --git a/internal/cache/handlers_test.go b/internal/cache/handlers_test.go index 80c75f9..3a6fd02 100644 --- a/internal/cache/handlers_test.go +++ b/internal/cache/handlers_test.go @@ -246,225 +246,33 @@ func Test_key(t *testing.T) { } }) - t.Run("git-upload-pack: same wants, different haves produce same key", func(t *testing.T) { - // Real protocol v1 format: want lines with capabilities, followed by have lines - // OIDs from github.com/octocat/Hello-World + // Integration check: git-upload-pack POSTs are routed through + // gitproto.NormalizeUploadPackBody before hashing. Edge-case behaviour of + // the normalizer itself is exercised in internal/gitproto. + t.Run("git-upload-pack body is normalized before hashing", func(t *testing.T) { + const url = "https://github.com/octocat/Hello-World.git/git-upload-pack" + // Two bodies that differ only in volatile fields (have line + agent). body1 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n" + - "0032want b1b3f9723831141a31a1a7252a213e216ea76e56\n0000" + "0032have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e\n0009done\n" - body2 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n" + - "0032want b1b3f9723831141a31a1a7252a213e216ea76e56\n0000" + + body2 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.53.0\n" + "0032have a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2\n0009done\n" - req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) - req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1 != key2 { - t.Error("Same wants with different haves should produce the same cache key") - } - }) - - t.Run("git-upload-pack: different wants produce different keys", func(t *testing.T) { - // Two requests wanting different refs from the same repo - body1 := "0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n00000009done\n" - body2 := "0032want b3cbd5bbd7e81436d2eee04537ea2b4c0cad4cdf\n00000009done\n" - - req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) - req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1 == key2 { - t.Error("Different wants should produce different cache keys") - } - }) - - t.Run("git-upload-pack: different agent strings produce same key", func(t *testing.T) { - // Use different-length agent strings to verify pkt-line length prefixes are also normalized - body1 := "00a3want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack no-progress ofs-delta deepen-since deepen-not agent=git/2.9.5\n00000009done\n" - body2 := "00aawant 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack no-progress ofs-delta deepen-since deepen-not agent=git/2.43.0-Linux\n00000009done\n" - - req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) - req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1 != key2 { - t.Error("Different agent strings should produce the same cache key") - } - }) - - t.Run("git-upload-pack: non-git-upload-pack POST still uses full body hash", func(t *testing.T) { - body1 := `{"query":"{ viewer { login } }"}` - body2 := `{"query":"{ repository(owner:\"octocat\") { name } }"}` - - req1 := httptest.NewRequest("POST", "https://api.github.com/graphql", strings.NewReader(body1)) - req2 := httptest.NewRequest("POST", "https://api.github.com/graphql", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1 == key2 { - t.Error("Non-git-upload-pack POST should use full body hash") - } - }) - - t.Run("git-upload-pack: GET to git-upload-pack URL is not normalized", func(t *testing.T) { - body1 := "body with have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e in it" - body2 := "body with have b3cbd5bbd7e81436d2eee04537ea2b4c0cad4cdf in it" - - req1 := httptest.NewRequest("GET", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) - req2 := httptest.NewRequest("GET", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1 == key2 { - t.Error("GET requests should not have body normalization") - } - }) - - t.Run("git-upload-pack: protocol v2 ls-refs with no want lines still contributes to body hash", func(t *testing.T) { - // Real protocol v2 ls-refs commands have no want lines - body1 := "0014command=ls-refs\n0015agent=git/2.43.0\n001bref-prefix refs/heads/\n0000" - body2 := "0014command=ls-refs\n0015agent=git/2.43.0\n001aref-prefix refs/tags/\n0000" - - req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) - req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1.BodyHash == "" { - t.Error("git-upload-pack POST with no want lines should still include a body hash") - } - if key2.BodyHash == "" { - t.Error("git-upload-pack POST with no want lines should still include a body hash") - } - if key1 == key2 { - t.Error("Different non-want git-upload-pack requests should produce different cache keys") - } - }) - - t.Run("git-upload-pack: protocol v2 fetch same wants different agent produces same key", func(t *testing.T) { - // Real protocol v2 fetch command as seen in shallow clones - body1 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n0001000ddeepen 1\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" - body2 := "0012command=fetch\n001bagent=git/2.53.0-Linux\n0001000ddeepen 1\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" - - req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) - req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1 != key2 { - t.Error("Same wants with different agent versions in v2 fetch should produce the same cache key") - } - }) - - t.Run("git-upload-pack: different deepen values produce different keys", func(t *testing.T) { - body1 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n0001000ddeepen 1\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" - body2 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n0001000ddeepen 2\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" - - req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) - req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1 == key2 { - t.Error("Same wants with different deepen values should produce different cache keys") - } - }) - - t.Run("git-upload-pack: different filter values produce different keys", func(t *testing.T) { - body1 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n00010015filter blob:none\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" - body2 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n00010012filter tree:0\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" - - req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) - req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1 == key2 { - t.Error("Same wants with different filter values should produce different cache keys") - } - }) - - t.Run("git-upload-pack: shallow vs full clone produce different keys", func(t *testing.T) { - // Shallow clone with deepen - body1 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n0001000ddeepen 1\n0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" - // Full clone without deepen - body2 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n00010032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0009done\n0000" - - req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) - req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1 == key2 { - t.Error("Shallow clone with deepen and full clone should produce different cache keys") - } - }) - - t.Run("git-upload-pack: protocol v2 ls-refs same prefix different agent produces same key", func(t *testing.T) { - body1 := "0014command=ls-refs\n0015agent=git/2.43.0\n001bref-prefix refs/heads/\n0000" - body2 := "0014command=ls-refs\n0015agent=git/2.53.0\n001bref-prefix refs/heads/\n0000" - - req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) - req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1 != key2 { - t.Error("Same ls-refs with different agent should produce the same cache key") - } - }) - - t.Run("git-upload-pack: different non-agent capabilities produce different keys", func(t *testing.T) { - // Capabilities like thin-pack affect response encoding and must remain in the hash - body1 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n00000009done\n" - body2 := "0076want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k ofs-delta agent=git/2.43.0\n00000009done\n" - - req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) - req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1 == key2 { - t.Error("Different non-agent capabilities should produce different cache keys") + k1 := key(httptest.NewRequest("POST", url, strings.NewReader(body1))) + k2 := key(httptest.NewRequest("POST", url, strings.NewReader(body2))) + if k1 != k2 { + t.Error("git-upload-pack POSTs differing only in have/agent must produce the same cache key") } }) - t.Run("git-upload-pack: v1 agent-only difference with identical capabilities produces same key", func(t *testing.T) { - // Only the agent= suffix differs; all other capabilities are identical - body1 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n00000009done\n" - body2 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.53.0\n00000009done\n" + t.Run("non-git POST with similar substrings is not normalized", func(t *testing.T) { + // Confirms the gitproto integration is gated by IsUploadPackRequest. + body1 := `{"q":"have stuff agent=foo"}` + body2 := `{"q":"have other agent=bar"}` - req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) - req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1 != key2 { - t.Error("Identical capabilities with only agent= difference should produce the same cache key") - } - }) - - t.Run("git-upload-pack: malformed body falls back to full-body hash", func(t *testing.T) { - // Not valid pkt-line data (no 4-hex length prefix). normalizeGitBody must - // return the original bytes so that two different malformed bodies hash to - // different keys (no collisions from re-encoding). - body1 := "this is not pkt-line data" - body2 := "this is also not pkt-line data" - - req1 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body1)) - req2 := httptest.NewRequest("POST", "https://github.com/octocat/Hello-World.git/git-upload-pack", strings.NewReader(body2)) - - key1 := key(req1) - key2 := key(req2) - if key1.BodyHash == "" || key2.BodyHash == "" { - t.Error("malformed body should still produce a body hash") - } - if key1 == key2 { - t.Error("different malformed bodies must not collide on the same cache key") + k1 := key(httptest.NewRequest("POST", "https://api.github.com/graphql", strings.NewReader(body1))) + k2 := key(httptest.NewRequest("POST", "https://api.github.com/graphql", strings.NewReader(body2))) + if k1 == k2 { + t.Error("non-git POSTs must not be normalized") } }) } diff --git a/internal/gitproto/pktline.go b/internal/gitproto/pktline.go new file mode 100644 index 0000000..9e93a26 --- /dev/null +++ b/internal/gitproto/pktline.go @@ -0,0 +1,96 @@ +package gitproto + +import ( + "fmt" + "strconv" +) + +// pkt-line is the framing format used by the git smart-HTTP protocol +// (git-upload-pack, git-receive-pack). Each line is prefixed with a 4-hex-digit +// length that includes itself, or is a special packet: +// +// - "0000" flush (stream boundary) +// - "0001" delim (section separator, protocol v2) +// - "0002" response-end +// - "0004"+ data packet with payload of (length - 4) bytes +// +// See https://git-scm.com/docs/protocol-common#_pkt_line_format + +type pktType int + +const ( + pktData pktType = iota + pktFlush + pktDelim + pktResponseEnd +) + +type packet struct { + typ pktType + payload []byte // nil for non-Data packets +} + +// parsePktLine parses a pkt-line byte stream into packets. The returned ok flag +// is false when the stream contains malformed or truncated data; the unparsed +// remainder is appended as a final Data packet so callers can degrade +// gracefully (e.g., fall back to hashing the raw input). +func parsePktLine(data []byte) (packets []packet, ok bool) { + for len(data) > 0 { + if len(data) < 4 { + packets = append(packets, packet{typ: pktData, payload: data}) + return packets, false + } + length64, err := strconv.ParseUint(string(data[:4]), 16, 16) + if err != nil { + packets = append(packets, packet{typ: pktData, payload: data}) + return packets, false + } + length := int(length64) + switch { + case length == 0: + packets = append(packets, packet{typ: pktFlush}) + data = data[4:] + case length == 1: + packets = append(packets, packet{typ: pktDelim}) + data = data[4:] + case length == 2: + packets = append(packets, packet{typ: pktResponseEnd}) + data = data[4:] + case length == 3: + // Reserved by the spec; not used by real git. Treat as malformed + // so callers fall back to full-input behaviour. + packets = append(packets, packet{typ: pktData, payload: data}) + return packets, false + default: + if length > len(data) { + packets = append(packets, packet{typ: pktData, payload: data}) + return packets, false + } + packets = append(packets, packet{typ: pktData, payload: data[4:length]}) + data = data[length:] + } + } + return packets, true +} + +// encodePktLine serializes packets back into the pkt-line wire format. +// Re-encoding recomputes each Data packet's length prefix, which is what makes +// upstream normalization correct across requests whose data payloads differ +// only in length (e.g. different agent string lengths). +func encodePktLine(packets []packet) []byte { + var buf []byte + for _, p := range packets { + switch p.typ { + case pktFlush: + buf = append(buf, "0000"...) + case pktDelim: + buf = append(buf, "0001"...) + case pktResponseEnd: + buf = append(buf, "0002"...) + case pktData: + buf = append(buf, fmt.Sprintf("%04x", 4+len(p.payload))...) + buf = append(buf, p.payload...) + } + } + return buf +} diff --git a/internal/gitproto/pktline_test.go b/internal/gitproto/pktline_test.go new file mode 100644 index 0000000..ed0de08 --- /dev/null +++ b/internal/gitproto/pktline_test.go @@ -0,0 +1,107 @@ +package gitproto + +import ( + "bytes" + "testing" +) + +func TestParsePktLine_Empty(t *testing.T) { + pkts, ok := parsePktLine(nil) + if !ok { + t.Error("expected ok=true for empty input") + } + if len(pkts) != 0 { + t.Fatalf("expected 0 packets, got %d", len(pkts)) + } +} + +func TestParsePktLine_SpecialPackets(t *testing.T) { + cases := map[string]pktType{ + "0000": pktFlush, + "0001": pktDelim, + "0002": pktResponseEnd, + } + for input, want := range cases { + pkts, ok := parsePktLine([]byte(input)) + if !ok || len(pkts) != 1 || pkts[0].typ != want { + t.Errorf("input %q: got %+v ok=%v, want type %d", input, pkts, ok, want) + } + } +} + +func TestParsePktLine_DataPacket(t *testing.T) { + // "000ahello\n" = length 0x000a (10), payload "hello\n" + pkts, ok := parsePktLine([]byte("000ahello\n")) + if !ok || len(pkts) != 1 || pkts[0].typ != pktData || string(pkts[0].payload) != "hello\n" { + t.Errorf("got %+v ok=%v", pkts, ok) + } +} + +func TestParsePktLine_MalformedAndTruncated(t *testing.T) { + // Bad hex prefix. + if _, ok := parsePktLine([]byte("gggghi")); ok { + t.Error("expected ok=false for malformed length prefix") + } + // Length claims 0x0020 but only 9 bytes available. + if _, ok := parsePktLine([]byte("0020short")); ok { + t.Error("expected ok=false for truncated packet") + } + // Length 3 is reserved; we treat as malformed. + if _, ok := parsePktLine([]byte("00030000")); ok { + t.Error("expected ok=false for reserved length 3") + } + // Less than 4 bytes. + if _, ok := parsePktLine([]byte("ab")); ok { + t.Error("expected ok=false for sub-prefix input") + } +} + +func TestParsePktLine_RealV1Body(t *testing.T) { + // Realistic v1 upload-pack body from github.com/octocat/Hello-World + input := "00a4want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack no-progress ofs-delta deepen-since deepen-not agent=git/2.43.0\n" + + "0032want b1b3f9723831141a31a1a7252a213e216ea76e56\n" + + "0000" + + "0032have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e\n" + + "0009done\n" + pkts, ok := parsePktLine([]byte(input)) + if !ok { + t.Fatal("expected ok=true for well-formed v1 body") + } + wantTypes := []pktType{pktData, pktData, pktFlush, pktData, pktData} + if len(pkts) != len(wantTypes) { + t.Fatalf("got %d packets, want %d", len(pkts), len(wantTypes)) + } + for i, want := range wantTypes { + if pkts[i].typ != want { + t.Errorf("packet %d: got type %d, want %d", i, pkts[i].typ, want) + } + } +} + +func TestParsePktLine_RealV2Body(t *testing.T) { + input := "0012command=fetch\n" + + "0015agent=git/2.43.0\n" + + "0001" + + "000ddeepen 1\n" + + "0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n" + + "0009done\n" + + "0000" + pkts, ok := parsePktLine([]byte(input)) + if !ok || len(pkts) != 7 { + t.Fatalf("got %d packets ok=%v, want 7 ok=true", len(pkts), ok) + } + if pkts[2].typ != pktDelim || pkts[6].typ != pktFlush { + t.Error("special packets misidentified") + } +} + +func TestEncodePktLine_RoundTrip(t *testing.T) { + input := []byte("000ahello\n" + "0000" + "0001" + "000aworld\n" + "0002") + pkts, ok := parsePktLine(input) + if !ok { + t.Fatal("parse failed on well-formed input") + } + if got := encodePktLine(pkts); !bytes.Equal(got, input) { + t.Errorf("round-trip mismatch:\n in: %q\n out: %q", input, got) + } +} diff --git a/internal/gitproto/upload_pack.go b/internal/gitproto/upload_pack.go new file mode 100644 index 0000000..d118b0f --- /dev/null +++ b/internal/gitproto/upload_pack.go @@ -0,0 +1,95 @@ +// Package gitproto implements a small set of operations on the git smart-HTTP +// wire protocol, in support of the proxy's disk cache. +// +// The exported surface is intentionally narrow: callers should only need +// IsUploadPackRequest and NormalizeUploadPackBody. The pkt-line framing format +// is an internal implementation detail. +package gitproto + +import ( + "bytes" + "net/http" + "regexp" + "strings" +) + +// gitInlineVolatileRegex matches volatile inline capability tokens — currently +// " agent=" and " session-id=" — that appear as trailing +// capabilities on a v1 upload-pack want line. Real git always emits these +// after a leading space, so the leading-space anchor distinguishes them from +// a payload that merely starts with the same prefix. +var gitInlineVolatileRegex = regexp.MustCompile(` (?:agent|session-id)=[^ \r\n]*`) + +// volatileStandalonePrefixes lists payload prefixes whose entire pkt-line is +// dropped from the normalized body. These are protocol-v2 capability or +// negotiation lines that vary between semantically identical requests: +// +// - "have " local object negotiation state, varies per client +// - "agent=" v2 client version capability +// - "session-id=" v2 trace2 session identifier (Git 2.36+), unique per invocation +var volatileStandalonePrefixes = [][]byte{[]byte("have "), []byte("agent="), []byte("session-id=")} + +// IsUploadPackRequest reports whether r is a POST to a git-upload-pack +// endpoint, i.e. a smart-HTTP fetch negotiation. +func IsUploadPackRequest(r *http.Request) bool { + return r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/git-upload-pack") +} + +// NormalizeUploadPackBody returns a normalized form of a git-upload-pack POST +// body, suitable for use as a stable cache-key input. +// +// The output is opaque hash input — not intended to be sent on the wire. +// +// Normalization removes only fields that vary between semantically identical +// requests: +// +// - "have" pkt-lines (local object negotiation state, varies per client) +// - Standalone "agent=" pkt-lines (protocol v2 client version) +// - Standalone "session-id=" pkt-lines (protocol v2 trace2 session id, Git 2.36+) +// - Inline " agent=" and " session-id=" tokens within v1 capability lines +// +// All response-shaping fields (want, capabilities, command=, deepen, filter, +// shallow, ref-prefix, object-format, ...) are preserved, and special framing +// packets (flush, delim, response-end) are preserved verbatim. Each retained +// Data packet is re-emitted with a recomputed length prefix, so two requests +// that differ only in the length of a volatile field still hash identically. +// +// If the body is not valid pkt-line data, it is returned unchanged so callers +// fall back to full-body hashing (cache miss, never collision). +// +// Safety contract: this is safe to use as a cache key in environments where +// "have" sets are stable across runs (e.g. ephemeral containers with no +// pre-existing git objects, as used by Dependabot updaters). Under that +// assumption an upstream response generated for one normalized request body +// is valid for any request that normalizes to the same bytes. +func NormalizeUploadPackBody(data []byte) []byte { + packets, ok := parsePktLine(data) + if !ok { + return data + } + filtered := packets[:0] + for _, p := range packets { + if p.typ != pktData { + filtered = append(filtered, p) + continue + } + payload := p.payload + dropped := false + for _, prefix := range volatileStandalonePrefixes { + if bytes.HasPrefix(payload, prefix) { + dropped = true + break + } + } + if dropped { + continue + } + if gitInlineVolatileRegex.Match(payload) { + cleaned := gitInlineVolatileRegex.ReplaceAll(payload, nil) + filtered = append(filtered, packet{typ: pktData, payload: cleaned}) + continue + } + filtered = append(filtered, p) + } + return encodePktLine(filtered) +} diff --git a/internal/gitproto/upload_pack_test.go b/internal/gitproto/upload_pack_test.go new file mode 100644 index 0000000..6c096d0 --- /dev/null +++ b/internal/gitproto/upload_pack_test.go @@ -0,0 +1,201 @@ +package gitproto + +import ( + "bytes" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +// hashableKey condenses a normalized body to a comparable string for tests. +func normalize(body string) string { + return string(NormalizeUploadPackBody([]byte(body))) +} + +func TestIsUploadPackRequest(t *testing.T) { + cases := []struct { + name string + method string + url string + want bool + }{ + {"POST to upload-pack", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", true}, + {"GET to upload-pack URL", http.MethodGet, "https://github.com/octocat/Hello-World.git/git-upload-pack", false}, + {"POST to other path", http.MethodPost, "https://github.com/octocat/Hello-World.git/info/refs", false}, + {"POST to non-git host", http.MethodPost, "https://api.github.com/graphql", false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + req := httptest.NewRequest(tc.method, tc.url, nil) + if got := IsUploadPackRequest(req); got != tc.want { + t.Errorf("got %v, want %v", got, tc.want) + } + }) + } +} + +func TestNormalizeUploadPackBody(t *testing.T) { + // All bodies use realistic OIDs and pkt-line lengths captured from + // github.com/octocat/Hello-World. + const ( + oidA = "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d" + oidB = "b1b3f9723831141a31a1a7252a213e216ea76e56" + oidC = "b3cbd5bbd7e81436d2eee04537ea2b4c0cad4cdf" + // A made-up have OID; have content is irrelevant since these get dropped. + oidH1 = "553c2077f0edc3d5dc5d17262f6aa498e69d6f8e" + oidH2 = "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2" + ) + + t.Run("same wants, different haves and different-length agents → identical", func(t *testing.T) { + // The flake fix: real-world drift in `have` lines and `agent=` versions + // (including different string lengths) must collapse to the same output. + body1 := "00a4want " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack no-progress ofs-delta deepen-since deepen-not agent=git/2.43.0\n" + + "0032want " + oidB + "\n" + + "0000" + + "0032have " + oidH1 + "\n" + + "0009done\n" + body2 := "00a3want " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack no-progress ofs-delta deepen-since deepen-not agent=git/2.9.5\n" + + "0032want " + oidB + "\n" + + "0000" + + "0032have " + oidH2 + "\n" + + "0009done\n" + if normalize(body1) != normalize(body2) { + t.Errorf("normalized bodies differ:\n %q\n %q", normalize(body1), normalize(body2)) + } + }) + + t.Run("different wants → different", func(t *testing.T) { + body1 := "0032want " + oidA + "\n0009done\n" + body2 := "0032want " + oidC + "\n0009done\n" + if normalize(body1) == normalize(body2) { + t.Error("different wants must not collide") + } + }) + + t.Run("response-shaping fields are preserved", func(t *testing.T) { + // Each variant must produce a distinct normalized body. Otherwise a + // shallow clone could be served from a full-clone cache entry, etc. + bodies := map[string]string{ + "plain": "0032want " + oidA + "\n0009done\n", + "shallow": "000ddeepen 1\n0032want " + oidA + "\n0009done\n", + "shallow-deeper": "000ddeepen 2\n0032want " + oidA + "\n0009done\n", + "filter-blobless": "0015filter blob:none\n0032want " + oidA + "\n0009done\n", + "filter-treeless": "0012filter tree:0\n0032want " + oidA + "\n0009done\n", + "thin-pack-on": "0080want " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n" + + "0009done\n", + "thin-pack-off": "0076want " + oidA + " multi_ack_detailed no-done side-band-64k ofs-delta agent=git/2.43.0\n" + + "0009done\n", + } + seen := make(map[string]string) + for name, body := range bodies { + n := normalize(body) + if other, ok := seen[n]; ok { + t.Errorf("normalized collision: %q and %q produce the same bytes:\n %q", name, other, n) + } + seen[n] = name + } + }) + + t.Run("v2 ls-refs preserves ref-prefix differences", func(t *testing.T) { + body1 := "0014command=ls-refs\n0015agent=git/2.43.0\n001bref-prefix refs/heads/\n0000" + body2 := "0014command=ls-refs\n0015agent=git/2.43.0\n001aref-prefix refs/tags/\n0000" + if normalize(body1) == normalize(body2) { + t.Error("different ref-prefix must not collide") + } + }) + + t.Run("v2 fetch with same wants different agents → identical", func(t *testing.T) { + body1 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n0001000ddeepen 1\n0032want " + oidA + "\n0009done\n0000" + body2 := "0012command=fetch\n001bagent=git/2.53.0-Linux\n0001000ddeepen 1\n0032want " + oidA + "\n0009done\n0000" + if normalize(body1) != normalize(body2) { + t.Error("v2 agent drift must not affect normalization") + } + }) + + t.Run("v1 inline agent stripped, capability bytes preserved", func(t *testing.T) { + body1 := "0080want " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n0009done\n" + body2 := "0080want " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.53.0\n0009done\n" + got1, got2 := normalize(body1), normalize(body2) + if got1 != got2 { + t.Errorf("agent-only difference must collapse:\n %q\n %q", got1, got2) + } + if !strings.Contains(got1, "thin-pack") { + t.Errorf("non-agent capabilities must be preserved, got %q", got1) + } + if strings.Contains(got1, "agent=") { + t.Errorf("agent= must be removed, got %q", got1) + } + }) + + t.Run("malformed body returned unchanged", func(t *testing.T) { + body := []byte("this is not pkt-line data") + got := NormalizeUploadPackBody(body) + if !bytes.Equal(got, body) { + t.Errorf("malformed body must be returned unchanged, got %q", got) + } + }) + + t.Run("empty body → empty output", func(t *testing.T) { + if got := NormalizeUploadPackBody(nil); len(got) != 0 { + t.Errorf("empty body should produce empty output, got %q", got) + } + }) + + t.Run("non-have payload that begins with 'have' substring is preserved", func(t *testing.T) { + // Defensive: the prefix check requires "have " (with space). A payload + // like "haven 123\n" must not be dropped. + body := "000ehaven foo\n0009done\n" + got := normalize(body) + if !strings.Contains(got, "haven foo") { + t.Errorf("non-have payload was incorrectly stripped: %q", got) + } + }) + + t.Run("v2 standalone session-id stripped (Git 2.36+ trace2 capability)", func(t *testing.T) { + // session-id is a per-invocation UUID added in Git 2.36 (April 2022). + // It is purely informational and must not affect the cache key. + // Payload "session-id=abcdef\n" = 18 bytes → pkt-line length 0016. + body1 := "0012command=fetch\n0015agent=git/2.43.0\n" + + "0016session-id=abcdef\n" + + "00010032want " + oidA + "\n0009done\n0000" + body2 := "0012command=fetch\n0015agent=git/2.43.0\n" + + "0016session-id=fedcba\n" + + "00010032want " + oidA + "\n0009done\n0000" + n1, n2 := normalize(body1), normalize(body2) + if n1 != n2 { + t.Errorf("session-id drift must not affect normalization:\n %q\n %q", n1, n2) + } + if strings.Contains(n1, "session-id=") { + t.Errorf("session-id= must be stripped, got %q", n1) + } + }) + + t.Run("v1 inline session-id stripped alongside agent", func(t *testing.T) { + // Newer git clients can include session-id as an inline capability on + // the first v1 want line. Both volatile tokens must collapse together. + body1 := "009awant " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0 session-id=aaa-1\n0009done\n" + body2 := "009awant " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.53.0 session-id=zzz-2\n0009done\n" + n1, n2 := normalize(body1), normalize(body2) + if n1 != n2 { + t.Errorf("inline agent+session-id drift must collapse:\n %q\n %q", n1, n2) + } + if strings.Contains(n1, "session-id=") || strings.Contains(n1, "agent=") { + t.Errorf("volatile inline tokens must be removed, got %q", n1) + } + if !strings.Contains(n1, "thin-pack") { + t.Errorf("non-volatile capabilities must be preserved, got %q", n1) + } + }) + + t.Run("payload that merely starts with 'session-id' substring is preserved", func(t *testing.T) { + // Same defensive check as for "haven": only an exact "session-id=" + // prefix should trigger drop. A hypothetical future "session-ids=..." + // argument or a want-line containing the literal text must survive. + body := "0014session-ids=keep\n0009done\n" + got := normalize(body) + if !strings.Contains(got, "session-ids=keep") { + t.Errorf("non-session-id payload was incorrectly stripped: %q", got) + } + }) +} diff --git a/internal/pktline/pktline.go b/internal/pktline/pktline.go deleted file mode 100644 index 7854133..0000000 --- a/internal/pktline/pktline.go +++ /dev/null @@ -1,106 +0,0 @@ -// Package pktline implements parsing and encoding of the git pkt-line format. -// -// The pkt-line format is used by the git smart HTTP protocol (git-upload-pack, -// git-receive-pack) to frame variable-length data. Each line is prefixed with a -// 4-hex-digit length that includes itself, or is a special packet: -// -// - "0000" flush packet (stream boundary) -// - "0001" delimiter packet (section separator, protocol v2) -// - "0002" response-end packet -// - "0003" reserved -// - "0004"+ data packet with payload of (length - 4) bytes -// -// See https://git-scm.com/docs/protocol-common#_pkt_line_format -package pktline - -import ( - "fmt" - "strconv" -) - -// PacketType identifies the kind of pkt-line packet. -type PacketType int - -const ( - // Data is a normal data packet with a payload. - Data PacketType = iota - // Flush is the "0000" packet indicating a stream boundary. - Flush - // Delim is the "0001" packet separating sections in protocol v2. - Delim - // ResponseEnd is the "0002" packet indicating the end of a response. - ResponseEnd -) - -// Packet represents a single pkt-line packet. -type Packet struct { - Type PacketType - Payload []byte // nil for Flush, Delim, ResponseEnd -} - -// Parse parses a pkt-line byte stream into a slice of packets. -// -// The returned ok flag is true when the entire stream was consumed as -// well-formed pkt-line data. If the stream contains malformed data (bad -// length prefix, truncated packet), ok is false and the remainder of the -// stream is returned as a final Data packet containing the unparsed bytes -// verbatim, so callers can degrade gracefully rather than losing data. -func Parse(data []byte) (packets []Packet, ok bool) { - for len(data) > 0 { - if len(data) < 4 { - packets = append(packets, Packet{Type: Data, Payload: data}) - return packets, false - } - - length64, err := strconv.ParseUint(string(data[:4]), 16, 16) - if err != nil { - packets = append(packets, Packet{Type: Data, Payload: data}) - return packets, false - } - length := int(length64) - - switch { - case length == 0: - packets = append(packets, Packet{Type: Flush}) - data = data[4:] - case length == 1: - packets = append(packets, Packet{Type: Delim}) - data = data[4:] - case length == 2: - packets = append(packets, Packet{Type: ResponseEnd}) - data = data[4:] - case length == 3: - // Reserved; treat as opaque data - packets = append(packets, Packet{Type: Data, Payload: data[:4]}) - data = data[4:] - default: - if length > len(data) { - // Truncated packet; include remainder as-is. - packets = append(packets, Packet{Type: Data, Payload: data}) - return packets, false - } - packets = append(packets, Packet{Type: Data, Payload: data[4:length]}) - data = data[length:] - } - } - return packets, true -} - -// Encode serializes a slice of packets back into the pkt-line wire format. -func Encode(packets []Packet) []byte { - var buf []byte - for _, p := range packets { - switch p.Type { - case Flush: - buf = append(buf, "0000"...) - case Delim: - buf = append(buf, "0001"...) - case ResponseEnd: - buf = append(buf, "0002"...) - case Data: - buf = append(buf, fmt.Sprintf("%04x", 4+len(p.Payload))...) - buf = append(buf, p.Payload...) - } - } - return buf -} diff --git a/internal/pktline/pktline_test.go b/internal/pktline/pktline_test.go deleted file mode 100644 index 9cbfcd7..0000000 --- a/internal/pktline/pktline_test.go +++ /dev/null @@ -1,185 +0,0 @@ -package pktline - -import ( - "bytes" - "testing" -) - -func TestParseEmptyInput(t *testing.T) { - packets, ok := Parse(nil) - if !ok { - t.Error("expected ok=true for empty input") - } - if len(packets) != 0 { - t.Fatalf("expected 0 packets, got %d", len(packets)) - } -} - -func TestParseReservedLength3(t *testing.T) { - // Length 0x0003 is reserved; we treat the 4 prefix bytes as opaque data - // and continue parsing. - packets, ok := Parse([]byte("0003" + "0000")) - if !ok { - t.Error("expected ok=true; reserved length is treated as opaque, not an error") - } - if len(packets) != 2 { - t.Fatalf("expected 2 packets, got %d", len(packets)) - } - if packets[0].Type != Data || string(packets[0].Payload) != "0003" { - t.Errorf("packet 0 should be Data with payload %q, got type=%v payload=%q", - "0003", packets[0].Type, string(packets[0].Payload)) - } - if packets[1].Type != Flush { - t.Error("packet 1 should be Flush") - } -} - -func TestParseFlush(t *testing.T) { - packets, _ := Parse([]byte("0000")) - if len(packets) != 1 || packets[0].Type != Flush { - t.Fatal("expected single Flush packet") - } -} - -func TestParseDelim(t *testing.T) { - packets, _ := Parse([]byte("0001")) - if len(packets) != 1 || packets[0].Type != Delim { - t.Fatal("expected single Delim packet") - } -} - -func TestParseResponseEnd(t *testing.T) { - packets, _ := Parse([]byte("0002")) - if len(packets) != 1 || packets[0].Type != ResponseEnd { - t.Fatal("expected single ResponseEnd packet") - } -} - -func TestParseDataPacket(t *testing.T) { - // "000ahello\n" = length 10 (0x000a), payload "hello\n" - packets, _ := Parse([]byte("000ahello\n")) - if len(packets) != 1 { - t.Fatalf("expected 1 packet, got %d", len(packets)) - } - if packets[0].Type != Data { - t.Fatal("expected Data packet") - } - if string(packets[0].Payload) != "hello\n" { - t.Fatalf("expected payload %q, got %q", "hello\n", string(packets[0].Payload)) - } -} - -func TestParseMultiplePackets(t *testing.T) { - // Two data packets + flush - input := "000ahello\n" + "000aworld\n" + "0000" - packets, _ := Parse([]byte(input)) - if len(packets) != 3 { - t.Fatalf("expected 3 packets, got %d", len(packets)) - } - if packets[0].Type != Data || string(packets[0].Payload) != "hello\n" { - t.Error("first packet wrong") - } - if packets[1].Type != Data || string(packets[1].Payload) != "world\n" { - t.Error("second packet wrong") - } - if packets[2].Type != Flush { - t.Error("third packet should be Flush") - } -} - -func TestParseMalformedLength(t *testing.T) { - // "gggg" is not valid hex — should return as raw data with ok=false - packets, ok := Parse([]byte("gggghello")) - if ok { - t.Error("expected ok=false for malformed input") - } - if len(packets) != 1 || packets[0].Type != Data { - t.Fatal("expected single raw Data packet for malformed input") - } - if string(packets[0].Payload) != "gggghello" { - t.Fatalf("expected full input as payload, got %q", string(packets[0].Payload)) - } -} - -func TestParseTruncatedPacket(t *testing.T) { - // Claims length 0x0020 (32 bytes) but only has 9 bytes total - packets, ok := Parse([]byte("0020short")) - if ok { - t.Error("expected ok=false for truncated input") - } - if len(packets) != 1 || packets[0].Type != Data { - t.Fatal("expected single raw Data packet for truncated input") - } -} - -func TestParseRealV1Body(t *testing.T) { - // Real protocol v1 upload-pack body captured from git clone of octocat/Hello-World - input := "00a4want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack no-progress ofs-delta deepen-since deepen-not agent=git/2.43.0\n" + - "0032want b1b3f9723831141a31a1a7252a213e216ea76e56\n" + - "0000" + - "0032have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e\n" + - "0009done\n" - packets, _ := Parse([]byte(input)) - if len(packets) != 5 { - t.Fatalf("expected 5 packets, got %d", len(packets)) - } - if packets[0].Type != Data { - t.Error("packet 0 should be Data (want with caps)") - } - if packets[1].Type != Data { - t.Error("packet 1 should be Data (want)") - } - if packets[2].Type != Flush { - t.Error("packet 2 should be Flush") - } - if packets[3].Type != Data { - t.Error("packet 3 should be Data (have)") - } - if packets[4].Type != Data { - t.Error("packet 4 should be Data (done)") - } -} - -func TestParseRealV2Body(t *testing.T) { - // Real protocol v2 fetch command - input := "0012command=fetch\n" + - "0015agent=git/2.43.0\n" + - "0001" + - "000ddeepen 1\n" + - "0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n" + - "0009done\n" + - "0000" - packets, _ := Parse([]byte(input)) - if len(packets) != 7 { - t.Fatalf("expected 7 packets, got %d", len(packets)) - } - if packets[0].Type != Data || string(packets[0].Payload) != "command=fetch\n" { - t.Error("packet 0 should be command=fetch") - } - if packets[1].Type != Data || string(packets[1].Payload) != "agent=git/2.43.0\n" { - t.Error("packet 1 should be agent=") - } - if packets[2].Type != Delim { - t.Error("packet 2 should be Delim") - } - if packets[6].Type != Flush { - t.Error("packet 6 should be Flush") - } -} - -func TestEncodeRoundTrip(t *testing.T) { - input := []byte("000ahello\n" + "0000" + "0001" + "000aworld\n" + "0002") - packets, _ := Parse(input) - output := Encode(packets) - if !bytes.Equal(input, output) { - t.Fatalf("round-trip failed:\n input: %q\n output: %q", input, output) - } -} - -func TestEncodeEmptyPayload(t *testing.T) { - packets := []Packet{{Type: Flush}, {Type: Delim}} - output := Encode(packets) - if string(output) != "00000001" { - t.Fatalf("expected %q, got %q", "00000001", string(output)) - } -} From 89bbd7a300b7a5f97f58c26c4a9550b91e0d7de6 Mon Sep 17 00:00:00 2001 From: kbukum1 Date: Mon, 27 Apr 2026 21:42:12 +0000 Subject: [PATCH 4/9] Refactor pkt-line handling and improve test descriptions for clarity --- internal/gitproto/pktline.go | 80 +++++++++++++++------------ internal/gitproto/upload_pack.go | 51 +++++++++-------- internal/gitproto/upload_pack_test.go | 8 ++- 3 files changed, 79 insertions(+), 60 deletions(-) diff --git a/internal/gitproto/pktline.go b/internal/gitproto/pktline.go index 9e93a26..fc38335 100644 --- a/internal/gitproto/pktline.go +++ b/internal/gitproto/pktline.go @@ -1,9 +1,6 @@ package gitproto -import ( - "fmt" - "strconv" -) +import "strconv" // pkt-line is the framing format used by the git smart-HTTP protocol // (git-upload-pack, git-receive-pack). Each line is prefixed with a 4-hex-digit @@ -16,6 +13,8 @@ import ( // // See https://git-scm.com/docs/protocol-common#_pkt_line_format +const hexDigits = "0123456789abcdef" + type pktType int const ( @@ -27,58 +26,54 @@ const ( type packet struct { typ pktType - payload []byte // nil for non-Data packets + payload []byte // nil for non-data packets } -// parsePktLine parses a pkt-line byte stream into packets. The returned ok flag -// is false when the stream contains malformed or truncated data; the unparsed -// remainder is appended as a final Data packet so callers can degrade -// gracefully (e.g., fall back to hashing the raw input). +// parsePktLine parses a pkt-line byte stream into packets. The returned ok +// flag is false when the stream contains malformed or truncated data, in +// which case the packets slice is nil and callers should fall back to +// treating the original input opaquely (e.g., hashing it whole). func parsePktLine(data []byte) (packets []packet, ok bool) { for len(data) > 0 { if len(data) < 4 { - packets = append(packets, packet{typ: pktData, payload: data}) - return packets, false + return nil, false } - length64, err := strconv.ParseUint(string(data[:4]), 16, 16) + parsed, err := strconv.ParseUint(string(data[:4]), 16, 16) if err != nil { - packets = append(packets, packet{typ: pktData, payload: data}) - return packets, false + return nil, false } - length := int(length64) - switch { - case length == 0: + n := int(parsed) + switch n { + case 0: packets = append(packets, packet{typ: pktFlush}) data = data[4:] - case length == 1: + case 1: packets = append(packets, packet{typ: pktDelim}) data = data[4:] - case length == 2: + case 2: packets = append(packets, packet{typ: pktResponseEnd}) data = data[4:] - case length == 3: - // Reserved by the spec; not used by real git. Treat as malformed - // so callers fall back to full-input behaviour. - packets = append(packets, packet{typ: pktData, payload: data}) - return packets, false + case 3: + // Reserved by the spec; not used by real git. Treat as + // malformed so callers fall back to full-input behaviour. + return nil, false default: - if length > len(data) { - packets = append(packets, packet{typ: pktData, payload: data}) - return packets, false + if n > len(data) { + return nil, false } - packets = append(packets, packet{typ: pktData, payload: data[4:length]}) - data = data[length:] + packets = append(packets, packet{typ: pktData, payload: data[4:n]}) + data = data[n:] } } return packets, true } // encodePktLine serializes packets back into the pkt-line wire format. -// Re-encoding recomputes each Data packet's length prefix, which is what makes -// upstream normalization correct across requests whose data payloads differ -// only in length (e.g. different agent string lengths). +// Re-encoding recomputes each data packet's length prefix, which is what +// makes upstream normalization correct across requests whose payloads +// differ only in length (e.g. different agent string lengths). func encodePktLine(packets []packet) []byte { - var buf []byte + buf := make([]byte, 0, encodedSize(packets)) for _, p := range packets { switch p.typ { case pktFlush: @@ -88,9 +83,26 @@ func encodePktLine(packets []packet) []byte { case pktResponseEnd: buf = append(buf, "0002"...) case pktData: - buf = append(buf, fmt.Sprintf("%04x", 4+len(p.payload))...) + n := 4 + len(p.payload) + buf = append(buf, + hexDigits[(n>>12)&0xf], + hexDigits[(n>>8)&0xf], + hexDigits[(n>>4)&0xf], + hexDigits[n&0xf], + ) buf = append(buf, p.payload...) } } return buf } + +func encodedSize(packets []packet) int { + size := 0 + for _, p := range packets { + size += 4 + if p.typ == pktData { + size += len(p.payload) + } + } + return size +} diff --git a/internal/gitproto/upload_pack.go b/internal/gitproto/upload_pack.go index d118b0f..9e08f79 100644 --- a/internal/gitproto/upload_pack.go +++ b/internal/gitproto/upload_pack.go @@ -1,5 +1,5 @@ -// Package gitproto implements a small set of operations on the git smart-HTTP -// wire protocol, in support of the proxy's disk cache. +// Package gitproto provides helpers for working with the git smart-HTTP wire +// protocol, in support of the proxy's disk cache. // // The exported surface is intentionally narrow: callers should only need // IsUploadPackRequest and NormalizeUploadPackBody. The pkt-line framing format @@ -13,22 +13,33 @@ import ( "strings" ) -// gitInlineVolatileRegex matches volatile inline capability tokens — currently -// " agent=" and " session-id=" — that appear as trailing -// capabilities on a v1 upload-pack want line. Real git always emits these -// after a leading space, so the leading-space anchor distinguishes them from -// a payload that merely starts with the same prefix. -var gitInlineVolatileRegex = regexp.MustCompile(` (?:agent|session-id)=[^ \r\n]*`) +// inlineVolatileTokenRegex matches volatile inline capability tokens — +// currently " agent=" and " session-id=" — that appear as +// trailing capabilities on a v1 upload-pack want line. Real git always emits +// these after a leading space, so the leading-space anchor distinguishes them +// from a payload that merely starts with the same prefix. +var inlineVolatileTokenRegex = regexp.MustCompile(` (?:agent|session-id)=[^ \r\n]*`) // volatileStandalonePrefixes lists payload prefixes whose entire pkt-line is -// dropped from the normalized body. These are protocol-v2 capability or -// negotiation lines that vary between semantically identical requests: +// dropped from the normalized body. These are negotiation or capability +// lines that vary between semantically identical requests: // -// - "have " local object negotiation state, varies per client +// - "have " v1 + v2 local object negotiation state, varies per client // - "agent=" v2 client version capability // - "session-id=" v2 trace2 session identifier (Git 2.36+), unique per invocation var volatileStandalonePrefixes = [][]byte{[]byte("have "), []byte("agent="), []byte("session-id=")} +// hasVolatilePrefix reports whether payload begins with any prefix in +// volatileStandalonePrefixes. +func hasVolatilePrefix(payload []byte) bool { + for _, prefix := range volatileStandalonePrefixes { + if bytes.HasPrefix(payload, prefix) { + return true + } + } + return false +} + // IsUploadPackRequest reports whether r is a POST to a git-upload-pack // endpoint, i.e. a smart-HTTP fetch negotiation. func IsUploadPackRequest(r *http.Request) bool { @@ -51,7 +62,7 @@ func IsUploadPackRequest(r *http.Request) bool { // All response-shaping fields (want, capabilities, command=, deepen, filter, // shallow, ref-prefix, object-format, ...) are preserved, and special framing // packets (flush, delim, response-end) are preserved verbatim. Each retained -// Data packet is re-emitted with a recomputed length prefix, so two requests +// data packet is re-emitted with a recomputed length prefix, so two requests // that differ only in the length of a volatile field still hash identically. // // If the body is not valid pkt-line data, it is returned unchanged so callers @@ -73,19 +84,13 @@ func NormalizeUploadPackBody(data []byte) []byte { filtered = append(filtered, p) continue } - payload := p.payload - dropped := false - for _, prefix := range volatileStandalonePrefixes { - if bytes.HasPrefix(payload, prefix) { - dropped = true - break - } - } - if dropped { + if hasVolatilePrefix(p.payload) { continue } - if gitInlineVolatileRegex.Match(payload) { - cleaned := gitInlineVolatileRegex.ReplaceAll(payload, nil) + // Match-then-Replace avoids an allocation on the common no-match path + // (most pkt-lines do not carry inline capability tokens). + if inlineVolatileTokenRegex.Match(p.payload) { + cleaned := inlineVolatileTokenRegex.ReplaceAll(p.payload, nil) filtered = append(filtered, packet{typ: pktData, payload: cleaned}) continue } diff --git a/internal/gitproto/upload_pack_test.go b/internal/gitproto/upload_pack_test.go index 6c096d0..a42638f 100644 --- a/internal/gitproto/upload_pack_test.go +++ b/internal/gitproto/upload_pack_test.go @@ -8,7 +8,8 @@ import ( "testing" ) -// hashableKey condenses a normalized body to a comparable string for tests. +// normalize is a test helper that runs NormalizeUploadPackBody and returns the +// result as a string for direct comparison. func normalize(body string) string { return string(NormalizeUploadPackBody([]byte(body))) } @@ -23,7 +24,7 @@ func TestIsUploadPackRequest(t *testing.T) { {"POST to upload-pack", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", true}, {"GET to upload-pack URL", http.MethodGet, "https://github.com/octocat/Hello-World.git/git-upload-pack", false}, {"POST to other path", http.MethodPost, "https://github.com/octocat/Hello-World.git/info/refs", false}, - {"POST to non-git host", http.MethodPost, "https://api.github.com/graphql", false}, + {"POST to non-git path", http.MethodPost, "https://api.github.com/graphql", false}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { @@ -42,7 +43,8 @@ func TestNormalizeUploadPackBody(t *testing.T) { oidA = "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d" oidB = "b1b3f9723831141a31a1a7252a213e216ea76e56" oidC = "b3cbd5bbd7e81436d2eee04537ea2b4c0cad4cdf" - // A made-up have OID; have content is irrelevant since these get dropped. + // have OIDs are arbitrary — these pkt-lines are dropped during + // normalization, so their content is irrelevant to the test. oidH1 = "553c2077f0edc3d5dc5d17262f6aa498e69d6f8e" oidH2 = "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2" ) From e2835ea3b922b977f90dcea73f0a923ebd590ea8 Mon Sep 17 00:00:00 2001 From: kbukum1 Date: Mon, 27 Apr 2026 21:54:44 +0000 Subject: [PATCH 5/9] Enhance git-upload-pack request handling and tests for improved normalization and validation --- internal/cache/handlers_test.go | 43 +++++++++++++++++++++++++-- internal/gitproto/upload_pack.go | 22 ++++++++++++-- internal/gitproto/upload_pack_test.go | 26 +++++++++++----- 3 files changed, 77 insertions(+), 14 deletions(-) diff --git a/internal/cache/handlers_test.go b/internal/cache/handlers_test.go index 3a6fd02..1d76029 100644 --- a/internal/cache/handlers_test.go +++ b/internal/cache/handlers_test.go @@ -251,19 +251,39 @@ func Test_key(t *testing.T) { // the normalizer itself is exercised in internal/gitproto. t.Run("git-upload-pack body is normalized before hashing", func(t *testing.T) { const url = "https://github.com/octocat/Hello-World.git/git-upload-pack" + const ct = "application/x-git-upload-pack-request" // Two bodies that differ only in volatile fields (have line + agent). body1 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n" + "0032have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e\n0009done\n" body2 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.53.0\n" + "0032have a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2\n0009done\n" - k1 := key(httptest.NewRequest("POST", url, strings.NewReader(body1))) - k2 := key(httptest.NewRequest("POST", url, strings.NewReader(body2))) - if k1 != k2 { + mkReq := func(body string) *http.Request { + r := httptest.NewRequest("POST", url, strings.NewReader(body)) + r.Header.Set("Content-Type", ct) + return r + } + if key(mkReq(body1)) != key(mkReq(body2)) { t.Error("git-upload-pack POSTs differing only in have/agent must produce the same cache key") } }) + t.Run("malformed git-upload-pack body falls back to raw-body hashing", func(t *testing.T) { + // Even with the right URL, method, and Content-Type, a body that fails + // pkt-line parsing must produce distinct cache keys for distinct bytes + // (cache miss is acceptable; collision is not). + const url = "https://github.com/octocat/Hello-World.git/git-upload-pack" + const ct = "application/x-git-upload-pack-request" + mkReq := func(body string) *http.Request { + r := httptest.NewRequest("POST", url, strings.NewReader(body)) + r.Header.Set("Content-Type", ct) + return r + } + if key(mkReq("garbage one")) == key(mkReq("garbage two")) { + t.Error("malformed bodies must hash distinctly") + } + }) + t.Run("non-git POST with similar substrings is not normalized", func(t *testing.T) { // Confirms the gitproto integration is gated by IsUploadPackRequest. body1 := `{"q":"have stuff agent=foo"}` @@ -275,6 +295,23 @@ func Test_key(t *testing.T) { t.Error("non-git POSTs must not be normalized") } }) + + t.Run("POST to fake /git-upload-pack URL without Content-Type is not normalized", func(t *testing.T) { + // Defense in depth: the Content-Type gate prevents non-git traffic that + // happens to share the path suffix from being routed through pkt-line + // normalization. + const url = "https://example.com/foo/git-upload-pack" + // Bodies that, if normalized, would collide because they only differ in + // would-be "have"/"agent" tokens. + body1 := "0032have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e\n0009done\n" + body2 := "0032have a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2\n0009done\n" + + k1 := key(httptest.NewRequest("POST", url, strings.NewReader(body1))) + k2 := key(httptest.NewRequest("POST", url, strings.NewReader(body2))) + if k1 == k2 { + t.Error("non-git POST without Content-Type must not be normalized") + } + }) } type BufferWithClose struct { diff --git a/internal/gitproto/upload_pack.go b/internal/gitproto/upload_pack.go index 9e08f79..adb89a8 100644 --- a/internal/gitproto/upload_pack.go +++ b/internal/gitproto/upload_pack.go @@ -40,10 +40,26 @@ func hasVolatilePrefix(payload []byte) bool { return false } -// IsUploadPackRequest reports whether r is a POST to a git-upload-pack -// endpoint, i.e. a smart-HTTP fetch negotiation. +// uploadPackContentType is the request Content-Type sent by all real git +// clients on a smart-HTTP fetch. See https://git-scm.com/docs/http-protocol. +const uploadPackContentType = "application/x-git-upload-pack-request" + +// IsUploadPackRequest reports whether r is a smart-HTTP git-upload-pack POST, +// i.e. a fetch negotiation. It checks method, path suffix, and Content-Type +// together so that a non-git POST to an arbitrary URL ending in +// "/git-upload-pack" cannot accidentally be routed through pkt-line +// normalization. func IsUploadPackRequest(r *http.Request) bool { - return r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/git-upload-pack") + if r.Method != http.MethodPost { + return false + } + if !strings.HasSuffix(r.URL.Path, "/git-upload-pack") { + return false + } + // Content-Type may include parameters (e.g. "; charset=utf-8"), though + // real git never sends them. Match on the media-type prefix. + ct := r.Header.Get("Content-Type") + return ct == uploadPackContentType || strings.HasPrefix(ct, uploadPackContentType+";") } // NormalizeUploadPackBody returns a normalized form of a git-upload-pack POST diff --git a/internal/gitproto/upload_pack_test.go b/internal/gitproto/upload_pack_test.go index a42638f..daa91f7 100644 --- a/internal/gitproto/upload_pack_test.go +++ b/internal/gitproto/upload_pack_test.go @@ -15,20 +15,30 @@ func normalize(body string) string { } func TestIsUploadPackRequest(t *testing.T) { + const ct = "application/x-git-upload-pack-request" cases := []struct { - name string - method string - url string - want bool + name string + method string + url string + contentType string + want bool }{ - {"POST to upload-pack", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", true}, - {"GET to upload-pack URL", http.MethodGet, "https://github.com/octocat/Hello-World.git/git-upload-pack", false}, - {"POST to other path", http.MethodPost, "https://github.com/octocat/Hello-World.git/info/refs", false}, - {"POST to non-git path", http.MethodPost, "https://api.github.com/graphql", false}, + {"real git POST", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", ct, true}, + {"real git POST with charset parameter", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", ct + "; charset=utf-8", true}, + {"GET to upload-pack URL", http.MethodGet, "https://github.com/octocat/Hello-World.git/git-upload-pack", ct, false}, + {"POST to other git path (info/refs)", http.MethodPost, "https://github.com/octocat/Hello-World.git/info/refs", ct, false}, + {"POST to non-git path", http.MethodPost, "https://api.github.com/graphql", "application/json", false}, + {"POST to fake upload-pack path with wrong Content-Type", http.MethodPost, "https://example.com/foo/git-upload-pack", "application/json", false}, + {"POST to upload-pack path with no Content-Type", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", "", false}, + {"path ends in git-upload-pack but no leading slash", http.MethodPost, "https://example.com/notgit-upload-pack", ct, false}, + {"path has trailing segment after git-upload-pack", http.MethodPost, "https://github.com/foo.git/git-upload-pack/extra", ct, false}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { req := httptest.NewRequest(tc.method, tc.url, nil) + if tc.contentType != "" { + req.Header.Set("Content-Type", tc.contentType) + } if got := IsUploadPackRequest(req); got != tc.want { t.Errorf("got %v, want %v", got, tc.want) } From f12f61c4d2d5a023467b9b92cd67d306dc742191 Mon Sep 17 00:00:00 2001 From: kbukum1 Date: Mon, 27 Apr 2026 21:56:24 +0000 Subject: [PATCH 6/9] Refactor hex parsing in pkt-line handling to eliminate string allocation and improve performance --- internal/gitproto/pktline.go | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/internal/gitproto/pktline.go b/internal/gitproto/pktline.go index fc38335..bfa1b5c 100644 --- a/internal/gitproto/pktline.go +++ b/internal/gitproto/pktline.go @@ -1,7 +1,5 @@ package gitproto -import "strconv" - // pkt-line is the framing format used by the git smart-HTTP protocol // (git-upload-pack, git-receive-pack). Each line is prefixed with a 4-hex-digit // length that includes itself, or is a special packet: @@ -29,6 +27,27 @@ type packet struct { payload []byte // nil for non-data packets } +// parseHex4 decodes a 4-byte ASCII hex prefix into its integer value without +// allocating an intermediate string. Returns ok=false on any non-hex byte. +func parseHex4(b []byte) (n int, ok bool) { + for i := 0; i < 4; i++ { + c := b[i] + var v int + switch { + case c >= '0' && c <= '9': + v = int(c - '0') + case c >= 'a' && c <= 'f': + v = int(c-'a') + 10 + case c >= 'A' && c <= 'F': + v = int(c-'A') + 10 + default: + return 0, false + } + n = n<<4 | v + } + return n, true +} + // parsePktLine parses a pkt-line byte stream into packets. The returned ok // flag is false when the stream contains malformed or truncated data, in // which case the packets slice is nil and callers should fall back to @@ -38,11 +57,10 @@ func parsePktLine(data []byte) (packets []packet, ok bool) { if len(data) < 4 { return nil, false } - parsed, err := strconv.ParseUint(string(data[:4]), 16, 16) - if err != nil { + n, ok := parseHex4(data[:4]) + if !ok { return nil, false } - n := int(parsed) switch n { case 0: packets = append(packets, packet{typ: pktFlush}) From b4c94e57703ada76fbdd5ef7ed62d83d686b8b9e Mon Sep 17 00:00:00 2001 From: kbukum1 Date: Mon, 27 Apr 2026 22:09:13 +0000 Subject: [PATCH 7/9] Refactor pkt-line comments for clarity and enhance IsUploadPackRequest to handle case-insensitivity and whitespace in Content-Type --- internal/gitproto/pktline.go | 16 +++++++++++----- internal/gitproto/upload_pack.go | 18 ++++++++++-------- internal/gitproto/upload_pack_test.go | 2 ++ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/internal/gitproto/pktline.go b/internal/gitproto/pktline.go index bfa1b5c..affd0f2 100644 --- a/internal/gitproto/pktline.go +++ b/internal/gitproto/pktline.go @@ -4,15 +4,17 @@ package gitproto // (git-upload-pack, git-receive-pack). Each line is prefixed with a 4-hex-digit // length that includes itself, or is a special packet: // -// - "0000" flush (stream boundary) -// - "0001" delim (section separator, protocol v2) -// - "0002" response-end -// - "0004"+ data packet with payload of (length - 4) bytes +// - "0000" flush (stream boundary) +// - "0001" delim (section separator, protocol v2) +// - "0002" response-end +// - any length >= 4 data packet with payload of (length - 4) bytes // // See https://git-scm.com/docs/protocol-common#_pkt_line_format const hexDigits = "0123456789abcdef" +// pktType distinguishes ordinary data packets from the three special framing +// packets defined by the protocol (flush, delim, response-end). type pktType int const ( @@ -22,9 +24,11 @@ const ( pktResponseEnd ) +// packet is one parsed pkt-line. payload is set only when typ == pktData and +// excludes the 4-byte length prefix. type packet struct { typ pktType - payload []byte // nil for non-data packets + payload []byte } // parseHex4 decodes a 4-byte ASCII hex prefix into its integer value without @@ -114,6 +118,8 @@ func encodePktLine(packets []packet) []byte { return buf } +// encodedSize returns the exact wire size of packets, used to pre-allocate the +// output buffer in encodePktLine without intermediate growth. func encodedSize(packets []packet) int { size := 0 for _, p := range packets { diff --git a/internal/gitproto/upload_pack.go b/internal/gitproto/upload_pack.go index adb89a8..1e6ebc4 100644 --- a/internal/gitproto/upload_pack.go +++ b/internal/gitproto/upload_pack.go @@ -8,11 +8,16 @@ package gitproto import ( "bytes" + "mime" "net/http" "regexp" "strings" ) +// uploadPackContentType is the request Content-Type sent by all real git +// clients on a smart-HTTP fetch. See https://git-scm.com/docs/http-protocol. +const uploadPackContentType = "application/x-git-upload-pack-request" + // inlineVolatileTokenRegex matches volatile inline capability tokens — // currently " agent=" and " session-id=" — that appear as // trailing capabilities on a v1 upload-pack want line. Real git always emits @@ -40,10 +45,6 @@ func hasVolatilePrefix(payload []byte) bool { return false } -// uploadPackContentType is the request Content-Type sent by all real git -// clients on a smart-HTTP fetch. See https://git-scm.com/docs/http-protocol. -const uploadPackContentType = "application/x-git-upload-pack-request" - // IsUploadPackRequest reports whether r is a smart-HTTP git-upload-pack POST, // i.e. a fetch negotiation. It checks method, path suffix, and Content-Type // together so that a non-git POST to an arbitrary URL ending in @@ -56,10 +57,11 @@ func IsUploadPackRequest(r *http.Request) bool { if !strings.HasSuffix(r.URL.Path, "/git-upload-pack") { return false } - // Content-Type may include parameters (e.g. "; charset=utf-8"), though - // real git never sends them. Match on the media-type prefix. - ct := r.Header.Get("Content-Type") - return ct == uploadPackContentType || strings.HasPrefix(ct, uploadPackContentType+";") + // Per RFC 7231 §3.1.1.1 media types are case-insensitive and may carry + // optional parameters and surrounding whitespace; mime.ParseMediaType + // normalizes all of that to the canonical lowercase media type. + mediaType, _, err := mime.ParseMediaType(r.Header.Get("Content-Type")) + return err == nil && mediaType == uploadPackContentType } // NormalizeUploadPackBody returns a normalized form of a git-upload-pack POST diff --git a/internal/gitproto/upload_pack_test.go b/internal/gitproto/upload_pack_test.go index daa91f7..3d228cc 100644 --- a/internal/gitproto/upload_pack_test.go +++ b/internal/gitproto/upload_pack_test.go @@ -25,6 +25,8 @@ func TestIsUploadPackRequest(t *testing.T) { }{ {"real git POST", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", ct, true}, {"real git POST with charset parameter", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", ct + "; charset=utf-8", true}, + {"real git POST with uppercase media type", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", "Application/X-Git-Upload-Pack-Request", true}, + {"real git POST with extra whitespace around parameter", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", ct + " ; charset=utf-8", true}, {"GET to upload-pack URL", http.MethodGet, "https://github.com/octocat/Hello-World.git/git-upload-pack", ct, false}, {"POST to other git path (info/refs)", http.MethodPost, "https://github.com/octocat/Hello-World.git/info/refs", ct, false}, {"POST to non-git path", http.MethodPost, "https://api.github.com/graphql", "application/json", false}, From db6135d0180858639fec33662f9b9beea65b4aa6 Mon Sep 17 00:00:00 2001 From: kbukum1 Date: Mon, 27 Apr 2026 22:26:45 +0000 Subject: [PATCH 8/9] Refactor git-upload-pack request handling and tests for improved normalization and clarity --- internal/cache/handlers_test.go | 81 ++++++------- internal/gitproto/pktline.go | 38 ++----- internal/gitproto/upload_pack.go | 84 +++++--------- internal/gitproto/upload_pack_test.go | 158 ++++++++++---------------- 4 files changed, 136 insertions(+), 225 deletions(-) diff --git a/internal/cache/handlers_test.go b/internal/cache/handlers_test.go index 1d76029..a722543 100644 --- a/internal/cache/handlers_test.go +++ b/internal/cache/handlers_test.go @@ -246,70 +246,59 @@ func Test_key(t *testing.T) { } }) - // Integration check: git-upload-pack POSTs are routed through - // gitproto.NormalizeUploadPackBody before hashing. Edge-case behaviour of - // the normalizer itself is exercised in internal/gitproto. - t.Run("git-upload-pack body is normalized before hashing", func(t *testing.T) { - const url = "https://github.com/octocat/Hello-World.git/git-upload-pack" - const ct = "application/x-git-upload-pack-request" - // Two bodies that differ only in volatile fields (have line + agent). + // Integration tests for the gitproto hookup. Edge-case behaviour of the + // normalizer itself lives in internal/gitproto. + const upUrl = "https://github.com/octocat/Hello-World.git/git-upload-pack" + const upCT = "application/x-git-upload-pack-request" + mkUpReq := func(url, ct, body string) *http.Request { + r := httptest.NewRequest("POST", url, strings.NewReader(body)) + if ct != "" { + r.Header.Set("Content-Type", ct) + } + return r + } + + t.Run("git-upload-pack: agent= drift collapses to one key", func(t *testing.T) { body1 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n" + "0032have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e\n0009done\n" body2 := "0080want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.53.0\n" + - "0032have a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2\n0009done\n" - - mkReq := func(body string) *http.Request { - r := httptest.NewRequest("POST", url, strings.NewReader(body)) - r.Header.Set("Content-Type", ct) - return r - } - if key(mkReq(body1)) != key(mkReq(body2)) { - t.Error("git-upload-pack POSTs differing only in have/agent must produce the same cache key") + "0032have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e\n0009done\n" + if key(mkUpReq(upUrl, upCT, body1)) != key(mkUpReq(upUrl, upCT, body2)) { + t.Error("agent-only difference must collapse") } }) - t.Run("malformed git-upload-pack body falls back to raw-body hashing", func(t *testing.T) { - // Even with the right URL, method, and Content-Type, a body that fails - // pkt-line parsing must produce distinct cache keys for distinct bytes - // (cache miss is acceptable; collision is not). - const url = "https://github.com/octocat/Hello-World.git/git-upload-pack" - const ct = "application/x-git-upload-pack-request" - mkReq := func(body string) *http.Request { - r := httptest.NewRequest("POST", url, strings.NewReader(body)) - r.Header.Set("Content-Type", ct) - return r + t.Run("git-upload-pack: different haves hash distinctly", func(t *testing.T) { + body1 := "0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0000" + + "0032have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e\n0009done\n" + body2 := "0032want 7fd1a60b01f91b314f59955a4e4d4e80d8edf11d\n0000" + + "0032have a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2\n0009done\n" + if key(mkUpReq(upUrl, upCT, body1)) == key(mkUpReq(upUrl, upCT, body2)) { + t.Error("haves shape the upstream pack and must not collapse") } - if key(mkReq("garbage one")) == key(mkReq("garbage two")) { + }) + + t.Run("git-upload-pack: malformed body falls back to raw hashing", func(t *testing.T) { + if key(mkUpReq(upUrl, upCT, "garbage one")) == key(mkUpReq(upUrl, upCT, "garbage two")) { t.Error("malformed bodies must hash distinctly") } }) - t.Run("non-git POST with similar substrings is not normalized", func(t *testing.T) { - // Confirms the gitproto integration is gated by IsUploadPackRequest. - body1 := `{"q":"have stuff agent=foo"}` - body2 := `{"q":"have other agent=bar"}` - - k1 := key(httptest.NewRequest("POST", "https://api.github.com/graphql", strings.NewReader(body1))) - k2 := key(httptest.NewRequest("POST", "https://api.github.com/graphql", strings.NewReader(body2))) + t.Run("non-git POST is not normalized even with similar substrings", func(t *testing.T) { + const u = "https://api.github.com/graphql" + k1 := key(httptest.NewRequest("POST", u, strings.NewReader(`{"q":"have stuff agent=foo"}`))) + k2 := key(httptest.NewRequest("POST", u, strings.NewReader(`{"q":"have other agent=bar"}`))) if k1 == k2 { t.Error("non-git POSTs must not be normalized") } }) - t.Run("POST to fake /git-upload-pack URL without Content-Type is not normalized", func(t *testing.T) { - // Defense in depth: the Content-Type gate prevents non-git traffic that - // happens to share the path suffix from being routed through pkt-line - // normalization. - const url = "https://example.com/foo/git-upload-pack" - // Bodies that, if normalized, would collide because they only differ in - // would-be "have"/"agent" tokens. + t.Run("upload-pack path without Content-Type is not normalized", func(t *testing.T) { + const u = "https://example.com/foo/git-upload-pack" body1 := "0032have 553c2077f0edc3d5dc5d17262f6aa498e69d6f8e\n0009done\n" body2 := "0032have a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2\n0009done\n" - - k1 := key(httptest.NewRequest("POST", url, strings.NewReader(body1))) - k2 := key(httptest.NewRequest("POST", url, strings.NewReader(body2))) - if k1 == k2 { - t.Error("non-git POST without Content-Type must not be normalized") + if key(mkUpReq(u, "", body1)) == key(mkUpReq(u, "", body2)) { + t.Error("missing Content-Type must skip normalization") } }) } diff --git a/internal/gitproto/pktline.go b/internal/gitproto/pktline.go index affd0f2..ca6505f 100644 --- a/internal/gitproto/pktline.go +++ b/internal/gitproto/pktline.go @@ -1,20 +1,13 @@ package gitproto -// pkt-line is the framing format used by the git smart-HTTP protocol -// (git-upload-pack, git-receive-pack). Each line is prefixed with a 4-hex-digit -// length that includes itself, or is a special packet: -// -// - "0000" flush (stream boundary) -// - "0001" delim (section separator, protocol v2) -// - "0002" response-end -// - any length >= 4 data packet with payload of (length - 4) bytes -// +// pkt-line is git's smart-HTTP framing format. Each line begins with a +// 4-hex-digit length (including itself), or is one of three special packets: +// "0000" flush, "0001" delim (v2), "0002" response-end. Any length >= 4 is a +// data packet whose payload is (length - 4) bytes. // See https://git-scm.com/docs/protocol-common#_pkt_line_format const hexDigits = "0123456789abcdef" -// pktType distinguishes ordinary data packets from the three special framing -// packets defined by the protocol (flush, delim, response-end). type pktType int const ( @@ -24,15 +17,13 @@ const ( pktResponseEnd ) -// packet is one parsed pkt-line. payload is set only when typ == pktData and -// excludes the 4-byte length prefix. +// payload is set only when typ == pktData and excludes the length prefix. type packet struct { typ pktType payload []byte } -// parseHex4 decodes a 4-byte ASCII hex prefix into its integer value without -// allocating an intermediate string. Returns ok=false on any non-hex byte. +// parseHex4 decodes a 4-byte ASCII hex prefix without allocating a string. func parseHex4(b []byte) (n int, ok bool) { for i := 0; i < 4; i++ { c := b[i] @@ -52,10 +43,8 @@ func parseHex4(b []byte) (n int, ok bool) { return n, true } -// parsePktLine parses a pkt-line byte stream into packets. The returned ok -// flag is false when the stream contains malformed or truncated data, in -// which case the packets slice is nil and callers should fall back to -// treating the original input opaquely (e.g., hashing it whole). +// parsePktLine returns ok=false on malformed or truncated input so callers +// can fall back to opaque hashing of the original bytes. func parsePktLine(data []byte) (packets []packet, ok bool) { for len(data) > 0 { if len(data) < 4 { @@ -76,8 +65,7 @@ func parsePktLine(data []byte) (packets []packet, ok bool) { packets = append(packets, packet{typ: pktResponseEnd}) data = data[4:] case 3: - // Reserved by the spec; not used by real git. Treat as - // malformed so callers fall back to full-input behaviour. + // Reserved; not used by real git. Treat as malformed. return nil, false default: if n > len(data) { @@ -90,10 +78,8 @@ func parsePktLine(data []byte) (packets []packet, ok bool) { return packets, true } -// encodePktLine serializes packets back into the pkt-line wire format. -// Re-encoding recomputes each data packet's length prefix, which is what -// makes upstream normalization correct across requests whose payloads -// differ only in length (e.g. different agent string lengths). +// encodePktLine recomputes each data packet's length prefix, which is what +// makes normalization stable across payloads of differing length. func encodePktLine(packets []packet) []byte { buf := make([]byte, 0, encodedSize(packets)) for _, p := range packets { @@ -118,8 +104,6 @@ func encodePktLine(packets []packet) []byte { return buf } -// encodedSize returns the exact wire size of packets, used to pre-allocate the -// output buffer in encodePktLine without intermediate growth. func encodedSize(packets []packet) int { size := 0 for _, p := range packets { diff --git a/internal/gitproto/upload_pack.go b/internal/gitproto/upload_pack.go index 1e6ebc4..112002c 100644 --- a/internal/gitproto/upload_pack.go +++ b/internal/gitproto/upload_pack.go @@ -1,9 +1,7 @@ -// Package gitproto provides helpers for working with the git smart-HTTP wire -// protocol, in support of the proxy's disk cache. +// Package gitproto provides helpers for stabilizing git smart-HTTP cache keys. // -// The exported surface is intentionally narrow: callers should only need -// IsUploadPackRequest and NormalizeUploadPackBody. The pkt-line framing format -// is an internal implementation detail. +// Only IsUploadPackRequest and NormalizeUploadPackBody are exported; the +// pkt-line framing parser is an implementation detail. package gitproto import ( @@ -14,28 +12,22 @@ import ( "strings" ) -// uploadPackContentType is the request Content-Type sent by all real git -// clients on a smart-HTTP fetch. See https://git-scm.com/docs/http-protocol. +// uploadPackContentType is the media type real git clients send on a fetch. +// See https://git-scm.com/docs/http-protocol. const uploadPackContentType = "application/x-git-upload-pack-request" -// inlineVolatileTokenRegex matches volatile inline capability tokens — -// currently " agent=" and " session-id=" — that appear as -// trailing capabilities on a v1 upload-pack want line. Real git always emits -// these after a leading space, so the leading-space anchor distinguishes them -// from a payload that merely starts with the same prefix. +// inlineVolatileTokenRegex matches " agent=…" / " session-id=…" tokens +// trailing a v1 capability list. The leading space anchors the match so a +// payload that merely starts with the same text is not affected. var inlineVolatileTokenRegex = regexp.MustCompile(` (?:agent|session-id)=[^ \r\n]*`) // volatileStandalonePrefixes lists payload prefixes whose entire pkt-line is -// dropped from the normalized body. These are negotiation or capability -// lines that vary between semantically identical requests: +// dropped: per-process v2 capability lines that don't influence the pack. // -// - "have " v1 + v2 local object negotiation state, varies per client -// - "agent=" v2 client version capability -// - "session-id=" v2 trace2 session identifier (Git 2.36+), unique per invocation -var volatileStandalonePrefixes = [][]byte{[]byte("have "), []byte("agent="), []byte("session-id=")} +// "have" lines are intentionally NOT here — they drive object negotiation and +// the upstream response depends on them. +var volatileStandalonePrefixes = [][]byte{[]byte("agent="), []byte("session-id=")} -// hasVolatilePrefix reports whether payload begins with any prefix in -// volatileStandalonePrefixes. func hasVolatilePrefix(payload []byte) bool { for _, prefix := range volatileStandalonePrefixes { if bytes.HasPrefix(payload, prefix) { @@ -45,11 +37,9 @@ func hasVolatilePrefix(payload []byte) bool { return false } -// IsUploadPackRequest reports whether r is a smart-HTTP git-upload-pack POST, -// i.e. a fetch negotiation. It checks method, path suffix, and Content-Type -// together so that a non-git POST to an arbitrary URL ending in -// "/git-upload-pack" cannot accidentally be routed through pkt-line -// normalization. +// IsUploadPackRequest reports whether r is a smart-HTTP git-upload-pack POST. +// All three of method, path suffix, and Content-Type must match so that an +// unrelated POST sharing the URL suffix isn't routed through normalization. func IsUploadPackRequest(r *http.Request) bool { if r.Method != http.MethodPost { return false @@ -57,40 +47,27 @@ func IsUploadPackRequest(r *http.Request) bool { if !strings.HasSuffix(r.URL.Path, "/git-upload-pack") { return false } - // Per RFC 7231 §3.1.1.1 media types are case-insensitive and may carry - // optional parameters and surrounding whitespace; mime.ParseMediaType - // normalizes all of that to the canonical lowercase media type. + // mime.ParseMediaType handles RFC 7231 case-insensitivity and parameter + // whitespace; we only care about the canonical media type. mediaType, _, err := mime.ParseMediaType(r.Header.Get("Content-Type")) return err == nil && mediaType == uploadPackContentType } -// NormalizeUploadPackBody returns a normalized form of a git-upload-pack POST -// body, suitable for use as a stable cache-key input. +// NormalizeUploadPackBody returns a stable cache-key input derived from a +// git-upload-pack POST body. The output is hash input only — never sent on +// the wire. // -// The output is opaque hash input — not intended to be sent on the wire. +// Stripped (per-process noise that doesn't shape the pack): +// - standalone "agent=" / "session-id=" pkt-lines (v2 capabilities) +// - inline " agent=" / " session-id=" tokens on v1 want lines // -// Normalization removes only fields that vary between semantically identical -// requests: +// Preserved (everything that can change the upstream response): wants, haves, +// capabilities, command=, deepen/shallow, filter, ref-prefix, object-format, +// and all framing packets. Re-encoding recomputes pkt-line length prefixes, +// so requests differing only in a stripped value's length still hash equal. // -// - "have" pkt-lines (local object negotiation state, varies per client) -// - Standalone "agent=" pkt-lines (protocol v2 client version) -// - Standalone "session-id=" pkt-lines (protocol v2 trace2 session id, Git 2.36+) -// - Inline " agent=" and " session-id=" tokens within v1 capability lines -// -// All response-shaping fields (want, capabilities, command=, deepen, filter, -// shallow, ref-prefix, object-format, ...) are preserved, and special framing -// packets (flush, delim, response-end) are preserved verbatim. Each retained -// data packet is re-emitted with a recomputed length prefix, so two requests -// that differ only in the length of a volatile field still hash identically. -// -// If the body is not valid pkt-line data, it is returned unchanged so callers -// fall back to full-body hashing (cache miss, never collision). -// -// Safety contract: this is safe to use as a cache key in environments where -// "have" sets are stable across runs (e.g. ephemeral containers with no -// pre-existing git objects, as used by Dependabot updaters). Under that -// assumption an upstream response generated for one normalized request body -// is valid for any request that normalizes to the same bytes. +// Malformed input is returned unchanged so callers fall back to opaque +// hashing (cache miss is acceptable; collision is not). func NormalizeUploadPackBody(data []byte) []byte { packets, ok := parsePktLine(data) if !ok { @@ -105,8 +82,7 @@ func NormalizeUploadPackBody(data []byte) []byte { if hasVolatilePrefix(p.payload) { continue } - // Match-then-Replace avoids an allocation on the common no-match path - // (most pkt-lines do not carry inline capability tokens). + // Match-then-Replace skips the alloc on the common no-match path. if inlineVolatileTokenRegex.Match(p.payload) { cleaned := inlineVolatileTokenRegex.ReplaceAll(p.payload, nil) filtered = append(filtered, packet{typ: pktData, payload: cleaned}) diff --git a/internal/gitproto/upload_pack_test.go b/internal/gitproto/upload_pack_test.go index 3d228cc..a9bf038 100644 --- a/internal/gitproto/upload_pack_test.go +++ b/internal/gitproto/upload_pack_test.go @@ -8,8 +8,6 @@ import ( "testing" ) -// normalize is a test helper that runs NormalizeUploadPackBody and returns the -// result as a string for direct comparison. func normalize(body string) string { return string(NormalizeUploadPackBody([]byte(body))) } @@ -25,13 +23,13 @@ func TestIsUploadPackRequest(t *testing.T) { }{ {"real git POST", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", ct, true}, {"real git POST with charset parameter", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", ct + "; charset=utf-8", true}, - {"real git POST with uppercase media type", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", "Application/X-Git-Upload-Pack-Request", true}, - {"real git POST with extra whitespace around parameter", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", ct + " ; charset=utf-8", true}, + {"uppercase media type (RFC 7231 case-insensitive)", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", "Application/X-Git-Upload-Pack-Request", true}, + {"extra whitespace around parameter", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", ct + " ; charset=utf-8", true}, {"GET to upload-pack URL", http.MethodGet, "https://github.com/octocat/Hello-World.git/git-upload-pack", ct, false}, {"POST to other git path (info/refs)", http.MethodPost, "https://github.com/octocat/Hello-World.git/info/refs", ct, false}, {"POST to non-git path", http.MethodPost, "https://api.github.com/graphql", "application/json", false}, - {"POST to fake upload-pack path with wrong Content-Type", http.MethodPost, "https://example.com/foo/git-upload-pack", "application/json", false}, - {"POST to upload-pack path with no Content-Type", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", "", false}, + {"fake upload-pack path with wrong Content-Type", http.MethodPost, "https://example.com/foo/git-upload-pack", "application/json", false}, + {"upload-pack path with no Content-Type", http.MethodPost, "https://github.com/octocat/Hello-World.git/git-upload-pack", "", false}, {"path ends in git-upload-pack but no leading slash", http.MethodPost, "https://example.com/notgit-upload-pack", ct, false}, {"path has trailing segment after git-upload-pack", http.MethodPost, "https://github.com/foo.git/git-upload-pack/extra", ct, false}, } @@ -48,38 +46,37 @@ func TestIsUploadPackRequest(t *testing.T) { } } +// Realistic OIDs and pkt-line lengths captured from github.com/octocat/Hello-World. +const ( + oidA = "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d" + oidB = "b1b3f9723831141a31a1a7252a213e216ea76e56" + oidC = "b3cbd5bbd7e81436d2eee04537ea2b4c0cad4cdf" + oidH1 = "553c2077f0edc3d5dc5d17262f6aa498e69d6f8e" + oidH2 = "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2" +) + func TestNormalizeUploadPackBody(t *testing.T) { - // All bodies use realistic OIDs and pkt-line lengths captured from - // github.com/octocat/Hello-World. - const ( - oidA = "7fd1a60b01f91b314f59955a4e4d4e80d8edf11d" - oidB = "b1b3f9723831141a31a1a7252a213e216ea76e56" - oidC = "b3cbd5bbd7e81436d2eee04537ea2b4c0cad4cdf" - // have OIDs are arbitrary — these pkt-lines are dropped during - // normalization, so their content is irrelevant to the test. - oidH1 = "553c2077f0edc3d5dc5d17262f6aa498e69d6f8e" - oidH2 = "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2" - ) - - t.Run("same wants, different haves and different-length agents → identical", func(t *testing.T) { - // The flake fix: real-world drift in `have` lines and `agent=` versions - // (including different string lengths) must collapse to the same output. + t.Run("agent= drift collapses; haves preserved", func(t *testing.T) { body1 := "00a4want " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack no-progress ofs-delta deepen-since deepen-not agent=git/2.43.0\n" + - "0032want " + oidB + "\n" + - "0000" + - "0032have " + oidH1 + "\n" + - "0009done\n" + "0032want " + oidB + "\n0000" + + "0032have " + oidH1 + "\n0009done\n" body2 := "00a3want " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack no-progress ofs-delta deepen-since deepen-not agent=git/2.9.5\n" + - "0032want " + oidB + "\n" + - "0000" + - "0032have " + oidH2 + "\n" + - "0009done\n" + "0032want " + oidB + "\n0000" + + "0032have " + oidH1 + "\n0009done\n" if normalize(body1) != normalize(body2) { t.Errorf("normalized bodies differ:\n %q\n %q", normalize(body1), normalize(body2)) } }) - t.Run("different wants → different", func(t *testing.T) { + t.Run("different haves do not collide", func(t *testing.T) { + body1 := "0032want " + oidA + "\n0000" + "0032have " + oidH1 + "\n0009done\n" + body2 := "0032want " + oidA + "\n0000" + "0032have " + oidH2 + "\n0009done\n" + if normalize(body1) == normalize(body2) { + t.Error("haves drive the upstream pack and must not collapse") + } + }) + + t.Run("different wants do not collide", func(t *testing.T) { body1 := "0032want " + oidA + "\n0009done\n" body2 := "0032want " + oidC + "\n0009done\n" if normalize(body1) == normalize(body2) { @@ -87,31 +84,27 @@ func TestNormalizeUploadPackBody(t *testing.T) { } }) - t.Run("response-shaping fields are preserved", func(t *testing.T) { - // Each variant must produce a distinct normalized body. Otherwise a - // shallow clone could be served from a full-clone cache entry, etc. + t.Run("response-shaping fields stay distinct", func(t *testing.T) { bodies := map[string]string{ "plain": "0032want " + oidA + "\n0009done\n", "shallow": "000ddeepen 1\n0032want " + oidA + "\n0009done\n", "shallow-deeper": "000ddeepen 2\n0032want " + oidA + "\n0009done\n", "filter-blobless": "0015filter blob:none\n0032want " + oidA + "\n0009done\n", "filter-treeless": "0012filter tree:0\n0032want " + oidA + "\n0009done\n", - "thin-pack-on": "0080want " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n" + - "0009done\n", - "thin-pack-off": "0076want " + oidA + " multi_ack_detailed no-done side-band-64k ofs-delta agent=git/2.43.0\n" + - "0009done\n", + "thin-pack-on": "0080want " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n0009done\n", + "thin-pack-off": "0076want " + oidA + " multi_ack_detailed no-done side-band-64k ofs-delta agent=git/2.43.0\n0009done\n", } seen := make(map[string]string) for name, body := range bodies { n := normalize(body) if other, ok := seen[n]; ok { - t.Errorf("normalized collision: %q and %q produce the same bytes:\n %q", name, other, n) + t.Errorf("collision: %q == %q", name, other) } seen[n] = name } }) - t.Run("v2 ls-refs preserves ref-prefix differences", func(t *testing.T) { + t.Run("v2 ls-refs ref-prefix is preserved", func(t *testing.T) { body1 := "0014command=ls-refs\n0015agent=git/2.43.0\n001bref-prefix refs/heads/\n0000" body2 := "0014command=ls-refs\n0015agent=git/2.43.0\n001aref-prefix refs/tags/\n0000" if normalize(body1) == normalize(body2) { @@ -119,7 +112,7 @@ func TestNormalizeUploadPackBody(t *testing.T) { } }) - t.Run("v2 fetch with same wants different agents → identical", func(t *testing.T) { + t.Run("v2 fetch agent drift collapses", func(t *testing.T) { body1 := "0012command=fetch\n001bagent=git/2.43.0-Linux\n0001000ddeepen 1\n0032want " + oidA + "\n0009done\n0000" body2 := "0012command=fetch\n001bagent=git/2.53.0-Linux\n0001000ddeepen 1\n0032want " + oidA + "\n0009done\n0000" if normalize(body1) != normalize(body2) { @@ -127,89 +120,58 @@ func TestNormalizeUploadPackBody(t *testing.T) { } }) - t.Run("v1 inline agent stripped, capability bytes preserved", func(t *testing.T) { + t.Run("v1 inline agent stripped, other capabilities kept", func(t *testing.T) { body1 := "0080want " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0\n0009done\n" body2 := "0080want " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.53.0\n0009done\n" - got1, got2 := normalize(body1), normalize(body2) - if got1 != got2 { - t.Errorf("agent-only difference must collapse:\n %q\n %q", got1, got2) - } - if !strings.Contains(got1, "thin-pack") { - t.Errorf("non-agent capabilities must be preserved, got %q", got1) + got := normalize(body1) + if got != normalize(body2) { + t.Errorf("agent-only difference must collapse:\n %q\n %q", got, normalize(body2)) } - if strings.Contains(got1, "agent=") { - t.Errorf("agent= must be removed, got %q", got1) + if !strings.Contains(got, "thin-pack") || strings.Contains(got, "agent=") { + t.Errorf("got %q", got) } }) t.Run("malformed body returned unchanged", func(t *testing.T) { body := []byte("this is not pkt-line data") - got := NormalizeUploadPackBody(body) - if !bytes.Equal(got, body) { - t.Errorf("malformed body must be returned unchanged, got %q", got) + if got := NormalizeUploadPackBody(body); !bytes.Equal(got, body) { + t.Errorf("got %q", got) } }) - t.Run("empty body → empty output", func(t *testing.T) { + t.Run("empty body produces empty output", func(t *testing.T) { if got := NormalizeUploadPackBody(nil); len(got) != 0 { - t.Errorf("empty body should produce empty output, got %q", got) + t.Errorf("got %q", got) } }) - t.Run("non-have payload that begins with 'have' substring is preserved", func(t *testing.T) { - // Defensive: the prefix check requires "have " (with space). A payload - // like "haven 123\n" must not be dropped. - body := "000ehaven foo\n0009done\n" - got := normalize(body) - if !strings.Contains(got, "haven foo") { - t.Errorf("non-have payload was incorrectly stripped: %q", got) - } - }) - - t.Run("v2 standalone session-id stripped (Git 2.36+ trace2 capability)", func(t *testing.T) { - // session-id is a per-invocation UUID added in Git 2.36 (April 2022). - // It is purely informational and must not affect the cache key. - // Payload "session-id=abcdef\n" = 18 bytes → pkt-line length 0016. - body1 := "0012command=fetch\n0015agent=git/2.43.0\n" + - "0016session-id=abcdef\n" + - "00010032want " + oidA + "\n0009done\n0000" - body2 := "0012command=fetch\n0015agent=git/2.43.0\n" + - "0016session-id=fedcba\n" + - "00010032want " + oidA + "\n0009done\n0000" - n1, n2 := normalize(body1), normalize(body2) - if n1 != n2 { - t.Errorf("session-id drift must not affect normalization:\n %q\n %q", n1, n2) - } - if strings.Contains(n1, "session-id=") { - t.Errorf("session-id= must be stripped, got %q", n1) + // session-id is a per-invocation UUID added in Git 2.36 (April 2022). + t.Run("v2 standalone session-id is stripped", func(t *testing.T) { + body1 := "0012command=fetch\n0015agent=git/2.43.0\n0016session-id=abcdef\n00010032want " + oidA + "\n0009done\n0000" + body2 := "0012command=fetch\n0015agent=git/2.43.0\n0016session-id=fedcba\n00010032want " + oidA + "\n0009done\n0000" + got := normalize(body1) + if got != normalize(body2) || strings.Contains(got, "session-id=") { + t.Errorf("got %q", got) } }) t.Run("v1 inline session-id stripped alongside agent", func(t *testing.T) { - // Newer git clients can include session-id as an inline capability on - // the first v1 want line. Both volatile tokens must collapse together. body1 := "009awant " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.43.0 session-id=aaa-1\n0009done\n" body2 := "009awant " + oidA + " multi_ack_detailed no-done side-band-64k thin-pack ofs-delta agent=git/2.53.0 session-id=zzz-2\n0009done\n" - n1, n2 := normalize(body1), normalize(body2) - if n1 != n2 { - t.Errorf("inline agent+session-id drift must collapse:\n %q\n %q", n1, n2) - } - if strings.Contains(n1, "session-id=") || strings.Contains(n1, "agent=") { - t.Errorf("volatile inline tokens must be removed, got %q", n1) - } - if !strings.Contains(n1, "thin-pack") { - t.Errorf("non-volatile capabilities must be preserved, got %q", n1) + got := normalize(body1) + if got != normalize(body2) || + strings.Contains(got, "session-id=") || strings.Contains(got, "agent=") || + !strings.Contains(got, "thin-pack") { + t.Errorf("got %q", got) } }) - t.Run("payload that merely starts with 'session-id' substring is preserved", func(t *testing.T) { - // Same defensive check as for "haven": only an exact "session-id=" - // prefix should trigger drop. A hypothetical future "session-ids=..." - // argument or a want-line containing the literal text must survive. - body := "0014session-ids=keep\n0009done\n" + // Prefix matching is exact: substrings of stripped tokens must survive. + t.Run("'haven' and 'session-ids' prefixes are not stripped", func(t *testing.T) { + body := "000ehaven foo\n0014session-ids=keep\n0009done\n" got := normalize(body) - if !strings.Contains(got, "session-ids=keep") { - t.Errorf("non-session-id payload was incorrectly stripped: %q", got) + if !strings.Contains(got, "haven foo") || !strings.Contains(got, "session-ids=keep") { + t.Errorf("got %q", got) } }) } From 72936b492a218e3774c726645a385a01682f08f7 Mon Sep 17 00:00:00 2001 From: kbukum1 Date: Mon, 27 Apr 2026 22:29:18 +0000 Subject: [PATCH 9/9] Refactor pkt-line documentation for clarity and add hasVolatilePrefix function to check payload prefixes --- internal/gitproto/pktline.go | 20 +++++++++++--------- internal/gitproto/upload_pack.go | 2 ++ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/internal/gitproto/pktline.go b/internal/gitproto/pktline.go index ca6505f..ba00b98 100644 --- a/internal/gitproto/pktline.go +++ b/internal/gitproto/pktline.go @@ -1,13 +1,12 @@ package gitproto -// pkt-line is git's smart-HTTP framing format. Each line begins with a -// 4-hex-digit length (including itself), or is one of three special packets: -// "0000" flush, "0001" delim (v2), "0002" response-end. Any length >= 4 is a -// data packet whose payload is (length - 4) bytes. -// See https://git-scm.com/docs/protocol-common#_pkt_line_format - -const hexDigits = "0123456789abcdef" - +// pktType is the kind of a single pkt-line: either a data packet or one of +// the three special framing packets defined by git's smart-HTTP protocol. +// +// Each pkt-line on the wire begins with a 4-hex-digit length that includes +// itself, or is one of: "0000" flush, "0001" delim (v2), "0002" response-end. +// Any length >= 4 is a data packet whose payload is (length - 4) bytes. +// See https://git-scm.com/docs/protocol-common#_pkt_line_format. type pktType int const ( @@ -17,12 +16,15 @@ const ( pktResponseEnd ) -// payload is set only when typ == pktData and excludes the length prefix. +// packet is one parsed pkt-line. payload is set only when typ == pktData and +// excludes the 4-byte length prefix. type packet struct { typ pktType payload []byte } +const hexDigits = "0123456789abcdef" + // parseHex4 decodes a 4-byte ASCII hex prefix without allocating a string. func parseHex4(b []byte) (n int, ok bool) { for i := 0; i < 4; i++ { diff --git a/internal/gitproto/upload_pack.go b/internal/gitproto/upload_pack.go index 112002c..2af9590 100644 --- a/internal/gitproto/upload_pack.go +++ b/internal/gitproto/upload_pack.go @@ -28,6 +28,8 @@ var inlineVolatileTokenRegex = regexp.MustCompile(` (?:agent|session-id)=[^ \r\n // the upstream response depends on them. var volatileStandalonePrefixes = [][]byte{[]byte("agent="), []byte("session-id=")} +// hasVolatilePrefix reports whether payload begins with any prefix in +// volatileStandalonePrefixes. func hasVolatilePrefix(payload []byte) bool { for _, prefix := range volatileStandalonePrefixes { if bytes.HasPrefix(payload, prefix) {