diff --git a/datadog/client_test.go b/datadog/client_test.go index a3b05a8..473e8c6 100644 --- a/datadog/client_test.go +++ b/datadog/client_test.go @@ -55,13 +55,16 @@ func TestClientSanitizesMetricNames(t *testing.T) { {Name: "colon:atsign@pipe|end", Value: stats.ValueOf(5)}, }, Tags: []stats.Tag{ - stats.T("colon:atsign@pipe|end", "42"), + stats.T("colon:atsign@pipe|end", "http://example.com/path|with:special"), + stats.T("env", "prod"), }, } client.HandleMeasures(time.Time{}, testMeasure) client.Flush() - expectedPacket1 := "request_colon_atsign_laughing_end.colon_atsign_pipe_end:5|c|#colon_atsign_pipe_end:42\n" + // Note: Tag names are strictly sanitized, but tag values are lenient + // The tag value can contain colons, pipes, slashes (but not commas) + expectedPacket1 := "request_colon_atsign_laughing_end.colon_atsign_pipe_end:5|c|#colon_atsign_pipe_end:http://example.com/path|with:special,env:prod\n" select { case packet := <-packets: assert.EqualValues(t, expectedPacket1, string(packet)) diff --git a/datadog/serializer.go b/datadog/serializer.go index ba22cfc..c6f6c58 100644 --- a/datadog/serializer.go +++ b/datadog/serializer.go @@ -93,11 +93,18 @@ func (s *serializer) AppendMeasures(b []byte, _ time.Time, measures ...stats.Mea // 2-byte UTF-8 sequences that decode to these codepoints. var latin1SupplementMap [256]byte -// valid[byte] = 1 if the ASCII char is allowed, 0 otherwise. +// valid[byte] = true if the ASCII char is allowed in metric/tag names, false otherwise. var valid = [256]bool{ '.': true, '-': true, '_': true, } +// validTagValue[byte] = true if the ASCII char is allowed in tag values, false otherwise. +// Tag values are more lenient than metric names - they can contain most characters +// except commas (which separate tags) and a few control characters. +var validTagValue = [256]bool{ + '.': true, '-': true, '_': true, '/': true, ':': true, '|': true, +} + func init() { // Initialize all to identity mapping for i := range latin1SupplementMap { @@ -203,12 +210,15 @@ func init() { for c := '0'; c <= '9'; c++ { valid[c] = true + validTagValue[c] = true } for c := 'A'; c <= 'Z'; c++ { valid[c] = true + validTagValue[c] = true } for c := 'a'; c <= 'z'; c++ { valid[c] = true + validTagValue[c] = true } } @@ -269,6 +279,54 @@ func appendSanitizedMetricName(dst []byte, raw string) []byte { return dst } +// appendSanitizedTagValue sanitizes tag values for DogStatsD format. +// Tag values are more lenient than metric names - they can contain colons, slashes, +// pipes, etc. The main restriction is that commas are not allowed since they +// separate tags in the protocol. +func appendSanitizedTagValue(dst []byte, raw string) []byte { + origLen := len(dst) + if raw == "" { + return dst + } + + // Simple transformation: iterate through runes and convert/replace as needed + lastWasRepl := false + for i, r := range raw { + if i >= maxLen { + break + } + + if r < utf8.RuneSelf && validTagValue[byte(r)] { + // Valid ASCII character + dst = append(dst, byte(r)) + lastWasRepl = false + } else if r >= 0xC0 && r <= 0xFF { + // Latin-1 Supplement block (common accented characters like À, É, ñ) + mapped := latin1SupplementMap[r] + if validTagValue[mapped] { + dst = append(dst, mapped) + lastWasRepl = false + } else if !lastWasRepl { + dst = append(dst, replacement) + lastWasRepl = true + } + } else if !lastWasRepl { + // Invalid or unsupported character - only append if we didn't just add a replacement + dst = append(dst, replacement) + lastWasRepl = true + } + } + + // Trim leading/trailing '.', '_' or '-' + trimmed := bytes.Trim(dst[origLen:], "._-") + dst = append(dst[:origLen], trimmed...) + + if len(dst) == origLen { + return append(dst, "_truncated_"...) + } + return dst +} + // AppendMeasure is a formatting routine to append the dogstatsd protocol // representation of a measure to a memory buffer. // Tags listed in the s.filters are removed. (some tags may not be suitable for submission to DataDog) @@ -325,7 +383,7 @@ func (s *serializer) AppendMeasure(b []byte, m stats.Measure) []byte { } b = appendSanitizedMetricName(b, t.Name) b = append(b, ':') - b = appendSanitizedMetricName(b, t.Value) + b = appendSanitizedTagValue(b, t.Value) } } b = append(b, '\n') diff --git a/datadog/serializer_test.go b/datadog/serializer_test.go index 5fd801e..5208941 100644 --- a/datadog/serializer_test.go +++ b/datadog/serializer_test.go @@ -67,7 +67,7 @@ request.rtt:0.1|h|#answer:42,hello:world stats.T("hello", "world"), }, }, - s: `request.count:5|c|#ans_wer_blah:also_pipe_colon_comma,hello:world + s: `request.count:5|c|#ans_wer_blah:also|pipe:colon_comma,hello:world `, dp: []string{}, }, @@ -87,6 +87,24 @@ request.rtt:0.1|h|#answer:42,hello:world `, dp: []string{"dist_"}, }, + + // Test lenient tag values - URLs, paths, colons, pipes + { + m: stats.Measure{ + Name: "api.request", + Fields: []stats.Field{ + stats.MakeField("count", 1, stats.Counter), + }, + Tags: []stats.Tag{ + stats.T("url", "http://api.example.com/v1/users"), + stats.T("path", "/api/v1/users"), + stats.T("env", "prod:us-east-1"), + }, + }, + s: `api.request.count:1|c|#url:http://api.example.com/v1/users,path:/api/v1/users,env:prod:us-east-1 +`, + dp: []string{}, + }, } func TestAppendMeasure(t *testing.T) { @@ -511,6 +529,109 @@ func TestSanitizationPreservesUTF8Validity(t *testing.T) { } } +func TestAppendSanitizedTagValue(t *testing.T) { + long := strings.Repeat("x", 300) // longer than maxLen + cases := []struct { + prefix string // existing data in buffer + in, want string + }{ + // Basic cases - tag values should be more lenient + {"", "simple", "simple"}, + {"", "with-dashes_underscores.dots", "with-dashes_underscores.dots"}, + + // Tag values can contain colons and pipes (unlike the protocol separators) + {"", "http://example.com", "http://example.com"}, + {"", "path/to/resource", "path/to/resource"}, + {"", "key:value:pair", "key:value:pair"}, + {"", "pipe|separated|values", "pipe|separated|values"}, + {"", "mixed:pipe|slash/colon", "mixed:pipe|slash/colon"}, + + // Commas must be sanitized (they separate tags in the protocol) + {"", "value,with,commas", "value_with_commas"}, + {"", "item1,item2,item3", "item1_item2_item3"}, + + // Special characters that should be sanitized + {"", "value@sign#hash", "value_sign_hash"}, + {"", "brackets[test]", "brackets_test"}, + {"", "parens(test)", "parens_test"}, + + // Accented characters should be normalized + {"", "café", "cafe"}, + {"", "naïve", "naive"}, + {"", "señor", "senor"}, + + // Leading/trailing special chars should be trimmed + {"", "-leading-dash", "leading-dash"}, + {"", "trailing-dash-", "trailing-dash"}, + {"", "...dots...", "dots"}, + {"", "__underscores__", "underscores"}, + + // Empty string handling + {"", "", ""}, + {"prefix:", "", "prefix:"}, + + // Multiple consecutive special chars collapse + {"", "foo!!!bar", "foo_bar"}, + {"", "test@@@value", "test_value"}, + + // Mixed valid and invalid characters + {"", "env:prod|region:us-east-1", "env:prod|region:us-east-1"}, + {"", "url:http://api.example.com/v1", "url:http://api.example.com/v1"}, + {"", "list:item1,item2,item3", "list:item1_item2_item3"}, + + // Emojis and other multi-byte sequences + {"", "test🤡emoji", "test_emoji"}, + {"", "hello🌍world", "hello_world"}, + {"", "测试值", "_truncated_"}, // Chinese -> _truncated_ (all invalid) + + // Over-long values should be truncated + {"", long, strings.Repeat("x", maxLen)}, + + // With prefix + {"tagname:", "http://example.com", "tagname:http://example.com"}, + {"key:", "value,with,comma", "key:value_with_comma"}, + {"env:", "production", "env:production"}, + } + + for _, c := range cases { + // Start with prefix data in buffer + buf := []byte(c.prefix) + originalLen := len(buf) + + // Append sanitized tag value + buf = appendSanitizedTagValue(buf, c.in) + got := string(buf) + + if got != c.want { + t.Fatalf("prefix=%q in=%q want=%q (len %v) got=%q (len %v)", c.prefix, c.in, c.want, len(c.want), got, len(got)) + } + + // Verify prefix is preserved + if len(c.prefix) > 0 && !strings.HasPrefix(got, c.prefix) { + t.Errorf("prefix %q not preserved in result %q", c.prefix, got) + } + + // Verify we only modified the buffer from the original length onward + if originalLen > 0 && originalLen <= len(buf) { + originalPart := string(buf[:originalLen]) + if originalPart != c.prefix { + t.Errorf("original buffer data corrupted: want %q, got %q", c.prefix, originalPart) + } + } + } + + // Additional test: verify behavior with various buffer capacities + t.Run("BufferReuse", func(t *testing.T) { + buf := make([]byte, 0, 100) // pre-allocated capacity + buf = append(buf, "tag:"...) + buf = appendSanitizedTagValue(buf, "http://café.com/path") + expected := "tag:http://cafe.com/path" + if string(buf) != expected { + t.Errorf("buffer reuse failed: want %q, got %q", expected, string(buf)) + } + }) +} + // BenchmarkAppendSanitizedMetricName measures performance of metric name sanitization // across different input types to ensure the implementation is efficient. func BenchmarkAppendSanitizedMetricName(b *testing.B) {