Skip to content

Commit fe95c46

Browse files
author
Your Name
committed
1
1 parent d9b0258 commit fe95c46

File tree

3 files changed

+89
-87
lines changed

3 files changed

+89
-87
lines changed

utf8/test.out.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
�􏿿􏿿􏿿􏿿􏿿􏿿􏿿􏿿

utf8/valid_arm64.s

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,20 @@ aligned_loop:
7676
B small_no_const
7777

7878
small:
79+
CBZ R11, valid_ascii
80+
81+
tail_loop:
82+
MOVBU (R10), R2
83+
AND $0x80, R2
84+
CBNZ R2, check_utf8
85+
ADD $1, R10
86+
SUB $1, R11
87+
CBNZ R11, tail_loop
88+
B valid_ascii
89+
90+
91+
check_utf8:
92+
7993
VMOVQ $0x0202020202020202, $0x4915012180808080, V11
8094
VMOVQ $0xcbcbcb8b8383a3e7, $0xcbcbdbcbcbcbcbcb, V13
8195
VMOVQ $0x0101010101010101, $0x01010101babaaee6, V15
@@ -95,7 +109,7 @@ small_no_const:
95109
ADD R11, R10, R10
96110
VLD1.P 16(R10), [V4.B16]
97111

98-
ADR shift_table, R2
112+
ADR shift_table, R2
99113
MOVW R11, R3
100114
LSL $2, R3
101115
ADD R3, R2
@@ -206,21 +220,11 @@ no_valid:
206220
MOVD R0, ret+24(FP)
207221
RET
208222

209-
210-
end_7:
211-
MOVD $7, R0
223+
valid_ascii:
224+
MOVD $3, R0
212225
MOVD R0, ret+24(FP)
213226
RET
214227

215-
end_R11:
216-
MOVD R11, R0
217-
MOVD R0, ret+24(FP)
218-
RET
219228

220-
221-
ret7:
222-
MOVD $7, R0
223-
MOVD R0, ret+24(FP) // Возвращаем 0 (строка не валидна)
224-
RET
225229
///////////////////////////
226230

utf8/valid_test.go

Lines changed: 71 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ func genExamples(current string, ranges []byteRange) []string {
4646
}
4747

4848
func TestValid(t *testing.T) {
49-
5049
var examples = []string{
5150
// Tests copied from the stdlib
5251
"",
@@ -57,85 +56,87 @@ func TestValid(t *testing.T) {
5756
"брэд-ЛГТМ",
5857
"☺☻☹",
5958

60-
// // overlong
61-
// "\xE0\x80",
62-
// // unfinished continuation
63-
// "aa\xE2",
59+
// overlong
60+
"\xE0\x80",
61+
// unfinished continuation
62+
"aa\xE2",
6463

65-
// string([]byte{66, 250}),
64+
string([]byte{66, 250}),
6665

67-
// string([]byte{66, 250, 67}),
66+
string([]byte{66, 250, 67}),
6867

69-
// "a\uFFFDb",
68+
"a\uFFFDb",
7069

71-
// "\xF4\x8F\xBF\xBF", // U+10FFFF
70+
"\xF4\x8F\xBF\xBF", // U+10FFFF
7271

73-
// "\xF4\x90\x80\x80", // U+10FFFF+1; out of range
74-
// "\xF7\xBF\xBF\xBF", // 0x1FFFFF; out of range
72+
"\xF4\x90\x80\x80", // U+10FFFF+1; out of range
73+
"\xF7\xBF\xBF\xBF", // 0x1FFFFF; out of range
7574

76-
// "\xFB\xBF\xBF\xBF\xBF", // 0x3FFFFFF; out of range
75+
"\xFB\xBF\xBF\xBF\xBF", // 0x3FFFFFF; out of range
7776

78-
// "\xc0\x80", // U+0000 encoded in two bytes: incorrect
79-
// "\xed\xa0\x80", // U+D800 high surrogate (sic)
80-
// "\xed\xbf\xbf", // U+DFFF low surrogate (sic)
77+
"\xc0\x80", // U+0000 encoded in two bytes: incorrect
78+
"\xed\xa0\x80", // U+D800 high surrogate (sic)
79+
"\xed\xbf\xbf", // U+DFFF low surrogate (sic)
8180

82-
// // valid at boundary
83-
// strings.Repeat("a", 32+28) + "☺☻☹",
84-
// strings.Repeat("a", 32+29) + "☺☻☹",
85-
// strings.Repeat("a", 32+30) + "☺☻☹",
86-
// strings.Repeat("a", 32+31) + "☺☻☹",
87-
// // invalid at boundary
88-
// strings.Repeat("a", 32+31) + "\xE2a",
81+
// valid at boundary
82+
strings.Repeat("a", 32+28) + "☺☻☹",
83+
strings.Repeat("a", 32+29) + "☺☻☹",
84+
strings.Repeat("a", 32+30) + "☺☻☹",
85+
strings.Repeat("a", 32+31) + "☺☻☹",
86+
// invalid at boundary
87+
strings.Repeat("a", 32+31) + "\xE2a",
8988

90-
// // same inputs as benchmarks
91-
// "0123456789",
92-
// "日本語日本語日本語日",
93-
// "\xF4\x8F\xBF\xBF",
89+
// same inputs as benchmarks
90+
"0123456789",
91+
"日本語日本語日本語日",
92+
"\xF4\x8F\xBF\xBF",
9493

95-
// // bugs found with fuzzing
96-
// "0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc60",
97-
// "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc300",
98-
// "߀0000000000000000000000000000訨",
99-
// "0000000000000000000000000000000˂00000000000000000000000000000000",
94+
// bugs found with fuzzing
95+
"0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc60",
96+
"000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000\xc300",
97+
"߀0000000000000000000000000000訨",
98+
"0000000000000000000000000000000˂00000000000000000000000000000000",
10099
}
101100

102-
// any := byteRange{0, 0xFF}
103-
// ascii := byteRange{0, 0x7F}
104-
// cont := byteRange{0x80, 0xBF}
101+
any := byteRange{0, 0xFF}
102+
ascii := byteRange{0, 0x7F}
103+
cont := byteRange{0x80, 0xBF}
105104

106105
rangesToTest := [][]byteRange{
107-
// {one(0x20), ascii, ascii, ascii},
108-
109-
// // 2-byte sequences
110-
// {one(0xC2)},
111-
// {one(0xC2), ascii},
112-
// {one(0xC2), cont},
113-
// {one(0xC2), {0xC0, 0xFF}},
114-
// {one(0xC2), cont, cont},
115-
// {one(0xC2), cont, cont, cont},
116-
117-
// // 3-byte sequences
118-
// {one(0xE1)},
119-
// {one(0xE1), cont},
120-
// {one(0xE1), cont, cont},
121-
// {one(0xE1), cont, cont, ascii},
122-
// {one(0xE1), cont, ascii},
123-
// {one(0xE1), cont, cont, cont},
124-
125-
// // 4-byte sequences
126-
// {one(0xF1)},
127-
// {one(0xF1), cont},
128-
// {one(0xF1), cont, cont},
129-
// {one(0xF1), cont, cont, cont},
130-
// {one(0xF1), cont, cont, ascii},
131-
// {one(0xF1), cont, cont, cont, ascii},
132-
133-
// // overlong
134-
// {{0xC0, 0xC1}, any},
135-
// {{0xC0, 0xC1}, any, any},
136-
// {{0xC0, 0xC1}, any, any, any},
137-
// {one(0xE0), {0x0, 0x9F}, cont},
138-
// {one(0xE0), {0xA0, 0xBF}, cont},
106+
{one(0x20), ascii, ascii, ascii},
107+
108+
{one(0x04), ascii, ascii, ascii},
109+
110+
// 2-byte sequences
111+
{one(0xC2)},
112+
{one(0xC2), ascii},
113+
{one(0xC2), cont},
114+
{one(0xC2), {0xC0, 0xFF}},
115+
{one(0xC2), cont, cont},
116+
{one(0xC2), cont, cont, cont},
117+
118+
// 3-byte sequences
119+
{one(0xE1)},
120+
{one(0xE1), cont},
121+
{one(0xE1), cont, cont},
122+
{one(0xE1), cont, cont, ascii},
123+
{one(0xE1), cont, ascii},
124+
{one(0xE1), cont, cont, cont},
125+
126+
// 4-byte sequences
127+
{one(0xF1)},
128+
{one(0xF1), cont},
129+
{one(0xF1), cont, cont},
130+
{one(0xF1), cont, cont, cont},
131+
{one(0xF1), cont, cont, ascii},
132+
{one(0xF1), cont, cont, cont, ascii},
133+
134+
// overlong
135+
{{0xC0, 0xC1}, any},
136+
{{0xC0, 0xC1}, any, any},
137+
{{0xC0, 0xC1}, any, any, any},
138+
{one(0xE0), {0x0, 0x9F}, cont},
139+
{one(0xE0), {0xA0, 0xBF}, cont},
139140
}
140141

141142
for _, r := range rangesToTest {
@@ -165,7 +166,7 @@ func TestValid(t *testing.T) {
165166

166167
t.Run("boundary-"+tt, func(t *testing.T) {
167168
size := 32 - len(tt)
168-
prefix := strings.Repeat("q", size)
169+
prefix := strings.Repeat("a", size)
169170
b := []byte(prefix + tt)
170171
check(t, b)
171172
})
@@ -195,7 +196,6 @@ func TestValid(t *testing.T) {
195196
}
196197

197198
func TestValidPageBoundary(t *testing.T) {
198-
199199
buf, err := buffer.New(64)
200200
if err != nil {
201201
t.Fatal(err)
@@ -233,8 +233,7 @@ func check(t *testing.T, b []byte) {
233233
if err != nil {
234234
panic(err)
235235
}
236-
fmt.Println("qwe\tValid(b)", Valid(b))
237-
fmt.Println("qwe\tutf8.Valid(b)", utf8.Valid(b))
236+
238237
t.Errorf("Valid(%q) = %v; want %v", string(b), !expected, expected)
239238
}
240239

@@ -246,9 +245,7 @@ func check(t *testing.T, b []byte) {
246245

247246
expected = ascii.Valid(b)
248247
if v.IsASCII() != expected {
249-
// t.Errorf("qwe\tValid(b) %q", ascii.Valid(b))
250-
t.Errorf("qwe\tascii.Valid(b) %v", ascii.Valid(b))
251-
t.Errorf("qwe\tascii.Valid(b) %v", Valid(b))
248+
t.Errorf("STRING(%q): %v", b, string(b))
252249
t.Errorf("Validate(%q) ascii valid: %v; want %v", string(b), !expected, expected)
253250
}
254251
}
@@ -259,7 +256,7 @@ var someutf8 = []byte("\xF4\x8F\xBF\xBF")
259256

260257
func BenchmarkValid(b *testing.B) {
261258
impls := map[string]func([]byte) bool{
262-
"SIMD": Valid,
259+
"AVX": Valid,
263260
"Stdlib": utf8.Valid,
264261
}
265262

0 commit comments

Comments
 (0)