diff --git a/src/java.base/share/classes/java/io/DataInputStream.java b/src/java.base/share/classes/java/io/DataInputStream.java index daf75b1318f6e..f5447226a68ce 100644 --- a/src/java.base/share/classes/java/io/DataInputStream.java +++ b/src/java.base/share/classes/java/io/DataInputStream.java @@ -599,7 +599,7 @@ public static final String readUTF(DataInput in) throws IOException { if (ascii == utflen) { String str; if (trusted) { - str = JLA.uncheckedNewStringNoRepl(bytearr, StandardCharsets.ISO_8859_1); + str = JLA.uncheckedNewStringNoReplacement(bytearr, StandardCharsets.ISO_8859_1); } else { str = new String(bytearr, 0, utflen, StandardCharsets.ISO_8859_1); } diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java index eac8a1355b7f2..6255d1dfd7657 100644 --- a/src/java.base/share/classes/java/lang/String.java +++ b/src/java.base/share/classes/java/lang/String.java @@ -604,14 +604,14 @@ private static String utf8(byte[] bytes, int offset, int length) { } byte[] utf16 = StringUTF16.newBytesFor(length); StringLatin1.inflate(latin1, 0, utf16, 0, dp); - dp = decodeUTF8_UTF16(latin1, sp, length, utf16, dp, true); + dp = decodeUTF8_UTF16(latin1, sp, length, utf16, dp); if (dp != length) { utf16 = Arrays.copyOf(utf16, dp << 1); } return new String(utf16, UTF16); } else { // !COMPACT_STRINGS byte[] dst = StringUTF16.newBytesFor(length); - int dp = decodeUTF8_UTF16(bytes, offset, offset + length, dst, 0, true); + int dp = decodeUTF8_UTF16(bytes, offset, offset + length, dst, 0); if (dp != length) { dst = Arrays.copyOf(dst, dp << 1); } @@ -689,15 +689,22 @@ private static String decode(Charset charset, byte[] bytes, int offset, int leng } /* - * Throws iae, instead of replacing, if malformed or unmappable. + * {@return a new string by decoding from the given UTF-8 bytes array} * + * @param offset the index of the first byte to decode + * @param length the number of bytes to decode * @param noShare * {@code true} if the resulting string MUST NOT share the byte array, * {@code false} if the byte array can be exclusively used to construct * the string and is not modified or used for any other purpose. + * @throws NullPointerException If {@code bytes} is null + * @throws StringIndexOutOfBoundsException If {@code offset} is negative, + * {@code length} is negative, or {@code offset} is greater than + * {@code bytes.length - length} + * @throws CharacterCodingException for malformed input or unmappable characters */ - static String newStringUTF8NoRepl(byte[] bytes, int offset, int length, boolean noShare) { - checkBoundsOffCount(offset, length, bytes.length); + static String newStringUTF8NoReplacement(byte[] bytes, int offset, int length, boolean noShare) throws CharacterCodingException { + checkBoundsOffCount(offset, length, bytes.length); // Implicit null check on `bytes` if (length == 0) { return ""; } @@ -748,10 +755,10 @@ static String newStringUTF8NoRepl(byte[] bytes, int offset, int length, boolean StringLatin1.inflate(dst, 0, buf, 0, dp); dst = buf; } - dp = decodeUTF8_UTF16(bytes, offset, sl, dst, dp, false); + dp = decodeUTF8_UTF16NoReplacement(bytes, offset, sl, dst, dp); } else { // !COMPACT_STRINGS dst = StringUTF16.newBytesFor(length); - dp = decodeUTF8_UTF16(bytes, offset, offset + length, dst, 0, false); + dp = decodeUTF8_UTF16NoReplacement(bytes, offset, offset + length, dst, 0); } if (dp != length) { dst = Arrays.copyOf(dst, dp << 1); @@ -759,26 +766,13 @@ static String newStringUTF8NoRepl(byte[] bytes, int offset, int length, boolean return new String(dst, UTF16); } - static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingException { - try { - return newStringNoRepl1(src, cs); - } catch (IllegalArgumentException e) { - //newStringNoRepl1 throws IAE with MalformedInputException or CCE as the cause - Throwable cause = e.getCause(); - if (cause instanceof MalformedInputException mie) { - throw mie; - } - throw (CharacterCodingException)cause; - } - } - - private static String newStringNoRepl1(byte[] src, Charset cs) { + static String newStringNoReplacement(byte[] src, Charset cs) throws CharacterCodingException { int len = src.length; if (len == 0) { return ""; } if (cs == UTF_8.INSTANCE) { - return newStringUTF8NoRepl(src, 0, src.length, false); + return newStringUTF8NoReplacement(src, 0, src.length, false); } if (cs == ISO_8859_1.INSTANCE) { if (COMPACT_STRINGS) @@ -791,7 +785,7 @@ private static String newStringNoRepl1(byte[] src, Charset cs) { return new String(src, LATIN1); return new String(StringLatin1.inflate(src, 0, src.length), UTF16); } else { - throwMalformed(src); + throw malformedInputException(src); } } @@ -806,13 +800,7 @@ private static String newStringNoRepl1(byte[] src, Charset cs) { } int en = scale(len, cd.maxCharsPerByte()); char[] ca = new char[en]; - int caLen; - try { - caLen = decodeWithDecoder(cd, ca, src, 0, src.length); - } catch (CharacterCodingException x) { - // throw via IAE - throw new IllegalArgumentException(x); - } + int caLen = decodeWithDecoder(cd, ca, src, 0, src.length); if (COMPACT_STRINGS) { byte[] val = StringUTF16.compress(ca, 0, caLen); byte coder = StringUTF16.coderFromArrayLen(val, caLen); @@ -849,7 +837,7 @@ private static Charset lookupCharset(String csn) throws UnsupportedEncodingExcep private static byte[] encode(Charset cs, byte coder, byte[] val) { if (cs == UTF_8.INSTANCE) { - return encodeUTF8(coder, val, true); + return encodeUTF8(coder, val); } if (cs == ISO_8859_1.INSTANCE) { return encode8859_1(coder, val); @@ -857,15 +845,15 @@ private static byte[] encode(Charset cs, byte coder, byte[] val) { if (cs == US_ASCII.INSTANCE) { return encodeASCII(coder, val); } - return encodeWithEncoder(cs, coder, val, true); + return encodeWithEncoder(cs, coder, val); } - private static byte[] encodeWithEncoder(Charset cs, byte coder, byte[] val, boolean doReplace) { + private static byte[] encodeWithEncoder(Charset cs, byte coder, byte[] val) { CharsetEncoder ce = cs.newEncoder(); int len = val.length >> coder; // assume LATIN1=0/UTF16=1; int en = scale(len, ce.maxBytesPerChar()); - // fastpath with ArrayEncoder implies `doReplace`. - if (doReplace && ce instanceof ArrayEncoder ae) { + // Fast-path with `ArrayEncoder` implies replacement. + if (ce instanceof ArrayEncoder ae) { // fastpath for ascii compatible if (coder == LATIN1 && ae.isASCIICompatible() && @@ -888,10 +876,8 @@ private static byte[] encodeWithEncoder(Charset cs, byte coder, byte[] val, bool if (len == 0) { return ba; } - if (doReplace) { - ce.onMalformedInput(CodingErrorAction.REPLACE) - .onUnmappableCharacter(CodingErrorAction.REPLACE); - } + ce.onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) : StringUTF16.toChars(val); ByteBuffer bb = ByteBuffer.wrap(ba); @@ -904,67 +890,87 @@ private static byte[] encodeWithEncoder(Charset cs, byte coder, byte[] val, bool if (!cr.isUnderflow()) cr.throwException(); } catch (CharacterCodingException x) { - if (!doReplace) { - throw new IllegalArgumentException(x); - } else { - throw new Error(x); - } + throw new Error(x); } return trimArray(ba, bb.position()); } - /* - * Throws iae, instead of replacing, if unmappable. + private static byte[] encodeWithEncoderNoReplacement(Charset cs, byte coder, byte[] val) throws CharacterCodingException { + CharsetEncoder ce = cs.newEncoder(); + int len = val.length >> coder; // assume LATIN1=0/UTF16=1; + int en = scale(len, ce.maxBytesPerChar()); + byte[] ba = new byte[en]; + if (len == 0) { + return ba; + } + char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) + : StringUTF16.toChars(val); + ByteBuffer bb = ByteBuffer.wrap(ba); + CharBuffer cb = CharBuffer.wrap(ca, 0, len); + CoderResult cr = ce.encode(cb, bb, true); + if (!cr.isUnderflow()) + cr.throwException(); + cr = ce.flush(bb); + if (!cr.isUnderflow()) + cr.throwException(); + return trimArray(ba, bb.position()); + } + + /** + * {@return the sequence of bytes obtained by encoding the given string in UTF-8} + * + * @param s the string to encode + * @throws NullPointerException If {@code s} is null + * @throws CharacterCodingException For malformed input or unmappable characters */ - static byte[] getBytesUTF8NoRepl(String s) { - return encodeUTF8(s.coder(), s.value(), false); + static byte[] getBytesUTF8NoReplacement(String s) throws CharacterCodingException { + return encodeUTF8NoReplacement(s.coder(), s.value()); // Implicit null check on `s` } private static boolean isASCII(byte[] src) { return !StringCoding.hasNegatives(src, 0, src.length); } - /* - * Throws CCE, instead of replacing, if unmappable. + /** + * {@return the sequence of bytes obtained by encoding the given string in + * the specified {@linkplain java.nio.charset.Charset charset}} + *

+ * WARNING: This method returns the {@code byte[]} backing the provided + * {@code String}, if the input is ASCII. Hence, the returned byte array + * must not be modified. + * + * @param s the string to encode + * @param cs the charset + * @throws NullPointerException If {@code s} or {@code cs} is null + * @throws CharacterCodingException For malformed input or unmappable characters */ - static byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException { - try { - return getBytesNoRepl1(s, cs); - } catch (IllegalArgumentException e) { - //getBytesNoRepl1 throws IAE with UnmappableCharacterException or CCE as the cause - Throwable cause = e.getCause(); - if (cause instanceof UnmappableCharacterException) { - throw (UnmappableCharacterException)cause; - } - throw (CharacterCodingException)cause; - } - } - - private static byte[] getBytesNoRepl1(String s, Charset cs) { + static byte[] getBytesNoReplacement(String s, Charset cs) throws CharacterCodingException { + Objects.requireNonNull(s, "s"); + Objects.requireNonNull(cs, "cs"); byte[] val = s.value(); byte coder = s.coder(); if (cs == UTF_8.INSTANCE) { if (coder == LATIN1 && isASCII(val)) { return val; } - return encodeUTF8(coder, val, false); + return encodeUTF8NoReplacement(coder, val); } if (cs == ISO_8859_1.INSTANCE) { if (coder == LATIN1) { return val; } - return encode8859_1(coder, val, false); + return encode8859_1NoReplacement(coder, val); } if (cs == US_ASCII.INSTANCE) { if (coder == LATIN1) { if (isASCII(val)) { return val; } else { - throwUnmappable(val); + throw unmappableCharacterException(val); } } } - return encodeWithEncoder(cs, coder, val, false); + return encodeWithEncoderNoReplacement(cs, coder, val); } private static byte[] encodeASCII(byte coder, byte[] val) { @@ -1006,10 +1012,6 @@ private static void replaceNegatives(byte[] val, int fromIndex) { } private static byte[] encode8859_1(byte coder, byte[] val) { - return encode8859_1(coder, val, true); - } - - private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) { if (coder == LATIN1) { return val.clone(); } @@ -1023,9 +1025,6 @@ private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) { sp = sp + ret; dp = dp + ret; if (ret != len) { - if (!doReplace) { - throwUnmappable(sp); - } char c = StringUTF16.getChar(val, sp++); if (Character.isHighSurrogate(c) && sp < sl && Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { @@ -1041,6 +1040,28 @@ private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) { return Arrays.copyOf(dst, dp); } + private static byte[] encode8859_1NoReplacement(byte coder, byte[] val) throws UnmappableCharacterException { + if (coder == LATIN1) { + return val.clone(); + } + int len = val.length >> 1; + byte[] dst = new byte[len]; + int dp = 0; + int sp = 0; + while (sp < len) { + int ret = StringCoding.implEncodeISOArray(val, sp, dst, dp, len); + sp = sp + ret; + dp = dp + ret; + if (ret != len) { + throw unmappableCharacterException(sp); + } + } + if (dp == dst.length) { + return dst; + } + return Arrays.copyOf(dst, dp); + } + //------------------------------ utf8 ------------------------------------ /** @@ -1118,7 +1139,7 @@ private static int decode4(int b1, int b2, int b3, int b4) { ((byte) 0x80 << 0)))); } - private static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int dp, boolean doReplace) { + private static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int dp) { while (sp < sl) { int b1 = src[sp++]; if (b1 >= 0) { @@ -1127,9 +1148,6 @@ private static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int if (sp < sl) { int b2 = src[sp++]; if (isNotContinuation(b2)) { - if (!doReplace) { - throwMalformed(sp - 1, 1); - } StringUTF16.putChar(dst, dp++, REPL); sp--; } else { @@ -1137,9 +1155,6 @@ private static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int } continue; } - if (!doReplace) { - throwMalformed(sp, 1); // underflow() - } StringUTF16.putChar(dst, dp++, REPL); break; } else if ((b1 >> 4) == -2) { @@ -1147,18 +1162,12 @@ private static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int int b2 = src[sp++]; int b3 = src[sp++]; if (isMalformed3(b1, b2, b3)) { - if (!doReplace) { - throwMalformed(sp - 3, 3); - } StringUTF16.putChar(dst, dp++, REPL); sp -= 3; sp += malformed3(src, sp); } else { char c = decode3(b1, b2, b3); if (Character.isSurrogate(c)) { - if (!doReplace) { - throwMalformed(sp - 3, 3); - } StringUTF16.putChar(dst, dp++, REPL); } else { StringUTF16.putChar(dst, dp++, c); @@ -1167,15 +1176,9 @@ private static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int continue; } if (sp < sl && isMalformed3_2(b1, src[sp])) { - if (!doReplace) { - throwMalformed(sp - 1, 2); - } StringUTF16.putChar(dst, dp++, REPL); continue; } - if (!doReplace) { - throwMalformed(sp, 1); - } StringUTF16.putChar(dst, dp++, REPL); break; } else if ((b1 >> 3) == -2) { @@ -1186,9 +1189,6 @@ private static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int int uc = decode4(b1, b2, b3, b4); if (isMalformed4(b2, b3, b4) || !Character.isSupplementaryCodePoint(uc)) { // shortest form check - if (!doReplace) { - throwMalformed(sp - 4, 4); - } StringUTF16.putChar(dst, dp++, REPL); sp -= 4; sp += malformed4(src, sp); @@ -1200,15 +1200,9 @@ private static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int } b1 &= 0xff; if (b1 > 0xf4 || sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) { - if (!doReplace) { - throwMalformed(sp - 1, 1); // or 2 - } StringUTF16.putChar(dst, dp++, REPL); continue; } - if (!doReplace) { - throwMalformed(sp - 1, 1); - } sp++; StringUTF16.putChar(dst, dp++, REPL); if (sp < sl && isMalformed4_3(src[sp])) { @@ -1216,15 +1210,76 @@ private static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int } break; } else { - if (!doReplace) { - throwMalformed(sp - 1, 1); - } StringUTF16.putChar(dst, dp++, REPL); } } return dp; } + private static int decodeUTF8_UTF16NoReplacement(byte[] src, int sp, int sl, byte[] dst, int dp) + throws CharacterCodingException { + while (sp < sl) { + int b1 = src[sp++]; + if (b1 >= 0) { + StringUTF16.putChar(dst, dp++, (char) b1); + } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { + if (sp < sl) { + int b2 = src[sp++]; + if (isNotContinuation(b2)) { + throw malformedInputException(sp - 1, 1); + } else { + StringUTF16.putChar(dst, dp++, decode2(b1, b2)); + } + continue; + } + throw malformedInputException(sp, 1); // underflow() + } else if ((b1 >> 4) == -2) { + if (sp + 1 < sl) { + int b2 = src[sp++]; + int b3 = src[sp++]; + if (isMalformed3(b1, b2, b3)) { + throw malformedInputException(sp - 3, 3); + } else { + char c = decode3(b1, b2, b3); + if (Character.isSurrogate(c)) { + throw malformedInputException(sp - 3, 3); + } else { + StringUTF16.putChar(dst, dp++, c); + } + } + continue; + } + if (sp < sl && isMalformed3_2(b1, src[sp])) { + throw malformedInputException(sp - 1, 2); + } + throw malformedInputException(sp, 1); + } else if ((b1 >> 3) == -2) { + if (sp + 2 < sl) { + int b2 = src[sp++]; + int b3 = src[sp++]; + int b4 = src[sp++]; + int uc = decode4(b1, b2, b3, b4); + if (isMalformed4(b2, b3, b4) || + !Character.isSupplementaryCodePoint(uc)) { // shortest form check + throw malformedInputException(sp - 4, 4); + } else { + StringUTF16.putChar(dst, dp++, Character.highSurrogate(uc)); + StringUTF16.putChar(dst, dp++, Character.lowSurrogate(uc)); + } + continue; + } + b1 &= 0xff; + if (b1 > 0xf4 || sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) { + throw malformedInputException(sp - 1, 1); // or 2 + } + throw malformedInputException(sp - 1, 1); + } else { + throw malformedInputException(sp - 1, 1); + } + } + return dp; + } + private static int decodeWithDecoder(CharsetDecoder cd, char[] dst, byte[] src, int offset, int length) throws CharacterCodingException { ByteBuffer bb = ByteBuffer.wrap(src, offset, length); @@ -1259,29 +1314,58 @@ private static int malformed4(byte[] src, int sp) { return 3; } - private static void throwMalformed(int off, int nb) { + private static MalformedInputException malformedInputException(int off, int nb) { + MalformedInputException mie = new MalformedInputException(nb); String msg = "malformed input off : " + off + ", length : " + nb; - throw new IllegalArgumentException(msg, new MalformedInputException(nb)); + mie.initCause(new IllegalArgumentException(msg)); + return mie; } - private static void throwMalformed(byte[] val) { + private static MalformedInputException malformedInputException(byte[] val) { int dp = StringCoding.countPositives(val, 0, val.length); - throwMalformed(dp, 1); + return malformedInputException(dp, 1); } - private static void throwUnmappable(int off) { + private static UnmappableCharacterException unmappableCharacterException(int off) { + UnmappableCharacterException uce = new UnmappableCharacterException(1); String msg = "malformed input off : " + off + ", length : 1"; - throw new IllegalArgumentException(msg, new UnmappableCharacterException(1)); + uce.initCause(new IllegalArgumentException(msg, uce)); + return uce; } - private static void throwUnmappable(byte[] val) { + private static UnmappableCharacterException unmappableCharacterException(byte[] val) { int dp = StringCoding.countPositives(val, 0, val.length); - throwUnmappable(dp); + return unmappableCharacterException(dp); + } + + private static byte[] encodeUTF8(byte coder, byte[] val) { + if (coder == UTF16) { + return encodeUTF8_UTF16(val); + } + + if (!StringCoding.hasNegatives(val, 0, val.length)) { + return val.clone(); + } + + int dp = 0; + byte[] dst = StringUTF16.newBytesFor(val.length); + for (byte c : val) { + if (c < 0) { + dst[dp++] = (byte) (0xc0 | ((c & 0xff) >> 6)); + dst[dp++] = (byte) (0x80 | (c & 0x3f)); + } else { + dst[dp++] = c; + } + } + if (dp == dst.length) { + return dst; + } + return Arrays.copyOf(dst, dp); } - private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { + private static byte[] encodeUTF8NoReplacement(byte coder, byte[] val) throws UnmappableCharacterException { if (coder == UTF16) { - return encodeUTF8_UTF16(val, doReplace); + return encodeUTF8_UTF16NoReplacement(val); } if (!StringCoding.hasNegatives(val, 0, val.length)) { @@ -1304,13 +1388,13 @@ private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { return Arrays.copyOf(dst, dp); } - private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { + private static byte[] encodeUTF8_UTF16(byte[] val) { int dp = 0; int sp = 0; int sl = val.length >> 1; // UTF-8 encoded can be as much as 3 times the string length // For very large estimate, (as in overflow of 32 bit int), precompute the exact size - long allocLen = (sl * 3 < 0) ? computeSizeUTF8_UTF16(val, doReplace) : sl * 3; + long allocLen = (sl * 3 < 0) ? computeSizeUTF8_UTF16(val) : sl * 3; if (allocLen > (long)Integer.MAX_VALUE) { throw new OutOfMemoryError("Required length exceeds implementation limit"); } @@ -1339,11 +1423,63 @@ private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { uc = Character.toCodePoint(c, c2); } if (uc < 0) { - if (doReplace) { - dst[dp++] = '?'; - } else { - throwUnmappable(sp - 1); - } + dst[dp++] = '?'; + } else { + dst[dp++] = (byte)(0xf0 | ((uc >> 18))); + dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); + dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); + dst[dp++] = (byte)(0x80 | (uc & 0x3f)); + sp++; // 2 chars + } + } else { + // 3 bytes, 16 bits + dst[dp++] = (byte)(0xe0 | ((c >> 12))); + dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); + dst[dp++] = (byte)(0x80 | (c & 0x3f)); + } + } + if (dp == dst.length) { + return dst; + } + return Arrays.copyOf(dst, dp); + } + + private static byte[] encodeUTF8_UTF16NoReplacement(byte[] val) throws UnmappableCharacterException { + int dp = 0; + int sp = 0; + int sl = val.length >> 1; + // UTF-8 encoded can be as much as 3 times the string length + // For very large estimate, (as in overflow of 32 bit int), precompute the exact size + long allocLen = (sl * 3 < 0) ? computeSizeUTF8_UTF16NoReplacement(val) : sl * 3; + if (allocLen > (long)Integer.MAX_VALUE) { + throw new OutOfMemoryError("Required length exceeds implementation limit"); + } + byte[] dst = new byte[(int) allocLen]; + while (sp < sl) { + // ascii fast loop; + char c = StringUTF16.getChar(val, sp); + if (c >= '\u0080') { + break; + } + dst[dp++] = (byte)c; + sp++; + } + while (sp < sl) { + char c = StringUTF16.getChar(val, sp++); + if (c < 0x80) { + dst[dp++] = (byte)c; + } else if (c < 0x800) { + dst[dp++] = (byte)(0xc0 | (c >> 6)); + dst[dp++] = (byte)(0x80 | (c & 0x3f)); + } else if (Character.isSurrogate(c)) { + int uc = -1; + char c2; + if (Character.isHighSurrogate(c) && sp < sl && + Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { + uc = Character.toCodePoint(c, c2); + } + if (uc < 0) { + throw unmappableCharacterException(sp - 1); } else { dst[dp++] = (byte)(0xf0 | ((uc >> 18))); dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); @@ -1367,9 +1503,8 @@ private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { /** * {@return the exact size required to UTF_8 encode this UTF16 string} * @param val UTF16 encoded byte array - * @param doReplace true to replace unmappable characters */ - private static long computeSizeUTF8_UTF16(byte[] val, boolean doReplace) { + private static long computeSizeUTF8_UTF16(byte[] val) { long dp = 0L; int sp = 0; int sl = val.length >> 1; @@ -1388,11 +1523,43 @@ private static long computeSizeUTF8_UTF16(byte[] val, boolean doReplace) { uc = Character.toCodePoint(c, c2); } if (uc < 0) { - if (doReplace) { - dp++; - } else { - throwUnmappable(sp - 1); - } + dp++; + } else { + dp += 4; + sp++; // 2 chars + } + } else { + // 3 bytes, 16 bits + dp += 3; + } + } + return dp; + } + + /** + * {@return the exact size required to UTF_8 encode this UTF16 string} + * @param val UTF16 encoded byte array + */ + private static long computeSizeUTF8_UTF16NoReplacement(byte[] val) throws UnmappableCharacterException { + long dp = 0L; + int sp = 0; + int sl = val.length >> 1; + + while (sp < sl) { + char c = StringUTF16.getChar(val, sp++); + if (c < 0x80) { + dp++; + } else if (c < 0x800) { + dp += 2; + } else if (Character.isSurrogate(c)) { + int uc = -1; + char c2; + if (Character.isHighSurrogate(c) && sp < sl && + Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { + uc = Character.toCodePoint(c, c2); + } + if (uc < 0) { + throw unmappableCharacterException(sp - 1); } else { dp += 4; sp++; // 2 chars @@ -1847,7 +2014,7 @@ public byte[] getBytes(String charsetName) public byte[] getBytes(Charset charset) { if (charset == null) throw new NullPointerException(); return encode(charset, coder(), value); - } + } /** * Encodes this {@code String} into a sequence of bytes using the diff --git a/src/java.base/share/classes/java/lang/System.java b/src/java.base/share/classes/java/lang/System.java index 1d62d69889603..2db2953aac693 100644 --- a/src/java.base/share/classes/java/lang/System.java +++ b/src/java.base/share/classes/java/lang/System.java @@ -55,7 +55,6 @@ import java.util.ResourceBundle; import java.util.Set; import java.util.concurrent.Executor; -import java.util.concurrent.ScheduledExecutorService; import java.util.function.Supplier; import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Stream; @@ -2124,28 +2123,33 @@ public Stream layers(ClassLoader loader) { public int countPositives(byte[] bytes, int offset, int length) { return StringCoding.countPositives(bytes, offset, length); } + public int countNonZeroAscii(String s) { return StringCoding.countNonZeroAscii(s); } - public String uncheckedNewStringNoRepl(byte[] bytes, Charset cs) throws CharacterCodingException { - return String.newStringNoRepl(bytes, cs); + + public String uncheckedNewStringNoReplacement(byte[] bytes, Charset cs) throws CharacterCodingException { + return String.newStringNoReplacement(bytes, cs); } + public char uncheckedGetUTF16Char(byte[] bytes, int index) { return StringUTF16.getChar(bytes, index); } + public void uncheckedPutCharUTF16(byte[] bytes, int index, int ch) { StringUTF16.putChar(bytes, index, ch); } - public byte[] uncheckedGetBytesNoRepl(String s, Charset cs) throws CharacterCodingException { - return String.getBytesNoRepl(s, cs); + + public byte[] uncheckedGetBytesNoReplacement(String s, Charset cs) throws CharacterCodingException { + return String.getBytesNoReplacement(s, cs); } - public String newStringUTF8NoRepl(byte[] bytes, int off, int len) { - return String.newStringUTF8NoRepl(bytes, off, len, true); + public String newStringUTF8NoReplacement(byte[] bytes, int off, int len) throws CharacterCodingException { + return String.newStringUTF8NoReplacement(bytes, off, len, true); } - public byte[] getBytesUTF8NoRepl(String s) { - return String.getBytesUTF8NoRepl(s); + public byte[] getBytesUTF8NoReplacement(String s) throws CharacterCodingException { + return String.getBytesUTF8NoReplacement(s); } public void inflateBytesToChars(byte[] src, int srcOff, char[] dst, int dstOff, int len) { diff --git a/src/java.base/share/classes/java/math/BigDecimal.java b/src/java.base/share/classes/java/math/BigDecimal.java index 3b7d9e0d65bb8..4fd5946a2ad2e 100644 --- a/src/java.base/share/classes/java/math/BigDecimal.java +++ b/src/java.base/share/classes/java/math/BigDecimal.java @@ -4150,7 +4150,7 @@ private String layoutChars(boolean sci) { buf[highIntSize] = '.'; DecimalDigits.uncheckedPutPairLatin1(buf, highIntSize + 1, lowInt); try { - return JLA.uncheckedNewStringNoRepl(buf, StandardCharsets.ISO_8859_1); + return JLA.uncheckedNewStringNoReplacement(buf, StandardCharsets.ISO_8859_1); } catch (CharacterCodingException cce) { throw new AssertionError(cce); } diff --git a/src/java.base/share/classes/java/nio/file/Files.java b/src/java.base/share/classes/java/nio/file/Files.java index f8278fa2642f2..c7c250452c260 100644 --- a/src/java.base/share/classes/java/nio/file/Files.java +++ b/src/java.base/share/classes/java/nio/file/Files.java @@ -3043,7 +3043,7 @@ public static String readString(Path path, Charset cs) throws IOException { byte[] ba = readAllBytes(path); if (path.getClass().getModule() != Object.class.getModule()) ba = ba.clone(); - return JLA.uncheckedNewStringNoRepl(ba, cs); + return JLA.uncheckedNewStringNoReplacement(ba, cs); } /** @@ -3362,7 +3362,7 @@ public static Path writeString(Path path, CharSequence csq, Charset cs, OpenOpti Objects.requireNonNull(csq); Objects.requireNonNull(cs); - byte[] bytes = JLA.uncheckedGetBytesNoRepl(String.valueOf(csq), cs); + byte[] bytes = JLA.uncheckedGetBytesNoReplacement(String.valueOf(csq), cs); if (path.getClass().getModule() != Object.class.getModule()) bytes = bytes.clone(); write(path, bytes, options); diff --git a/src/java.base/share/classes/java/util/HexFormat.java b/src/java.base/share/classes/java/util/HexFormat.java index 99d047995fdb9..c5db3e9b956e6 100644 --- a/src/java.base/share/classes/java/util/HexFormat.java +++ b/src/java.base/share/classes/java/util/HexFormat.java @@ -462,7 +462,7 @@ private String formatOptDelimiter(byte[] bytes, int fromIndex, int toIndex) { } try { // Return a new string using the bytes without making a copy - return jla.uncheckedNewStringNoRepl(rep, StandardCharsets.ISO_8859_1); + return jla.uncheckedNewStringNoReplacement(rep, StandardCharsets.ISO_8859_1); } catch (CharacterCodingException cce) { throw new AssertionError(cce); } @@ -696,7 +696,7 @@ public String toHexDigits(byte value) { rep[0] = (byte)toHighHexDigit(value); rep[1] = (byte)toLowHexDigit(value); try { - return jla.uncheckedNewStringNoRepl(rep, StandardCharsets.ISO_8859_1); + return jla.uncheckedNewStringNoReplacement(rep, StandardCharsets.ISO_8859_1); } catch (CharacterCodingException cce) { throw new AssertionError(cce); } @@ -732,7 +732,7 @@ public String toHexDigits(short value) { rep[3] = (byte)toLowHexDigit((byte)value); try { - return jla.uncheckedNewStringNoRepl(rep, StandardCharsets.ISO_8859_1); + return jla.uncheckedNewStringNoReplacement(rep, StandardCharsets.ISO_8859_1); } catch (CharacterCodingException cce) { throw new AssertionError(cce); } @@ -760,7 +760,7 @@ public String toHexDigits(int value) { rep[7] = (byte)toLowHexDigit((byte)value); try { - return jla.uncheckedNewStringNoRepl(rep, StandardCharsets.ISO_8859_1); + return jla.uncheckedNewStringNoReplacement(rep, StandardCharsets.ISO_8859_1); } catch (CharacterCodingException cce) { throw new AssertionError(cce); } @@ -796,7 +796,7 @@ public String toHexDigits(long value) { rep[15] = (byte)toLowHexDigit((byte)value); try { - return jla.uncheckedNewStringNoRepl(rep, StandardCharsets.ISO_8859_1); + return jla.uncheckedNewStringNoReplacement(rep, StandardCharsets.ISO_8859_1); } catch (CharacterCodingException cce) { throw new AssertionError(cce); } @@ -824,7 +824,7 @@ public String toHexDigits(long value, int digits) { value = value >>> 4; } try { - return jla.uncheckedNewStringNoRepl(rep, StandardCharsets.ISO_8859_1); + return jla.uncheckedNewStringNoReplacement(rep, StandardCharsets.ISO_8859_1); } catch (CharacterCodingException cce) { throw new AssertionError(cce); } diff --git a/src/java.base/share/classes/java/util/UUID.java b/src/java.base/share/classes/java/util/UUID.java index 5961fce9cb23c..f1cab61884cd4 100644 --- a/src/java.base/share/classes/java/util/UUID.java +++ b/src/java.base/share/classes/java/util/UUID.java @@ -480,7 +480,7 @@ public String toString() { ByteArrayLittleEndian.setLong(buf, 28, hex8(leastSigBits)); try { - return jla.uncheckedNewStringNoRepl(buf, StandardCharsets.ISO_8859_1); + return jla.uncheckedNewStringNoReplacement(buf, StandardCharsets.ISO_8859_1); } catch (CharacterCodingException cce) { throw new AssertionError(cce); } diff --git a/src/java.base/share/classes/java/util/zip/ZipCoder.java b/src/java.base/share/classes/java/util/zip/ZipCoder.java index 0c3282e351841..146b1f87b31d5 100644 --- a/src/java.base/share/classes/java/util/zip/ZipCoder.java +++ b/src/java.base/share/classes/java/util/zip/ZipCoder.java @@ -252,12 +252,20 @@ boolean isUTF8() { @Override String toString(byte[] ba, int off, int length) { - return JLA.newStringUTF8NoRepl(ba, off, length); + try { + return JLA.newStringUTF8NoReplacement(ba, off, length); + } catch (CharacterCodingException cce) { + throw new IllegalArgumentException(cce); + } } @Override byte[] getBytes(String s) { - return JLA.getBytesUTF8NoRepl(s); + try { + return JLA.getBytesUTF8NoReplacement(s); + } catch (CharacterCodingException cce) { + throw new IllegalArgumentException(cce); + } } @Override @@ -271,9 +279,9 @@ int checkedHash(byte[] a, int off, int len) throws Exception { // Non-ASCII, fall back to decoding a String // We avoid using decoder() here since the UTF8ZipCoder is // shared and that decoder is not thread safe. - // We use the JLA.newStringUTF8NoRepl variant to throw + // We use the JLA.newStringUTF8NoReplacement variant to throw // exceptions eagerly when opening ZipFiles - return hash(JLA.newStringUTF8NoRepl(a, off, len)); + return hash(JLA.newStringUTF8NoReplacement(a, off, len)); } int h = ArraysSupport.hashCodeOfUnsigned(a, off, len, 0); if (a[end - 1] != '/') { @@ -289,7 +297,7 @@ private boolean hasTrailingSlash(byte[] a, int end) { @Override byte compare(String str, byte[] b, int off, int len, boolean matchDirectory) { try { - byte[] encoded = JLA.uncheckedGetBytesNoRepl(str, UTF_8.INSTANCE); + byte[] encoded = JLA.uncheckedGetBytesNoReplacement(str, UTF_8.INSTANCE); int mismatch = Arrays.mismatch(encoded, 0, encoded.length, b, off, off+len); if (mismatch == -1) { return EXACT_MATCH; diff --git a/src/java.base/share/classes/jdk/internal/access/JavaLangAccess.java b/src/java.base/share/classes/jdk/internal/access/JavaLangAccess.java index efa36b5b2d8fc..bff9de846d224 100644 --- a/src/java.base/share/classes/jdk/internal/access/JavaLangAccess.java +++ b/src/java.base/share/classes/jdk/internal/access/JavaLangAccess.java @@ -330,35 +330,35 @@ public interface JavaLangAccess { * @return the newly created string * @throws CharacterCodingException for malformed or unmappable bytes */ - String uncheckedNewStringNoRepl(byte[] bytes, Charset cs) throws CharacterCodingException; + String uncheckedNewStringNoReplacement(byte[] bytes, Charset cs) throws CharacterCodingException; /** - * Encode the given string into a sequence of bytes using the specified - * {@linkplain java.nio.charset.Charset charset}. + * {@return the sequence of bytes obtained by encoding the given string in + * the specified {@linkplain java.nio.charset.Charset charset}} *

* WARNING: This method returns the {@code byte[]} backing the provided * {@code String}, if the input is ASCII. Hence, the returned byte array * must not be modified. - *

- * This method throws {@code CharacterCodingException} instead of replacing - * when malformed input or unmappable characters are encountered. * * @param s the string to encode * @param cs the charset - * @return the encoded bytes - * @throws CharacterCodingException for malformed input or unmappable characters + * @throws NullPointerException If {@code s} or {@code cs} is null + * @throws CharacterCodingException For malformed input or unmappable characters */ - byte[] uncheckedGetBytesNoRepl(String s, Charset cs) throws CharacterCodingException; + byte[] uncheckedGetBytesNoReplacement(String s, Charset cs) throws CharacterCodingException; /** - * Returns a new string by decoding from the given UTF-8 bytes array. + * {@return a new string by decoding from the given UTF-8 bytes array} * - * @param off the index of the first byte to decode - * @param len the number of bytes to decode - * @return the newly created string - * @throws IllegalArgumentException for malformed or unmappable bytes. + * @param offset the index of the first byte to decode + * @param length the number of bytes to decode + * @throws NullPointerException If {@code bytes} is null + * @throws StringIndexOutOfBoundsException If {@code offset} is negative, + * {@code length} is negative, or {@code offset} is greater than + * {@code bytes.length - length} + * @throws CharacterCodingException For malformed input or unmappable characters */ - String newStringUTF8NoRepl(byte[] bytes, int off, int len); + String newStringUTF8NoReplacement(byte[] bytes, int offset, int length) throws CharacterCodingException; /** * Get the {@code char} at {@code index} in a {@code byte[]} in internal @@ -384,13 +384,13 @@ public interface JavaLangAccess { void uncheckedPutCharUTF16(byte[] bytes, int index, int ch); /** - * Encode the given string into a sequence of bytes using utf8. + * {@return the sequence of bytes obtained by encoding the given string in UTF-8} * * @param s the string to encode - * @return the encoded bytes in utf8 - * @throws IllegalArgumentException for malformed surrogates + * @throws NullPointerException If {@code s} is null + * @throws CharacterCodingException For malformed input or unmappable characters */ - byte[] getBytesUTF8NoRepl(String s); + byte[] getBytesUTF8NoReplacement(String s) throws CharacterCodingException; /** * Inflated copy from {@code byte[]} to {@code char[]}, as defined by diff --git a/src/java.base/unix/classes/sun/nio/fs/UnixPath.java b/src/java.base/unix/classes/sun/nio/fs/UnixPath.java index 5dfc73f57aaed..c47c3fb70f1b2 100644 --- a/src/java.base/unix/classes/sun/nio/fs/UnixPath.java +++ b/src/java.base/unix/classes/sun/nio/fs/UnixPath.java @@ -126,7 +126,7 @@ private static String normalize(String input, int len, int off) { private static byte[] encode(UnixFileSystem fs, String input) { input = fs.normalizeNativePath(input); try { - return JLA.uncheckedGetBytesNoRepl(input, Util.jnuEncoding()); + return JLA.uncheckedGetBytesNoReplacement(input, Util.jnuEncoding()); } catch (CharacterCodingException cce) { throw new InvalidPathException(input, "Malformed input or input contains unmappable characters"); diff --git a/test/jdk/java/lang/String/NoReplTest.java b/test/jdk/java/lang/String/NoReplTest.java index 1817a1ffe7367..2f7344327824a 100644 --- a/test/jdk/java/lang/String/NoReplTest.java +++ b/test/jdk/java/lang/String/NoReplTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -24,7 +24,7 @@ /* * @test * @bug 8286287 8288589 - * @summary Tests for *NoRepl() shared secret methods. + * @summary Tests for *NoReplacement() shared secret methods. * @run testng NoReplTest * @modules jdk.charsets */ @@ -45,11 +45,11 @@ public class NoReplTest { private final static Charset WINDOWS_1252 = Charset.forName("windows-1252"); /** - * Verifies newStringNoRepl() throws a CharacterCodingException. + * Verifies `uncheckedNewStringNoReplacement()` throws a `CharacterCodingException`. * The method is invoked by `Files.readString()` method. */ @Test - public void newStringNoReplTest() throws IOException { + public void uncheckedNewStringNoReplacementTest() throws IOException { var f = Files.createTempFile(null, null); try (var fos = Files.newOutputStream(f)) { fos.write(MALFORMED_UTF16); @@ -67,11 +67,11 @@ public void newStringNoReplTest() throws IOException { } /** - * Verifies getBytesNoRepl() throws a CharacterCodingException. + * Verifies `uncheckedGetBytesNoReplacement()` throws a `CharacterCodingException`. * The method is invoked by `Files.writeString()` method. */ @Test - public void getBytesNoReplTest() throws IOException { + public void uncheckedGetBytesNoReplacementTest() throws IOException { var f = Files.createTempFile(null, null); try { Files.writeString(f, MALFORMED_WINDOWS_1252, WINDOWS_1252);