perl11
diff --git a/‎MANIFEST‎
Lines changed: 1 addition & 0 deletions b/‎MANIFEST‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Porting/Maintainers.pl‎
Lines changed: 7 additions & 1 deletion b/‎Porting/Maintainers.pl‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎cpan/Encode/Encode.pm‎
Lines changed: 32 additions & 21 deletions b/‎cpan/Encode/Encode.pm‎
Lines changed: 32 additions & 21 deletions
diff --git a/‎cpan/Encode/Encode.xs‎
Lines changed: 76 additions & 43 deletions b/‎cpan/Encode/Encode.xs‎
Lines changed: 76 additions & 43 deletions
diff --git a/‎cpan/Encode/Makefile.PL‎
Lines changed: 1 addition & 1 deletion b/‎cpan/Encode/Makefile.PL‎
Lines changed: 1 addition & 1 deletion
@@ -1031,6 +1031,7 @@ cpan/Encode/t/rt86327.t
 cpan/Encode/t/taint.t
 cpan/Encode/t/unibench.pl		benchmark script
 cpan/Encode/t/Unicode.t			test script
+cpan/Encode/t/utf8messages.t
 cpan/Encode/t/utf8ref.t			test script
 cpan/Encode/t/utf8strict.t		test script
 cpan/Encode/t/utf8warnings.t
 
@@ -568,8 +568,14 @@ package Maintainers;
     },
 
     'Encode' => {
-        'DISTRIBUTION' => 'DANKOGAI/Encode-2.88.tar.gz',
+        'DISTRIBUTION' => 'DANKOGAI/Encode-2.89.tar.gz',
         'FILES'        => q[cpan/Encode],
+        # undeprecate encoding
+        'CUSTOMIZED'   => [ qw(
+                            encoding.pm
+                            t/enc_eucjp.t
+                            t/enc_utf8.t
+                            )],
     },
 
     'encoding::warnings' => {
 
@@ -1,10 +1,10 @@
 #
-# $Id: Encode.pm,v 2.88 2016/11/29 23:30:30 dankogai Exp dankogai $
+# $Id: Encode.pm,v 2.89 2017/04/21 05:20:14 dankogai Exp dankogai $
 #
 package Encode;
 use strict;
 use warnings;
-our $VERSION = sprintf "%d.%02d", q$Revision: 2.88 $ =~ /(\d+)/g;
+our $VERSION = sprintf "%d.%02d", q$Revision: 2.89 $ =~ /(\d+)/g;
 use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
 use XSLoader ();
 XSLoader::load( __PACKAGE__, $VERSION );
@@ -516,14 +516,16 @@ ISO-8859-1, also known as Latin1:
 
   $octets = encode("iso-8859-1", $string);
 
-B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
+B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then
 $octets I<might not be equal to> $string.  Though both contain the
 same data, the UTF8 flag for $octets is I<always> off.  When you
 encode anything, the UTF8 flag on the result is always off, even when it
-contains a completely valid utf8 string. See L</"The UTF8 flag"> below.
+contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below.
 
 If the $string is C<undef>, then C<undef> is returned.
 
+C<str2bytes> may be used as an alias for C<encode>.
+
 =head3 decode
 
   $string = decode(ENCODING, OCTETS[, CHECK])
@@ -544,13 +546,15 @@ internal format:
 
   $string = decode("iso-8859-1", $octets);
 
-B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
+B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string
 I<might not be equal to> $octets.  Though both contain the same data, the
 UTF8 flag for $string is on.  See L</"The UTF8 flag">
 below.
 
 If the $string is C<undef>, then C<undef> is returned.
 
+C<bytes2str> may be used as an alias for C<decode>.
+
 =head3 find_encoding
 
   [$obj =] find_encoding(ENCODING)
@@ -559,11 +563,11 @@ Returns the I<encoding object> corresponding to I<ENCODING>.  Returns
 C<undef> if no matching I<ENCODING> is find.  The returned object is
 what does the actual encoding or decoding.
 
-  $utf8 = decode($name, $bytes);
+  $string = decode($name, $bytes);
 
 is in fact
 
-    $utf8 = do {
+    $string = do {
         $obj = find_encoding($name);
         croak qq(encoding "$name" not found) unless ref $obj;
         $obj->decode($bytes);
@@ -575,8 +579,8 @@ You can therefore save time by reusing this object as follows;
 
     my $enc = find_encoding("iso-8859-1");
     while(<>) {
-        my $utf8 = $enc->decode($_);
-        ... # now do something with $utf8;
+        my $string = $enc->decode($_);
+        ... # now do something with $string;
     }
 
 Besides L</decode> and L</encode>, other methods are
@@ -624,13 +628,13 @@ and C<undef> on error.
 
 B<CAVEAT>: The following operations may look the same, but are not:
 
-  from_to($data, "iso-8859-1", "utf8"); #1
+  from_to($data, "iso-8859-1", "UTF-8"); #1
   $data = decode("iso-8859-1", $data);  #2
 
 Both #1 and #2 make $data consist of a completely valid UTF-8 string,
 but only #2 turns the UTF8 flag on.  #1 is equivalent to:
 
-  $data = encode("utf8", decode("iso-8859-1", $data));
+  $data = encode("UTF-8", decode("iso-8859-1", $data));
 
 See L</"The UTF8 flag"> below.
 
@@ -655,19 +659,27 @@ followed by C<encode> as follows:
 Equivalent to C<$octets = encode("utf8", $string)>.  The characters in
 $string are encoded in Perl's internal format, and the result is returned
 as a sequence of octets.  Because all possible characters in Perl have a
-(loose, not strict) UTF-8 representation, this function cannot fail.
+(loose, not strict) utf8 representation, this function cannot fail.
+
+B<WARNING>: do not use this function for data exchange as it can produce
+not strict utf8 $octets! For strictly valid UTF-8 output use
+C<$octets = encode("UTF-8", $string)>.
 
 =head3 decode_utf8
 
   $string = decode_utf8($octets [, CHECK]);
 
 Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
 The sequence of octets represented by $octets is decoded
-from UTF-8 into a sequence of logical characters.
-Because not all sequences of octets are valid UTF-8,
+from (loose, not strict) utf8 into a sequence of logical characters.
+Because not all sequences of octets are valid not strict utf8,
 it is quite possible for this function to fail.
 For CHECK, see L</"Handling Malformed Data">.
 
+B<WARNING>: do not use this function for data exchange as it can produce
+$string with not strict utf8 representation! For strictly valid UTF-8
+$string representation use C<$string = decode("UTF-8", $octets [, CHECK])>.
+
 B<CAVEAT>: the input I<$octets> might be modified in-place depending on
 what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
 left unchanged.
@@ -903,15 +915,14 @@ octets that represent the fallback character.  For instance:
 
 Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
 
-Even the fallback for C<decode> must return octets, which are
-then decoded with the character encoding that C<decode> accepts. So for
+Fallback for C<decode> must return decoded string (sequence of characters)
+and takes a list of ordinal values as its arguments. So for
 example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
 a fallback for bytes that are not valid UTF-8, you could write
 
     $str = decode 'UTF-8', $octets, sub {
-        my $tmp = chr shift;
-        from_to $tmp, 'ISO-8859-15', 'UTF-8';
-        return $tmp;
+        my $tmp = join '', map chr, @_;
+        return decode 'ISO-8859-15', $tmp;
     };
 
 =head1 Defining Encodings
@@ -980,9 +991,9 @@ When you I<encode>, the resulting UTF8 flag is always B<off>.
 
 When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can
 unambiguously represent data.  Here is what we mean by "unambiguously".
-After C<$utf8 = decode("foo", $octet)>,
+After C<$str = decode("foo", $octet)>,
 
-  When $octet is...   The UTF8 flag in $utf8 is
+  When $octet is...    The UTF8 flag in $str is
   ---------------------------------------------
   In ASCII only (or EBCDIC only)            OFF
   In ISO-8859-1                              ON
 
@@ -1,5 +1,5 @@
 /*
- $Id: Encode.xs,v 2.39 2016/11/29 23:29:23 dankogai Exp dankogai $
+ $Id: Encode.xs,v 2.40 2017/04/21 05:20:14 dankogai Exp dankogai $
  */
 
 #define PERL_NO_GET_CONTEXT
@@ -35,17 +35,6 @@ UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
 #define SvIV_nomg SvIV
 #endif
 
-#ifdef UTF8_DISALLOW_ILLEGAL_INTERCHANGE
-#   define UTF8_ALLOW_STRICT UTF8_DISALLOW_ILLEGAL_INTERCHANGE
-#else
-#   define UTF8_ALLOW_STRICT 0
-#endif
-
-#define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY &                    \
-                              ~(UTF8_ALLOW_CONTINUATION |         \
-                                UTF8_ALLOW_NON_CONTINUATION |     \
-                                UTF8_ALLOW_LONG))
-
 static void
 Encode_XSEncoding(pTHX_ encode_t * enc)
 {
@@ -114,17 +103,18 @@ utf8_safe_upgrade(pTHX_ SV ** src, U8 ** s, STRLEN * slen, bool modify)
 
 #define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s"
 #define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode"
+#define ERR_DECODE_STR_NOMAP "%s \"%s\" does not map to Unicode"
 
 static SV *
 do_fallback_cb(pTHX_ UV ch, SV *fallback_cb)
 {
     dSP;
     int argc;
-    SV *retval = newSVpv("",0);
+    SV *retval = newSVpvn("",0);
     ENTER;
     SAVETMPS;
     PUSHMARK(sp);
-    XPUSHs(sv_2mortal(newSVnv((UV)ch)));
+    XPUSHs(sv_2mortal(newSVuv(ch)));
     PUTBACK;
     argc = call_sv(fallback_cb, G_SCALAR);
     SPAGAIN;
@@ -138,6 +128,31 @@ do_fallback_cb(pTHX_ UV ch, SV *fallback_cb)
     return retval;
 }
 
+static SV *
+do_bytes_fallback_cb(pTHX_ U8 *s, STRLEN slen, SV *fallback_cb)
+{
+    dSP;
+    int argc;
+    STRLEN i;
+    SV *retval = newSVpvn("",0);
+    ENTER;
+    SAVETMPS;
+    PUSHMARK(sp);
+    for (i=0; i<slen; ++i)
+        XPUSHs(sv_2mortal(newSVuv(s[i])));
+    PUTBACK;
+    argc = call_sv(fallback_cb, G_SCALAR);
+    SPAGAIN;
+    if (argc != 1){
+        croak("fallback sub must return scalar!");
+    }
+    sv_catsv(retval, POPs);
+    PUTBACK;
+    FREETMPS;
+    LEAVE;
+    return retval;
+}
+
 static SV *
 encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 * s, STRLEN slen,
 	      int check, STRLEN * offset, SV * term, int * retcode, 
@@ -382,7 +397,7 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
     U8 *ptr = s;
     bool overflowed = 0;
 
-    uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(len);
+    uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(UTF8SKIP(s));
 
     len--;
     s++;
@@ -401,7 +416,6 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
     *rlen = s-ptr;
 
     if (overflowed || *rlen > (STRLEN)UNISKIP(uv)) {
-        *rlen = 1;
         return 0;
     }
 
@@ -413,11 +427,13 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
              bool encode, bool strict, bool stop_at_partial)
 {
     UV uv;
+    STRLEN i;
     STRLEN ulen;
     SV *fallback_cb;
     int check;
     U8 *d;
     STRLEN dlen;
+    char esc[80]; /* need to store UTF8SKIP * 6 + 1 */
 
     if (SvROK(check_sv)) {
 	/* croak("UTF-8 decoder doesn't support callback CHECK"); */
@@ -442,21 +458,22 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
         }
 
         ulen = 1;
-        if (UTF8_IS_START(*s)) {
+        if (! UTF8_IS_CONTINUATION(*s)) {
+            /* Not an invariant nor a continuation; must be a start byte.  (We
+             * can't test for UTF8_IS_START as that excludes things like \xC0
+             * which are start bytes, but always lead to overlongs */
+
             U8 skip = UTF8SKIP(s);
             if ((s + skip) > e) {
-                if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) {
-                    const U8 *p = s + 1;
-                    for (; p < e; p++) {
-                        if (!UTF8_IS_CONTINUATION(*p)) {
-                            ulen = p-s;
-                            goto malformed_byte;
-                        }
-                    }
+                /* just calculate ulen, in pathological cases can be smaller then e-s */
+                if (e-s >= 2)
+                    convert_utf8_multi_seq(s, e-s, &ulen);
+                else
+                    ulen = 1;
+
+                if ((stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) && ulen == (STRLEN)(e-s))
                     break;
-                }
 
-                ulen = e-s;
                 goto malformed_byte;
             }
 
@@ -475,40 +492,56 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
         }
 
         /* If we get here there is something wrong with alleged UTF-8 */
+        /* uv is used only when encoding */
     malformed_byte:
-        uv = (UV)*s;
-        if (ulen == 0)
+        if (uv == 0)
+            uv = (UV)*s;
+        if (encode || ulen == 0)
             ulen = 1;
 
     malformed:
+        if (!encode && (check & (ENCODE_DIE_ON_ERR|ENCODE_WARN_ON_ERR|ENCODE_PERLQQ)))
+            for (i=0; i<ulen; ++i) sprintf(esc+4*i, "\\x%02X", s[i]);
         if (check & ENCODE_DIE_ON_ERR){
             if (encode)
-                Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8");
+                Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, (strict ? "UTF-8" : "utf8"));
             else
-                Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv);
+                Perl_croak(aTHX_ ERR_DECODE_STR_NOMAP, (strict ? "UTF-8" : "utf8"), esc);
         }
         if (check & ENCODE_WARN_ON_ERR){
             if (encode)
                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
-                            ERR_ENCODE_NOMAP, uv, "utf8");
+                            ERR_ENCODE_NOMAP, uv, (strict ? "UTF-8" : "utf8"));
             else
                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
-                            ERR_DECODE_NOMAP, "utf8", uv);
+                            ERR_DECODE_STR_NOMAP, (strict ? "UTF-8" : "utf8"), esc);
         }
         if (check & ENCODE_RETURN_ON_ERR) {
                 break;
         }
         if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
-	    SV* subchar =
-		(fallback_cb != &PL_sv_undef)
-		? do_fallback_cb(aTHX_ uv, fallback_cb)
-		: newSVpvf(check & ENCODE_PERLQQ 
-			   ? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}")
-			   :  check & ENCODE_HTMLCREF ? "&#%" UVuf ";" 
-			   : "&#x%" UVxf ";", uv);
-	    if (encode){
-		SvUTF8_off(subchar); /* make sure no decoded string gets in */
-	    }
+            SV* subchar;
+            if (encode) {
+                subchar =
+                    (fallback_cb != &PL_sv_undef)
+                    ? do_fallback_cb(aTHX_ uv, fallback_cb)
+                    : newSVpvf(check & ENCODE_PERLQQ
+                        ? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}")
+                        :  check & ENCODE_HTMLCREF ? "&#%" UVuf ";"
+                        : "&#x%" UVxf ";", uv);
+                SvUTF8_off(subchar); /* make sure no decoded string gets in */
+            } else {
+                if (fallback_cb != &PL_sv_undef) {
+                    /* in decode mode we have sequence of wrong bytes */
+                    subchar = do_bytes_fallback_cb(aTHX_ s, ulen, fallback_cb);
+                } else {
+                    char *ptr = esc;
+                    /* ENCODE_PERLQQ is already stored in esc */
+                    if (check & (ENCODE_HTMLCREF|ENCODE_XMLCREF))
+                        for (i=0; i<ulen; ++i) ptr += sprintf(ptr, ((check & ENCODE_HTMLCREF) ? "&#%u;" : "&#x%02X;"), s[i]);
+                    subchar = newSVpvn(esc, strlen(esc));
+                }
+            }
             dlen += SvCUR(subchar) - ulen;
             SvCUR_set(dst, d-(U8 *)SvPVX(dst));
             *SvEND(dst) = '\0';
 
@@ -1,5 +1,5 @@
 #
-# $Id: Makefile.PL,v 2.18 2016/11/29 23:29:23 dankogai Exp dankogai $
+# $Id: Makefile.PL,v 2.18 2016/11/29 23:29:23 dankogai Exp $
 #
 use 5.007003;
 use strict;
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`#`
`2`		`-# $Id: Makefile.PL,v 2.18 2016/11/29 23:29:23 dankogai Exp dankogai $`
	`2`	`+# $Id: Makefile.PL,v 2.18 2016/11/29 23:29:23 dankogai Exp $`
`3`	`3`	`#`
`4`	`4`	`use 5.007003;`
`5`	`5`	`use strict;`