Skip to content
This repository was archived by the owner on Jun 1, 2023. It is now read-only.

Commit 086ff1b

Browse files
committed
Encode: Update to 2.89, more encoding undeprecation
1 parent c663e21 commit 086ff1b

File tree

25 files changed

+221
-92
lines changed

25 files changed

+221
-92
lines changed

MANIFEST

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1031,6 +1031,7 @@ cpan/Encode/t/rt86327.t
10311031
cpan/Encode/t/taint.t
10321032
cpan/Encode/t/unibench.pl benchmark script
10331033
cpan/Encode/t/Unicode.t test script
1034+
cpan/Encode/t/utf8messages.t
10341035
cpan/Encode/t/utf8ref.t test script
10351036
cpan/Encode/t/utf8strict.t test script
10361037
cpan/Encode/t/utf8warnings.t

Porting/Maintainers.pl

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -568,8 +568,14 @@ package Maintainers;
568568
},
569569

570570
'Encode' => {
571-
'DISTRIBUTION' => 'DANKOGAI/Encode-2.88.tar.gz',
571+
'DISTRIBUTION' => 'DANKOGAI/Encode-2.89.tar.gz',
572572
'FILES' => q[cpan/Encode],
573+
# undeprecate encoding
574+
'CUSTOMIZED' => [ qw(
575+
encoding.pm
576+
t/enc_eucjp.t
577+
t/enc_utf8.t
578+
)],
573579
},
574580

575581
'encoding::warnings' => {

cpan/Encode/Encode.pm

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
#
2-
# $Id: Encode.pm,v 2.88 2016/11/29 23:30:30 dankogai Exp dankogai $
2+
# $Id: Encode.pm,v 2.89 2017/04/21 05:20:14 dankogai Exp dankogai $
33
#
44
package Encode;
55
use strict;
66
use warnings;
7-
our $VERSION = sprintf "%d.%02d", q$Revision: 2.88 $ =~ /(\d+)/g;
7+
our $VERSION = sprintf "%d.%02d", q$Revision: 2.89 $ =~ /(\d+)/g;
88
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
99
use XSLoader ();
1010
XSLoader::load( __PACKAGE__, $VERSION );
@@ -516,14 +516,16 @@ ISO-8859-1, also known as Latin1:
516516
517517
$octets = encode("iso-8859-1", $string);
518518
519-
B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then
519+
B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then
520520
$octets I<might not be equal to> $string. Though both contain the
521521
same data, the UTF8 flag for $octets is I<always> off. When you
522522
encode anything, the UTF8 flag on the result is always off, even when it
523-
contains a completely valid utf8 string. See L</"The UTF8 flag"> below.
523+
contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below.
524524
525525
If the $string is C<undef>, then C<undef> is returned.
526526
527+
C<str2bytes> may be used as an alias for C<encode>.
528+
527529
=head3 decode
528530
529531
$string = decode(ENCODING, OCTETS[, CHECK])
@@ -544,13 +546,15 @@ internal format:
544546
545547
$string = decode("iso-8859-1", $octets);
546548
547-
B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
549+
B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string
548550
I<might not be equal to> $octets. Though both contain the same data, the
549551
UTF8 flag for $string is on. See L</"The UTF8 flag">
550552
below.
551553
552554
If the $string is C<undef>, then C<undef> is returned.
553555
556+
C<bytes2str> may be used as an alias for C<decode>.
557+
554558
=head3 find_encoding
555559
556560
[$obj =] find_encoding(ENCODING)
@@ -559,11 +563,11 @@ Returns the I<encoding object> corresponding to I<ENCODING>. Returns
559563
C<undef> if no matching I<ENCODING> is find. The returned object is
560564
what does the actual encoding or decoding.
561565
562-
$utf8 = decode($name, $bytes);
566+
$string = decode($name, $bytes);
563567
564568
is in fact
565569
566-
$utf8 = do {
570+
$string = do {
567571
$obj = find_encoding($name);
568572
croak qq(encoding "$name" not found) unless ref $obj;
569573
$obj->decode($bytes);
@@ -575,8 +579,8 @@ You can therefore save time by reusing this object as follows;
575579
576580
my $enc = find_encoding("iso-8859-1");
577581
while(<>) {
578-
my $utf8 = $enc->decode($_);
579-
... # now do something with $utf8;
582+
my $string = $enc->decode($_);
583+
... # now do something with $string;
580584
}
581585
582586
Besides L</decode> and L</encode>, other methods are
@@ -624,13 +628,13 @@ and C<undef> on error.
624628
625629
B<CAVEAT>: The following operations may look the same, but are not:
626630
627-
from_to($data, "iso-8859-1", "utf8"); #1
631+
from_to($data, "iso-8859-1", "UTF-8"); #1
628632
$data = decode("iso-8859-1", $data); #2
629633
630634
Both #1 and #2 make $data consist of a completely valid UTF-8 string,
631635
but only #2 turns the UTF8 flag on. #1 is equivalent to:
632636
633-
$data = encode("utf8", decode("iso-8859-1", $data));
637+
$data = encode("UTF-8", decode("iso-8859-1", $data));
634638
635639
See L</"The UTF8 flag"> below.
636640
@@ -655,19 +659,27 @@ followed by C<encode> as follows:
655659
Equivalent to C<$octets = encode("utf8", $string)>. The characters in
656660
$string are encoded in Perl's internal format, and the result is returned
657661
as a sequence of octets. Because all possible characters in Perl have a
658-
(loose, not strict) UTF-8 representation, this function cannot fail.
662+
(loose, not strict) utf8 representation, this function cannot fail.
663+
664+
B<WARNING>: do not use this function for data exchange as it can produce
665+
not strict utf8 $octets! For strictly valid UTF-8 output use
666+
C<$octets = encode("UTF-8", $string)>.
659667
660668
=head3 decode_utf8
661669
662670
$string = decode_utf8($octets [, CHECK]);
663671
664672
Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
665673
The sequence of octets represented by $octets is decoded
666-
from UTF-8 into a sequence of logical characters.
667-
Because not all sequences of octets are valid UTF-8,
674+
from (loose, not strict) utf8 into a sequence of logical characters.
675+
Because not all sequences of octets are valid not strict utf8,
668676
it is quite possible for this function to fail.
669677
For CHECK, see L</"Handling Malformed Data">.
670678
679+
B<WARNING>: do not use this function for data exchange as it can produce
680+
$string with not strict utf8 representation! For strictly valid UTF-8
681+
$string representation use C<$string = decode("UTF-8", $octets [, CHECK])>.
682+
671683
B<CAVEAT>: the input I<$octets> might be modified in-place depending on
672684
what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
673685
left unchanged.
@@ -903,15 +915,14 @@ octets that represent the fallback character. For instance:
903915
904916
Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
905917
906-
Even the fallback for C<decode> must return octets, which are
907-
then decoded with the character encoding that C<decode> accepts. So for
918+
Fallback for C<decode> must return decoded string (sequence of characters)
919+
and takes a list of ordinal values as its arguments. So for
908920
example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
909921
a fallback for bytes that are not valid UTF-8, you could write
910922
911923
$str = decode 'UTF-8', $octets, sub {
912-
my $tmp = chr shift;
913-
from_to $tmp, 'ISO-8859-15', 'UTF-8';
914-
return $tmp;
924+
my $tmp = join '', map chr, @_;
925+
return decode 'ISO-8859-15', $tmp;
915926
};
916927
917928
=head1 Defining Encodings
@@ -980,9 +991,9 @@ When you I<encode>, the resulting UTF8 flag is always B<off>.
980991
981992
When you I<decode>, the resulting UTF8 flag is B<on>--I<unless> you can
982993
unambiguously represent data. Here is what we mean by "unambiguously".
983-
After C<$utf8 = decode("foo", $octet)>,
994+
After C<$str = decode("foo", $octet)>,
984995
985-
When $octet is... The UTF8 flag in $utf8 is
996+
When $octet is... The UTF8 flag in $str is
986997
---------------------------------------------
987998
In ASCII only (or EBCDIC only) OFF
988999
In ISO-8859-1 ON

cpan/Encode/Encode.xs

Lines changed: 76 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
$Id: Encode.xs,v 2.39 2016/11/29 23:29:23 dankogai Exp dankogai $
2+
$Id: Encode.xs,v 2.40 2017/04/21 05:20:14 dankogai Exp dankogai $
33
*/
44

55
#define PERL_NO_GET_CONTEXT
@@ -35,17 +35,6 @@ UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
3535
#define SvIV_nomg SvIV
3636
#endif
3737

38-
#ifdef UTF8_DISALLOW_ILLEGAL_INTERCHANGE
39-
# define UTF8_ALLOW_STRICT UTF8_DISALLOW_ILLEGAL_INTERCHANGE
40-
#else
41-
# define UTF8_ALLOW_STRICT 0
42-
#endif
43-
44-
#define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY & \
45-
~(UTF8_ALLOW_CONTINUATION | \
46-
UTF8_ALLOW_NON_CONTINUATION | \
47-
UTF8_ALLOW_LONG))
48-
4938
static void
5039
Encode_XSEncoding(pTHX_ encode_t * enc)
5140
{
@@ -114,17 +103,18 @@ utf8_safe_upgrade(pTHX_ SV ** src, U8 ** s, STRLEN * slen, bool modify)
114103

115104
#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s"
116105
#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode"
106+
#define ERR_DECODE_STR_NOMAP "%s \"%s\" does not map to Unicode"
117107

118108
static SV *
119109
do_fallback_cb(pTHX_ UV ch, SV *fallback_cb)
120110
{
121111
dSP;
122112
int argc;
123-
SV *retval = newSVpv("",0);
113+
SV *retval = newSVpvn("",0);
124114
ENTER;
125115
SAVETMPS;
126116
PUSHMARK(sp);
127-
XPUSHs(sv_2mortal(newSVnv((UV)ch)));
117+
XPUSHs(sv_2mortal(newSVuv(ch)));
128118
PUTBACK;
129119
argc = call_sv(fallback_cb, G_SCALAR);
130120
SPAGAIN;
@@ -138,6 +128,31 @@ do_fallback_cb(pTHX_ UV ch, SV *fallback_cb)
138128
return retval;
139129
}
140130

131+
static SV *
132+
do_bytes_fallback_cb(pTHX_ U8 *s, STRLEN slen, SV *fallback_cb)
133+
{
134+
dSP;
135+
int argc;
136+
STRLEN i;
137+
SV *retval = newSVpvn("",0);
138+
ENTER;
139+
SAVETMPS;
140+
PUSHMARK(sp);
141+
for (i=0; i<slen; ++i)
142+
XPUSHs(sv_2mortal(newSVuv(s[i])));
143+
PUTBACK;
144+
argc = call_sv(fallback_cb, G_SCALAR);
145+
SPAGAIN;
146+
if (argc != 1){
147+
croak("fallback sub must return scalar!");
148+
}
149+
sv_catsv(retval, POPs);
150+
PUTBACK;
151+
FREETMPS;
152+
LEAVE;
153+
return retval;
154+
}
155+
141156
static SV *
142157
encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 * s, STRLEN slen,
143158
int check, STRLEN * offset, SV * term, int * retcode,
@@ -382,7 +397,7 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
382397
U8 *ptr = s;
383398
bool overflowed = 0;
384399

385-
uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(len);
400+
uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(UTF8SKIP(s));
386401

387402
len--;
388403
s++;
@@ -401,7 +416,6 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
401416
*rlen = s-ptr;
402417

403418
if (overflowed || *rlen > (STRLEN)UNISKIP(uv)) {
404-
*rlen = 1;
405419
return 0;
406420
}
407421

@@ -413,11 +427,13 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
413427
bool encode, bool strict, bool stop_at_partial)
414428
{
415429
UV uv;
430+
STRLEN i;
416431
STRLEN ulen;
417432
SV *fallback_cb;
418433
int check;
419434
U8 *d;
420435
STRLEN dlen;
436+
char esc[80]; /* need to store UTF8SKIP * 6 + 1 */
421437

422438
if (SvROK(check_sv)) {
423439
/* croak("UTF-8 decoder doesn't support callback CHECK"); */
@@ -442,21 +458,22 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
442458
}
443459

444460
ulen = 1;
445-
if (UTF8_IS_START(*s)) {
461+
if (! UTF8_IS_CONTINUATION(*s)) {
462+
/* Not an invariant nor a continuation; must be a start byte. (We
463+
* can't test for UTF8_IS_START as that excludes things like \xC0
464+
* which are start bytes, but always lead to overlongs */
465+
446466
U8 skip = UTF8SKIP(s);
447467
if ((s + skip) > e) {
448-
if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) {
449-
const U8 *p = s + 1;
450-
for (; p < e; p++) {
451-
if (!UTF8_IS_CONTINUATION(*p)) {
452-
ulen = p-s;
453-
goto malformed_byte;
454-
}
455-
}
468+
/* just calculate ulen, in pathological cases can be smaller then e-s */
469+
if (e-s >= 2)
470+
convert_utf8_multi_seq(s, e-s, &ulen);
471+
else
472+
ulen = 1;
473+
474+
if ((stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) && ulen == (STRLEN)(e-s))
456475
break;
457-
}
458476

459-
ulen = e-s;
460477
goto malformed_byte;
461478
}
462479

@@ -475,40 +492,56 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
475492
}
476493

477494
/* If we get here there is something wrong with alleged UTF-8 */
495+
/* uv is used only when encoding */
478496
malformed_byte:
479-
uv = (UV)*s;
480-
if (ulen == 0)
497+
if (uv == 0)
498+
uv = (UV)*s;
499+
if (encode || ulen == 0)
481500
ulen = 1;
482501

483502
malformed:
503+
if (!encode && (check & (ENCODE_DIE_ON_ERR|ENCODE_WARN_ON_ERR|ENCODE_PERLQQ)))
504+
for (i=0; i<ulen; ++i) sprintf(esc+4*i, "\\x%02X", s[i]);
484505
if (check & ENCODE_DIE_ON_ERR){
485506
if (encode)
486-
Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8");
507+
Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, (strict ? "UTF-8" : "utf8"));
487508
else
488-
Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv);
509+
Perl_croak(aTHX_ ERR_DECODE_STR_NOMAP, (strict ? "UTF-8" : "utf8"), esc);
489510
}
490511
if (check & ENCODE_WARN_ON_ERR){
491512
if (encode)
492513
Perl_warner(aTHX_ packWARN(WARN_UTF8),
493-
ERR_ENCODE_NOMAP, uv, "utf8");
514+
ERR_ENCODE_NOMAP, uv, (strict ? "UTF-8" : "utf8"));
494515
else
495516
Perl_warner(aTHX_ packWARN(WARN_UTF8),
496-
ERR_DECODE_NOMAP, "utf8", uv);
517+
ERR_DECODE_STR_NOMAP, (strict ? "UTF-8" : "utf8"), esc);
497518
}
498519
if (check & ENCODE_RETURN_ON_ERR) {
499520
break;
500521
}
501522
if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
502-
SV* subchar =
503-
(fallback_cb != &PL_sv_undef)
504-
? do_fallback_cb(aTHX_ uv, fallback_cb)
505-
: newSVpvf(check & ENCODE_PERLQQ
506-
? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}")
507-
: check & ENCODE_HTMLCREF ? "&#%" UVuf ";"
508-
: "&#x%" UVxf ";", uv);
509-
if (encode){
510-
SvUTF8_off(subchar); /* make sure no decoded string gets in */
511-
}
523+
SV* subchar;
524+
if (encode) {
525+
subchar =
526+
(fallback_cb != &PL_sv_undef)
527+
? do_fallback_cb(aTHX_ uv, fallback_cb)
528+
: newSVpvf(check & ENCODE_PERLQQ
529+
? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}")
530+
: check & ENCODE_HTMLCREF ? "&#%" UVuf ";"
531+
: "&#x%" UVxf ";", uv);
532+
SvUTF8_off(subchar); /* make sure no decoded string gets in */
533+
} else {
534+
if (fallback_cb != &PL_sv_undef) {
535+
/* in decode mode we have sequence of wrong bytes */
536+
subchar = do_bytes_fallback_cb(aTHX_ s, ulen, fallback_cb);
537+
} else {
538+
char *ptr = esc;
539+
/* ENCODE_PERLQQ is already stored in esc */
540+
if (check & (ENCODE_HTMLCREF|ENCODE_XMLCREF))
541+
for (i=0; i<ulen; ++i) ptr += sprintf(ptr, ((check & ENCODE_HTMLCREF) ? "&#%u;" : "&#x%02X;"), s[i]);
542+
subchar = newSVpvn(esc, strlen(esc));
543+
}
544+
}
512545
dlen += SvCUR(subchar) - ulen;
513546
SvCUR_set(dst, d-(U8 *)SvPVX(dst));
514547
*SvEND(dst) = '\0';

cpan/Encode/Makefile.PL

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# $Id: Makefile.PL,v 2.18 2016/11/29 23:29:23 dankogai Exp dankogai $
2+
# $Id: Makefile.PL,v 2.18 2016/11/29 23:29:23 dankogai Exp $
33
#
44
use 5.007003;
55
use strict;

0 commit comments

Comments
 (0)