Skip to content

Commit c57d33e

Browse files
committed
Merge branch 'lukeshu/no-cvtutf'
Fix: #567 Fix: #277
2 parents 98ca195 + ff8edcd commit c57d33e

File tree

5 files changed

+229
-399
lines changed

5 files changed

+229
-399
lines changed

ext/json/ext/generator/generator.c

Lines changed: 98 additions & 263 deletions
Original file line numberDiff line numberDiff line change
@@ -18,282 +18,122 @@ static ID i_to_s, i_to_json, i_new, i_indent, i_space, i_space_before,
1818
i_aref, i_send, i_respond_to_p, i_match, i_keys, i_depth,
1919
i_buffer_initial_length, i_dup, i_script_safe, i_escape_slash, i_strict;
2020

21-
/*
22-
* Copyright 2001-2004 Unicode, Inc.
21+
/* Converts in_string to a JSON string (without the wrapping '"'
22+
* characters) in FBuffer out_buffer.
2323
*
24-
* Disclaimer
24+
* Character are JSON-escaped according to:
2525
*
26-
* This source code is provided as is by Unicode, Inc. No claims are
27-
* made as to fitness for any particular purpose. No warranties of any
28-
* kind are expressed or implied. The recipient agrees to determine
29-
* applicability of information provided. If this file has been
30-
* purchased on magnetic or optical media from Unicode, Inc., the
31-
* sole remedy for any claim will be exchange of defective media
32-
* within 90 days of receipt.
26+
* - Always: ASCII control characters (0x00-0x1F), dquote, and
27+
* backslash.
3328
*
34-
* Limitations on Rights to Redistribute This Code
29+
* - If out_ascii_only: non-ASCII characters (>0x7F)
3530
*
36-
* Unicode, Inc. hereby grants the right to freely use the information
37-
* supplied in this file in the creation of products supporting the
38-
* Unicode Standard, and to make copies of this file in any form
39-
* for internal or external distribution as long as this notice
40-
* remains attached.
41-
*/
42-
43-
/*
44-
* Index into the table below with the first byte of a UTF-8 sequence to
45-
* get the number of trailing bytes that are supposed to follow it.
46-
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
47-
* left as-is for anyone who may want to do such conversion, which was
48-
* allowed in earlier algorithms.
49-
*/
50-
static const char trailingBytesForUTF8[256] = {
51-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
52-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
53-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
54-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
55-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
56-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
57-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
58-
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
59-
};
60-
61-
/*
62-
* Magic values subtracted from a buffer value during UTF8 conversion.
63-
* This table contains as many values as there might be trailing bytes
64-
* in a UTF-8 sequence.
31+
* - If out_script_safe: forwardslash, line separator (U+2028), and
32+
* paragraph separator (U+2029)
33+
*
34+
* Everything else (should be UTF-8) is just passed through and
35+
* appended to the result.
6536
*/
66-
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
67-
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
68-
69-
/* Escapes the UTF16 character and stores the result in the buffer buf. */
70-
static void unicode_escape(char *buf, UTF16 character)
37+
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe)
7138
{
72-
const char *digits = "0123456789abcdef";
39+
const char *hexdig = "0123456789abcdef";
40+
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
7341

74-
buf[2] = digits[character >> 12];
75-
buf[3] = digits[(character >> 8) & 0xf];
76-
buf[4] = digits[(character >> 4) & 0xf];
77-
buf[5] = digits[character & 0xf];
78-
}
42+
const char *in_utf8_str = RSTRING_PTR(in_string);
43+
unsigned long in_utf8_len = RSTRING_LEN(in_string);
44+
bool in_is_ascii_only = rb_enc_str_asciionly_p(in_string);
7945

80-
/* Escapes the UTF16 character and stores the result in the buffer buf, then
81-
* the buffer buf is appended to the FBuffer buffer. */
82-
static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16
83-
character)
84-
{
85-
unicode_escape(buf, character);
86-
fbuffer_append(buffer, buf, 6);
87-
}
88-
89-
/* Converts string to a JSON string in FBuffer buffer, where all but the ASCII
90-
* and control characters are JSON escaped. */
91-
static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe)
92-
{
93-
const UTF8 *source = (UTF8 *) RSTRING_PTR(string);
94-
const UTF8 *sourceEnd = source + RSTRING_LEN(string);
95-
char buf[6] = { '\\', 'u' };
96-
97-
int ascii_only = rb_enc_str_asciionly_p(string);
46+
unsigned long beg = 0, pos;
9847

99-
if (!ascii_only) {
100-
if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
101-
rb_raise(rb_path2class("JSON::GeneratorError"),
102-
"source sequence is illegal/malformed utf-8");
103-
}
104-
}
48+
for (pos = 0; pos < in_utf8_len;) {
49+
uint32_t ch;
50+
unsigned long ch_len;
51+
bool should_escape;
10552

106-
while (source < sourceEnd) {
107-
UTF32 ch = 0;
108-
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
109-
/*
110-
* The cases all fall through. See "Note A" below.
111-
*/
112-
switch (extraBytesToRead) {
113-
case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
114-
case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
115-
case 3: ch += *source++; ch <<= 6;
116-
case 2: ch += *source++; ch <<= 6;
117-
case 1: ch += *source++; ch <<= 6;
118-
case 0: ch += *source++;
119-
}
120-
ch -= offsetsFromUTF8[extraBytesToRead];
121-
122-
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
123-
/* UTF-16 surrogate values are illegal in UTF-32 */
124-
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
125-
#if UNI_STRICT_CONVERSION
126-
source -= (extraBytesToRead+1); /* return to the illegal value itself */
53+
/* UTF-8 decoding */
54+
if (in_is_ascii_only) {
55+
ch = in_utf8_str[pos];
56+
ch_len = 1;
57+
} else {
58+
unsigned long i;
59+
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
60+
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
61+
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
62+
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
63+
else
12764
rb_raise(rb_path2class("JSON::GeneratorError"),
128-
"source sequence is illegal/malformed utf-8");
129-
#else
130-
unicode_escape_to_buffer(buffer, buf, UNI_REPLACEMENT_CHAR);
131-
#endif
132-
} else {
133-
/* normal case */
134-
if (ch >= 0x20 && ch <= 0x7f) {
135-
switch (ch) {
136-
case '\\':
137-
fbuffer_append(buffer, "\\\\", 2);
138-
break;
139-
case '"':
140-
fbuffer_append(buffer, "\\\"", 2);
141-
break;
142-
case '/':
143-
if(script_safe) {
144-
fbuffer_append(buffer, "\\/", 2);
145-
break;
146-
}
147-
default:
148-
fbuffer_append_char(buffer, (char)ch);
149-
break;
150-
}
151-
} else {
152-
switch (ch) {
153-
case '\n':
154-
fbuffer_append(buffer, "\\n", 2);
155-
break;
156-
case '\r':
157-
fbuffer_append(buffer, "\\r", 2);
158-
break;
159-
case '\t':
160-
fbuffer_append(buffer, "\\t", 2);
161-
break;
162-
case '\f':
163-
fbuffer_append(buffer, "\\f", 2);
164-
break;
165-
case '\b':
166-
fbuffer_append(buffer, "\\b", 2);
167-
break;
168-
default:
169-
unicode_escape_to_buffer(buffer, buf, (UTF16) ch);
170-
break;
171-
}
172-
}
65+
"source sequence is illegal/malformed utf-8");
66+
if ((pos+ch_len) > in_utf8_len)
67+
rb_raise(rb_path2class("JSON::GeneratorError"),
68+
"partial character in source, but hit end");
69+
for (i = 1; i < ch_len; i++) {
70+
if ((in_utf8_str[pos+i] & 0xC0) != 0x80) /* leading 2 bits should be 0b10 */
71+
rb_raise(rb_path2class("JSON::GeneratorError"),
72+
"source sequence is illegal/malformed utf-8");
73+
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
17374
}
174-
} else if (ch > UNI_MAX_UTF16) {
175-
#if UNI_STRICT_CONVERSION
176-
source -= (extraBytesToRead+1); /* return to the start */
177-
rb_raise(rb_path2class("JSON::GeneratorError"),
178-
"source sequence is illegal/malformed utf8");
179-
#else
180-
unicode_escape_to_buffer(buffer, buf, UNI_REPLACEMENT_CHAR);
181-
#endif
182-
} else {
183-
/* target is a character in range 0xFFFF - 0x10FFFF. */
184-
ch -= halfBase;
185-
unicode_escape_to_buffer(buffer, buf, (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START));
186-
unicode_escape_to_buffer(buffer, buf, (UTF16)((ch & halfMask) + UNI_SUR_LOW_START));
187-
}
188-
}
189-
RB_GC_GUARD(string);
190-
}
191-
192-
/* Converts string to a JSON string in FBuffer buffer, where only the
193-
* characters required by the JSON standard are JSON escaped. The remaining
194-
* characters (should be UTF8) are just passed through and appended to the
195-
* result. */
196-
static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe)
197-
{
198-
const char *ptr = RSTRING_PTR(string), *p;
199-
unsigned long len = RSTRING_LEN(string), start = 0, end = 0;
200-
const char *escape = NULL;
201-
int escape_len;
202-
unsigned char c;
203-
char buf[6] = { '\\', 'u' };
204-
int ascii_only = rb_enc_str_asciionly_p(string);
205-
206-
if (!ascii_only) {
207-
if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
208-
rb_raise(rb_path2class("JSON::GeneratorError"),
209-
"source sequence is illegal/malformed utf-8");
75+
if (ch > 0x10FFFF)
76+
rb_raise(rb_path2class("JSON::GeneratorError"),
77+
"source sequence is illegal/malformed utf-8");
21078
}
211-
}
21279

213-
for (start = 0, end = 0; end < len;) {
214-
p = ptr + end;
215-
c = (unsigned char) *p;
216-
if (c < 0x20) {
217-
switch (c) {
218-
case '\n':
219-
escape = "\\n";
220-
escape_len = 2;
221-
break;
222-
case '\r':
223-
escape = "\\r";
224-
escape_len = 2;
225-
break;
226-
case '\t':
227-
escape = "\\t";
228-
escape_len = 2;
229-
break;
230-
case '\f':
231-
escape = "\\f";
232-
escape_len = 2;
233-
break;
234-
case '\b':
235-
escape = "\\b";
236-
escape_len = 2;
237-
break;
238-
default:
239-
unicode_escape(buf, (UTF16) *p);
240-
escape = buf;
241-
escape_len = 6;
242-
break;
243-
}
244-
} else {
245-
switch (c) {
246-
case '\\':
247-
escape = "\\\\";
248-
escape_len = 2;
249-
break;
250-
case '"':
251-
escape = "\\\"";
252-
escape_len = 2;
253-
break;
254-
case '/':
255-
if(script_safe) {
256-
escape = "\\/";
257-
escape_len = 2;
258-
break;
259-
}
80+
/* JSON policy */
81+
should_escape =
82+
(ch < 0x20) ||
83+
(ch == '"') ||
84+
(ch == '\\') ||
85+
(out_ascii_only && (ch > 0x7F)) ||
86+
(out_script_safe && (ch == '/')) ||
87+
(out_script_safe && (ch == 0x2028)) ||
88+
(out_script_safe && (ch == 0x2029));
89+
90+
/* JSON encoding */
91+
if (should_escape) {
92+
if (pos > beg)
93+
fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg);
94+
beg = pos + ch_len;
95+
switch (ch) {
96+
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
97+
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
98+
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
99+
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
100+
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
101+
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
102+
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
103+
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
260104
default:
261-
{
262-
unsigned short clen = 1;
263-
if (!ascii_only) {
264-
clen += trailingBytesForUTF8[c];
265-
if (end + clen > len) {
266-
rb_raise(rb_path2class("JSON::GeneratorError"),
267-
"partial character in source, but hit end");
268-
}
269-
270-
if (script_safe && c == 0xE2) {
271-
unsigned char c2 = (unsigned char) *(p+1);
272-
unsigned char c3 = (unsigned char) *(p+2);
273-
if (c2 == 0x80 && (c3 == 0xA8 || c3 == 0xA9)) {
274-
fbuffer_append(buffer, ptr + start, end - start);
275-
start = end = (end + clen);
276-
if (c3 == 0xA8) {
277-
fbuffer_append(buffer, "\\u2028", 6);
278-
} else {
279-
fbuffer_append(buffer, "\\u2029", 6);
280-
}
281-
continue;
282-
}
283-
}
284-
}
285-
end += clen;
105+
if (ch <= 0xFFFF) {
106+
scratch[2] = hexdig[ch >> 12];
107+
scratch[3] = hexdig[(ch >> 8) & 0xf];
108+
scratch[4] = hexdig[(ch >> 4) & 0xf];
109+
scratch[5] = hexdig[ch & 0xf];
110+
fbuffer_append(out_buffer, scratch, 6);
111+
} else {
112+
uint16_t hi, lo;
113+
ch -= 0x10000;
114+
hi = 0xD800 + (uint16_t)(ch >> 10);
115+
lo = 0xDC00 + (uint16_t)(ch & 0x3FF);
116+
117+
scratch[2] = hexdig[hi >> 12];
118+
scratch[3] = hexdig[(hi >> 8) & 0xf];
119+
scratch[4] = hexdig[(hi >> 4) & 0xf];
120+
scratch[5] = hexdig[hi & 0xf];
121+
122+
scratch[8] = hexdig[lo >> 12];
123+
scratch[9] = hexdig[(lo >> 8) & 0xf];
124+
scratch[10] = hexdig[(lo >> 4) & 0xf];
125+
scratch[11] = hexdig[lo & 0xf];
126+
127+
fbuffer_append(out_buffer, scratch, 12);
286128
}
287-
continue;
288-
break;
289129
}
290130
}
291-
fbuffer_append(buffer, ptr + start, end - start);
292-
fbuffer_append(buffer, escape, escape_len);
293-
start = ++end;
294-
escape = NULL;
131+
132+
pos += ch_len;
295133
}
296-
fbuffer_append(buffer, ptr + start, end - start);
134+
if (beg < in_utf8_len)
135+
fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg);
136+
RB_GC_GUARD(in_string);
297137
}
298138

299139
static char *fstrndup(const char *ptr, unsigned long len) {
@@ -930,12 +770,7 @@ static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_S
930770
if (!enc_utf8_compatible_p(rb_enc_get(obj))) {
931771
obj = rb_str_export_to_enc(obj, rb_utf8_encoding());
932772
}
933-
934-
if (state->ascii_only) {
935-
convert_UTF8_to_JSON_ASCII(buffer, obj, state->script_safe);
936-
} else {
937-
convert_UTF8_to_JSON(buffer, obj, state->script_safe);
938-
}
773+
convert_UTF8_to_JSON(buffer, obj, state->ascii_only, state->script_safe);
939774
fbuffer_append_char(buffer, '"');
940775
}
941776

0 commit comments

Comments
 (0)