@@ -18,282 +18,122 @@ static ID i_to_s, i_to_json, i_new, i_indent, i_space, i_space_before,
1818 i_aref , i_send , i_respond_to_p , i_match , i_keys , i_depth ,
1919 i_buffer_initial_length , i_dup , i_script_safe , i_escape_slash , i_strict ;
2020
21- /*
22- * Copyright 2001-2004 Unicode, Inc .
21+ /* Converts in_string to a JSON string (without the wrapping '"'
22+ * characters) in FBuffer out_buffer .
2323 *
24- * Disclaimer
24+ * Character are JSON-escaped according to:
2525 *
26- * This source code is provided as is by Unicode, Inc. No claims are
27- * made as to fitness for any particular purpose. No warranties of any
28- * kind are expressed or implied. The recipient agrees to determine
29- * applicability of information provided. If this file has been
30- * purchased on magnetic or optical media from Unicode, Inc., the
31- * sole remedy for any claim will be exchange of defective media
32- * within 90 days of receipt.
26+ * - Always: ASCII control characters (0x00-0x1F), dquote, and
27+ * backslash.
3328 *
34- * Limitations on Rights to Redistribute This Code
29+ * - If out_ascii_only: non-ASCII characters (>0x7F)
3530 *
36- * Unicode, Inc. hereby grants the right to freely use the information
37- * supplied in this file in the creation of products supporting the
38- * Unicode Standard, and to make copies of this file in any form
39- * for internal or external distribution as long as this notice
40- * remains attached.
41- */
42-
43- /*
44- * Index into the table below with the first byte of a UTF-8 sequence to
45- * get the number of trailing bytes that are supposed to follow it.
46- * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
47- * left as-is for anyone who may want to do such conversion, which was
48- * allowed in earlier algorithms.
49- */
50- static const char trailingBytesForUTF8 [256 ] = {
51- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
52- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
53- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
54- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
55- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
56- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 , 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,
57- 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 , 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,
58- 2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 , 3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,4 ,4 ,4 ,4 ,5 ,5 ,5 ,5
59- };
60-
61- /*
62- * Magic values subtracted from a buffer value during UTF8 conversion.
63- * This table contains as many values as there might be trailing bytes
64- * in a UTF-8 sequence.
31+ * - If out_script_safe: forwardslash, line separator (U+2028), and
32+ * paragraph separator (U+2029)
33+ *
34+ * Everything else (should be UTF-8) is just passed through and
35+ * appended to the result.
6536 */
66- static const UTF32 offsetsFromUTF8 [6 ] = { 0x00000000UL , 0x00003080UL , 0x000E2080UL ,
67- 0x03C82080UL , 0xFA082080UL , 0x82082080UL };
68-
69- /* Escapes the UTF16 character and stores the result in the buffer buf. */
70- static void unicode_escape (char * buf , UTF16 character )
37+ static void convert_UTF8_to_JSON (FBuffer * out_buffer , VALUE in_string , bool out_ascii_only , bool out_script_safe )
7138{
72- const char * digits = "0123456789abcdef" ;
39+ const char * hexdig = "0123456789abcdef" ;
40+ char scratch [12 ] = { '\\' , 'u' , 0 , 0 , 0 , 0 , '\\' , 'u' };
7341
74- buf [2 ] = digits [character >> 12 ];
75- buf [3 ] = digits [(character >> 8 ) & 0xf ];
76- buf [4 ] = digits [(character >> 4 ) & 0xf ];
77- buf [5 ] = digits [character & 0xf ];
78- }
42+ const char * in_utf8_str = RSTRING_PTR (in_string );
43+ unsigned long in_utf8_len = RSTRING_LEN (in_string );
44+ bool in_is_ascii_only = rb_enc_str_asciionly_p (in_string );
7945
80- /* Escapes the UTF16 character and stores the result in the buffer buf, then
81- * the buffer buf is appended to the FBuffer buffer. */
82- static void unicode_escape_to_buffer (FBuffer * buffer , char buf [6 ], UTF16
83- character )
84- {
85- unicode_escape (buf , character );
86- fbuffer_append (buffer , buf , 6 );
87- }
88-
89- /* Converts string to a JSON string in FBuffer buffer, where all but the ASCII
90- * and control characters are JSON escaped. */
91- static void convert_UTF8_to_JSON_ASCII (FBuffer * buffer , VALUE string , char script_safe )
92- {
93- const UTF8 * source = (UTF8 * ) RSTRING_PTR (string );
94- const UTF8 * sourceEnd = source + RSTRING_LEN (string );
95- char buf [6 ] = { '\\' , 'u' };
96-
97- int ascii_only = rb_enc_str_asciionly_p (string );
46+ unsigned long beg = 0 , pos ;
9847
99- if (!ascii_only ) {
100- if (RB_ENCODING_GET_INLINED (string ) != rb_utf8_encindex () || RB_ENC_CODERANGE (string ) != RUBY_ENC_CODERANGE_VALID ) {
101- rb_raise (rb_path2class ("JSON::GeneratorError" ),
102- "source sequence is illegal/malformed utf-8" );
103- }
104- }
48+ for (pos = 0 ; pos < in_utf8_len ;) {
49+ uint32_t ch ;
50+ unsigned long ch_len ;
51+ bool should_escape ;
10552
106- while (source < sourceEnd ) {
107- UTF32 ch = 0 ;
108- unsigned short extraBytesToRead = trailingBytesForUTF8 [* source ];
109- /*
110- * The cases all fall through. See "Note A" below.
111- */
112- switch (extraBytesToRead ) {
113- case 5 : ch += * source ++ ; ch <<= 6 ; /* remember, illegal UTF-8 */
114- case 4 : ch += * source ++ ; ch <<= 6 ; /* remember, illegal UTF-8 */
115- case 3 : ch += * source ++ ; ch <<= 6 ;
116- case 2 : ch += * source ++ ; ch <<= 6 ;
117- case 1 : ch += * source ++ ; ch <<= 6 ;
118- case 0 : ch += * source ++ ;
119- }
120- ch -= offsetsFromUTF8 [extraBytesToRead ];
121-
122- if (ch <= UNI_MAX_BMP ) { /* Target is a character <= 0xFFFF */
123- /* UTF-16 surrogate values are illegal in UTF-32 */
124- if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END ) {
125- #if UNI_STRICT_CONVERSION
126- source -= (extraBytesToRead + 1 ); /* return to the illegal value itself */
53+ /* UTF-8 decoding */
54+ if (in_is_ascii_only ) {
55+ ch = in_utf8_str [pos ];
56+ ch_len = 1 ;
57+ } else {
58+ unsigned long i ;
59+ if ((in_utf8_str [pos ] & 0x80 ) == 0x00 ) { ch_len = 1 ; ch = in_utf8_str [pos ]; } /* leading 1 bit is 0b0 */
60+ else if ((in_utf8_str [pos ] & 0xE0 ) == 0xC0 ) { ch_len = 2 ; ch = in_utf8_str [pos ] & 0x1F ; } /* leading 3 bits are 0b110 */
61+ else if ((in_utf8_str [pos ] & 0xF0 ) == 0xE0 ) { ch_len = 3 ; ch = in_utf8_str [pos ] & 0x0F ; } /* leading 4 bits are 0b1110 */
62+ else if ((in_utf8_str [pos ] & 0xF8 ) == 0xF0 ) { ch_len = 4 ; ch = in_utf8_str [pos ] & 0x07 ; } /* leading 5 bits are 0b11110 */
63+ else
12764 rb_raise (rb_path2class ("JSON::GeneratorError" ),
128- "source sequence is illegal/malformed utf-8" );
129- #else
130- unicode_escape_to_buffer (buffer , buf , UNI_REPLACEMENT_CHAR );
131- #endif
132- } else {
133- /* normal case */
134- if (ch >= 0x20 && ch <= 0x7f ) {
135- switch (ch ) {
136- case '\\' :
137- fbuffer_append (buffer , "\\\\" , 2 );
138- break ;
139- case '"' :
140- fbuffer_append (buffer , "\\\"" , 2 );
141- break ;
142- case '/' :
143- if (script_safe ) {
144- fbuffer_append (buffer , "\\/" , 2 );
145- break ;
146- }
147- default :
148- fbuffer_append_char (buffer , (char )ch );
149- break ;
150- }
151- } else {
152- switch (ch ) {
153- case '\n' :
154- fbuffer_append (buffer , "\\n" , 2 );
155- break ;
156- case '\r' :
157- fbuffer_append (buffer , "\\r" , 2 );
158- break ;
159- case '\t' :
160- fbuffer_append (buffer , "\\t" , 2 );
161- break ;
162- case '\f' :
163- fbuffer_append (buffer , "\\f" , 2 );
164- break ;
165- case '\b' :
166- fbuffer_append (buffer , "\\b" , 2 );
167- break ;
168- default :
169- unicode_escape_to_buffer (buffer , buf , (UTF16 ) ch );
170- break ;
171- }
172- }
65+ "source sequence is illegal/malformed utf-8" );
66+ if ((pos + ch_len ) > in_utf8_len )
67+ rb_raise (rb_path2class ("JSON::GeneratorError" ),
68+ "partial character in source, but hit end" );
69+ for (i = 1 ; i < ch_len ; i ++ ) {
70+ if ((in_utf8_str [pos + i ] & 0xC0 ) != 0x80 ) /* leading 2 bits should be 0b10 */
71+ rb_raise (rb_path2class ("JSON::GeneratorError" ),
72+ "source sequence is illegal/malformed utf-8" );
73+ ch = (ch <<6 ) | (in_utf8_str [pos + i ] & 0x3F );
17374 }
174- } else if (ch > UNI_MAX_UTF16 ) {
175- #if UNI_STRICT_CONVERSION
176- source -= (extraBytesToRead + 1 ); /* return to the start */
177- rb_raise (rb_path2class ("JSON::GeneratorError" ),
178- "source sequence is illegal/malformed utf8" );
179- #else
180- unicode_escape_to_buffer (buffer , buf , UNI_REPLACEMENT_CHAR );
181- #endif
182- } else {
183- /* target is a character in range 0xFFFF - 0x10FFFF. */
184- ch -= halfBase ;
185- unicode_escape_to_buffer (buffer , buf , (UTF16 )((ch >> halfShift ) + UNI_SUR_HIGH_START ));
186- unicode_escape_to_buffer (buffer , buf , (UTF16 )((ch & halfMask ) + UNI_SUR_LOW_START ));
187- }
188- }
189- RB_GC_GUARD (string );
190- }
191-
192- /* Converts string to a JSON string in FBuffer buffer, where only the
193- * characters required by the JSON standard are JSON escaped. The remaining
194- * characters (should be UTF8) are just passed through and appended to the
195- * result. */
196- static void convert_UTF8_to_JSON (FBuffer * buffer , VALUE string , char script_safe )
197- {
198- const char * ptr = RSTRING_PTR (string ), * p ;
199- unsigned long len = RSTRING_LEN (string ), start = 0 , end = 0 ;
200- const char * escape = NULL ;
201- int escape_len ;
202- unsigned char c ;
203- char buf [6 ] = { '\\' , 'u' };
204- int ascii_only = rb_enc_str_asciionly_p (string );
205-
206- if (!ascii_only ) {
207- if (RB_ENCODING_GET_INLINED (string ) != rb_utf8_encindex () || RB_ENC_CODERANGE (string ) != RUBY_ENC_CODERANGE_VALID ) {
208- rb_raise (rb_path2class ("JSON::GeneratorError" ),
209- "source sequence is illegal/malformed utf-8" );
75+ if (ch > 0x10FFFF )
76+ rb_raise (rb_path2class ("JSON::GeneratorError" ),
77+ "source sequence is illegal/malformed utf-8" );
21078 }
211- }
21279
213- for (start = 0 , end = 0 ; end < len ;) {
214- p = ptr + end ;
215- c = (unsigned char ) * p ;
216- if (c < 0x20 ) {
217- switch (c ) {
218- case '\n' :
219- escape = "\\n" ;
220- escape_len = 2 ;
221- break ;
222- case '\r' :
223- escape = "\\r" ;
224- escape_len = 2 ;
225- break ;
226- case '\t' :
227- escape = "\\t" ;
228- escape_len = 2 ;
229- break ;
230- case '\f' :
231- escape = "\\f" ;
232- escape_len = 2 ;
233- break ;
234- case '\b' :
235- escape = "\\b" ;
236- escape_len = 2 ;
237- break ;
238- default :
239- unicode_escape (buf , (UTF16 ) * p );
240- escape = buf ;
241- escape_len = 6 ;
242- break ;
243- }
244- } else {
245- switch (c ) {
246- case '\\' :
247- escape = "\\\\" ;
248- escape_len = 2 ;
249- break ;
250- case '"' :
251- escape = "\\\"" ;
252- escape_len = 2 ;
253- break ;
254- case '/' :
255- if (script_safe ) {
256- escape = "\\/" ;
257- escape_len = 2 ;
258- break ;
259- }
80+ /* JSON policy */
81+ should_escape =
82+ (ch < 0x20 ) ||
83+ (ch == '"' ) ||
84+ (ch == '\\' ) ||
85+ (out_ascii_only && (ch > 0x7F )) ||
86+ (out_script_safe && (ch == '/' )) ||
87+ (out_script_safe && (ch == 0x2028 )) ||
88+ (out_script_safe && (ch == 0x2029 ));
89+
90+ /* JSON encoding */
91+ if (should_escape ) {
92+ if (pos > beg )
93+ fbuffer_append (out_buffer , & in_utf8_str [beg ], pos - beg );
94+ beg = pos + ch_len ;
95+ switch (ch ) {
96+ case '"' : fbuffer_append (out_buffer , "\\\"" , 2 ); break ;
97+ case '\\' : fbuffer_append (out_buffer , "\\\\" , 2 ); break ;
98+ case '/' : fbuffer_append (out_buffer , "\\/" , 2 ); break ;
99+ case '\b' : fbuffer_append (out_buffer , "\\b" , 2 ); break ;
100+ case '\f' : fbuffer_append (out_buffer , "\\f" , 2 ); break ;
101+ case '\n' : fbuffer_append (out_buffer , "\\n" , 2 ); break ;
102+ case '\r' : fbuffer_append (out_buffer , "\\r" , 2 ); break ;
103+ case '\t' : fbuffer_append (out_buffer , "\\t" , 2 ); break ;
260104 default :
261- {
262- unsigned short clen = 1 ;
263- if (!ascii_only ) {
264- clen += trailingBytesForUTF8 [c ];
265- if (end + clen > len ) {
266- rb_raise (rb_path2class ("JSON::GeneratorError" ),
267- "partial character in source, but hit end" );
268- }
269-
270- if (script_safe && c == 0xE2 ) {
271- unsigned char c2 = (unsigned char ) * (p + 1 );
272- unsigned char c3 = (unsigned char ) * (p + 2 );
273- if (c2 == 0x80 && (c3 == 0xA8 || c3 == 0xA9 )) {
274- fbuffer_append (buffer , ptr + start , end - start );
275- start = end = (end + clen );
276- if (c3 == 0xA8 ) {
277- fbuffer_append (buffer , "\\u2028" , 6 );
278- } else {
279- fbuffer_append (buffer , "\\u2029" , 6 );
280- }
281- continue ;
282- }
283- }
284- }
285- end += clen ;
105+ if (ch <= 0xFFFF ) {
106+ scratch [2 ] = hexdig [ch >> 12 ];
107+ scratch [3 ] = hexdig [(ch >> 8 ) & 0xf ];
108+ scratch [4 ] = hexdig [(ch >> 4 ) & 0xf ];
109+ scratch [5 ] = hexdig [ch & 0xf ];
110+ fbuffer_append (out_buffer , scratch , 6 );
111+ } else {
112+ uint16_t hi , lo ;
113+ ch -= 0x10000 ;
114+ hi = 0xD800 + (uint16_t )(ch >> 10 );
115+ lo = 0xDC00 + (uint16_t )(ch & 0x3FF );
116+
117+ scratch [2 ] = hexdig [hi >> 12 ];
118+ scratch [3 ] = hexdig [(hi >> 8 ) & 0xf ];
119+ scratch [4 ] = hexdig [(hi >> 4 ) & 0xf ];
120+ scratch [5 ] = hexdig [hi & 0xf ];
121+
122+ scratch [8 ] = hexdig [lo >> 12 ];
123+ scratch [9 ] = hexdig [(lo >> 8 ) & 0xf ];
124+ scratch [10 ] = hexdig [(lo >> 4 ) & 0xf ];
125+ scratch [11 ] = hexdig [lo & 0xf ];
126+
127+ fbuffer_append (out_buffer , scratch , 12 );
286128 }
287- continue ;
288- break ;
289129 }
290130 }
291- fbuffer_append (buffer , ptr + start , end - start );
292- fbuffer_append (buffer , escape , escape_len );
293- start = ++ end ;
294- escape = NULL ;
131+
132+ pos += ch_len ;
295133 }
296- fbuffer_append (buffer , ptr + start , end - start );
134+ if (beg < in_utf8_len )
135+ fbuffer_append (out_buffer , & in_utf8_str [beg ], in_utf8_len - beg );
136+ RB_GC_GUARD (in_string );
297137}
298138
299139static char * fstrndup (const char * ptr , unsigned long len ) {
@@ -930,12 +770,7 @@ static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_S
930770 if (!enc_utf8_compatible_p (rb_enc_get (obj ))) {
931771 obj = rb_str_export_to_enc (obj , rb_utf8_encoding ());
932772 }
933-
934- if (state -> ascii_only ) {
935- convert_UTF8_to_JSON_ASCII (buffer , obj , state -> script_safe );
936- } else {
937- convert_UTF8_to_JSON (buffer , obj , state -> script_safe );
938- }
773+ convert_UTF8_to_JSON (buffer , obj , state -> ascii_only , state -> script_safe );
939774 fbuffer_append_char (buffer , '"' );
940775}
941776
0 commit comments