ruby
diff --git a/‎ext/json/ext/generator/generator.c‎
Lines changed: 98 additions & 263 deletions b/‎ext/json/ext/generator/generator.c‎
Lines changed: 98 additions & 263 deletions
@@ -18,282 +18,122 @@ static ID i_to_s, i_to_json, i_new, i_indent, i_space, i_space_before,
           i_aref, i_send, i_respond_to_p, i_match, i_keys, i_depth,
           i_buffer_initial_length, i_dup, i_script_safe, i_escape_slash, i_strict;
 
-/*
- * Copyright 2001-2004 Unicode, Inc.
+/* Converts in_string to a JSON string (without the wrapping '"'
+ * characters) in FBuffer out_buffer.
  *
- * Disclaimer
+ * Character are JSON-escaped according to:
  *
- * This source code is provided as is by Unicode, Inc. No claims are
- * made as to fitness for any particular purpose. No warranties of any
- * kind are expressed or implied. The recipient agrees to determine
- * applicability of information provided. If this file has been
- * purchased on magnetic or optical media from Unicode, Inc., the
- * sole remedy for any claim will be exchange of defective media
- * within 90 days of receipt.
+ * - Always: ASCII control characters (0x00-0x1F), dquote, and
+ *   backslash.
  *
- * Limitations on Rights to Redistribute This Code
+ * - If out_ascii_only: non-ASCII characters (>0x7F)
  *
- * Unicode, Inc. hereby grants the right to freely use the information
- * supplied in this file in the creation of products supporting the
- * Unicode Standard, and to make copies of this file in any form
- * for internal or external distribution as long as this notice
- * remains attached.
- */
-
-/*
- * Index into the table below with the first byte of a UTF-8 sequence to
- * get the number of trailing bytes that are supposed to follow it.
- * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
- * left as-is for anyone who may want to do such conversion, which was
- * allowed in earlier algorithms.
- */
-static const char trailingBytesForUTF8[256] = {
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
-};
-
-/*
- * Magic values subtracted from a buffer value during UTF8 conversion.
- * This table contains as many values as there might be trailing bytes
- * in a UTF-8 sequence.
+ * - If out_script_safe: forwardslash, line separator (U+2028), and
+ *   paragraph separator (U+2029)
+ *
+ * Everything else (should be UTF-8) is just passed through and
+ * appended to the result.
  */
-static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
-    0x03C82080UL, 0xFA082080UL, 0x82082080UL };
-
-/* Escapes the UTF16 character and stores the result in the buffer buf. */
-static void unicode_escape(char *buf, UTF16 character)
+static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe)
 {
-    const char *digits = "0123456789abcdef";
+    const char *hexdig = "0123456789abcdef";
+    char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
 
-    buf[2] = digits[character >> 12];
-    buf[3] = digits[(character >> 8) & 0xf];
-    buf[4] = digits[(character >> 4) & 0xf];
-    buf[5] = digits[character & 0xf];
-}
+    const char *in_utf8_str = RSTRING_PTR(in_string);
+    unsigned long in_utf8_len = RSTRING_LEN(in_string);
+    bool in_is_ascii_only = rb_enc_str_asciionly_p(in_string);
 
-/* Escapes the UTF16 character and stores the result in the buffer buf, then
- * the buffer buf is appended to the FBuffer buffer. */
-static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16
-        character)
-{
-    unicode_escape(buf, character);
-    fbuffer_append(buffer, buf, 6);
-}
-
-/* Converts string to a JSON string in FBuffer buffer, where all but the ASCII
- * and control characters are JSON escaped. */
-static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe)
-{
-    const UTF8 *source = (UTF8 *) RSTRING_PTR(string);
-    const UTF8 *sourceEnd = source + RSTRING_LEN(string);
-    char buf[6] = { '\\', 'u' };
-
-    int ascii_only = rb_enc_str_asciionly_p(string);
+    unsigned long beg = 0, pos;
 
-    if (!ascii_only) {
-        if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
-            rb_raise(rb_path2class("JSON::GeneratorError"),
-                    "source sequence is illegal/malformed utf-8");
-        }
-    }
+    for (pos =  0; pos < in_utf8_len;) {
+        uint32_t ch;
+        unsigned long ch_len;
+        bool should_escape;
 
-    while (source < sourceEnd) {
-        UTF32 ch = 0;
-        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
-        /*
-         * The cases all fall through. See "Note A" below.
-         */
-        switch (extraBytesToRead) {
-            case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
-            case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
-            case 3: ch += *source++; ch <<= 6;
-            case 2: ch += *source++; ch <<= 6;
-            case 1: ch += *source++; ch <<= 6;
-            case 0: ch += *source++;
-        }
-        ch -= offsetsFromUTF8[extraBytesToRead];
-
-        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
-            /* UTF-16 surrogate values are illegal in UTF-32 */
-            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
-#if UNI_STRICT_CONVERSION
-                source -= (extraBytesToRead+1); /* return to the illegal value itself */
+        /* UTF-8 decoding */
+        if (in_is_ascii_only) {
+            ch = in_utf8_str[pos];
+            ch_len = 1;
+        } else {
+            unsigned long i;
+            if      ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos];        } /* leading 1 bit is   0b0     */
+            else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110   */
+            else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110  */
+            else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
+            else
                 rb_raise(rb_path2class("JSON::GeneratorError"),
-                        "source sequence is illegal/malformed utf-8");
-#else
-                unicode_escape_to_buffer(buffer, buf, UNI_REPLACEMENT_CHAR);
-#endif
-            } else {
-                /* normal case */
-                if (ch >= 0x20 && ch <= 0x7f) {
-                    switch (ch) {
-                        case '\\':
-                            fbuffer_append(buffer, "\\\\", 2);
-                            break;
-                        case '"':
-                            fbuffer_append(buffer, "\\\"", 2);
-                            break;
-                        case '/':
-                            if(script_safe) {
-                                fbuffer_append(buffer, "\\/", 2);
-                                break;
-                            }
-                        default:
-                            fbuffer_append_char(buffer, (char)ch);
-                            break;
-                    }
-                } else {
-                    switch (ch) {
-                        case '\n':
-                            fbuffer_append(buffer, "\\n", 2);
-                            break;
-                        case '\r':
-                            fbuffer_append(buffer, "\\r", 2);
-                            break;
-                        case '\t':
-                            fbuffer_append(buffer, "\\t", 2);
-                            break;
-                        case '\f':
-                            fbuffer_append(buffer, "\\f", 2);
-                            break;
-                        case '\b':
-                            fbuffer_append(buffer, "\\b", 2);
-                            break;
-                        default:
-                            unicode_escape_to_buffer(buffer, buf, (UTF16) ch);
-                            break;
-                    }
-                }
+                         "source sequence is illegal/malformed utf-8");
+            if ((pos+ch_len) > in_utf8_len)
+                rb_raise(rb_path2class("JSON::GeneratorError"),
+                         "partial character in source, but hit end");
+            for (i = 1; i < ch_len; i++) {
+                if ((in_utf8_str[pos+i] & 0xC0) != 0x80) /* leading 2 bits should be 0b10 */
+                    rb_raise(rb_path2class("JSON::GeneratorError"),
+                             "source sequence is illegal/malformed utf-8");
+                ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
             }
-        } else if (ch > UNI_MAX_UTF16) {
-#if UNI_STRICT_CONVERSION
-            source -= (extraBytesToRead+1); /* return to the start */
-            rb_raise(rb_path2class("JSON::GeneratorError"),
-                    "source sequence is illegal/malformed utf8");
-#else
-            unicode_escape_to_buffer(buffer, buf, UNI_REPLACEMENT_CHAR);
-#endif
-        } else {
-            /* target is a character in range 0xFFFF - 0x10FFFF. */
-            ch -= halfBase;
-            unicode_escape_to_buffer(buffer, buf, (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START));
-            unicode_escape_to_buffer(buffer, buf, (UTF16)((ch & halfMask) + UNI_SUR_LOW_START));
-        }
-    }
-    RB_GC_GUARD(string);
-}
-
-/* Converts string to a JSON string in FBuffer buffer, where only the
- * characters required by the JSON standard are JSON escaped. The remaining
- * characters (should be UTF8) are just passed through and appended to the
- * result. */
-static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe)
-{
-    const char *ptr = RSTRING_PTR(string), *p;
-    unsigned long len = RSTRING_LEN(string), start = 0, end = 0;
-    const char *escape = NULL;
-    int escape_len;
-    unsigned char c;
-    char buf[6] = { '\\', 'u' };
-    int ascii_only = rb_enc_str_asciionly_p(string);
-
-    if (!ascii_only) {
-        if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
-            rb_raise(rb_path2class("JSON::GeneratorError"),
-                    "source sequence is illegal/malformed utf-8");
+            if (ch > 0x10FFFF)
+                rb_raise(rb_path2class("JSON::GeneratorError"),
+                         "source sequence is illegal/malformed utf-8");
         }
-    }
 
-    for (start = 0, end = 0; end < len;) {
-        p = ptr + end;
-        c = (unsigned char) *p;
-        if (c < 0x20) {
-            switch (c) {
-                case '\n':
-                    escape = "\\n";
-                    escape_len = 2;
-                    break;
-                case '\r':
-                    escape = "\\r";
-                    escape_len = 2;
-                    break;
-                case '\t':
-                    escape = "\\t";
-                    escape_len = 2;
-                    break;
-                case '\f':
-                    escape = "\\f";
-                    escape_len = 2;
-                    break;
-                case '\b':
-                    escape = "\\b";
-                    escape_len = 2;
-                    break;
-                default:
-                    unicode_escape(buf, (UTF16) *p);
-                    escape = buf;
-                    escape_len = 6;
-                    break;
-            }
-        } else {
-            switch (c) {
-                case '\\':
-                    escape = "\\\\";
-                    escape_len = 2;
-                    break;
-                case '"':
-                    escape =  "\\\"";
-                    escape_len = 2;
-                    break;
-                case '/':
-                    if(script_safe) {
-                        escape = "\\/";
-                        escape_len = 2;
-                        break;
-                    }
+        /* JSON policy */
+        should_escape =
+            (ch < 0x20) ||
+            (ch == '"') ||
+            (ch == '\\') ||
+            (out_ascii_only && (ch > 0x7F)) ||
+            (out_script_safe && (ch == '/')) ||
+            (out_script_safe && (ch == 0x2028)) ||
+            (out_script_safe && (ch == 0x2029));
+
+        /* JSON encoding */
+        if (should_escape) {
+            if (pos > beg)
+                fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg);
+            beg = pos + ch_len;
+            switch (ch) {
+                case '"':  fbuffer_append(out_buffer, "\\\"", 2); break;
+                case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
+                case '/':  fbuffer_append(out_buffer, "\\/", 2); break;
+                case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
+                case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
+                case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
+                case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
+                case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
                 default:
-                    {
-                        unsigned short clen = 1;
-                        if (!ascii_only) {
-                            clen += trailingBytesForUTF8[c];
-                            if (end + clen > len) {
-                                rb_raise(rb_path2class("JSON::GeneratorError"),
-                                        "partial character in source, but hit end");
-                            }
-
-                            if (script_safe && c == 0xE2) {
-                                unsigned char c2 = (unsigned char) *(p+1);
-                                unsigned char c3 = (unsigned char) *(p+2);
-                                if (c2 == 0x80 && (c3 == 0xA8 || c3 == 0xA9)) {
-                                    fbuffer_append(buffer, ptr + start, end - start);
-                                    start = end = (end + clen);
-                                    if (c3 == 0xA8) {
-                                        fbuffer_append(buffer, "\\u2028", 6);
-                                    } else {
-                                        fbuffer_append(buffer, "\\u2029", 6);
-                                    }
-                                    continue;
-                                }
-                            }
-                        }
-                        end += clen;
+                    if (ch <= 0xFFFF) {
+                        scratch[2] = hexdig[ch >> 12];
+                        scratch[3] = hexdig[(ch >> 8) & 0xf];
+                        scratch[4] = hexdig[(ch >> 4) & 0xf];
+                        scratch[5] = hexdig[ch & 0xf];
+                        fbuffer_append(out_buffer, scratch, 6);
+                    } else {
+			uint16_t hi, lo;
+                        ch -= 0x10000;
+                        hi = 0xD800 + (uint16_t)(ch >> 10);
+                        lo = 0xDC00 + (uint16_t)(ch & 0x3FF);
+
+                        scratch[2] = hexdig[hi >> 12];
+                        scratch[3] = hexdig[(hi >> 8) & 0xf];
+                        scratch[4] = hexdig[(hi >> 4) & 0xf];
+                        scratch[5] = hexdig[hi & 0xf];
+
+                        scratch[8] = hexdig[lo >> 12];
+                        scratch[9] = hexdig[(lo >> 8) & 0xf];
+                        scratch[10] = hexdig[(lo >> 4) & 0xf];
+                        scratch[11] = hexdig[lo & 0xf];
+
+                        fbuffer_append(out_buffer, scratch, 12);
                     }
-                    continue;
-                    break;
             }
         }
-        fbuffer_append(buffer, ptr + start, end - start);
-        fbuffer_append(buffer, escape, escape_len);
-        start = ++end;
-        escape = NULL;
+
+        pos += ch_len;
     }
-    fbuffer_append(buffer, ptr + start, end - start);
+    if (beg < in_utf8_len)
+        fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg);
+    RB_GC_GUARD(in_string);
 }
 
 static char *fstrndup(const char *ptr, unsigned long len) {
@@ -930,12 +770,7 @@ static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_S
     if (!enc_utf8_compatible_p(rb_enc_get(obj))) {
         obj = rb_str_export_to_enc(obj, rb_utf8_encoding());
     }
-
-    if (state->ascii_only) {
-        convert_UTF8_to_JSON_ASCII(buffer, obj, state->script_safe);
-    } else {
-        convert_UTF8_to_JSON(buffer, obj, state->script_safe);
-    }
+    convert_UTF8_to_JSON(buffer, obj, state->ascii_only, state->script_safe);
     fbuffer_append_char(buffer, '"');
 }