diff --git a/Cargo.toml b/Cargo.toml index 8252dbd..1d4c65e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,7 @@ simd-json = "0.15.0" mockalloc = "0.1.2" criterion = "0.5.1" proptest = "1.7" +json-five = "0.3.0" [features] default = ["databend", "preserve_order", "arbitrary_precision"] diff --git a/src/core/databend/de.rs b/src/core/databend/de.rs index 798b409..1ebabb8 100644 --- a/src/core/databend/de.rs +++ b/src/core/databend/de.rs @@ -147,7 +147,7 @@ impl<'de> Deserializer<'de> { Ok(Cow::Borrowed(s)) } - fn read_payload_extension(&mut self, length: usize) -> Result { + fn read_payload_extension(&mut self, length: usize) -> Result> { let start = self.index; let end = self.index + length; let val = ExtensionValue::decode(&self.raw.data[start..end])?; diff --git a/src/core/databend/util.rs b/src/core/databend/util.rs index bf441eb..fc63d8d 100644 --- a/src/core/databend/util.rs +++ b/src/core/databend/util.rs @@ -439,7 +439,7 @@ impl ExtensionValue<'_> { } #[inline] - pub(crate) fn decode(bytes: &[u8]) -> Result { + pub(crate) fn decode(bytes: &[u8]) -> Result> { let mut len = bytes.len(); assert!(len > 0); len -= 1; diff --git a/src/error.rs b/src/error.rs index ce5f326..e1ced0c 100644 --- a/src/error.rs +++ b/src/error.rs @@ -36,6 +36,8 @@ pub enum ParseErrorCode { InvalidSurrogateInHexEscape(u16), UnexpectedEndOfHexEscape, ObjectDuplicateKey(String), + ObjectKeyInvalidNumber, + ObjectKeyInvalidCharacter, } pub type Result = std::result::Result; @@ -72,6 +74,12 @@ impl Display for ParseErrorCode { ParseErrorCode::ObjectDuplicateKey(key) => { write!(f, "duplicate object attribute \"{}\"", key) } + ParseErrorCode::ObjectKeyInvalidNumber => { + f.write_str("object attribute name cannot be a number") + } + ParseErrorCode::ObjectKeyInvalidCharacter => { + f.write_str("object attribute name cannot be invalid character") + } } } } diff --git a/src/functions/scalar.rs b/src/functions/scalar.rs index a01109f..71afd15 100644 --- a/src/functions/scalar.rs +++ b/src/functions/scalar.rs @@ -1499,7 +1499,7 @@ impl RawJsonb<'_> { /// let raw_jsonb = RawJsonb::new(&buf); /// assert_eq!(raw_jsonb.as_extension_value().unwrap(), Some(ExtensionValue::Binary(&[1,2,3]))); /// ``` - pub fn as_extension_value(&self) -> Result> { + pub fn as_extension_value(&self) -> Result>> { let jsonb_item = JsonbItem::from_raw_jsonb(*self)?; match jsonb_item { JsonbItem::Extension(data) => { diff --git a/src/parser.rs b/src/parser.rs index 9259a51..b3c76fc 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -53,6 +53,19 @@ const DECIMAL128_MIN: i128 = -99999999999999999999999999999999999999i128; #[cfg(feature = "arbitrary_precision")] const DECIMAL128_MAX: i128 = 99999999999999999999999999999999999999i128; +// JSON literal constants +const NULL_LOWERCASE: [u8; 4] = [b'n', b'u', b'l', b'l']; +const NULL_UPPERCASE: [u8; 4] = [b'N', b'U', b'L', b'L']; +const TRUE_LOWERCASE: [u8; 4] = [b't', b'r', b'u', b'e']; +const TRUE_UPPERCASE: [u8; 4] = [b'T', b'R', b'U', b'E']; +const FALSE_LOWERCASE: [u8; 5] = [b'f', b'a', b'l', b's', b'e']; +const FALSE_UPPERCASE: [u8; 5] = [b'F', b'A', b'L', b'S', b'E']; + +const NAN_LOWERCASE: [u8; 3] = [b'n', b'a', b'n']; +const NAN_UPPERCASE: [u8; 3] = [b'N', b'A', b'N']; +const INFINITY_LOWERCASE: [u8; 8] = [b'i', b'n', b'f', b'i', b'n', b'i', b't', b'y']; +const INFINITY_UPPERCASE: [u8; 8] = [b'I', b'N', b'F', b'I', b'N', b'I', b'T', b'Y']; + #[cfg(feature = "arbitrary_precision")] static POWER_TABLE: std::sync::LazyLock<[i256; 39]> = std::sync::LazyLock::new(|| { [ @@ -122,7 +135,7 @@ static POWER_TABLE: std::sync::LazyLock<[i256; 39]> = std::sync::LazyLock::new(| /// /// This approach separates the parsing concerns from the final representation concerns, /// allowing each to be optimized independently. -#[derive(Clone, PartialEq, Default, Eq)] +#[derive(Clone, PartialEq, Default, Eq, Debug)] pub(crate) enum JsonAst<'a> { #[default] Null, @@ -134,13 +147,6 @@ pub(crate) enum JsonAst<'a> { } impl<'a> JsonAst<'a> { - fn as_string(&self) -> Option> { - match self { - JsonAst::String(s) => Some(s.clone()), - _ => None, - } - } - /// Converts the intermediate `JsonAst` representation to the final `Value` type. fn into_value(self) -> Result> { let value = match self { @@ -302,10 +308,12 @@ struct Parser<'a> { buf: &'a [u8], /// Current position in the buffer idx: usize, - /// Function pointer for parsing numbers based on the mode - parse_number_fn: fn(&mut Self) -> Result>, - /// Function pointer for parsing arrays based on the mode - parse_array_fn: fn(&mut Self) -> Result>, + /// Function pointer for parsing json value based on the mode + parse_value_fn: fn(&mut Self) -> Result>, + /// Function pointer for parsing array value based on the mode + parse_array_value_fn: fn(&mut Self) -> Result>, + /// Function pointer for parsing object_key based on the mode + parse_object_key_fn: fn(&mut Self) -> Result>, } impl<'a> Parser<'a> { @@ -313,8 +321,9 @@ impl<'a> Parser<'a> { Self { buf, idx: 0, - parse_number_fn: Self::parse_json_number, - parse_array_fn: Self::parse_json_array, + parse_value_fn: Self::parse_json_value, + parse_array_value_fn: Self::parse_array_value, + parse_object_key_fn: Self::parse_object_key, } } @@ -322,37 +331,36 @@ impl<'a> Parser<'a> { Self { buf, idx: 0, - parse_number_fn: Self::parse_standard_json_number, - parse_array_fn: Self::parse_standard_json_array, + parse_value_fn: Self::parse_standard_json_value, + parse_array_value_fn: Self::parse_standard_json_value, + parse_object_key_fn: Self::parse_standard_object_key, } } /// Parse a complete JSON document from the input buffer. fn parse(&mut self) -> Result> { - let val = self.parse_json_value()?; + let value = (self.parse_value_fn)(self)?; + self.skip_unused(); if self.idx < self.buf.len() { self.step(); return Err(self.error(ParseErrorCode::UnexpectedTrailingCharacters)); } - Ok(val) + Ok(value) } - /// Parse a JSON value, dispatching to the appropriate parser based on the first character. - /// - /// This is an optimized version that avoids runtime mode checks by using function pointers - /// selected during parser initialization. + /// Parse a JSON value in standard mode, following strict JSON syntax rules as RFC 8259. #[inline] - fn parse_json_value(&mut self) -> Result> { + fn parse_standard_json_value(&mut self) -> Result> { self.skip_unused(); let c = self.next()?; match c { - b'n' => self.parse_json_null(), - b't' => self.parse_json_true(), - b'f' => self.parse_json_false(), - b'0'..=b'9' | b'-' | b'+' | b'.' => (self.parse_number_fn)(self), - b'"' => self.parse_json_string(), - b'[' => (self.parse_array_fn)(self), + b'n' => self.parse_standard_json_null(), + b't' => self.parse_standard_json_true(), + b'f' => self.parse_standard_json_false(), + b'0'..=b'9' | b'-' => self.parse_standard_json_number(), + b'"' => self.parse_standard_json_string(), + b'[' => self.parse_json_array(), b'{' => self.parse_json_object(), _ => { self.step(); @@ -361,20 +369,44 @@ impl<'a> Parser<'a> { } } + /// Parse a JSON value in extended mode with more lenient syntax rules #[inline] - fn next(&mut self) -> Result<&u8> { + fn parse_json_value(&mut self) -> Result> { + self.skip_unused(); + // Parse empty string to Null value + let Ok(c) = self.next() else { + return Ok(JsonAst::Null); + }; + match c { + b'n' | b'N' => self.parse_json_null_or_nan(), + b't' | b'T' => self.parse_json_true(), + b'f' | b'F' => self.parse_json_false(), + b'i' | b'I' => self.parse_json_infinity(), + b'0'..=b'9' | b'-' | b'+' | b'.' => self.parse_json_number(), + b'"' | b'\'' => self.parse_json_string(), + b'[' => self.parse_json_array(), + b'{' => self.parse_json_object(), + _ => { + self.step(); + Err(self.error(ParseErrorCode::ExpectedSomeValue)) + } + } + } + + #[inline] + fn next(&mut self) -> Result { match self.buf.get(self.idx) { - Some(c) => Ok(c), + Some(c) => Ok(*c), None => Err(self.error(ParseErrorCode::InvalidEOF)), } } #[inline] - fn must_is(&mut self, c: u8) -> Result<()> { + fn must_is(&mut self, c: &u8) -> Result<()> { match self.buf.get(self.idx) { Some(v) => { self.step(); - if v == &c { + if v == c { Ok(()) } else { Err(self.error(ParseErrorCode::ExpectedSomeIdent)) @@ -385,21 +417,24 @@ impl<'a> Parser<'a> { } #[inline] - fn check_next(&mut self, c: u8) -> bool { - if self.idx < self.buf.len() { - let v = self.buf.get(self.idx).unwrap(); - if v == &c { - return true; + fn must_either(&mut self, c1: &u8, c2: &u8) -> Result { + match self.buf.get(self.idx) { + Some(v) => { + self.step(); + if v == c1 || v == c2 { + Ok(*v) + } else { + Err(self.error(ParseErrorCode::ExpectedSomeIdent)) + } } + None => Err(self.error(ParseErrorCode::InvalidEOF)), } - false } #[inline] - fn check_next_either(&mut self, c1: u8, c2: u8) -> bool { - if self.idx < self.buf.len() { - let v = self.buf.get(self.idx).unwrap(); - if v == &c1 || v == &c2 { + fn check_next(&mut self, c: &u8) -> bool { + if let Some(v) = self.buf.get(self.idx) { + if v == c { return true; } } @@ -407,22 +442,44 @@ impl<'a> Parser<'a> { } #[inline] - fn check_digit(&mut self) -> bool { - if self.idx < self.buf.len() { - let v = self.buf.get(self.idx).unwrap(); + fn check_next_either(&mut self, c1: &u8, c2: &u8) -> Option { + if let Some(v) = self.buf.get(self.idx) { + if v == c1 || v == c2 { + return Some(*v); + } + } + None + } + + #[inline] + fn check_digit(&mut self) -> Option { + if let Some(v) = self.buf.get(self.idx) { if v.is_ascii_digit() { - return true; + let digit = v - b'0'; + return Some(digit); } } - false + None } #[inline] fn step_digits(&mut self) -> usize { let mut len = 0; - while self.idx < self.buf.len() { - let c = self.buf.get(self.idx).unwrap(); - if !c.is_ascii_digit() { + while let Some(v) = self.buf.get(self.idx) { + if !v.is_ascii_digit() { + break; + } + len += 1; + self.step(); + } + len + } + + #[inline] + fn step_hexdigits(&mut self) -> usize { + let mut len = 0; + while let Some(v) = self.buf.get(self.idx) { + if !v.is_ascii_hexdigit() { break; } len += 1; @@ -483,30 +540,96 @@ impl<'a> Parser<'a> { } } - fn parse_json_null(&mut self) -> Result> { - let data = [b'n', b'u', b'l', b'l']; - for v in data.into_iter() { + /// Parse a JSON null literal in standard mode + #[inline] + fn parse_standard_json_null(&mut self) -> Result> { + for v in NULL_LOWERCASE.iter() { self.must_is(v)?; } Ok(JsonAst::Null) } - fn parse_json_true(&mut self) -> Result> { - let data = [b't', b'r', b'u', b'e']; - for v in data.into_iter() { + /// Parse a JSON null or NaN literal in extended mode with case-insensitivity + #[inline] + fn parse_json_null_or_nan(&mut self) -> Result> { + let idx = self.idx; + if let Ok(null) = self.parse_json_null() { + Ok(null) + } else { + // fallback idx to check if it is NaN + self.idx = idx; + self.parse_json_nan() + } + } + + /// Parse a JSON null literal in extended mode with case-insensitivity + /// Accepts any case variation of "null" (e.g., "Null", "NULL", "nUlL"). + #[inline] + fn parse_json_null(&mut self) -> Result> { + for (v1, v2) in NULL_LOWERCASE.iter().zip(NULL_UPPERCASE.iter()) { + self.must_either(v1, v2)?; + } + Ok(JsonAst::Null) + } + + /// Parse a JSON true literal in standard mode + #[inline] + fn parse_standard_json_true(&mut self) -> Result> { + for v in TRUE_LOWERCASE.iter() { self.must_is(v)?; } Ok(JsonAst::Bool(true)) } - fn parse_json_false(&mut self) -> Result> { - let data = [b'f', b'a', b'l', b's', b'e']; - for v in data.into_iter() { + /// Parse a JSON true literal in extended mode with case-insensitivity + /// Accepts any case variation of "true" (e.g., "True", "TRUE", "tRuE"). + #[inline] + fn parse_json_true(&mut self) -> Result> { + for (v1, v2) in TRUE_LOWERCASE.iter().zip(TRUE_UPPERCASE.iter()) { + self.must_either(v1, v2)?; + } + Ok(JsonAst::Bool(true)) + } + + /// Parse a JSON false literal in standard mode + #[inline] + fn parse_standard_json_false(&mut self) -> Result> { + for v in FALSE_LOWERCASE.iter() { self.must_is(v)?; } Ok(JsonAst::Bool(false)) } + /// Parse a JSON false literal in extended mode with case-insensitivity + /// Accepts any case variation of "false" (e.g., "False", "FALSE", "fAlSe"). + #[inline] + fn parse_json_false(&mut self) -> Result> { + for (v1, v2) in FALSE_LOWERCASE.iter().zip(FALSE_UPPERCASE.iter()) { + self.must_either(v1, v2)?; + } + Ok(JsonAst::Bool(false)) + } + + /// Parse a JSON infinity literal in extended mode with case-insensitivity + /// Accepts any case variation of "infinity" (e.g., "Infinity", "INFINITY"). + #[inline] + fn parse_json_infinity(&mut self) -> Result> { + for (v1, v2) in INFINITY_LOWERCASE.iter().zip(INFINITY_UPPERCASE.iter()) { + self.must_either(v1, v2)?; + } + Ok(JsonAst::Number(Number::Float64(f64::INFINITY))) + } + + /// Parse a JSON NaN literal in extended mode with case-insensitivity + /// Accepts any case variation of "NaN" (e.g., "nan", "NAN"). + #[inline] + fn parse_json_nan(&mut self) -> Result> { + for (v1, v2) in NAN_LOWERCASE.iter().zip(NAN_UPPERCASE.iter()) { + self.must_either(v1, v2)?; + } + Ok(JsonAst::Number(Number::Float64(f64::NAN))) + } + /// Parse JSON numbers in standard mode /// /// This function implements strict parsing according to the standard JSON specification: @@ -525,30 +648,23 @@ impl<'a> Parser<'a> { let mut has_fraction = false; let mut has_exponent = false; - let c = self.next()?; - if *c == b'-' { + if self.check_next(&b'-') { negative = true; self.step(); - } else if *c == b'+' || *c == b'.' { - self.step(); - return Err(self.error(ParseErrorCode::InvalidNumberValue)); } - if self.check_next(b'0') { + if self.check_next(&b'0') { self.step(); - if self.check_digit() { + if self.check_digit().is_some() { self.step(); return Err(self.error(ParseErrorCode::InvalidNumberValue)); } } else { let len = self.step_digits(); if len == 0 { - if !negative { - self.step(); - } return Err(self.error(ParseErrorCode::InvalidNumberValue)); } } - if self.check_next(b'.') { + if self.check_next(&b'.') { has_fraction = true; self.step(); let len = self.step_digits(); @@ -557,10 +673,10 @@ impl<'a> Parser<'a> { return Err(self.error(ParseErrorCode::InvalidNumberValue)); } } - if self.check_next_either(b'E', b'e') { + if self.check_next_either(&b'E', &b'e').is_some() { has_exponent = true; self.step(); - if self.check_next_either(b'+', b'-') { + if self.check_next_either(&b'+', &b'-').is_some() { self.step(); } let len = self.step_digits(); @@ -592,37 +708,42 @@ impl<'a> Parser<'a> { /// 1. Support for leading plus sign (e.g., `+123`) /// 2. Support for multiple leading zeros (e.g., `000123`) /// 3. Support for decimal point without digits on either side (e.g., `.123` or `123.`) + /// 4. Support for special values like `NaN`, `Infinity`, and `-Infinity` (case-insensitive) + /// 5. Support for hexadecimal notation with optional fractional part (e.g., `0xFF`, `0x1A.B`) /// /// Zero-allocation parsing strategy: /// 1. Uses direct digit accumulation without intermediate string conversions /// 2. For standard numeric types (Int64/UInt64), directly builds the value during parsing - /// 3. For decimal types, tracks scale and precision during the single-pass parse - /// 4. Falls back to Float64 parsing only when necessary + /// 3. For decimal types (requires `arbitrary_precision` feature), tracks scale and precision during the single-pass parse + /// 4. For hexadecimal numbers, uses specialized parsing with support for fractional parts + /// 5. Falls back to Float64 parsing only when necessary /// /// This implementation prioritizes performance through: /// - Single-pass approach with minimal branching /// - Avoiding heap allocations and string conversions /// - Optimized handling of common number formats + /// - Specialized handling for extended syntax elements fn parse_json_number(&mut self) -> Result> { // Store the starting position for potential fallback parsing let start_idx = self.idx; + let mut negative = false; - let mut leading_zeros = false; + let mut leading_zeros = 0; // Handle sign prefix (+ or -), extending JSON to support leading plus sign let c = self.next()?; - if *c == b'-' { + if c == b'-' { negative = true; self.step(); - } else if *c == b'+' { + } else if c == b'+' { // Extended syntax: Support for leading plus sign self.step(); } // Extended syntax: Support for multiple leading zeros (e.g., 000123) loop { - if self.check_next(b'0') { - leading_zeros = true; + if self.check_next(&b'0') { + leading_zeros += 1; self.step(); } else { break; @@ -639,20 +760,18 @@ impl<'a> Parser<'a> { // Parse digits, supporting up to MAX_DECIMAL256_PRECISION digits while precision < MAX_DECIMAL256_PRECISION { - if self.check_digit() { + if let Some(digit) = self.check_digit() { // Parse digit and accumulate value - let digit = (self.buf[self.idx] - b'0') as i128; - // Store in hi_value or lo_value based on precision if precision < MAX_DECIMAL128_PRECISION { hi_value = unsafe { hi_value.unchecked_mul(10_i128) }; - hi_value = unsafe { hi_value.unchecked_add(digit) }; + hi_value = unsafe { hi_value.unchecked_add(digit as i128) }; } else { lo_value = unsafe { lo_value.unchecked_mul(10_i128) }; - lo_value = unsafe { lo_value.unchecked_add(digit) }; + lo_value = unsafe { lo_value.unchecked_add(digit as i128) }; } self.step(); - } else if self.check_next(b'.') { + } else if self.check_next(&b'.') { // Handle decimal point - can only appear once if has_fraction { return Err(self.error(ParseErrorCode::InvalidNumberValue)); @@ -678,7 +797,7 @@ impl<'a> Parser<'a> { if !has_fraction { let len = self.step_digits(); precision += len; - if self.check_next(b'.') { + if self.check_next(&b'.') { has_fraction = true; self.step(); } @@ -691,16 +810,146 @@ impl<'a> Parser<'a> { } } - // Handle empty precision - if !leading_zeros && precision == 0 { + if leading_zeros == 0 && precision == 0 { + // Handle special values + if !has_fraction { + if let Ok(c) = self.next() { + match c { + b'i' | b'I' => { + let val = self.parse_json_infinity()?; + if negative { + return Ok(JsonAst::Number(Number::Float64(f64::NEG_INFINITY))); + } else { + return Ok(val); + } + } + b'n' | b'N' => { + let val = self.parse_json_nan()?; + if negative { + // `-Nan` is not allowed + return Err(self.error(ParseErrorCode::InvalidNumberValue)); + } else { + return Ok(val); + } + } + _ => {} + } + } + } return Err(self.error(ParseErrorCode::InvalidNumberValue)); + } else if leading_zeros == 1 && precision == 0 && !has_fraction { + // Handle hexadecimal number (0x...) + if self.check_next_either(&b'x', &b'X').is_some() { + self.step(); + + // Mark the start position of hex digits + let hex_start = self.idx; + let int_len = self.step_hexdigits(); + if int_len == 0 { + return Err(self.error(ParseErrorCode::InvalidNumberValue)); + } + + // Check if we have a fractional part + if self.check_next(&b'.') { + // Skip the decimal point + self.step(); + + // Mark the start of fractional digits + let frac_start = self.idx; + let frac_len = self.step_hexdigits(); + if frac_len == 0 { + return Err(self.error(ParseErrorCode::InvalidNumberValue)); + } + + let int_str = std::str::from_utf8(&self.buf[hex_start..hex_start + int_len]) + .map_err(|_| self.error(ParseErrorCode::InvalidNumberValue))?; + let frac_str = + std::str::from_utf8(&self.buf[frac_start..frac_start + frac_len]) + .map_err(|_| self.error(ParseErrorCode::InvalidNumberValue))?; + + // Parse integer part + let int_val = u128::from_str_radix(int_str, 16) + .map_err(|_| self.error(ParseErrorCode::InvalidNumberValue))?; + + // Parse fractional part and calculate its value + let frac_val = u128::from_str_radix(frac_str, 16) + .map_err(|_| self.error(ParseErrorCode::InvalidNumberValue))?; + let frac_divisor = 16.0_f64.powi(frac_len as i32); + + // Combine integer and fractional parts + let mut final_val = int_val as f64 + (frac_val as f64 / frac_divisor); + if negative { + final_val = -final_val; + } + return Ok(JsonAst::Number(Number::Float64(final_val))); + } else { + // Integer-only hex value + let int_str = std::str::from_utf8(&self.buf[hex_start..self.idx]) + .map_err(|_| self.error(ParseErrorCode::InvalidNumberValue))?; + + // Parse the hex value + let value = u128::from_str_radix(int_str, 16) + .map_err(|_| self.error(ParseErrorCode::InvalidNumberValue))?; + + // Convert to appropriate number type based on size + if negative { + // Handle negative values + if value <= (i64::MAX as u128 + 1) { + let i_val = -(value as i64); + return Ok(JsonAst::Number(Number::Int64(i_val))); + } + #[cfg(feature = "arbitrary_precision")] + { + if value <= (DECIMAL128_MAX as u128 + 1) { + return Ok(JsonAst::Number(Number::Decimal128(Decimal128 { + scale: 0, + value: -(value as i128), + }))); + } else { + return Ok(JsonAst::Number(Number::Decimal256(Decimal256 { + scale: 0, + value: i256::from(value) * -1, + }))); + } + } + #[cfg(not(feature = "arbitrary_precision"))] + { + return Ok(JsonAst::Number(Number::Float64(-(value as f64)))); + } + } else { + // Handle positive values + if value <= u64::MAX as u128 { + return Ok(JsonAst::Number(Number::UInt64(value as u64))); + } + #[cfg(feature = "arbitrary_precision")] + { + if value <= DECIMAL128_MAX as u128 { + return Ok(JsonAst::Number(Number::Decimal128(Decimal128 { + scale: 0, + value: value as i128, + }))); + } else { + return Ok(JsonAst::Number(Number::Decimal256(Decimal256 { + scale: 0, + value: i256::from(value), + }))); + } + } + #[cfg(not(feature = "arbitrary_precision"))] + { + return Ok(JsonAst::Number(Number::Float64(value as f64))); + } + } + } + } } + // Handle exponent notation (e.g., 1e10, 1.5E-7) - if self.check_next_either(b'E', b'e') { + if self.check_next_either(&b'E', &b'e').is_some() { has_exponent = true; self.step(); // Handle exponent sign - if self.check_next_either(b'+', b'-') { + if self.check_next_either(&b'+', &b'-').is_some() { self.step(); } // Parse exponent digits @@ -767,20 +1016,38 @@ impl<'a> Parser<'a> { } } - /// Parse a JSON string value with support for escape sequences. + /// Parse a JSON string in standard mode /// - /// This function implements a high-performance JSON string parser that: - /// 1. Efficiently handles strings without escape sequences using direct memory access - /// 2. Falls back to a more complex parsing routine only when escape sequences are present - /// 3. Supports standard JSON escape sequences and Unicode escapes (\uXXXX and \u{XXXX}) + /// Only supports double quotes (") as string delimiters + /// and follows strict JSON specification. + #[inline] + fn parse_standard_json_string(&mut self) -> Result> { + self.must_is(&b'"')?; + let val = self.parse_quoted_string(b'"')?; + Ok(JsonAst::String(val)) + } + + /// Parse a JSON string with extended syntax support /// - /// The implementation uses a two-pass approach for strings with escapes: - /// - First pass: Count escapes and determine string boundaries - /// - Second pass: Process escape sequences only when necessary + /// Extended syntax allows both double quotes (") and single quotes (') + /// as string delimiters, which is not allowed in standard JSON. + #[inline] fn parse_json_string(&mut self) -> Result> { - // Ensure the string starts with a quote - self.must_is(b'"')?; + let end_quote = self.must_either(&b'"', &b'\'')?; + let val = self.parse_quoted_string(end_quote)?; + Ok(JsonAst::String(val)) + } + /// Parse a quoted string with support for escape sequences + /// + /// Handles both standard and extended Unicode escape sequences: + /// - Standard: \uXXXX (4 hex digits) + /// - Extended: \u{XXXX} (variable number of hex digits in braces) + /// + /// Uses a two-pass approach for efficiency: + /// 1. First pass: Find string boundaries and count escapes + /// 2. Second pass: Process escapes only when necessary + fn parse_quoted_string(&mut self, end_quote: u8) -> Result> { // Mark the starting position (after the opening quote) let start_idx = self.idx; let mut escapes = 0; @@ -794,11 +1061,11 @@ impl<'a> Parser<'a> { self.step(); escapes += 1; let next_c = self.next()?; - if *next_c == b'u' { + if next_c == b'u' { // Handle Unicode escape sequence self.step(); let next_c = self.next()?; - if *next_c == b'{' { + if next_c == b'{' { // Extended Unicode format: \u{XXXX} self.step_by(UNICODE_LEN + 2); } else { @@ -809,16 +1076,14 @@ impl<'a> Parser<'a> { // Simple escape sequence like \n, \t, etc. self.step(); } - continue; } - b'"' => { - // End of string found + _ => { self.step(); - break; + if c == end_quote { + break; + } } - _ => {} } - self.step(); } // Get the string data (excluding quotes) @@ -837,59 +1102,97 @@ impl<'a> Parser<'a> { .map(Cow::Borrowed) .map_err(|_| self.error(ParseErrorCode::InvalidStringValue))? }; - Ok(JsonAst::String(val)) + Ok(val) } - /// Parse a JSON array with standard mode. - fn parse_standard_json_array(&mut self) -> Result> { - // Ensure the array starts with an opening bracket - self.must_is(b'[')?; + /// Parse an unquoted string literal for object keys + /// + /// Extended syntax feature that allows object keys without quotes. + /// + /// Restrictions: + /// - Only letters, numbers, underscore, dollar and UTF-8 multi-byte characters are allowed + /// - First character cannot be a number + /// - Must contain at least one character + fn parse_unquoted_string(&mut self) -> Result> { + let start_idx = self.idx; - let mut first = true; - let mut values = Vec::with_capacity(8); + let c = self.next()?; + if c.is_ascii_digit() { + self.step(); + return Err(self.error(ParseErrorCode::ObjectKeyInvalidNumber)); + } - // Parse array elements until closing bracket is found loop { - self.skip_unused(); let c = self.next()?; - - // Check for end of array - if *c == b']' { + if c.is_ascii_alphanumeric() || matches!(c, b'_' | b'$') { self.step(); - break; - } + } else if c >= 0x80 { + // Handle UTF-8 multi-byte characters (including Chinese) + // UTF-8 continuation bytes start with binary 10xxxxxx (0x80-0xBF) + // Determine how many continuation bytes to expect based on the first byte + let continuation_bytes = if c >= 0xF0 { + 4 // 4-byte sequence (U+10000 to U+10FFFF) + } else if c >= 0xE0 { + 3 // 3-byte sequence (U+0800 to U+FFFF) - includes most Chinese characters + } else if c >= 0xC0 { + 2 // 2-byte sequence (U+0080 to U+07FF) + } else { + // Invalid UTF-8 start byte + return Err(self.error(ParseErrorCode::ObjectKeyInvalidCharacter)); + }; - // Handle comma separator between elements (not for the first element) - if !first { - if *c != b',' { - return Err(self.error(ParseErrorCode::ExpectedArrayCommaOrEnd)); - } - self.step(); + // Consume the expected continuation bytes + self.step_by(continuation_bytes); + } else { + break; } - first = false; + } + if self.idx == start_idx { + return Err(self.error(ParseErrorCode::ObjectKeyInvalidCharacter)); + } - self.skip_unused(); + // Get the string data + let data = &self.buf[start_idx..self.idx]; + let val = std::str::from_utf8(data) + .map(Cow::Borrowed) + .map_err(|_| self.error(ParseErrorCode::InvalidStringValue))?; + Ok(val) + } - // Parse a regular array element - let value = self.parse_json_value()?; - values.push(value); + /// Parse an array value with support for empty elements + /// + /// Extended syntax feature that treats empty elements as null: + /// - [1,,3] is parsed as [1,null,3] + /// - [1,2,] is parsed as [1,2,null] + /// + /// This is not allowed in standard JSON but supported in extended mode. + #[inline] + fn parse_array_value(&mut self) -> Result> { + if self.check_next_either(&b',', &b']').is_some() { + Ok(JsonAst::Null) + } else { + self.parse_json_value() } - Ok(JsonAst::Array(values)) } - /// Parse a JSON array with extended syntax support. + /// Parse a JSON array with support for both standard and extended syntax + /// + /// This function handles the common array parsing logic for both modes: + /// - Parses arrays enclosed in square brackets [...] + /// - Handles comma-separated values + /// - Validates proper syntax for separators and closing brackets /// - /// This function implements a JSON array parser that: - /// 1. Handles standard JSON arrays with comma-separated values - /// 2. Extends JSON syntax to support empty elements (e.g., [1,,3]) which are parsed as null values - /// 3. Efficiently processes arrays of any size with minimal allocations + /// The behavior differs between standard and extended mode through the function pointer: + /// - In standard mode: Uses parse_standard_json_value which enforces strict JSON rules + /// - In extended mode: Uses parse_array_value which allows empty elements (treated as null) /// - /// Extended JSON array syntax support: - /// - Empty elements between commas (e.g., [1,,3]) which standard JSON doesn't allow - /// - Empty elements at the end of arrays (e.g., [1,2,]) which standard JSON doesn't allow + /// Examples of valid arrays in extended mode: + /// - [1,2,3] (standard JSON) + /// - [1,,3] (empty element treated as null) + /// - [1,2,] (trailing comma treated as null element) fn parse_json_array(&mut self) -> Result> { // Ensure the array starts with an opening bracket - self.must_is(b'[')?; + self.must_is(&b'[')?; let mut first = true; let mut values = Vec::with_capacity(8); @@ -900,14 +1203,14 @@ impl<'a> Parser<'a> { let c = self.next()?; // Check for end of array - if *c == b']' { + if c == b']' { self.step(); break; } // Handle comma separator between elements (not for the first element) if !first { - if *c != b',' { + if c != b',' { return Err(self.error(ParseErrorCode::ExpectedArrayCommaOrEnd)); } self.step(); @@ -915,36 +1218,64 @@ impl<'a> Parser<'a> { first = false; self.skip_unused(); - - // Extended syntax: Check for empty elements (consecutive commas or comma before closing bracket) - // This is where the parser extends standard JSON by allowing empty elements - if self.check_next_either(b',', b']') { - // Insert null for empty element - values.push(JsonAst::Null); - continue; - } - // Parse a regular array element - let value = self.parse_json_value()?; + let value = (self.parse_array_value_fn)(self)?; values.push(value); } Ok(JsonAst::Array(values)) } - /// Parse a JSON object with key-value pairs. + /// Parse an object key in standard mode + /// + /// Only supports double-quoted strings as keys, + /// following strict JSON specification. + #[inline] + fn parse_standard_object_key(&mut self) -> Result> { + self.must_is(&b'"')?; + self.parse_quoted_string(b'"') + } + + /// Parse an object key with extended syntax support + /// + /// Extended syntax allows: + /// 1. Double-quoted strings (") + /// 2. Single-quoted strings (') + /// 3. Unquoted identifiers (letters, numbers, underscore, dollar and UTF-8 characters) + /// with the restriction that they cannot start with a number + #[inline] + fn parse_object_key(&mut self) -> Result> { + if let Some(end_quote) = self.check_next_either(&b'"', &b'\'') { + self.step(); + self.parse_quoted_string(end_quote) + } else { + self.parse_unquoted_string() + } + } + + /// Parse a JSON object with support for both standard and extended syntax /// - /// This function implements a standard-compliant JSON object parser that: - /// 1. Handles objects with string keys and any valid JSON values - /// 2. Enforces that keys must be strings as per JSON specification - /// 3. Efficiently builds a hash map representation of the object + /// This function handles the common object parsing logic for both modes: + /// - Parses objects enclosed in curly braces {...} + /// - Handles key-value pairs separated by colons + /// - Validates proper syntax for separators and closing braces + /// - Detects and reports duplicate keys /// - /// The implementation follows standard JSON syntax requirements: - /// - Keys must be strings - /// - Keys and values are separated by colons - /// - Key-value pairs are separated by commas + /// The behavior differs between standard and extended mode through function pointers: + /// - In standard mode: + /// * Uses parse_standard_object_key which only accepts double-quoted keys + /// * Uses parse_standard_json_value which enforces strict JSON rules for values + /// - In extended mode: + /// * Uses parse_object_key which accepts quoted (double/single) and unquoted keys + /// * Uses parse_json_value which allows extended syntax for values + /// + /// Examples of valid objects in extended mode: + /// - {"key": "value"} (standard JSON) + /// - {'key': 'value'} (single quotes) + /// - {key: "value"} (unquoted key) + /// - {_user123: 'value'} (unquoted key with underscore) fn parse_json_object(&mut self) -> Result> { // Ensure the object starts with an opening brace - self.must_is(b'{')?; + self.must_is(&b'{')?; let mut first = true; let mut obj = Vec::with_capacity(16); @@ -955,38 +1286,35 @@ impl<'a> Parser<'a> { let c = self.next()?; // Check for end of object - if *c == b'}' { + if c == b'}' { self.step(); break; } // Handle comma separator between key-value pairs (not for the first pair) if !first { - if *c != b',' { + if c != b',' { return Err(self.error(ParseErrorCode::ExpectedObjectCommaOrEnd)); } self.step(); } first = false; - // Parse the key (must be a string) - let key = self.parse_json_value()?; - let Some(key_str) = key.as_string() else { - return Err(self.error(ParseErrorCode::KeyMustBeAString)); - }; + self.skip_unused(); + let key_str = (self.parse_object_key_fn)(self)?; let pos = self.idx; self.skip_unused(); // Ensure key and value are separated by a colon let c = self.next()?; - if *c != b':' { + if c != b':' { return Err(self.error(ParseErrorCode::ExpectedColon)); } self.step(); // Parse the value - let value = self.parse_json_value()?; + let value = (self.parse_value_fn)(self)?; // Add the key-value pair to the object obj.push((key_str, value, pos)); @@ -1011,7 +1339,134 @@ impl<'a> Parser<'a> { mod tests { use super::*; use proptest::prelude::*; + use std::collections::BTreeMap; + use std::fmt::Display; + use std::fmt::Formatter; + + /// Json5Value represents the extended [JSON5 syntax](https://json5.org/) for testing purposes + /// + /// This enum is used to generate test data that conforms to the JSON5 specification, + /// including features like hexadecimal numbers, single-quoted strings, and unquoted object keys. + #[derive(Clone, PartialEq, Default, Eq, Debug)] + pub enum Json5Value { + #[default] + Null, + Bool(bool), + Number(Number), + HexNumber(String), + DoubleQuotedString(String), + SingleQuotedString(String), + Array(Vec), + DoubleQuotedKeyObject(BTreeMap), + SingleQuotedKeyObject(BTreeMap), + UnquotedKeyObject(BTreeMap), + } + + /// Display implementation for Json5Value that formats values according to JSON5 syntax + /// + /// This implementation handles proper escaping of special characters in strings + /// and ensures the output conforms to the JSON5 specification. + impl Display for Json5Value { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + match self { + Json5Value::Null => write!(f, "null"), + Json5Value::Bool(v) => { + if *v { + write!(f, "true") + } else { + write!(f, "false") + } + } + Json5Value::Number(ref v) => write!(f, "{}", v), + Json5Value::HexNumber(ref v) => write!(f, "{}", v), + Json5Value::DoubleQuotedString(ref v) => { + write!(f, "\"")?; + for c in v.chars() { + match c { + '"' => write!(f, "\\\"")?, + '\\' => write!(f, "\\\\")?, + c => write!(f, "{}", c)?, + } + } + write!(f, "\"") + } + Json5Value::SingleQuotedString(ref v) => { + write!(f, "'")?; + for c in v.chars() { + match c { + '\'' => write!(f, "\\\'")?, + '\\' => write!(f, "\\\\")?, + c => write!(f, "{}", c)?, + } + } + write!(f, "'") + } + Json5Value::Array(ref vs) => { + write!(f, "[")?; + for (i, v) in vs.iter().enumerate() { + if i > 0 { + write!(f, ",")?; + } + write!(f, "{v}")?; + } + write!(f, "]") + } + Json5Value::DoubleQuotedKeyObject(ref vs) => { + write!(f, "{{")?; + for (i, (k, v)) in vs.iter().enumerate() { + if i > 0 { + write!(f, ",")?; + } + write!(f, "\"")?; + for c in k.chars() { + match c { + '"' => write!(f, "\\\"")?, + '\\' => write!(f, "\\\\")?, + c => write!(f, "{}", c)?, + } + } + write!(f, "\"")?; + write!(f, ":{v}")?; + } + write!(f, "}}") + } + Json5Value::SingleQuotedKeyObject(ref vs) => { + write!(f, "{{")?; + for (i, (k, v)) in vs.iter().enumerate() { + if i > 0 { + write!(f, ",")?; + } + write!(f, "'")?; + for c in k.chars() { + match c { + '\'' => write!(f, "\\\'")?, + '\\' => write!(f, "\\\\")?, + c => write!(f, "{}", c)?, + } + } + write!(f, "'")?; + write!(f, ":{v}")?; + } + write!(f, "}}") + } + Json5Value::UnquotedKeyObject(ref vs) => { + write!(f, "{{")?; + for (i, (k, v)) in vs.iter().enumerate() { + if i > 0 { + write!(f, ",")?; + } + write!(f, "{k}:{v}")?; + } + write!(f, "}}") + } + } + } + } + /// Strategy to generate standard strings for testing + /// + /// Generates strings containing ASCII characters and CJK Unicode characters + /// for testing standard JSON string handling fn string_strategy() -> impl Strategy { let ascii = '!'..='~'; // CJK Unified Ideographs @@ -1022,6 +1477,51 @@ mod tests { .prop_map(|v| v.into_iter().collect()) } + /// Strategy to generate strings suitable for quoted keys and values in JSON5 + /// + /// Excludes quote characters (single and double) and backslashes to simplify testing + /// while still providing a diverse set of characters including CJK Unicode + fn quoted_string_strategy() -> impl Strategy { + // ignore ' " \ + let ascii1 = '('..='['; + let ascii2 = ']'..='~'; + // CJK Unified Ideographs + let cjk = '\u{4E00}'..='\u{9FFF}'; + + let chars: Vec = ascii1.chain(ascii2).chain(cjk).collect(); + prop::collection::vec(prop::sample::select(chars), 1..50) + .prop_map(|v| v.into_iter().collect()) + } + + /// Strategy to generate strings suitable for unquoted object keys in JSON5 + /// + /// Generates strings containing alphanumeric characters, underscores, dollar signs, + /// and CJK Unicode characters that are valid as unquoted keys in JSON5. + /// This tests the parser's ability to handle extended syntax for object keys. + fn unquoted_string_strategy() -> impl Strategy { + let number = '0'..='9'; + let lowercase = 'a'..='f'; + let uppercase = 'A'..='F'; + let underline = '_'; + let dollar = '$'; + // CJK Unified Ideographs + let cjk = '\u{4E00}'..='\u{9FFF}'; + + let mut chars: Vec = number + .chain(lowercase) + .chain(uppercase) + .chain(cjk) + .collect(); + chars.push(underline); + chars.push(dollar); + prop::collection::vec(prop::sample::select(chars), 1..50) + .prop_map(|v| v.into_iter().collect()) + } + + /// Strategy to generate standard JSON number values + /// + /// Generates integers (signed and unsigned) and floating-point numbers + /// while excluding special cases like -0.0 that might cause comparison issues fn standard_number_strategy() -> impl Strategy { prop_oneof![ any::().prop_map(Number::UInt64), @@ -1032,6 +1532,13 @@ mod tests { ] } + /// Strategy to generate arbitrary precision number values when the feature is enabled + /// + /// Generates various numeric types including: + /// - Standard integers (i64, u64) + /// - Floating-point numbers (f64) + /// - Decimal types with different scales (Decimal64, Decimal128, Decimal256) + /// This tests the parser's ability to handle the full range of numeric formats #[cfg(feature = "arbitrary_precision")] fn number_strategy() -> impl Strategy { use crate::Decimal128; @@ -1054,6 +1561,63 @@ mod tests { ] } + /// Strategy to generate random hexadecimal numbers for testing + /// + /// Generates hexadecimal numbers with 0x/0X prefix (e.g., "0xFF", "0X1A3") + /// to test the parser's ability to handle extended JSON5 hex number syntax + fn hex_number_strategy() -> impl Strategy { + let number = '0'..='9'; + let lowercase = 'a'..='f'; + let uppercase = 'A'..='F'; + + let hex_digit = + prop::sample::select(number.chain(lowercase).chain(uppercase).collect::>()); + let hex_prefix = prop::sample::select(vec!['x', 'X']); + let int_part = prop::collection::vec(hex_digit.clone(), 1..16) + .prop_map(|v| v.into_iter().collect::()); + + (hex_prefix, int_part).prop_map(|(x, i)| format!("0{}{}", x, i)) + } + + /// Strategy to generate JSON5 values for testing the extended JSON parser + /// + /// Creates a comprehensive set of JSON5 values including all extended syntax features: + /// - Standard JSON literals (null, true, false) + /// - Numbers (standard format) + /// - Hexadecimal numbers (with 0x/0X prefix) + /// - Double-quoted strings + /// - Single-quoted strings + /// - Arrays + /// - Objects with different key styles (double-quoted, single-quoted, and unquoted) + /// + /// This strategy is used to verify that our parser correctly handles all JSON5 extensions + fn json5_strategy() -> impl Strategy { + let leaf = prop_oneof![ + Just(Json5Value::Null), + any::().prop_map(Json5Value::Bool), + standard_number_strategy().prop_map(Json5Value::Number), + hex_number_strategy().prop_map(Json5Value::HexNumber), + quoted_string_strategy().prop_map(Json5Value::DoubleQuotedString), + quoted_string_strategy().prop_map(Json5Value::SingleQuotedString), + ]; + + leaf.prop_recursive(8, 256, 30, |inner| { + prop_oneof![ + prop::collection::vec(inner.clone(), 0..10).prop_map(Json5Value::Array), + prop::collection::btree_map(quoted_string_strategy(), inner.clone(), 0..20) + .prop_map(Json5Value::DoubleQuotedKeyObject), + prop::collection::btree_map(quoted_string_strategy(), inner.clone(), 0..20) + .prop_map(Json5Value::SingleQuotedKeyObject), + prop::collection::btree_map(unquoted_string_strategy(), inner, 0..20) + .prop_map(Json5Value::UnquotedKeyObject), + ] + }) + } + + /// Strategy to generate standard JSON values with arbitrary precision when enabled + /// + /// Used for testing the parser's compatibility with standard JSON format + /// while supporting arbitrary precision numbers #[cfg(feature = "arbitrary_precision")] fn json_strategy() -> impl Strategy> { let leaf = prop_oneof![ @@ -1072,6 +1636,10 @@ mod tests { }) } + /// Strategy to generate standard JSON values without arbitrary precision + /// + /// Used for testing the parser in standard mode to ensure it strictly + /// follows the JSON specification without any extensions fn standard_json_strategy() -> impl Strategy> { let leaf = prop_oneof![ Just(Value::Null), @@ -1090,6 +1658,33 @@ mod tests { } proptest! { + /// Tests the parser's ability to handle JSON5 syntax + /// + /// Generates JSON5 values and verifies that our parser produces + /// the same results as the json_five crate + #[test] + fn test_json5_parser(json in json5_strategy()) { + let source = format!("{}", json); + + let res1 = json_five::from_str::(&source); + let res2 = parse_value(source.as_bytes()); + let res3 = parse_owned_jsonb(source.as_bytes()); + assert_eq!(res1.is_ok(), res2.is_ok()); + assert_eq!(res1.is_ok(), res3.is_ok()); + if res1.is_ok() { + let res1 = format!("{}", res1.unwrap()); + let res2 = format!("{}", res2.unwrap()); + let res3 = format!("{}", res3.unwrap()); + assert_eq!(res1, res2); + assert_eq!(res1, res3); + } + } + } + + proptest! { + /// Tests the parser's ability to handle standard JSON with arbitrary precision + /// + /// Compares our parser's results with serde_json for standard JSON input #[test] #[cfg(feature = "arbitrary_precision")] fn test_json_parser(json in json_strategy()) { @@ -1110,6 +1705,9 @@ mod tests { } proptest! { + /// Tests the parser in standard mode with standard JSON input + /// + /// Verifies that the parser strictly follows the JSON specification in standard mode #[test] fn test_standard_json_parser(json in standard_json_strategy()) { let source = format!("{}", json); diff --git a/src/util.rs b/src/util.rs index 4143b98..8ebe497 100644 --- a/src/util.rs +++ b/src/util.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::io::Read; - use super::constants::*; use super::error::Error; use super::error::ParseErrorCode; @@ -66,6 +64,13 @@ fn parse_escaped_string<'a>( idx: &mut usize, str_buf: &mut String, ) -> Result<&'a [u8], Error> { + if data.is_empty() { + return Err(Error::Syntax( + ParseErrorCode::UnexpectedEndOfHexEscape, + *idx, + )); + } + let byte = data[0]; *idx += 1; data = &data[1..]; @@ -79,77 +84,70 @@ fn parse_escaped_string<'a>( b'r' => str_buf.push(RR), b't' => str_buf.push(TT), b'u' => { - let mut numbers = vec![0; UNICODE_LEN]; - if data[0] == b'{' { - data = &data[1..]; - data.read_exact(numbers.as_mut_slice())?; - if data[0] != b'}' { - return Err(Error::Syntax( - ParseErrorCode::UnexpectedEndOfHexEscape, - *idx, - )); - } - data = &data[1..]; - *idx += 6; - } else { - data.read_exact(numbers.as_mut_slice())?; - *idx += 4; - } - let hex = decode_hex_escape(numbers.clone(), idx)?; + let mut numbers = [0u8; UNICODE_LEN]; + // Parse the first Unicode escape sequence + data = parse_unicode_escape(data, idx, &mut numbers)?; + let hex = decode_hex_escape(&numbers, idx)?; let c = match hex { 0xDC00..=0xDFFF => { - encode_invalid_unicode(numbers, str_buf); + // Low surrogate without preceding high surrogate + encode_invalid_unicode(&numbers, str_buf); return Ok(data); } // Non-BMP characters are encoded as a sequence of two hex - // escapes, representing UTF-16 surrogates. If deserializing a - // utf-8 string the surrogates are required to be paired, - // whereas deserializing a byte string accepts lone surrogates. + // escapes, representing UTF-16 surrogates. n1 @ 0xD800..=0xDBFF => { + // High surrogate - check for following low surrogate if data.len() < 2 { - encode_invalid_unicode(numbers, str_buf); + encode_invalid_unicode(&numbers, str_buf); return Ok(data); } + + // Check for \u sequence if data[0] == b'\\' && data[1] == b'u' { *idx += 2; data = &data[2..]; } else { - encode_invalid_unicode(numbers, str_buf); + encode_invalid_unicode(&numbers, str_buf); return Ok(data); } - let mut lower_numbers = vec![0; UNICODE_LEN]; - if data[0] == b'{' { - data = &data[1..]; - data.read_exact(lower_numbers.as_mut_slice())?; - if data[0] != b'}' { - return Err(Error::Syntax( - ParseErrorCode::UnexpectedEndOfHexEscape, - *idx, - )); - } - data = &data[1..]; - *idx += 6; - } else { - data.read_exact(lower_numbers.as_mut_slice())?; - *idx += 4; - } - let n2 = decode_hex_escape(lower_numbers.clone(), idx)?; + + let mut lower_numbers = [0u8; UNICODE_LEN]; + // Parse the second Unicode escape sequence + data = parse_unicode_escape(data, idx, &mut lower_numbers)?; + let n2 = decode_hex_escape(&lower_numbers, idx)?; if !(0xDC00..=0xDFFF).contains(&n2) { - encode_invalid_unicode(numbers, str_buf); - encode_invalid_unicode(lower_numbers, str_buf); + encode_invalid_unicode(&numbers, str_buf); + encode_invalid_unicode(&lower_numbers, str_buf); return Ok(data); } #[allow(clippy::precedence)] let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; - char::from_u32(n).unwrap() + + match char::from_u32(n) { + Some(ch) => ch, + None => { + // Handle invalid Unicode code points gracefully + // If we somehow got an invalid code point, preserve the original escape sequence + encode_invalid_unicode(&numbers, str_buf); + encode_invalid_unicode(&lower_numbers, str_buf); + return Ok(data); + } + } } - // Every u16 outside of the surrogate ranges above is guaranteed - // to be a legal char. - n => char::from_u32(n as u32).unwrap(), + // Regular Unicode code points + n => match char::from_u32(n as u32) { + Some(ch) => ch, + None => { + // Handle invalid code points gracefully + encode_invalid_unicode(&numbers, str_buf); + return Ok(data); + } + }, }; str_buf.push(c); } @@ -158,14 +156,59 @@ fn parse_escaped_string<'a>( Ok(data) } +/// Parse a Unicode escape sequence and return the updated data slice +/// +/// This helper function handles both standard \uXXXX and extended \u{XXXX} formats, +/// extracting the hex digits into the provided buffer. +#[inline] +fn parse_unicode_escape<'a>( + mut data: &'a [u8], + idx: &mut usize, + numbers: &mut [u8; UNICODE_LEN], +) -> Result<&'a [u8], Error> { + if data.len() < UNICODE_LEN { + return Err(Error::Syntax( + ParseErrorCode::UnexpectedEndOfHexEscape, + *idx, + )); + } + // Handle \u{XXXX} format (with braces) + if data[0] == b'{' { + if data.len() < UNICODE_LEN + 2 { + return Err(Error::Syntax( + ParseErrorCode::UnexpectedEndOfHexEscape, + *idx, + )); + } + + numbers.copy_from_slice(&data[1..UNICODE_LEN + 1]); + if data[UNICODE_LEN + 1] != b'}' { + return Err(Error::Syntax( + ParseErrorCode::UnexpectedEndOfHexEscape, + *idx, + )); + } + + data = &data[UNICODE_LEN + 2..]; + *idx += UNICODE_LEN + 2; + } else { + // Standard \uXXXX format + numbers.copy_from_slice(&data[..UNICODE_LEN]); + data = &data[UNICODE_LEN..]; + *idx += UNICODE_LEN; + } + + Ok(data) +} + // https://datatracker.ietf.org/doc/html/rfc8259#section-8.2 // RFC8259 allow invalid Unicode #[inline] -fn encode_invalid_unicode(numbers: Vec, str_buf: &mut String) { +fn encode_invalid_unicode(numbers: &[u8], str_buf: &mut String) { str_buf.push('\\'); str_buf.push('u'); for n in numbers { - str_buf.push(n.into()); + str_buf.push((*n).into()); } } @@ -180,14 +223,214 @@ fn decode_hex_val(val: u8) -> Option { } #[inline] -fn decode_hex_escape(numbers: Vec, idx: &usize) -> Result { +fn decode_hex_escape(numbers: &[u8], idx: &usize) -> Result { let mut n = 0; for number in numbers { - if let Some(hex) = decode_hex_val(number) { + if let Some(hex) = decode_hex_val(*number) { n = (n << 4) + hex; } else { - return Err(Error::Syntax(ParseErrorCode::InvalidHex(number), *idx)); + return Err(Error::Syntax(ParseErrorCode::InvalidHex(*number), *idx)); } } Ok(n) } + +#[cfg(test)] +mod tests { + use super::*; + use proptest::prelude::*; + use std::fmt::Write; + + #[test] + fn test_parse_string() { + // Test cases with expected results + let test_cases = vec![ + // Basic strings + ("hello", "hello"), + ("", ""), + ("123", "123"), + // Escaped characters + (r#"hello\nworld"#, "hello\nworld"), + (r#"\"\\\b\f\n\r\t"#, "\"\\\u{8}\u{c}\n\r\t"), + (r#"escaped \"quotes\""#, "escaped \"quotes\""), + (r#"forward\/slash"#, "forward/slash"), + // Unicode escapes - Basic + (r#"\u0041\u0042\u0043"#, "ABC"), + (r#"Unicode: \u00A9 \u00AE"#, "Unicode: © ®"), + // Unicode escapes - Braces syntax + (r#"\u{0041}\u{0042}\u{0043}"#, "ABC"), + (r#"Unicode: \u{00A9} \u{00AE}"#, "Unicode: © ®"), + // Unicode escapes - Surrogate pairs + (r#"\uD834\uDD1E"#, "𝄞"), // G-clef (musical symbol) + (r#"\u{D834}\u{DD1E}"#, "𝄞"), // Same with braces + // Mixed content + (r#"Mixed: \u0041\n\t\"test\""#, "Mixed: A\n\t\"test\""), + (r#"CJK: \u4E2D\u6587"#, "CJK: 中文"), + // Edge cases + (r#"\u007F"#, "\u{7F}"), // DEL character + (r#"\u0000"#, "\u{0}"), // NULL character + ]; + + // Run all test cases + for (input, expected) in test_cases { + let input_bytes = input.as_bytes(); + let mut idx = 0; + let result = parse_string(input_bytes, input_bytes.len(), &mut idx); + + assert!(result.is_ok(), "Failed to parse valid string: {}", input); + assert_eq!( + result.unwrap(), + expected, + "Incorrect parsing result for: {}", + input + ); + assert_eq!( + idx, + input_bytes.len(), + "Index not advanced correctly for: {}", + input + ); + } + + // Error cases + let error_cases = vec![ + // Invalid escape sequence + r#"\z"#, + // Incomplete Unicode escape + r#"\u123"#, + // Invalid hex in Unicode escape + r#"\uGHIJ"#, + ]; + + for input in error_cases { + let input_bytes = if let Ok(s) = std::str::from_utf8(input.as_ref()) { + s.as_bytes() + } else { + input.as_ref() + }; + let mut idx = 0; + let result = parse_string(input_bytes, input_bytes.len(), &mut idx); + assert!( + result.is_err(), + "Expected error for invalid input: {:?}", + input_bytes + ); + } + } + + proptest! { + /// Property-based test for parse_string using randomly generated strings + /// + /// This test generates: + /// 1. Regular ASCII strings + /// 2. Strings with escaped characters + /// 3. Strings with Unicode characters including CJK + /// 4. Strings with Unicode escape sequences + #[test] + fn proptest_parse_string( + // Generate regular ASCII strings + s1 in r#"[a-zA-Z0-9 ]{0,50}"#, + // Generate strings with standard JSON escape sequences + s2 in r#"(\\[\"\\\/bfnrt]){0,10}"#, + // Generate Unicode characters including CJK + s3 in prop::collection::vec(prop::char::range('\u{0020}', '\u{FFFF}'), 0..20).prop_map(|chars| chars.into_iter().collect::()), + // Generate valid Unicode escape sequences + s4 in prop::collection::vec(0u16..0xD800, 0..5).prop_map(|nums| { + nums.into_iter() + .fold(String::new(), |mut output, b| { + let _ = write!(output, r#"\u{:04X}"#, b); + output + }) + }), + // Generate valid Unicode surrogate pairs + s5 in prop::collection::vec((0xD800u16..0xDC00, 0xDC00u16..0xE000), 0..3).prop_map(|pairs| { + pairs.into_iter() + .fold(String::new(), |mut output, (high, low)| { + let _ = write!(output, r#"\u{:04X}\u{:04X}"#, high, low); + output + }) + }), + ) { + // Combine all generated strings + let combined = format!("{}{}{}{}{}", s1, s2, s3, s4, s5); + + // Skip empty strings as they're already tested in the unit tests + prop_assume!(!combined.is_empty()); + + // Convert to a properly escaped JSON string + let json_string = serde_json::to_string(&combined).unwrap(); + // Remove the surrounding quotes that serde_json adds + let json_content = &json_string[1..json_string.len()-1]; + + // Parse the string using our function + let input_bytes = json_content.as_bytes(); + let mut idx = 0; + let result = parse_string(input_bytes, input_bytes.len(), &mut idx); + + // Verify parsing succeeded and produced the expected result + prop_assert!(result.is_ok(), "Failed to parse valid string: {}", json_content); + prop_assert_eq!(result.unwrap(), combined, "Incorrect parsing result"); + prop_assert_eq!(idx, input_bytes.len(), "Index not advanced correctly"); + } + + /// Property-based test for parse_string with focus on edge cases + /// + /// This test specifically targets edge cases like: + /// 1. Strings with many escape sequences + /// 2. Very long strings + /// 3. Strings with complex Unicode patterns + #[test] + fn proptest_parse_string_edge_cases( + // Generate strings with many escape sequences + heavy_escapes in prop::collection::vec( + prop::sample::select(vec![r#"\\"#, r#"\""#, r#"\n"#, r#"\t"#, r#"\b"#, r#"\f"#, r#"\r"#, r#"\/"#, r#"\u0020"#, r#"\u00A9"#]), + 1..100 + ).prop_map(|v| v.join("")), + + // Generate long regular strings + long_string in r#"[a-zA-Z0-9 ]{100,500}"#, + + // Generate strings with repeating Unicode patterns + unicode_pattern in prop::collection::vec( + prop::sample::select(vec![ + // ASCII + "ABC", + // Emoji + "😀😁😂", + // CJK + "中文日本語", + // Mixed scripts + "Latin Кириллица العربية", + // Unicode escapes + r#"\u0041\u0042\u0043"#, + // Surrogate pairs + r#"\uD834\uDD1E\uD834\uDD1F"# + ]), + 1..10 + ).prop_map(|v| v.join("")), + ) { + // Test each generated string separately + for test_str in [heavy_escapes, long_string, unicode_pattern] { + // Skip empty strings + if test_str.is_empty() { + continue; + } + + // Convert to a properly escaped JSON string + let json_string = serde_json::to_string(&test_str).unwrap(); + // Remove the surrounding quotes + let json_content = &json_string[1..json_string.len()-1]; + + // Parse the string + let input_bytes = json_content.as_bytes(); + let mut idx = 0; + let result = parse_string(input_bytes, input_bytes.len(), &mut idx); + + // Verify parsing + prop_assert!(result.is_ok(), "Failed to parse valid string: {}", json_content); + prop_assert_eq!(result.unwrap(), test_str, "Incorrect parsing result"); + prop_assert_eq!(idx, input_bytes.len(), "Index not advanced correctly"); + } + } + } +} diff --git a/tests/it/parser.rs b/tests/it/parser.rs index 9313fc1..b3fa1c3 100644 --- a/tests/it/parser.rs +++ b/tests/it/parser.rs @@ -48,10 +48,17 @@ fn test_parse_standard_ok(tests: Vec<(&str, Value<'_>)>) { fn test_parse_null() { test_parse_err(&[ ("n", "EOF while parsing a value, pos 1"), - ("nul", "EOF while parsing a value, pos 3"), + ("nul", "expected ident, pos 2"), ("nulla", "trailing characters, pos 5"), ]); + test_parse_ok(vec![("null", Value::Null)]); + // Extended JSON null syntax, allow uppercase letters and empty string is treated as NULL + test_parse_ok(vec![ + (" ", Value::Null), + ("NULL", Value::Null), + ("Null", Value::Null), + ]); } #[test] @@ -71,6 +78,13 @@ fn test_parse_boolean() { ("false", Value::Bool(false)), (" false ", Value::Bool(false)), ]); + // Extended JSON boolean syntax, allow uppercase letters + test_parse_ok(vec![ + ("TRUE", Value::Bool(true)), + (" True ", Value::Bool(true)), + ("FALSE", Value::Bool(false)), + (" falSE ", Value::Bool(false)), + ]); } #[test] @@ -79,7 +93,6 @@ fn test_parse_number_errors() { ("+", "invalid number, pos 1"), (".", "invalid number, pos 1"), ("-", "invalid number, pos 1"), - ("0x80", "trailing characters, pos 2"), ("\\0", "expected value, pos 1"), ("1.a", "trailing characters, pos 3"), ("1e", "invalid number, pos 2"), @@ -88,8 +101,8 @@ fn test_parse_number_errors() { ]); test_parse_standard_err(&[ - ("+", "invalid number, pos 1"), - (".", "invalid number, pos 1"), + ("+", "expected value, pos 1"), + (".", "expected value, pos 1"), ("-", "invalid number, pos 1"), ("0x80", "trailing characters, pos 2"), ("\\0", "expected value, pos 1"), @@ -98,9 +111,9 @@ fn test_parse_number_errors() { ("1e+", "invalid number, pos 3"), ("1a", "trailing characters, pos 2"), // Extended JSON number syntax return error in standard mode - ("+1", "invalid number, pos 1"), + ("+1", "expected value, pos 1"), ("00", "invalid number, pos 2"), - (".0", "invalid number, pos 1"), + (".0", "expected value, pos 1"), ("0.", "invalid number, pos 3"), ("1.", "invalid number, pos 3"), ("1.e1", "invalid number, pos 3"), @@ -281,6 +294,18 @@ fn test_parse_f64() { ("0.", Value::Number(Number::UInt64(0))), ("1.", Value::Number(Number::UInt64(1))), ("1.e1", Value::Number(Number::Float64(10.0))), + ("nan", Value::Number(Number::Float64(f64::NAN))), + ("+infinity", Value::Number(Number::Float64(f64::INFINITY))), + ("INFINITY", Value::Number(Number::Float64(f64::INFINITY))), + ( + "-INFINITY", + Value::Number(Number::Float64(f64::NEG_INFINITY)), + ), + ("0xdecaf", Value::Number(Number::UInt64(912559))), + ( + "0xdecaf.124", + Value::Number(Number::Float64(912559.0712890625)), + ), ]; test_parse_ok(extended_tests); } @@ -344,6 +369,10 @@ fn test_parse_string() { ), (r#""⚠\u{fe0f}""#, Value::String(Cow::from("⚠\u{fe0f}"))), ]); + + // Extended JSON string syntax + let extended_tests = vec![("'abcd'", Value::String(Cow::from("abcd")))]; + test_parse_ok(extended_tests); } #[test] @@ -448,7 +477,7 @@ fn test_parse_object() { test_parse_err(&[ ("{", "EOF while parsing a value, pos 1"), ("{ ", "EOF while parsing a value, pos 2"), - ("{1", "key must be a string, pos 2"), + ("{1", "object attribute name cannot be a number, pos 2"), ("{ \"a\"", "EOF while parsing a value, pos 5"), ("{\"a\"", "EOF while parsing a value, pos 4"), ("{\"a\" ", "EOF while parsing a value, pos 5"), @@ -478,17 +507,28 @@ fn test_parse_object() { obj4.insert("c".to_string(), Value::Null); let mut obj5 = Object::new(); obj5.insert("d".to_string(), Value::Number(Number::UInt64(5))); + let mut obj6 = Object::new(); + obj6.insert("_test123中文".to_string(), Value::Number(Number::UInt64(6))); test_parse_ok(vec![ (r#"{}"#, Value::Object(Object::new())), (r#"{ }"#, Value::Object(Object::new())), (r#"{"a":3}"#, Value::Object(obj1.clone())), - (r#"{ "a" : 3 }"#, Value::Object(obj1)), + (r#"{ "a" : 3 }"#, Value::Object(obj1.clone())), (r#"{"a":3,"b":4}"#, Value::Object(obj2.clone())), (r#" { "a" : 3 , "b" : 4 } "#, Value::Object(obj2)), - (r#"{"a": {"b": 3, "c": 4}}"#, Value::Object(obj3)), + (r#"{"a": {"b": 3, "c": 4}}"#, Value::Object(obj3.clone())), (r#"{"c":null}"#, Value::Object(obj4)), (r#"{\t\n\r "d": 5}"#, Value::Object(obj5.clone())), - (r#"{ \x0C "d": 5}"#, Value::Object(obj5)), + (r#"{ \x0C "d": 5}"#, Value::Object(obj5.clone())), ]); + + // Extended JSON string syntax + let extended_tests = vec![ + ("{'a':3}", Value::Object(obj1)), + ("{a:{b:3, c:4}}", Value::Object(obj3)), + ("{d:5}", Value::Object(obj5)), + ("{_test123中文 :6}", Value::Object(obj6)), + ]; + test_parse_ok(extended_tests); }