Open
Description
AB#1117209
This is the API proposal for Utf8String
, an immutable, heap-allocated representation of UTF-8 string data. See dotnet/corefxlab#2368 for the scenarios and design philosophy behind this proposal.
Included in this are also APIs to improve text processing across the framework as a whole, including changes to
existing types like String
and CultureInfo
.
// n.b. System namespace
namespace System
{
// New APIs added to System.String
public sealed partial class String
{
public UnicodeScalar GetScalarAt(int index) { throw null; }
public bool TryGetScalarAt(int index, out UnicodeScalar value) { throw null; }
}
// Represents a string whose internal representation consists of UTF-8 subsequences.
// Like the String class, developers are *strongly discouraged* from creating instances of
// this type that have invalid UTF-8 subsequences (and our APIs try to encourage good hygiene
// in this regard), but instances of this type are *not guaranteed* to consist only of well-
// formed UTF-8 subsequences. The APIs hanging off this type have well-defined, predictable
// behavior regardless of whether the UTF-8 string contains invalid subsequences.
//
// The class isn't directly indexable or enumerable, instead relying on the developer to
// go through one of the AsBytes / AsScalars / AsSpan APIs.
//
// Whenever length / index / offset / count / etc. occurs in these APIs, it's in terms of number
// of Char8 elements. (Or, "byte length" if you prefer.)
public sealed unsafe class Utf8String : IEquatable<Utf8String>
{
/*
* CONSTRUCTORS
* All public ctors are validating ctors.
* Complexity is O(n) for memcpy and O(n) for validation.
* Behavior given invalid input: bad sequences replaced with U+FFFD.
* Scroll down further in the file for static factories that suppress validation.
*/
// For null-terminated UTF-8 and UTF-16 sequences.
// If not null-terminated, wrap (ptr, length) in a Span and call the Span-based ctors.
public unsafe Utf8String(byte* value) { }
public Utf8String(char* value) => { }
// For non null-terminated UTF-8 and UTF-16 sequences.
public Utf8String(ReadOnlySpan<byte> value) { }
public Utf8String(ReadOnlySpan<char> value) { }
public Utf8String(byte[] value, int startIndex, int length) { }
public Utf8String(char[] value, int startIndex, int length) { }
// For discoverability / ease of use, equivalent to ROS<char>-based ctor
public Utf8String(String value) { }
// No Utf8String(ReadOnlySpan<Utf8Char>) or similar ctor due to complexity of plumbing this through
// the VM, but can call ROS<Utf8Char>.ToUtf8String() extension method for now as workaround.
/*
* COMPARISON
* All equality / comparison methods which don't explicitly take a StringComparison
* are ordinal by default. This differs slightly from System.String but is self-consistent
* within the Utf8String class.
*/
public static bool operator ==(Utf8String a, Utf8String b) => throw null;
public static bool operator !=(Utf8String a, Utf8String b) => throw null;
/*
* PROJECTION
* n.b. No implicit or explicit cast from Utf8String <-> String.
* Reason for this is that the cast would have O(n) complexity, which would be
* potentially surprising for developers. Use ToString() / ToUtf8String() instead.
*/
public static implicit operator ReadOnlySpan<Utf8Char>(Utf8String value) => throw null;
// static readonly field, not property or const, to match String.Empty
public static readonly Utf8String Empty;
// Length (in UTF-8 code units)
public int Length { get => throw null; }
// Indexer (in UTF-8 code units)
// Returns 'ref readonly' since enables more scenarios for callers
public ref readonly Utf8Char this[int index] => throw null;
/*
* CONCAT
* This set of overloads may change based on how language and compiler support for '+' works
* with Utf8String instances, including whether struct-based builder types come online.
* Let's go with this for now pending how those other features shake out.
*/
public static Utf8String Concat(Utf8String str0, Utf8String str1) => throw null;
public static Utf8String Concat(Utf8String str0, Utf8String str1, Utf8String str2) => throw null;
public static Utf8String Concat(Utf8String str0, Utf8String str1, Utf8String str2, Utf8String str3) => throw null;
// Contains: overloads which don't take a StringComparison assume Ordinal.
public bool Contains(char value) => throw null;
public bool Contains(char value, StringComparison comparisonType) => throw null;
public bool Contains(Utf8String value) => throw null;
public bool Contains(Utf8String value, StringComparison comparisonType) => throw null;
public bool Contains(UnicodeScalar value) => throw null;
public bool Contains(UnicodeScalar value, StringComparison comparisonType) => throw null;
public void CopyTo(Span<byte> destination) => throw null;
public void CopyTo(Span<Utf8Char> destination) => throw null;
// The static factories below allow the developer to control how (or even whether) validation is performed
// against the incoming data. Developers should exercise caution when calling this API with the "allow invalid
// data" flag, taking into account such considerations as:
// (a) whether the input came from a trustworthy source,
// (b) which component the constructed instance will be passed to, and
// (c) the behavior such component might exhibit if faced with invalid sequences.
//
// As an example of such a scenario that requires further scrutiny, consider a forum that allows users to
// sign up for new accounts and post messages. Forum administrators use a web interface to perform such
// tasks as deleting abusive accounts, moving messages, and so forth. If a malicious user attempts to sign
// up with a username that contains an invalid UTF-8 sequence, and if such sequence round-trips through the
// messages database, the username that appears in the page's HTML (as a string) might be different than the
// username that actually exists in the database (as an arbitrary byte sequence). One potential consequence
// of this is that if such user starts posting abusive messages, admins will be powerless to do anything via
// the web interface since the "delete account" API will return "user does not exist", instead requiring the
// IT administrator to go directly to the database and purge the abuser's account.
public static Utf8String Create(ReadOnlySpan<byte> value, InvalidSequenceBehavior behavior = InvalidSequenceBehavior.ReplaceInvalidSequence) => throw null;
public static Utf8String Create(ReadOnlySpan<Utf8Char> value, InvalidSequenceBehavior behavior = InvalidSequenceBehavior.ReplaceInvalidSequence) => throw null;
public static Utf8String Create<TState>(int length, TState state, System.Buffers.SpanAction<byte, TState> action, InvalidSequenceBehavior behavior = InvalidSequenceBehavior.ReplaceInvalidSequence) => throw null;
// "CreateFromBytes" is renamed so that type inference doesn't fail if the developer
// passes an untyped lambda as the third parameter. O(n) for memcpy + O(n) for validation.
// Behavior given invalid input: fixes up invalid sequences on-the-fly.
public static Utf8String Create<TState>(int length, TState state, SpanAction<Char8, TState> action) => throw null;
public static Utf8String CreateFromBytes<TState>(int length, TState state, SpanAction<byte, TState> action) => throw null;
// EndsWith: for simplicity, only implemented as Ordinal for now.
public bool EndsWith(char value) => throw null;
public bool EndsWith(UnicodeScalar value) => throw null;
public bool EndsWith(Utf8String value) => throw null;
/*
* EQUALS
* The Equals(object) overload only matches Utf8String, not String.
*
* OPEN QUESTION: Do we need an Equals(Utf8String, String) overload? The performance of that method could be
* somewhat rough and might involve a transcoding operation, which may surprise the developer.
*
* When transcoding is required, comparison is by ordinal scalar, and invalid subsequences immediately return failure.
* Example: the UTF-8 string [ C1 80 ] will *never* match any UTF-16 string.
*/
public override bool Equals(object obj) => throw null;
public bool Equals(Utf8String value) => throw null;
public static bool Equals(Utf8String a, Utf8String b) => throw null;
public static bool Equals(Utf8String a, Utf8String b, StringComparison comparisonType) => throw null;
public int GetHashCode(StringComparison comparisonType) => throw null;
public static int GetHashCode(ReadOnlySpan<byte> value) => throw null;
public static int GetHashCode(ReadOnlySpan<byte> value, StringComparison comparisonType) => throw null;
public static int GetHashCode(ReadOnlySpan<Utf8Char> value) => throw null;
public static int GetHashCode(ReadOnlySpan<Utf8Char> value, StringComparison comparisonType) => throw null;
// Used for pinning. Typed as 'byte' instead of 'Utf8Char' because the scenario for calling this
// is p/invoke, and we don't want to require a reinterpret_cast.
[EditorBrowsable(EditorBrowsableState.Never)]
public ref readonly byte GetPinnableReference() => throw null;
public UnicodeScalar GetScalarAt(int index) => throw null;
public bool TryGetScalarAt(int index, out UnicodeScalar scalar) => throw null;
// GetStream: Returns a read-only Stream which wraps this instance. Useful for networking and other i/o scenarios.
// ** OPEN QUESTION ** Should we ditch this and simply have a ReadOnlyMemory<byte>.GetStream() extension method?
public System.IO.Stream GetStream() => throw null;
// Literal: A stopgap measure to support literal UTF-8 values until we get first-class compiler support.
// Syntax is Utf8String theString = Utf8String.Literal("I am a literal string.");
// JIT will special-case this call and optimize it just as it would've done with a String literal.
public static Utf8String Literal(string value) => throw null;
// IndexOf / LastIndexOf: Ordinal for simplicity for now.
public int IndexOf(char value) => throw null;
public int IndexOf(char value, int startIndex) => throw null;
public int IndexOf(char value, int startIndex, int count) => throw null;
public int IndexOf(UnicodeScalar value) => throw null;
public int IndexOf(UnicodeScalar value, int startIndex) => throw null;
public int IndexOf(UnicodeScalar value, int startIndex, int count) => throw null;
public int IndexOf(Utf8String value) => throw null;
public int IndexOf(Utf8String value, int startIndex) => throw null;
public int IndexOf(Utf8String value, int startIndex, int count) => throw null;
public int LastIndexOf(char value) => throw null;
public int LastIndexOf(char value, int startIndex) => throw null;
public int LastIndexOf(char value, int startIndex, int count) => throw null;
public int LastIndexOf(UnicodeScalar value) => throw null;
public int LastIndexOf(UnicodeScalar value, int startIndex) => throw null;
public int LastIndexOf(UnicodeScalar value, int startIndex, int count) => throw null;
public int LastIndexOf(Utf8String value) => throw null;
public int LastIndexOf(Utf8String value, int startIndex) => throw null;
public int LastIndexOf(Utf8String value, int startIndex, int count) => throw null;
public static bool IsNullOrEmpty(Utf8String value) => throw null;
public static bool IsNullOrWhiteSpace(Utf8String value) => throw null;
public static bool IsEmptyOrWhiteSpace(ReadOnlySpan<byte> value) => throw null;
public static bool IsEmptyOrWhiteSpace(ReadOnlySpan<Utf8Char> value) => throw null;
// Replace: Ordinal only for now for simplicity.
public Utf8String Replace(Utf8String oldValue, Utf8String newValue) => throw null;
// n.b. Utf8String.Split returns its results in an array, just like String.Split. There will be non-allocating
// Split APIs hanging off of ROM<Char8> / ROS<Char8> and other types for more advanced use cases.
public Utf8String[] Split(UnicodeScalar separator) => throw null;
public Utf8String[] Split(UnicodeScalar separator, int count) => throw null;
public Utf8String[] Split(UnicodeScalar separator, int count, StringSplitOptions options) => throw null;
public Utf8String[] Split(ReadOnlySpan<UnicodeScalar> separator) => throw null;
public Utf8String[] Split(ReadOnlySpan<UnicodeScalar> separator, int count) => throw null;
public Utf8String[] Split(ReadOnlySpan<UnicodeScalar> separator, int count, StringSplitOptions options) => throw null;
public Utf8String[] Split(Utf8String separator) => throw null;
public Utf8String[] Split(Utf8String separator, int count) => throw null;
public Utf8String[] Split(Utf8String separator, int count, StringSplitOptions options) => throw null;
public bool StartsWith(UnicodeScalar value) => throw null;
public bool StartsWith(Utf8String value, StringComparison comparisonType) => throw null;
// The natural way to use Substring is first to call IndexOf(...), then to substring on the index
// that is returned. Since the parameter passed to IndexOf is generally a literal or some other value
// under the developer's control, this means that the natural way of calling Substring shouldn't
// inadvertently lead to splitting the string in the middle of a UTF-8 sequence. (This same argument
// holds for the String class.)
//
// If the developer wants to go out of their way to substring a valid string in such a way that the
// result is invalid UTF-8, we won't stop them.
public Utf8String Substring(int startIndex) => throw null;
public Utf8String Substring(int startIndex, int length) => throw null;
// No ToLower() method - method name contains 'invariant' or culture must be specified
public Utf8String ToLowerInvariant() => throw null;
public Utf8String ToLower(CultureInfo culture) => throw null;
public override string ToString() => throw null;
public Utf8String ToUpperInvariant() => throw null;
public Utf8String ToUpper(CultureInfo culture) => throw null;
// Trim: only trims whitespace (not arbitrary charaters) for now for simplicity.
public Utf8String Trim() => throw null;
public Utf8String TrimEnd() => throw null;
public Utf8String TrimStart() => throw null;
// IsWellFormed: Determines whether a given input is well-formed UTF-8
public static bool IsWellFormed(Utf8String value) => throw null;
public static bool IsWellFormed(ReadOnlySpan<byte> span) => throw null;
public static bool IsWellFormed(ReadOnlySpan<Utf8Char> span) => throw null;
}
// New APIs added to System.MemoryExtensions
public static partial class MemoryExtensions
{
// Convert a Utf8String to a ROS<Utf8Char> or a ROS<byte>
public static ReadOnlySpan<Utf8Char> AsSpan(this Utf8String text) { throw null; }
public static ReadOnlySpan<Utf8Char> AsSpan(this Utf8String text, int start) { throw null; }
public static ReadOnlySpan<Utf8Char> AsSpan(this Utf8String text, int start, int length) { throw null; }
public static ReadOnlySpan<byte> AsBytes(this Utf8String value) { throw null; }
public static ReadOnlySpan<byte> AsBytes(this Utf8String value, int start) { throw null; }
public static ReadOnlySpan<byte> AsBytes(this Utf8String value, int start, int length) { throw null; }
// Convert a {ReadOnly}Span<Utf8Char> to a ReadOnlySpan<byte>
//
// Conversion is one-way - it's ok to take UTF-8 data and treat it as binary data for the purpose of
// bit-blasting it across i/o, but we don't want to encourage developers to take incoming arbitrary
// binary data and treat it as structured UTF-8 text. Use an API like Utf8Parser which is intended
// for accepting ROS<byte> input - which allows you to avoid the T/U conversion altogether, or use
// an unsafe API like MemoryMarshal.Cast to convert back and forth between the two representations.
public static ReadOnlySpan<byte> AsBytes(this Span<Utf8Char> value) { throw null; }
public static ReadOnlySpan<byte> AsBytes(this ReadOnlySpan<Utf8Char> value) { throw null; }
// Convert a Utf8String to a ROM<Utf8Char> or a ROM<byte>.
// This means ROM<T> can now be backed by one of four things:
// 1. T[],
// 2. MemoryManager<T>,
// 3. String (only if T = char), or
// 4. Utf8String (only if T = byte or T = Utf8Char) ** NEW **
//
// Consequently, the ReadOnlyMemory<byte>.Span property getter will now have to check for Utf8String
// in addition to checking for byte[]. From the results in https://github.com/dotnet/coreclr/pull/20386
// I expect a 5%-ish performance regression in the ROM<byte>.Span property getter, but since we've
// optimized that code path so aggressively in recent weeks this extra check shouldn't be very impactful.
//
// Finally, unlike ROS<Utf8Char> to ROS<byte>, ROM<Utf8Char> and ROM<byte> *cannot* be converted back and
// forth between each other. The caller is expected to know which T representation (text or binary?) is
// applicable for their scenario and to call the appropriate AsMemory API. This matches existing ROM<T>
// behavior; where e.g., it's valid to reinterpret cast between Span<ushort> and Span<char> (and such APIs
// are provided), but reinterpret casting between Memory<ushort> and Memory<char> is forbidden.
public static ReadOnlyMemory<Utf8Char> AsMemory(this Utf8String text) { throw null; }
public static ReadOnlyMemory<Utf8Char> AsMemory(this Utf8String text, int start) { throw null; }
public static ReadOnlyMemory<Utf8Char> AsMemory(this Utf8String text, int start, int length) { throw null; }
public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String text) { throw null; }
public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String text, int start) { throw null; }
public static ReadOnlyMemory<byte> AsMemoryBytes(this Utf8String text, int start, int length) { throw null; }
}
}
namespace System.Runtime.InteropServices
{
// New APIs added to System.Runtime.InteropServices.MemoryMarshal
public static partial class MemoryMarshal
{
// These methods have the same behavior as the existing TryGetString method.
public static bool TryGetUtf8String(ReadOnlyMemory<byte> memory, out Utf8String text, out int start, out int length) { throw null; }
public static bool TryGetUtf8String(ReadOnlyMemory<Utf8Char> memory, out Utf8String text, out int start, out int length) { throw null; }
}
}
namespace System.Runtime.CompilerServices {
// New APIs added to System.Runtime.CompilerServices.RuntimeHelpers
public static class RuntimeHelpers
{
// For compiler use, provides support for constant (literal) Utf8String values
public static Utf8String GetUtf8StringLiteral(string s) => throw null;
}
}
namespace System.Text
{
// Represents the fundamental elemental type of UTF-8 textual data and is distinct
// from System.Byte, similar to how System.Char is the fundamental elemental type
// of UTF-16 textual data and is distinct from System.UInt16.
//
// Ideally the compiler would support various syntaxes for this, like:
// Utf8Char theChar = 63; // Implicit assignment of const to local of type Utf8Char
public readonly struct Utf8Char : IComparable<Utf8Char>, IEquatable<Utf8Char>
{
private readonly int _dummy;
// Construction is performed via a cast. All casts are checked for overflow
// but not for correctness. For example, casting -1 to Utf8Char will fail
// with an OverflowException, but casting 0xFF to Utf8Char will succeed even
// though 0xFF is never a valid UTF-8 code unit. Additionally, even though
// the cast from Byte to Utf8Char can never overflow, it's still an explicit
// cast because we don't want devs to fall into the habit of treating arbitrary
// integral types as equivalent to textual data types. As an existing example of
// this in the current compiler, there's no implicit cast from Byte to Char even
// though it's a widening operation, but there is an explicit cast.
public static explicit operator Utf8Char(byte value) => throw null;
public static explicit operator Utf8Char(sbyte value) => throw null;
public static explicit operator Utf8Char(char value) => throw null;
public static explicit operator Utf8Char(short value) => throw null;
public static explicit operator Utf8Char(ushort value) => throw null;
public static explicit operator Utf8Char(int value) => throw null;
public static explicit operator Utf8Char(uint value) => throw null;
public static explicit operator Utf8Char(long value) => throw null;
public static explicit operator Utf8Char(ulong value) => throw null;
// Casts to the various primitive integral types. All casts are implicit
// with two exceptions, which are explicit:
// - Cast to SByte, because it could result in an OverflowException.
// - Cast to Char, for the same reason as the Byte-to-Utf8Char cast.
public static implicit operator byte(Utf8Char value) => throw null;
public static explicit operator sbyte(Utf8Char value) => throw null;
public static explicit operator char(Utf8Char value) => throw null;
public static implicit operator short(Utf8Char value) => throw null;
public static implicit operator ushort(Utf8Char value) => throw null;
public static implicit operator int(Utf8Char value) => throw null;
public static implicit operator uint(Utf8Char value) => throw null;
public static implicit operator long(Utf8Char value) => throw null;
public static implicit operator ulong(Utf8Char value) => throw null;
public static bool operator ==(Utf8Char a, Utf8Char b) => throw null;
public static bool operator !=(Utf8Char a, Utf8Char b) => throw null;
public static bool operator <(Utf8Char a, Utf8Char b) => throw null;
public static bool operator <=(Utf8Char a, Utf8Char b) => throw null;
public static bool operator >(Utf8Char a, Utf8Char b) => throw null;
public static bool operator >=(Utf8Char a, Utf8Char b) => throw null;
public int CompareTo(Utf8Char other) => throw null;
public override bool Equals(object obj) => throw null;
public bool Equals(Utf8Char other) => throw null;
public override int GetHashCode() => throw null;
public override string ToString() => throw null;
}
// Represents a Unicode scalar value ([ U+0000..U+D7FF ], inclusive; or [ U+E000..U+10FFFF ], inclusive).
// This type's ctors are guaranteed to validate the input, and consumers can call the APIs assuming
// that the input is well-formed.
//
// This type's ctors validate, but that shouldn't be a terrible imposition because very few components
// are going to need to create instances of this type. UnicodeScalar instances will almost always be
// created as a result of enumeration over a UTF-8 or UTF-16 sequence, or instances will be created
// by the compiler from known good constants in source. In both cases validation can be elided, which
// means that there's *no runtime check at all* - not in the ctors nor in the instance methods hanging
// off this type. This gives improved performance over APIs which require the consumer to call an
// IsValid method before operating on instances of this type, and it means that we can get away without
// potentially expensive branching logic in many of our property getters.
public readonly partial struct UnicodeScalar : IComparable<UnicodeScalar>, IEquatable<UnicodeScalar>
{
private readonly int _dummyPrimitive;
public static UnicodeScalar ReplacementChar { get => throw null; } // = U+FFFD
// ctors - throw if out of range
public UnicodeScalar(char ch) => throw null; // from UTF-16 code unit (must not be surrogate)
public UnicodeScalar(int scalarValue) => throw null;
public UnicodeScalar(uint scalarValue) => throw null;
// try pattern - returns (false, default(UnicodeScalar)) on failure
public static bool TryCreate(char value, out UnicodeScalar result) => throw null;
public static bool TryCreate(int value, out UnicodeScalar result) => throw null;
public static bool TryCreate(uint value, out UnicodeScalar result) => throw null;
// cast operators are explicit because value is checked
public static explicit operator UnicodeScalar(char value) => throw null;
public static explicit operator UnicodeScalar(uint value) =>throw null;
public static explicit operator UnicodeScalar(int value) => throw null;
public static bool operator ==(UnicodeScalar a, UnicodeScalar b) => throw null;
public static bool operator !=(UnicodeScalar a, UnicodeScalar b) => throw null;
public static bool operator <(UnicodeScalar a, UnicodeScalar b) => throw null;
public static bool operator <=(UnicodeScalar a, UnicodeScalar b) => throw null;
public static bool operator >(UnicodeScalar a, UnicodeScalar b) => throw null;
public static bool operator >=(UnicodeScalar a, UnicodeScalar b) => throw null;
public bool IsAscii { get => throw null; } // returns true iff Value <= 0x7F
public bool IsBmp { get => throw null; } // returns true iff Value <= 0xFFFF
public int Plane { get => throw null; } // returns 0 .. 16
public int Utf16SequenceLength { get => throw null; } // returns 1 .. 2
public int Utf8SequenceLength { get => throw null; } // return 1 .. 4
public uint Value { get => throw null; }
// Determines whether an arbitrary integer is a valid Unicode scalar value.
// Not an instance method because we always assume 'this' is valid.
public static bool IsValid(int value) => throw null;
public static bool IsValid(uint value) => throw null;
public int CompareTo(UnicodeScalar other) => throw null;
public override bool Equals(object obj) => throw null;
public bool Equals(UnicodeScalar other) => throw null;
public override int GetHashCode() => throw null;
// returns the scalar as a standalone UTF-16 string or a standalone UTF-8 string;
// or writes the scalar to a UTF-16 span or a UTF-8 span
public override string ToString() => throw null;
public int ToUtf16(Span<char> output) => throw null;
public int ToUtf8(Span<Utf8Char> output) => throw null;
public Utf8String ToUtf8String() => throw null;
// These are analogs of APIs on System.Char
public static double GetNumericValue(UnicodeScalar s) => throw null;
public static System.Globalization.UnicodeCategory GetUnicodeCategory(UnicodeScalar s) => throw null;
public static bool IsControl(UnicodeScalar s) => throw null;
public static bool IsDigit(UnicodeScalar s) => throw null;
public static bool IsLetter(UnicodeScalar s) => throw null;
public static bool IsLetterOrDigit(UnicodeScalar s) => throw null;
public static bool IsLower(UnicodeScalar s) => throw null;
public static bool IsNumber(UnicodeScalar s) => throw null;
public static bool IsPunctuation(UnicodeScalar s) => throw null;
public static bool IsSeparator(UnicodeScalar s) => throw null;
public static bool IsSymbol(UnicodeScalar s) => throw null;
public static bool IsUpper(UnicodeScalar s) => throw null;
public static bool IsWhiteSpace(UnicodeScalar s) => throw null;
public static UnicodeScalar ToLower(UnicodeScalar s, System.Globalization.CultureInfo culture) => throw null;
public static UnicodeScalar ToLowerInvariant(UnicodeScalar s) => throw null;
public static UnicodeScalar ToUpper(UnicodeScalar s, System.Globalization.CultureInfo culture) => throw null;
public static UnicodeScalar ToUpperInvariant(UnicodeScalar s) => throw null;
}
// Allows enumerating UnicodeScalar instances from an underlying UTF-16 or UTF-8 string or span.
//
// **OPEN QUESTION**
// It would be a lot easier if we just used UnicodeScalar directly as our elemental type of enumeration,
// but this has two important consequences. First, it's possible that developers may care about the StartIndex
// of the element (though 'foreach' really doesn't lend itself well to this). Second, if we return U+FFFD when
// we see an invalid sequence, callers won't be able to tell the difference between an invalid sequence and a
// valid sequence that really did read U+FFFD.
public static partial class UnicodeExtensions
{
public static (SequenceValidity Validity, UnicodeScalar Scalar, int SequenceLength) GetFirstScalar(ReadOnlySpan<char> span) => throw null;
public static (SequenceValidity Validity, UnicodeScalar Scalar, int SequenceLength) GetFirstScalar(ReadOnlySpan<Utf8Char> span) => throw null;
public static (SequenceValidity Validity, UnicodeScalar Scalar, int SequenceLength) GetLastScalar(ReadOnlySpan<char> span) => throw null;
public static (SequenceValidity Validity, UnicodeScalar Scalar, int SequenceLength) GetLastScalar(ReadOnlySpan<Utf8Char> span) => throw null;
public static CharSpanScalarEnumerator GetScalars(ReadOnlySpan<char> value) => throw null;
public static Utf8CharSpanScalarEnumerator GetScalars(ReadOnlySpan<Utf8Char> value) => throw null;
public static StringScalarEnumerator GetScalars(string value) => throw null;
public static Utf8StringScalarEnumerator GetScalars(Utf8String value) => throw null;
public struct StringScalarEnumerator
: System.Collections.Generic.IEnumerable<(UnicodeScalar? ScalarValue, int StartIndex, int Length)>
, System.Collections.Generic.IEnumerator<(UnicodeScalar? ScalarValue, int StartIndex, int Length)>
{
private int _dummy;
public (UnicodeScalar? ScalarValue, int StartIndex, int Length) Current => throw null;
[System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
public StringScalarEnumerator GetEnumerator() => throw null;
public bool MoveNext() => throw null;
void IDisposable.Dispose() { }
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() => throw null;
System.Collections.Generic.IEnumerator<(UnicodeScalar? ScalarValue, int StartIndex, int Length)> System.Collections.Generic.IEnumerable<(UnicodeScalar? ScalarValue, int StartIndex, int Length)>.GetEnumerator() => throw null;
object System.Collections.IEnumerator.Current => Current;
void System.Collections.IEnumerator.Reset() { }
}
public ref struct CharSpanScalarEnumerator
{
private int _dummy;
public (UnicodeScalar? ScalarValue, int StartIndex, int Length) Current => throw null;
[System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
public CharSpanScalarEnumerator GetEnumerator() => throw null;
public bool MoveNext() => throw null;
}
public struct Utf8StringScalarEnumerator
: System.Collections.Generic.IEnumerable<(UnicodeScalar? ScalarValue, int StartIndex, int Length)>
, System.Collections.Generic.IEnumerator<(UnicodeScalar? ScalarValue, int StartIndex, int Length)>
{
private int _dummy;
public (UnicodeScalar? ScalarValue, int StartIndex, int Length) Current => throw null;
[System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
public Utf8StringScalarEnumerator GetEnumerator() => throw null;
public bool MoveNext() => throw null;
void IDisposable.Dispose() { }
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() => throw null;
System.Collections.Generic.IEnumerator<(UnicodeScalar? ScalarValue, int StartIndex, int Length)> System.Collections.Generic.IEnumerable<(UnicodeScalar? ScalarValue, int StartIndex, int Length)>.GetEnumerator() => throw null;
object System.Collections.IEnumerator.Current => Current;
void System.Collections.IEnumerator.Reset() { }
}
public ref struct Utf8CharSpanScalarEnumerator
{
private int _dummy;
public (UnicodeScalar? ScalarValue, int StartIndex, int Length) Current => throw null;
[System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
public Utf8CharSpanScalarEnumerator GetEnumerator() => throw null;
public bool MoveNext() => throw null;
}
}
// When creating / transcoding UTF-8 data, controls what happens when an invalid sequence is detected
// in the input stream.
public enum InvalidSequenceBehavior
{
// Fails the operation immediately. The failure mode is caller-implemented. For example, a constructor
// could throw an exception, while an OperationStatus-returning method could return InvalidData.
Fail = 0,
// Replaces invalid sequences with U+FFFD. This matches the current behavior of Encoding.UTF8
// and is the recommendation from the Unicode Consortium for scenarios where the application should attempt
// to continue.
ReplaceInvalidSequence = 1,
// "Garbage in - garbage out." Propagates invalid sequences as-is without validation. This enum option is
// not valid for all scenarios. For example, APIs which transcode between UTF-8 and UTF-16 cannot use
// this value.
LeaveUnchanged = 2,
}
// Represents the validity of a UTF-8 or UTF-16 code unit sequence.
public enum SequenceValidity
{
// The input sequence is well-formed, i.e., it is an unambiguous representation of a Unicode scalar value.
// Examples:
// The UTF-8 sequence [ CE A9 ] is well-formed because it unambiguously represents the Unicode scalar value U+03A9.
// The UTF-8 sequence [ F2 AB B3 9E ] is well-formed because it unambiguously represents the Unicode scalar value U+ABCDE.
Valid,
// The input sequence is not well-formed, i.e., it does not correspond to a valid Unicode scalar value.
// Examples:
// The UTF-8 sequence [ C0 ] is not well-formed.
// The UTF-8 sequence [ C2 20 ] is not well-formed.
// The UTF-8 sequence [ ED A0 80 ] is not well-formed.
Invalid,
// The input sequence is incomplete (or empty). It is not valid on its own, but it could be the start of a longer valid
// sequence. The caller should more input data if available. If no further input data is available, the sequence should
// be treated as not well-formed.
// Examples:
// The UTF-8 sequence [ C2 ] is incomplete.
// The UTF-8 sequence [ F2 AB B3 ] is incomplete.
Incomplete
}
// APIs for fast transcoding of data between different UTF-* representations.
// These make up for some shortcomings in the Encoding class when dealing with streaming data, such as needing to know
// ahead of time how large the output buffer should be (which potentially requires two passes over the input).
public static partial class Transcode
{
// There are byte and Utf8Char versions of the below APIs because we want to support the scenario where the
// caller is writing directly to or reading directly from a network stream. APIs which are designed to be used
// against any possible input (including potentially malicious or malformed input) and which properly perform
// validation can be written in terms of byte for convenience.
public static OperationStatus ToUtf8(ReadOnlySpan<char> source, Span<byte> destination, bool isFinalBlock, out int charsRead, out int bytesWritten, InvalidSequenceBehavior behavior = InvalidSequenceBehavior.Fail);
public static OperationStatus ToUtf8(ReadOnlySpan<char> source, Span<Utf8Char> destination, bool isFinalBlock, out int charsRead, out int utf8CharsWritten, InvalidSequenceBehavior behavior = InvalidSequenceBehavior.Fail);
public static OperationStatus FromUtf8(ReadOnlySpan<byte> source, Span<char> destination, bool isFinalBlock, out int bytesRead, out int charsWritten, InvalidSequenceBehavior behavior = InvalidSequenceBehavior.Fail);
public static OperationStatus FromUtf8(ReadOnlySpan<Utf8Char> source, Span<char> destination, bool isFinalBlock, out int utf8CharsRead, out int charsWritten, InvalidSequenceBehavior behavior = InvalidSequenceBehavior.Fail);
}
}
Edits:
Nov. 8 - 9, 2018 - Updated API proposals in preparation for upcoming review.