Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 59 additions & 76 deletions tables.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,83 +9,66 @@ var propertyWidths = [5]int{
_Emoji: 2,
}

// asciiWidths is a lookup table for single-byte character widths. Printable
// ASCII characters have width 1, control characters have width 0.
// asciiWidths is a bitmask using 4 uint64's.
//
// It is intended for valid single-byte UTF-8, which means <128.
//
// If you look up an index >= 128, that is either:
// - invalid UTF-8, or
// - a multi-byte UTF-8 sequence, in which case you should be operating on
// the grapheme cluster, and not using this table
//
// We will return a default value of 1 in those cases, so as not to panic.
var asciiWidths = [256]int8{
// Control characters (0x00-0x1F): width 0
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Printable ASCII (0x20-0x7E): width 1
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// DEL (0x7F): width 0
0,
// >= 128
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// Layout:
// - asciiWidths[0]: bits 0-63 (ASCII 0x00-0x3F)
// - asciiWidths[1]: bits 64-127 (ASCII 0x40-0x7F)
// - asciiWidths[2]: bits 128-191 (ASCII 0x80-0xBF)
// - asciiWidths[3]: bits 192-255 (ASCII 0xC0-0xFF)
var asciiWidths = [4]uint64{
// Mask 0: 0x00-0x3F
// 0x00-0x1F: all 0 (control characters)
// 0x20-0x3F: all 1 (printable ASCII)
0xFFFFFFFF00000000,
// Mask 1: 0x40-0x7F
// 0x40-0x7E: all 1 (printable ASCII)
// 0x7F: 0 (DEL)
0x7FFFFFFFFFFFFFFF,

// >= 128 means you should not be using this table, because valid
// single-byte UTF-8 is < 128. We will return a default value of
// _Default in those cases, so as not to panic.

// Mask 2: 0x80-0xBF
// All 1 (>= 128)
0xFFFFFFFFFFFFFFFF, // all bits set
// Mask 3: 0xC0-0xFF
// All 1 (>= 128)
0xFFFFFFFFFFFFFFFF, // all bits set
}

// asciiProperties is a lookup table for single-byte character properties.
// It is intended for valid single-byte UTF-8, which means <128.
//
// If you look up an index >= 128, that is either:
// - invalid UTF-8, or
// - a multi-byte UTF-8 sequence, in which case you should be operating on
// the grapheme cluster, and not using this table
//
// We will return a default value of _Default in those cases, so as not to
// panic.
var asciiProperties = [256]property{
// Control characters (0x00-0x1F): _Zero_Width
_Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width,
_Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width,
_Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width,
_Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width, _Zero_Width,
// Printable ASCII (0x20-0x7E): _Default
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default,
// DEL (0x7F): _Zero_Width
_Zero_Width,
// >= 128
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
_Default, _Default, _Default, _Default, _Default, _Default, _Default, _Default,
// asciiWidth returns the width for a byte
func asciiWidth(b byte) int {
// determine the uint64 mask for the byte
mask := asciiWidths[b>>6]
// determine which bit within the uint64 to use
pos := b & 0x3F
return int((mask >> pos) & 1)
}

// asciiProperty returns the property for a byte
func asciiProperty(b byte) property {
// We can reuse (invert) asciiWidth because _Default happens to be 0,
// and _Zero_Width happens to be 1.

// determine the uint64 mask for the byte
mask := asciiWidths[b>>6]
// determine which bit within the uint64 to use
pos := b & 0x3F
// invert the mask and extract the bit
return property((^mask >> pos) & 1)
}

var (
// asciiProperty depends on _Default being 0 and _Zero_Width being 1.
// Some compile-time checks.

// If _Default != 0, out of bounds.
_ = [1]int{}[_Default]

// If _Zero_Width is 0, index is -1, out of bounds
// If _Zero_Width is 1, index is 0, correct
// If _Zero_Width is > 1, out of bounds
_ = [1]int{}[_Zero_Width-1]
)
114 changes: 114 additions & 0 deletions tables_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package displaywidth

import (
"testing"
)

// TestAsciiWidth verifies the bitmask values for specific
// control and printable ASCII characters.
func TestAsciiWidth(t *testing.T) {
tests := []struct {
name string
b byte
expected int
desc string
}{
// Control characters (0x00-0x1F): width 0
{"null", 0x00, 0, "NULL character"},
{"bell", 0x07, 0, "BEL (bell)"},
{"backspace", 0x08, 0, "BS (backspace)"},
{"tab", 0x09, 0, "TAB"},
{"newline", 0x0A, 0, "LF (newline)"},
{"carriage return", 0x0D, 0, "CR (carriage return)"},
{"escape", 0x1B, 0, "ESC (escape)"},
{"last control", 0x1F, 0, "Last control character"},

// Printable ASCII (0x20-0x7E): width 1
{"space", 0x20, 1, "Space (first printable)"},
{"exclamation", 0x21, 1, "!"},
{"zero", 0x30, 1, "0"},
{"nine", 0x39, 1, "9"},
{"A", 0x41, 1, "A"},
{"Z", 0x5A, 1, "Z"},
{"a", 0x61, 1, "a"},
{"z", 0x7A, 1, "z"},
{"tilde", 0x7E, 1, "~ (last printable)"},

// DEL (0x7F): width 0
{"delete", 0x7F, 0, "DEL (delete)"},

// >= 128: width 1 (default, though shouldn't be used for valid UTF-8)
{"0x80", 0x80, 1, "First byte >= 128"},
{"0xFF", 0xFF, 1, "Last byte value"},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := asciiWidth(tt.b)
if got != tt.expected {
t.Errorf("asciiWidth(0x%02X '%s') = %d, want %d (%s)",
tt.b, string(tt.b), got, tt.expected, tt.desc)
}
})
}
}

// TestAsciiProperty verifies the bitmask values for specific
// control and printable ASCII characters.
func TestAsciiProperty(t *testing.T) {
tests := []struct {
name string
b byte
expected property
desc string
}{
// Control characters (0x00-0x1F): _Zero_Width (1)
{"null", 0x00, _Zero_Width, "NULL character"},
{"bell", 0x07, _Zero_Width, "BEL (bell)"},
{"backspace", 0x08, _Zero_Width, "BS (backspace)"},
{"tab", 0x09, _Zero_Width, "TAB"},
{"newline", 0x0A, _Zero_Width, "LF (newline)"},
{"carriage return", 0x0D, _Zero_Width, "CR (carriage return)"},
{"escape", 0x1B, _Zero_Width, "ESC (escape)"},
{"last control", 0x1F, _Zero_Width, "Last control character"},

// Printable ASCII (0x20-0x7E): _Default (0)
{"space", 0x20, _Default, "Space (first printable)"},
{"exclamation", 0x21, _Default, "!"},
{"zero", 0x30, _Default, "0"},
{"nine", 0x39, _Default, "9"},
{"A", 0x41, _Default, "A"},
{"Z", 0x5A, _Default, "Z"},
{"a", 0x61, _Default, "a"},
{"z", 0x7A, _Default, "z"},
{"tilde", 0x7E, _Default, "~ (last printable)"},

// DEL (0x7F): _Zero_Width (1)
{"delete", 0x7F, _Zero_Width, "DEL (delete)"},

// >= 128: _Default (0) (default, though shouldn't be used for valid UTF-8)
{"0x80", 0x80, _Default, "First byte >= 128"},
{"0xFF", 0xFF, _Default, "Last byte value"},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := asciiProperty(tt.b)
if got != tt.expected {
t.Errorf("asciiProperty(0x%02X '%s') = %d, want %d (%s)",
tt.b, string(tt.b), got, tt.expected, tt.desc)
}
})
}
}

func TestAsciiPropertyEnums(t *testing.T) {
// We need _Default to be 0, and _Zero_Width to be 1, in order for
// asciiProperty to work as the inverse of asciiWidth.
if _Default != 0 {
t.Errorf("_Default = %d, want 0", _Default)
}
if _Zero_Width != 1 {
t.Errorf("_Zero_Width = %d, want 1", _Zero_Width)
}
}
13 changes: 7 additions & 6 deletions width.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ func (options Options) String(s string) int {
case 0:
return 0
case 1:
return int(asciiWidths[s[0]])
return asciiWidth(s[0])
}

width := 0
Expand All @@ -60,7 +60,7 @@ func (options Options) Bytes(s []byte) int {
case 0:
return 0
case 1:
return int(asciiWidths[s[0]])
return asciiWidth(s[0])
}

width := 0
Expand Down Expand Up @@ -90,7 +90,7 @@ func Rune(r rune) int {
// Iterating over runes to measure width is incorrect in many cases.
func (options Options) Rune(r rune) int {
if r < utf8.RuneSelf {
return int(asciiWidths[byte(r)])
return asciiWidth(byte(r))
}

// Surrogates (U+D800-U+DFFF) are invalid UTF-8.
Expand All @@ -113,7 +113,7 @@ func graphemeWidth[T stringish.Interface](s T, options Options) int {
case 0:
return 0
case 1:
return int(asciiWidths[s[0]])
return asciiWidth(s[0])
}

return lookupProperties(s).width(options)
Expand All @@ -138,8 +138,9 @@ func isVS16[T stringish.Interface](s T) bool {
// optimization, and to reduce the scope of this function.
func lookupProperties[T stringish.Interface](s T) property {
l := len(s)
b := s[0]

if s[0] < utf8.RuneSelf {
if b < utf8.RuneSelf {
// Check for variation selector after ASCII (e.g., keycap sequences like 1️⃣)
if l >= 4 {
// Subslice may help eliminate bounds checks
Expand All @@ -151,7 +152,7 @@ func lookupProperties[T stringish.Interface](s T) property {
// VS15 (0x8E) requests text presentation but does not affect width,
// in my reading of Unicode TR51. Falls through to _Default.
}
return asciiProperties[s[0]]
return asciiProperty(b)
}

// Regional indicator pair (flag)
Expand Down