mirror of
https://github.com/odin-lang/Odin.git
synced 2026-04-18 06:20:08 -04:00
unicode.is_graphic has been modified to use the generted Unicode tables. The tables Me, Mn, Mc, and Zs were add as needed by is_graphic.
555 lines
13 KiB
Odin
555 lines
13 KiB
Odin
package unicode
|
|
|
|
import "base:runtime"
|
|
|
|
MAX_RUNE :: '\U0010ffff' // Maximum valid unicode code point
|
|
REPLACEMENT_CHAR :: '\ufffd' // Represented an invalid code point
|
|
MAX_ASCII :: '\u007f' // Maximum ASCII value
|
|
MAX_LATIN1 :: '\u00ff' // Maximum Latin-1 value
|
|
|
|
ZERO_WIDTH_SPACE :: '\u200B'
|
|
ZERO_WIDTH_NON_JOINER :: '\u200C'
|
|
ZERO_WIDTH_JOINER :: '\u200D'
|
|
WORD_JOINER :: '\u2060'
|
|
|
|
@(require_results)
|
|
binary_search :: proc(c: $T, table: []T, length, stride: int, loc := #caller_location) -> int #no_bounds_check {
|
|
runtime.bounds_check_error_loc(loc, length*stride-1, len(table))
|
|
n := length
|
|
t := 0
|
|
for n > 1 {
|
|
m := n / 2
|
|
p := t + m*stride
|
|
if c >= table[p] {
|
|
t = p
|
|
n = n-m
|
|
} else {
|
|
n = m
|
|
}
|
|
}
|
|
if n != 0 && c >= table[t] {
|
|
return t
|
|
}
|
|
return -1
|
|
}
|
|
|
|
@(require_results)
|
|
to_lower :: proc(r: rune) -> rune #no_bounds_check {
|
|
c := i32(r)
|
|
p := binary_search(c, to_lower_ranges[:], len(to_lower_ranges)/3, 3)
|
|
if p >= 0 && to_lower_ranges[p] <= c && c <= to_lower_ranges[p+1] {
|
|
return rune(c + to_lower_ranges[p+2] - 500)
|
|
}
|
|
p = binary_search(c, to_lower_singlets[:], len(to_lower_singlets)/2, 2)
|
|
if p >= 0 && c == to_lower_singlets[p] {
|
|
return rune(c + to_lower_singlets[p+1] - 500)
|
|
}
|
|
return rune(c)
|
|
}
|
|
@(require_results)
|
|
to_upper :: proc(r: rune) -> rune #no_bounds_check {
|
|
c := i32(r)
|
|
p := binary_search(c, to_upper_ranges[:], len(to_upper_ranges)/3, 3)
|
|
if p >= 0 && to_upper_ranges[p] <= c && c <= to_upper_ranges[p+1] {
|
|
return rune(c + to_upper_ranges[p+2] - 500)
|
|
}
|
|
p = binary_search(c, to_upper_singlets[:], len(to_upper_singlets)/2, 2)
|
|
if p >= 0 && c == to_upper_singlets[p] {
|
|
return rune(c + to_upper_singlets[p+1] - 500)
|
|
}
|
|
return rune(c)
|
|
}
|
|
@(require_results)
|
|
to_title :: proc(r: rune) -> rune #no_bounds_check {
|
|
c := i32(r)
|
|
p := binary_search(c, to_upper_singlets[:], len(to_title_singlets)/2, 2)
|
|
if p >= 0 && c == to_upper_singlets[p] {
|
|
return rune(c + to_title_singlets[p+1] - 500)
|
|
}
|
|
return rune(c)
|
|
}
|
|
|
|
|
|
@(require_results)
|
|
is_lower :: proc(r: rune) -> bool #no_bounds_check {
|
|
if r <= MAX_ASCII {
|
|
return u32(r)-'a' < 26
|
|
}
|
|
return in_range(r, ll_ranges) || in_range(r, other_lowercase_ranges)
|
|
}
|
|
|
|
@(require_results)
|
|
is_upper :: proc(r: rune) -> bool #no_bounds_check {
|
|
if r <= MAX_ASCII {
|
|
return u32(r)-'A' < 26
|
|
}
|
|
return in_range(r, lu_ranges) || in_range(r, other_uppercase_ranges)
|
|
}
|
|
|
|
is_alpha :: is_letter
|
|
|
|
/*
|
|
Return true if the rune `r` is a letter. Being a letter means that the rune has
|
|
the Unicode general category property of L. In practice, the character will have
|
|
a general category property of Ll, Lm, Lo, Lt, or Lu.
|
|
|
|
Inputs:
|
|
- r: The rune which will be check for having the property of being a letter.
|
|
|
|
Returns:
|
|
`true` when the rune `r` is a letter. `false` will be returned in all other cases.
|
|
*/
|
|
@(require_results)
|
|
is_letter :: proc(r: rune) -> bool #no_bounds_check {
|
|
if u32(r) <= MAX_LATIN1 {
|
|
return char_properties[u8(r)]&pLmask != 0
|
|
}
|
|
if is_upper(r) || is_lower(r) {
|
|
return true
|
|
}
|
|
|
|
ll_lu := in_range(r, ll_ranges) || in_range(r, lu_ranges)
|
|
|
|
return ll_lu || in_range(r, lo_ranges) || in_range(r, lt_ranges) || in_range(r, lm_ranges)
|
|
}
|
|
|
|
@(require_results)
|
|
is_title :: proc(r: rune) -> bool {
|
|
return is_upper(r) && is_lower(r)
|
|
}
|
|
|
|
/*
|
|
Returns true if the rune `r` is in the General Category Nd
|
|
|
|
Inputs:
|
|
- r: The run to check if it is in the general category Nd.
|
|
|
|
Returns:
|
|
`true` if the rune is in the general category Nd and `false` otherwise
|
|
|
|
*/
|
|
is_decimal :: proc(r: rune) -> bool {
|
|
return in_range(r, nd_ranges)
|
|
}
|
|
|
|
/*
|
|
This function determincs if a rune is a digit. To be a digit the
|
|
charage either has a Numeric_Type of Digit or Decimal.
|
|
|
|
Inputs:
|
|
- r: The rune to check if it is a digit.
|
|
|
|
Returns:
|
|
`true` if the rune `r` is a digit, `false` in all other cases
|
|
|
|
*/
|
|
@(require_results)
|
|
is_digit :: proc(r: rune) -> bool {
|
|
if r <= MAX_LATIN1 {
|
|
return ('0' <= r && r <= '9') || r == 0x00B9 || (r >= 0x00B2 && r <= 0x0B3)
|
|
}
|
|
|
|
if in_range(r, nd_ranges) {
|
|
return true
|
|
}
|
|
|
|
if in_range(r, extra_digits_ranges) {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
|
|
is_white_space :: is_space
|
|
@(require_results)
|
|
is_space :: proc(r: rune) -> bool #no_bounds_check {
|
|
if u32(r) <= MAX_LATIN1 {
|
|
switch r {
|
|
case '\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xa0:
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
c := i32(r)
|
|
p := binary_search(c, space_ranges[:], len(space_ranges)/2, 2)
|
|
if p >= 0 && space_ranges[p] <= c && c <= space_ranges[p+1] {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
@(require_results)
|
|
is_combining :: proc(r: rune) -> bool {
|
|
c := i32(r)
|
|
|
|
return c >= 0x0300 && (c <= 0x036f ||
|
|
(c >= 0x1ab0 && c <= 0x1aff) ||
|
|
(c >= 0x1dc0 && c <= 0x1dff) ||
|
|
(c >= 0x20d0 && c <= 0x20ff) ||
|
|
(c >= 0xfe20 && c <= 0xfe2f))
|
|
}
|
|
|
|
|
|
|
|
@(require_results)
|
|
is_graphic :: proc(r: rune) -> bool {
|
|
if u32(r) <= MAX_LATIN1 {
|
|
return char_properties[u8(r)]&pg != 0
|
|
}
|
|
|
|
if is_letter(r) || is_number(r) || is_punct(r) || is_symbol(r) || in_range(r, zs_ranges) {
|
|
return true
|
|
}
|
|
|
|
if in_range(r, mc_ranges) || in_range(r, me_ranges) || in_range(r, mn_ranges) {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
@(require_results)
|
|
is_print :: proc(r: rune) -> bool #no_bounds_check {
|
|
if u32(r) <= MAX_LATIN1 {
|
|
return char_properties[u8(r)]&pp != 0
|
|
}
|
|
return false
|
|
}
|
|
|
|
@(require_results)
|
|
is_control :: proc(r: rune) -> bool #no_bounds_check {
|
|
if u32(r) <= MAX_LATIN1 {
|
|
return char_properties[u8(r)]&pC != 0
|
|
}
|
|
return false
|
|
}
|
|
|
|
/*
|
|
Checks to see if the rune `r` is a number. This means the rune is a member
|
|
of the general category Nd, Nl, or No.
|
|
|
|
Inputs:
|
|
r: The rune to check if it is number.
|
|
|
|
Returns:
|
|
`true` if the ruen belongs to the general category Nd, Nl, or No. `false`
|
|
is return in all other cases.
|
|
|
|
*/
|
|
@(require_results)
|
|
is_number :: proc(r: rune) -> bool #no_bounds_check {
|
|
if u32(r) <= MAX_LATIN1 {
|
|
return char_properties[u8(r)]&pN != 0
|
|
}
|
|
|
|
return in_range(r, nd_ranges) || in_range(r, nl_ranges) || in_range(r, no_ranges)
|
|
}
|
|
|
|
@(require_results)
|
|
is_punct :: proc(r: rune) -> bool #no_bounds_check {
|
|
if u32(r) <= MAX_LATIN1 {
|
|
return char_properties[u8(r)]&pP != 0
|
|
}
|
|
|
|
if in_range(r, pc_ranges) || in_range(r, pd_ranges) || in_range(r, pe_ranges) {
|
|
return true
|
|
}
|
|
|
|
if in_range(r, pf_ranges) || in_range(r, pi_ranges) || in_range(r, po_ranges) {
|
|
return true
|
|
}
|
|
|
|
return in_range(r, ps_ranges)
|
|
}
|
|
|
|
@(require_results)
|
|
is_symbol :: proc(r: rune) -> bool #no_bounds_check {
|
|
if u32(r) <= MAX_LATIN1 {
|
|
return char_properties[u8(r)]&pS != 0
|
|
}
|
|
|
|
s := in_range(r, sc_ranges) || in_range(r, sm_ranges)
|
|
|
|
if s || in_range(r, so_ranges) || in_range(r, sk_ranges) {
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
//
|
|
// The procedures below are accurate as of Unicode 15.1.0.
|
|
//
|
|
|
|
// Emoji_Modifier
|
|
@(require_results)
|
|
is_emoji_modifier :: proc(r: rune) -> bool {
|
|
return 0x1F3FB <= r && r <= 0x1F3FF
|
|
}
|
|
|
|
// Regional_Indicator
|
|
@(require_results)
|
|
is_regional_indicator :: proc(r: rune) -> bool {
|
|
return 0x1F1E6 <= r && r <= 0x1F1FF
|
|
}
|
|
|
|
// General_Category=Enclosing_Mark
|
|
@(require_results)
|
|
is_enclosing_mark :: proc(r: rune) -> bool {
|
|
switch r {
|
|
case 0x0488,
|
|
0x0489,
|
|
0x1ABE,
|
|
0x20DD ..= 0x20E0,
|
|
0x20E2 ..= 0x20E4,
|
|
0xA670 ..= 0xA672:
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// Prepended_Concatenation_Mark
|
|
@(require_results)
|
|
is_prepended_concatenation_mark :: proc(r: rune) -> bool {
|
|
switch r {
|
|
case 0x00600 ..= 0x00605,
|
|
0x006DD,
|
|
0x0070F,
|
|
0x00890 ..= 0x00891,
|
|
0x008E2,
|
|
0x110BD,
|
|
0x110CD:
|
|
return true
|
|
case:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// General_Category=Spacing_Mark
|
|
@(require_results)
|
|
is_spacing_mark :: proc(r: rune) -> bool #no_bounds_check {
|
|
c := i32(r)
|
|
p := binary_search(c, spacing_mark_ranges[:], len(spacing_mark_ranges)/2, 2)
|
|
if p >= 0 && spacing_mark_ranges[p] <= c && c <= spacing_mark_ranges[p+1] {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// General_Category=Nonspacing_Mark
|
|
@(require_results)
|
|
is_nonspacing_mark :: proc(r: rune) -> bool #no_bounds_check {
|
|
c := i32(r)
|
|
p := binary_search(c, nonspacing_mark_ranges[:], len(nonspacing_mark_ranges)/2, 2)
|
|
if p >= 0 && nonspacing_mark_ranges[p] <= c && c <= nonspacing_mark_ranges[p+1] {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Extended_Pictographic
|
|
@(require_results)
|
|
is_emoji_extended_pictographic :: proc(r: rune) -> bool #no_bounds_check {
|
|
c := i32(r)
|
|
p := binary_search(c, emoji_extended_pictographic_ranges[:], len(emoji_extended_pictographic_ranges)/2, 2)
|
|
if p >= 0 && emoji_extended_pictographic_ranges[p] <= c && c <= emoji_extended_pictographic_ranges[p+1] {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Grapheme_Extend
|
|
@(require_results)
|
|
is_grapheme_extend :: proc(r: rune) -> bool #no_bounds_check {
|
|
c := i32(r)
|
|
p := binary_search(c, grapheme_extend_ranges[:], len(grapheme_extend_ranges)/2, 2)
|
|
if p >= 0 && grapheme_extend_ranges[p] <= c && c <= grapheme_extend_ranges[p+1] {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
|
|
// Hangul_Syllable_Type=Leading_Jamo
|
|
@(require_results)
|
|
is_hangul_syllable_leading :: proc(r: rune) -> bool {
|
|
return 0x1100 <= r && r <= 0x115F || 0xA960 <= r && r <= 0xA97C
|
|
}
|
|
|
|
// Hangul_Syllable_Type=Vowel_Jamo
|
|
@(require_results)
|
|
is_hangul_syllable_vowel :: proc(r: rune) -> bool {
|
|
return 0x1160 <= r && r <= 0x11A7 || 0xD7B0 <= r && r <= 0xD7C6
|
|
}
|
|
|
|
// Hangul_Syllable_Type=Trailing_Jamo
|
|
@(require_results)
|
|
is_hangul_syllable_trailing :: proc(r: rune) -> bool {
|
|
return 0x11A8 <= r && r <= 0x11FF || 0xD7CB <= r && r <= 0xD7FB
|
|
}
|
|
|
|
// Hangul_Syllable_Type=LV_Syllable
|
|
@(require_results)
|
|
is_hangul_syllable_lv :: proc(r: rune) -> bool #no_bounds_check {
|
|
c := i32(r)
|
|
p := binary_search(c, hangul_syllable_lv_singlets[:], len(hangul_syllable_lv_singlets), 1)
|
|
if p >= 0 && c == hangul_syllable_lv_singlets[p] {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Hangul_Syllable_Type=LVT_Syllable
|
|
@(require_results)
|
|
is_hangul_syllable_lvt :: proc(r: rune) -> bool #no_bounds_check {
|
|
c := i32(r)
|
|
p := binary_search(c, hangul_syllable_lvt_ranges[:], len(hangul_syllable_lvt_ranges)/2, 2)
|
|
if p >= 0 && hangul_syllable_lvt_ranges[p] <= c && c <= hangul_syllable_lvt_ranges[p+1] {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
|
|
// Indic_Syllabic_Category=Consonant_Preceding_Repha
|
|
@(require_results)
|
|
is_indic_consonant_preceding_repha :: proc(r: rune) -> bool {
|
|
switch r {
|
|
case 0x00D4E,
|
|
0x11941,
|
|
0x11D46,
|
|
0x11F02:
|
|
return true
|
|
case:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// Indic_Syllabic_Category=Consonant_Prefixed
|
|
@(require_results)
|
|
is_indic_consonant_prefixed :: proc(r: rune) -> bool {
|
|
switch r {
|
|
case 0x111C2 ..= 0x111C3,
|
|
0x1193F,
|
|
0x11A3A,
|
|
0x11A84 ..= 0x11A89:
|
|
return true
|
|
case:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// Indic_Conjunct_Break=Linker
|
|
@(require_results)
|
|
is_indic_conjunct_break_linker :: proc(r: rune) -> bool {
|
|
switch r {
|
|
case 0x094D,
|
|
0x09CD,
|
|
0x0ACD,
|
|
0x0B4D,
|
|
0x0C4D,
|
|
0x0D4D:
|
|
return true
|
|
case:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// Indic_Conjunct_Break=Consonant
|
|
@(require_results)
|
|
is_indic_conjunct_break_consonant :: proc(r: rune) -> bool #no_bounds_check {
|
|
c := i32(r)
|
|
p := binary_search(c, indic_conjunct_break_consonant_ranges[:], len(indic_conjunct_break_consonant_ranges)/2, 2)
|
|
if p >= 0 && indic_conjunct_break_consonant_ranges[p] <= c && c <= indic_conjunct_break_consonant_ranges[p+1] {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Indic_Conjunct_Break=Extend
|
|
@(require_results)
|
|
is_indic_conjunct_break_extend :: proc(r: rune) -> bool #no_bounds_check {
|
|
c := i32(r)
|
|
p := binary_search(c, indic_conjunct_break_extend_ranges[:], len(indic_conjunct_break_extend_ranges)/2, 2)
|
|
if p >= 0 && indic_conjunct_break_extend_ranges[p] <= c && c <= indic_conjunct_break_extend_ranges[p+1] {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
|
|
/*
|
|
For grapheme text segmentation, from Unicode TR 29 Rev 43:
|
|
|
|
```
|
|
Indic_Syllabic_Category = Consonant_Preceding_Repha, or
|
|
Indic_Syllabic_Category = Consonant_Prefixed, or
|
|
Prepended_Concatenation_Mark = Yes
|
|
```
|
|
*/
|
|
@(require_results)
|
|
is_gcb_prepend_class :: proc(r: rune) -> bool {
|
|
return is_indic_consonant_preceding_repha(r) || is_indic_consonant_prefixed(r) || is_prepended_concatenation_mark(r)
|
|
}
|
|
|
|
/*
|
|
For grapheme text segmentation, from Unicode TR 29 Rev 43:
|
|
|
|
```
|
|
Grapheme_Extend = Yes, or
|
|
Emoji_Modifier = Yes
|
|
|
|
This includes:
|
|
General_Category = Nonspacing_Mark
|
|
General_Category = Enclosing_Mark
|
|
U+200C ZERO WIDTH NON-JOINER
|
|
|
|
plus a few General_Category = Spacing_Mark needed for canonical equivalence.
|
|
```
|
|
*/
|
|
@(require_results)
|
|
is_gcb_extend_class :: proc(r: rune) -> bool {
|
|
return is_grapheme_extend(r) || is_emoji_modifier(r)
|
|
}
|
|
|
|
// Return values:
|
|
//
|
|
// - 2 if East_Asian_Width=F or W, or
|
|
// - 0 if non-printable / zero-width, or
|
|
// - 1 in all other cases.
|
|
//
|
|
@(require_results)
|
|
normalized_east_asian_width :: proc(r: rune) -> int #no_bounds_check {
|
|
// This is a different interpretation of the BOM which occurs in the middle of text.
|
|
ZERO_WIDTH_NO_BREAK_SPACE :: '\uFEFF'
|
|
|
|
if is_control(r) {
|
|
return 0
|
|
} else if r <= 0x10FF {
|
|
// Easy early out for low runes.
|
|
return 1
|
|
}
|
|
|
|
switch r {
|
|
case ZERO_WIDTH_NO_BREAK_SPACE,
|
|
ZERO_WIDTH_SPACE,
|
|
ZERO_WIDTH_NON_JOINER,
|
|
ZERO_WIDTH_JOINER,
|
|
WORD_JOINER:
|
|
return 0
|
|
}
|
|
|
|
c := i32(r)
|
|
p := binary_search(c, normalized_east_asian_width_ranges[:], len(normalized_east_asian_width_ranges)/3, 3)
|
|
if p >= 0 && normalized_east_asian_width_ranges[p] <= c && c <= normalized_east_asian_width_ranges[p+1] {
|
|
return cast(int)normalized_east_asian_width_ranges[p+2]
|
|
}
|
|
return 1
|
|
}
|
|
|
|
//
|
|
// End of Unicode 15.1.0 block.
|
|
//
|