Files
opencloud/services/collaboration/pkg/connector/utf7/utf7.go
Juan Pablo Villafáñez 94d191f624 refactor: simplify code
2024-07-31 12:50:41 +02:00

309 lines
9.1 KiB
Go

package utf7
import (
"bytes"
"encoding/base64"
"encoding/binary"
"errors"
"strings"
"unicode"
"unicode/utf16"
)
const (
rangeASCII = "ascii"
rangeUTF7 = "utf7"
)
// Range represents a range with a lower and upper bounds. The range has a
// name for easier identification
type Range struct {
Name string
Low int
High int
}
// Range table for ASCII chars belonging to the "direct character" group
// for UTF-7
var utf7AsciiRT = &unicode.RangeTable{
R16: []unicode.Range16{
{0x27, 0x29, 1}, // '()
{0x2c, 0x2f, 1}, // ,-./
{0x30, 0x39, 1}, // 0-9
{0x3a, 0x3f, 5}, // :?
{0x41, 0x5a, 1}, // A-Z
{0x61, 0x7a, 1}, // a-z
},
}
// EncodeString will encode the provided UTF-8 string into UTF-7 format
//
// The encoding process will have the following peculiarities
// * Any char outside the "direct characters" will be encoded. This means that
// only "a-z", "A-Z", "0-9" and "'(),-.:?" chars will remain intact while the
// rest will be encoded. "Optional direct chars" (such as the space) will
// be encoded.
// * The "+" char will be encoded as any other character, so the result will
// be "+ACs-", not "+-"
// * Sequences of chars will be encoded as a single group. For example,
// "こんにちは" will be encoded as "+MFMwkzBrMGEwbw-"
// * All encoded sequences will be enclosed between "+" and "-"
func EncodeString(s string) string {
runes := []rune(s)
ranges := analyzeRunes(runes)
var sb strings.Builder
// doubling the number of bytes of the string is usually enough
sb.Grow(len(s) * 2)
for _, v := range ranges {
if v.Name == rangeASCII {
for _, v := range runes[v.Low:v.High] {
sb.WriteRune(v)
}
} else {
utf7Bytes := convertToUtf7(runes[v.Low:v.High])
sb.Write(utf7Bytes)
}
}
return sb.String()
}
// DecodeString will decode the provided UTF-7 string into UTF-8.
//
// Any valid UTF-7 string can be decoded, not just the ones returned by
// the EncodeString function.
// In particular, UTF-7 strings such as "a+-b" or "a+AD0.b" can be decoded
// even if the EncodeString function won't generate the corresponding
// strings that way.
//
// Note that this function requires the string to contain only ASCII chars
// (as per UTF-7), otherwise an error will be returned.
// Illegal char sequences in the encoded parts of the string will also trigger
// errors.
func DecodeString(s string) (string, error) {
byteArray := []byte(s)
ranges, err := analyzeUtf7(byteArray)
if err != nil {
return "", err
}
var sb strings.Builder
sb.Grow(len(byteArray))
for _, v := range ranges {
if v.Name == rangeASCII {
// if it's an ascii range, just copy it
sb.Write(byteArray[v.Low:v.High])
} else {
// utf7 range
utf7ByteRange := byteArray[v.Low:v.High]
if err := convertRangeFromUtf7(utf7ByteRange, &sb); err != nil {
return "", err
}
}
}
return sb.String(), nil
}
// analyzeRunes will analyze the array of runes and provide a list of ranges.
// Each range will be defined by a name and a low and high index. For example,
// an "ascii" range could go from index 0 to 12 and "utf7" range from 12 to 25.
// The range includes the low index but not the high "[0,12)". This means it
// be easily extracted with something like "runes[r.Low:r.High]".
//
// The list of ranges will only include the following names:
// * "ascii" for runes belonging to the "direct characters" group of UTF-7
// (those that can be used directly without encoding them). Note that
// it won't consider every ASCII character.
// * "utf7" for runes that should be encoded for UTF-7.
//
// As said, runes in the ranges marked as "utf7" should be encoded for UTF-7,
// while the others can be used without changes.
//
// This method is intended to be used to detect which ranges need to be
// encoded to UTF-7
func analyzeRunes(runes []rune) []Range {
ranges := make([]Range, 0)
var currentRange Range
for k, v := range runes {
if unicode.Is(utf7AsciiRT, v) {
if currentRange.Name == "" {
// take control of the current range
currentRange.Name = rangeASCII
currentRange.Low = k
} else if currentRange.Name != rangeASCII {
// close current range and open a new one
currentRange.High = k
ranges = append(ranges, currentRange)
currentRange = Range{
Name: rangeASCII,
Low: k,
}
}
} else {
if currentRange.Name == "" {
// take control of the current range
currentRange.Name = rangeUTF7
currentRange.Low = k
} else if currentRange.Name != rangeUTF7 {
// close current range and open a new one
currentRange.High = k
ranges = append(ranges, currentRange)
currentRange = Range{
Name: rangeUTF7,
Low: k,
}
}
}
}
// close the last range
currentRange.High = len(runes)
ranges = append(ranges, currentRange)
return ranges
}
// analyzeUtf7 will analyze the provided byte sequence and return a list of
// ranges.
// The byte sequence is considered as UTF-7, so if there is a non-ASCII char
// in the sequence, an error will be returned (it isn't a valid UTF-7 string).
//
// Each returned range will have either "ascii" or "utf7" as name for the range.
// "ascii" ranges won't require any change and can be used directly. "utf7"
// ranges are encoded in UTF-7 and will require decoding.
//
// This method is intended to be used to detect which ranges need to be
// decoded from UTF-7
func analyzeUtf7(byteArray []byte) ([]Range, error) {
base64chars := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
base64ByteArray := []byte(base64chars)
ranges := make([]Range, 0)
currentRange := Range{
Name: rangeASCII,
Low: 0,
}
for k, v := range byteArray {
if v > unicode.MaxASCII {
return nil, errors.New("Byte sequence contains a non-ASCII char")
}
if v == '+' && currentRange.Name != rangeUTF7 {
// start utf7-encoded range
currentRange.High = k
ranges = append(ranges, currentRange)
currentRange = Range{
Name: rangeUTF7,
Low: k,
}
} else if v == '-' {
// close utf7-encoded range
currentRange.High = k + 1 // the '-' char is part of the range
ranges = append(ranges, currentRange)
currentRange = Range{
Name: rangeASCII,
Low: k + 1,
}
} else if bytes.IndexByte(base64ByteArray, v) == -1 && currentRange.Name == rangeUTF7 {
// found invalid base64 char, so need to close the utf7 range
currentRange.High = k
ranges = append(ranges, currentRange)
currentRange = Range{
Name: rangeASCII,
Low: k,
}
}
}
// close the last range
currentRange.High = len(byteArray)
ranges = append(ranges, currentRange)
// there might be empty ranges we need to clear
// empty ranges have Low = High
realRanges := make([]Range, 0, len(ranges))
for _, v := range ranges {
if v.Low != v.High {
realRanges = append(realRanges, v)
}
}
return realRanges, nil
}
// convertToUtf7 will convert the provided runes to a UTF-7 sequence of bytes.
// The function assumes that all the provided runes must be converted to UTF-7
func convertToUtf7(runes []rune) []byte {
byteArray := make([]byte, 0, len(runes)*2)
u16 := utf16.Encode(runes)
for _, v := range u16 {
byteArray = binary.BigEndian.AppendUint16(byteArray, v)
}
dst := make([]byte, base64.RawStdEncoding.EncodedLen(len(byteArray))+2)
dst[0] = '+'
base64.RawStdEncoding.Encode(dst[1:len(dst)-1], byteArray)
dst[len(dst)-1] = '-'
return dst
}
// convertRangeFromUtf7 will convert an utf7 byte range (enclosed in
// the "+" and "-" chars) and write the result in the provided string builder.
// The string builder won't be modified other than to append the result.
// An error might be returned if there is any problem with the conversion.
func convertRangeFromUtf7(utf7ByteRange []byte, sb *strings.Builder) error {
if len(utf7ByteRange) == 2 && utf7ByteRange[0] == '+' && utf7ByteRange[1] == '-' {
// special case for the "+-" sequence -> just write "+" as replacement
sb.WriteByte('+')
} else {
// utf7 range must start with "+" and should (but might not) end with "-"
// we need to remove those chars before decoding
toDecode := utf7ByteRange[1 : len(utf7ByteRange)-1]
if utf7ByteRange[len(utf7ByteRange)-1] != '-' {
toDecode = utf7ByteRange[1:]
}
runeArray, err := convertFromUtf7(toDecode)
if err != nil {
return err
}
for _, r := range runeArray {
sb.WriteRune(r)
}
}
return nil
}
// convertFromUtf7 will convert the sequence of bytes to runes. The sequence
// of bytes is assumed to be an UTF-7 encoded sequence (without the "+" and
// "-" limiters)
// The returned runes should be UTF-8 encoded and can be converted to a
// regular string easily.
// Note that errors can be returned if the decoding process fails
func convertFromUtf7(byteArray []byte) ([]rune, error) {
dst := make([]byte, base64.RawStdEncoding.DecodedLen(len(byteArray)))
_, err := base64.RawStdEncoding.Decode(dst, byteArray)
if err != nil {
return []rune{}, err
}
if len(dst)%2 != 0 {
// some data can't be represented as utf16, and can't be decoded
return []rune{}, errors.New("some utf7 data can't be represented as utf16")
}
u16array := make([]uint16, 0, len(dst)/2)
for i := 0; i < len(dst); i++ {
u16array = append(u16array, binary.BigEndian.Uint16(dst[i:i+2]))
i = i + 1
}
return utf16.Decode(u16array), nil
}