mirror of
https://github.com/opencloud-eu/opencloud.git
synced 2026-04-07 17:08:10 -04:00
309 lines
9.1 KiB
Go
309 lines
9.1 KiB
Go
package utf7
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/base64"
|
|
"encoding/binary"
|
|
"errors"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf16"
|
|
)
|
|
|
|
const (
|
|
rangeASCII = "ascii"
|
|
rangeUTF7 = "utf7"
|
|
)
|
|
|
|
// Range represents a range with a lower and upper bounds. The range has a
|
|
// name for easier identification
|
|
type Range struct {
|
|
Name string
|
|
Low int
|
|
High int
|
|
}
|
|
|
|
// Range table for ASCII chars belonging to the "direct character" group
|
|
// for UTF-7
|
|
var utf7AsciiRT = &unicode.RangeTable{
|
|
R16: []unicode.Range16{
|
|
{0x27, 0x29, 1}, // '()
|
|
{0x2c, 0x2f, 1}, // ,-./
|
|
{0x30, 0x39, 1}, // 0-9
|
|
{0x3a, 0x3f, 5}, // :?
|
|
{0x41, 0x5a, 1}, // A-Z
|
|
{0x61, 0x7a, 1}, // a-z
|
|
},
|
|
}
|
|
|
|
// EncodeString will encode the provided UTF-8 string into UTF-7 format
|
|
//
|
|
// The encoding process will have the following peculiarities
|
|
// * Any char outside the "direct characters" will be encoded. This means that
|
|
// only "a-z", "A-Z", "0-9" and "'(),-.:?" chars will remain intact while the
|
|
// rest will be encoded. "Optional direct chars" (such as the space) will
|
|
// be encoded.
|
|
// * The "+" char will be encoded as any other character, so the result will
|
|
// be "+ACs-", not "+-"
|
|
// * Sequences of chars will be encoded as a single group. For example,
|
|
// "こんにちは" will be encoded as "+MFMwkzBrMGEwbw-"
|
|
// * All encoded sequences will be enclosed between "+" and "-"
|
|
func EncodeString(s string) string {
|
|
runes := []rune(s)
|
|
|
|
ranges := analyzeRunes(runes)
|
|
|
|
var sb strings.Builder
|
|
// doubling the number of bytes of the string is usually enough
|
|
sb.Grow(len(s) * 2)
|
|
|
|
for _, v := range ranges {
|
|
if v.Name == rangeASCII {
|
|
for _, v := range runes[v.Low:v.High] {
|
|
sb.WriteRune(v)
|
|
}
|
|
} else {
|
|
utf7Bytes := convertToUtf7(runes[v.Low:v.High])
|
|
sb.Write(utf7Bytes)
|
|
}
|
|
}
|
|
return sb.String()
|
|
}
|
|
|
|
// DecodeString will decode the provided UTF-7 string into UTF-8.
|
|
//
|
|
// Any valid UTF-7 string can be decoded, not just the ones returned by
|
|
// the EncodeString function.
|
|
// In particular, UTF-7 strings such as "a+-b" or "a+AD0.b" can be decoded
|
|
// even if the EncodeString function won't generate the corresponding
|
|
// strings that way.
|
|
//
|
|
// Note that this function requires the string to contain only ASCII chars
|
|
// (as per UTF-7), otherwise an error will be returned.
|
|
// Illegal char sequences in the encoded parts of the string will also trigger
|
|
// errors.
|
|
func DecodeString(s string) (string, error) {
|
|
byteArray := []byte(s)
|
|
|
|
ranges, err := analyzeUtf7(byteArray)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
var sb strings.Builder
|
|
sb.Grow(len(byteArray))
|
|
|
|
for _, v := range ranges {
|
|
if v.Name == rangeASCII {
|
|
// if it's an ascii range, just copy it
|
|
sb.Write(byteArray[v.Low:v.High])
|
|
} else {
|
|
// utf7 range
|
|
utf7ByteRange := byteArray[v.Low:v.High]
|
|
if err := convertRangeFromUtf7(utf7ByteRange, &sb); err != nil {
|
|
return "", err
|
|
}
|
|
}
|
|
}
|
|
return sb.String(), nil
|
|
}
|
|
|
|
// analyzeRunes will analyze the array of runes and provide a list of ranges.
|
|
// Each range will be defined by a name and a low and high index. For example,
|
|
// an "ascii" range could go from index 0 to 12 and "utf7" range from 12 to 25.
|
|
// The range includes the low index but not the high "[0,12)". This means it
|
|
// be easily extracted with something like "runes[r.Low:r.High]".
|
|
//
|
|
// The list of ranges will only include the following names:
|
|
// * "ascii" for runes belonging to the "direct characters" group of UTF-7
|
|
// (those that can be used directly without encoding them). Note that
|
|
// it won't consider every ASCII character.
|
|
// * "utf7" for runes that should be encoded for UTF-7.
|
|
//
|
|
// As said, runes in the ranges marked as "utf7" should be encoded for UTF-7,
|
|
// while the others can be used without changes.
|
|
//
|
|
// This method is intended to be used to detect which ranges need to be
|
|
// encoded to UTF-7
|
|
func analyzeRunes(runes []rune) []Range {
|
|
ranges := make([]Range, 0)
|
|
|
|
var currentRange Range
|
|
for k, v := range runes {
|
|
if unicode.Is(utf7AsciiRT, v) {
|
|
if currentRange.Name == "" {
|
|
// take control of the current range
|
|
currentRange.Name = rangeASCII
|
|
currentRange.Low = k
|
|
} else if currentRange.Name != rangeASCII {
|
|
// close current range and open a new one
|
|
currentRange.High = k
|
|
ranges = append(ranges, currentRange)
|
|
currentRange = Range{
|
|
Name: rangeASCII,
|
|
Low: k,
|
|
}
|
|
}
|
|
} else {
|
|
if currentRange.Name == "" {
|
|
// take control of the current range
|
|
currentRange.Name = rangeUTF7
|
|
currentRange.Low = k
|
|
} else if currentRange.Name != rangeUTF7 {
|
|
// close current range and open a new one
|
|
currentRange.High = k
|
|
ranges = append(ranges, currentRange)
|
|
currentRange = Range{
|
|
Name: rangeUTF7,
|
|
Low: k,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// close the last range
|
|
currentRange.High = len(runes)
|
|
ranges = append(ranges, currentRange)
|
|
|
|
return ranges
|
|
}
|
|
|
|
// analyzeUtf7 will analyze the provided byte sequence and return a list of
|
|
// ranges.
|
|
// The byte sequence is considered as UTF-7, so if there is a non-ASCII char
|
|
// in the sequence, an error will be returned (it isn't a valid UTF-7 string).
|
|
//
|
|
// Each returned range will have either "ascii" or "utf7" as name for the range.
|
|
// "ascii" ranges won't require any change and can be used directly. "utf7"
|
|
// ranges are encoded in UTF-7 and will require decoding.
|
|
//
|
|
// This method is intended to be used to detect which ranges need to be
|
|
// decoded from UTF-7
|
|
func analyzeUtf7(byteArray []byte) ([]Range, error) {
|
|
base64chars := "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
|
|
base64ByteArray := []byte(base64chars)
|
|
|
|
ranges := make([]Range, 0)
|
|
|
|
currentRange := Range{
|
|
Name: rangeASCII,
|
|
Low: 0,
|
|
}
|
|
|
|
for k, v := range byteArray {
|
|
if v > unicode.MaxASCII {
|
|
return nil, errors.New("Byte sequence contains a non-ASCII char")
|
|
}
|
|
|
|
if v == '+' && currentRange.Name != rangeUTF7 {
|
|
// start utf7-encoded range
|
|
currentRange.High = k
|
|
ranges = append(ranges, currentRange)
|
|
currentRange = Range{
|
|
Name: rangeUTF7,
|
|
Low: k,
|
|
}
|
|
} else if v == '-' {
|
|
// close utf7-encoded range
|
|
currentRange.High = k + 1 // the '-' char is part of the range
|
|
ranges = append(ranges, currentRange)
|
|
currentRange = Range{
|
|
Name: rangeASCII,
|
|
Low: k + 1,
|
|
}
|
|
} else if bytes.IndexByte(base64ByteArray, v) == -1 && currentRange.Name == rangeUTF7 {
|
|
// found invalid base64 char, so need to close the utf7 range
|
|
currentRange.High = k
|
|
ranges = append(ranges, currentRange)
|
|
currentRange = Range{
|
|
Name: rangeASCII,
|
|
Low: k,
|
|
}
|
|
}
|
|
}
|
|
|
|
// close the last range
|
|
currentRange.High = len(byteArray)
|
|
ranges = append(ranges, currentRange)
|
|
|
|
// there might be empty ranges we need to clear
|
|
// empty ranges have Low = High
|
|
realRanges := make([]Range, 0, len(ranges))
|
|
for _, v := range ranges {
|
|
if v.Low != v.High {
|
|
realRanges = append(realRanges, v)
|
|
}
|
|
}
|
|
|
|
return realRanges, nil
|
|
}
|
|
|
|
// convertToUtf7 will convert the provided runes to a UTF-7 sequence of bytes.
|
|
// The function assumes that all the provided runes must be converted to UTF-7
|
|
func convertToUtf7(runes []rune) []byte {
|
|
byteArray := make([]byte, 0, len(runes)*2)
|
|
|
|
u16 := utf16.Encode(runes)
|
|
for _, v := range u16 {
|
|
byteArray = binary.BigEndian.AppendUint16(byteArray, v)
|
|
}
|
|
|
|
dst := make([]byte, base64.RawStdEncoding.EncodedLen(len(byteArray))+2)
|
|
dst[0] = '+'
|
|
base64.RawStdEncoding.Encode(dst[1:len(dst)-1], byteArray)
|
|
dst[len(dst)-1] = '-'
|
|
return dst
|
|
}
|
|
|
|
// convertRangeFromUtf7 will convert an utf7 byte range (enclosed in
|
|
// the "+" and "-" chars) and write the result in the provided string builder.
|
|
// The string builder won't be modified other than to append the result.
|
|
// An error might be returned if there is any problem with the conversion.
|
|
func convertRangeFromUtf7(utf7ByteRange []byte, sb *strings.Builder) error {
|
|
if len(utf7ByteRange) == 2 && utf7ByteRange[0] == '+' && utf7ByteRange[1] == '-' {
|
|
// special case for the "+-" sequence -> just write "+" as replacement
|
|
sb.WriteByte('+')
|
|
} else {
|
|
// utf7 range must start with "+" and should (but might not) end with "-"
|
|
// we need to remove those chars before decoding
|
|
toDecode := utf7ByteRange[1 : len(utf7ByteRange)-1]
|
|
if utf7ByteRange[len(utf7ByteRange)-1] != '-' {
|
|
toDecode = utf7ByteRange[1:]
|
|
}
|
|
runeArray, err := convertFromUtf7(toDecode)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, r := range runeArray {
|
|
sb.WriteRune(r)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// convertFromUtf7 will convert the sequence of bytes to runes. The sequence
|
|
// of bytes is assumed to be an UTF-7 encoded sequence (without the "+" and
|
|
// "-" limiters)
|
|
// The returned runes should be UTF-8 encoded and can be converted to a
|
|
// regular string easily.
|
|
// Note that errors can be returned if the decoding process fails
|
|
func convertFromUtf7(byteArray []byte) ([]rune, error) {
|
|
dst := make([]byte, base64.RawStdEncoding.DecodedLen(len(byteArray)))
|
|
|
|
_, err := base64.RawStdEncoding.Decode(dst, byteArray)
|
|
if err != nil {
|
|
return []rune{}, err
|
|
}
|
|
|
|
if len(dst)%2 != 0 {
|
|
// some data can't be represented as utf16, and can't be decoded
|
|
return []rune{}, errors.New("some utf7 data can't be represented as utf16")
|
|
}
|
|
|
|
u16array := make([]uint16, 0, len(dst)/2)
|
|
for i := 0; i < len(dst); i++ {
|
|
u16array = append(u16array, binary.BigEndian.Uint16(dst[i:i+2]))
|
|
i = i + 1
|
|
}
|
|
return utf16.Decode(u16array), nil
|
|
}
|