mirror of
https://github.com/42wim/matterbridge
synced 2024-11-05 12:00:27 +00:00
302 lines
7.5 KiB
Go
302 lines
7.5 KiB
Go
// The charset package implements translation between character sets.
|
|
// It uses Unicode as the intermediate representation.
|
|
// Because it can be large, the character set data is separated
|
|
// from the charset package. It can be embedded in the Go
|
|
// executable by importing the data package:
|
|
//
|
|
// import _ "github.com/paulrosania/go-charset/data"
|
|
//
|
|
// It can also made available in a data directory (by settting CharsetDir).
|
|
package charset
|
|
|
|
import (
|
|
"io"
|
|
"strings"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// Charset holds information about a given character set.
|
|
type Charset struct {
|
|
Name string // Canonical name of character set.
|
|
Aliases []string // Known aliases.
|
|
Desc string // Description.
|
|
NoFrom bool // Not possible to translate from this charset.
|
|
NoTo bool // Not possible to translate to this charset.
|
|
}
|
|
|
|
// Translator represents a character set converter.
|
|
// The Translate method translates the given data,
|
|
// and returns the number of bytes of data consumed,
|
|
// a slice containing the converted data (which may be
|
|
// overwritten on the next call to Translate), and any
|
|
// conversion error. If eof is true, the data represents
|
|
// the final bytes of the input.
|
|
type Translator interface {
|
|
Translate(data []byte, eof bool) (n int, cdata []byte, err error)
|
|
}
|
|
|
|
// A Factory can be used to make character set translators.
|
|
type Factory interface {
|
|
// TranslatorFrom creates a translator that will translate from the named character
|
|
// set to UTF-8.
|
|
TranslatorFrom(name string) (Translator, error) // Create a Translator from this character set to.
|
|
|
|
// TranslatorTo creates a translator that will translate from UTF-8 to the named character set.
|
|
TranslatorTo(name string) (Translator, error) // Create a Translator To this character set.
|
|
|
|
// Names returns all the character set names accessibile through the factory.
|
|
Names() []string
|
|
|
|
// Info returns information on the named character set. It returns nil if the
|
|
// factory doesn't recognise the given name.
|
|
Info(name string) *Charset
|
|
}
|
|
|
|
var factories = []Factory{localFactory{}}
|
|
|
|
// Register registers a new Factory which will be consulted when NewReader
|
|
// or NewWriter needs a character set translator for a given name.
|
|
func Register(factory Factory) {
|
|
factories = append(factories, factory)
|
|
}
|
|
|
|
// NewReader returns a new Reader that translates from the named
|
|
// character set to UTF-8 as it reads r.
|
|
func NewReader(charset string, r io.Reader) (io.Reader, error) {
|
|
tr, err := TranslatorFrom(charset)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return NewTranslatingReader(r, tr), nil
|
|
}
|
|
|
|
// NewWriter returns a new WriteCloser writing to w. It converts writes
|
|
// of UTF-8 text into writes on w of text in the named character set.
|
|
// The Close is necessary to flush any remaining partially translated
|
|
// characters to the output.
|
|
func NewWriter(charset string, w io.Writer) (io.WriteCloser, error) {
|
|
tr, err := TranslatorTo(charset)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return NewTranslatingWriter(w, tr), nil
|
|
}
|
|
|
|
// Info returns information about a character set, or nil
|
|
// if the character set is not found.
|
|
func Info(name string) *Charset {
|
|
for _, f := range factories {
|
|
if info := f.Info(name); info != nil {
|
|
return info
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Names returns the canonical names of all supported character sets, in alphabetical order.
|
|
func Names() []string {
|
|
// TODO eliminate duplicates
|
|
var names []string
|
|
for _, f := range factories {
|
|
names = append(names, f.Names()...)
|
|
}
|
|
return names
|
|
}
|
|
|
|
// TranslatorFrom returns a translator that will translate from
|
|
// the named character set to UTF-8.
|
|
func TranslatorFrom(charset string) (Translator, error) {
|
|
var err error
|
|
var tr Translator
|
|
for _, f := range factories {
|
|
tr, err = f.TranslatorFrom(charset)
|
|
if err == nil {
|
|
break
|
|
}
|
|
}
|
|
if tr == nil {
|
|
return nil, err
|
|
}
|
|
return tr, nil
|
|
}
|
|
|
|
// TranslatorTo returns a translator that will translate from UTF-8
|
|
// to the named character set.
|
|
func TranslatorTo(charset string) (Translator, error) {
|
|
var err error
|
|
var tr Translator
|
|
for _, f := range factories {
|
|
tr, err = f.TranslatorTo(charset)
|
|
if err == nil {
|
|
break
|
|
}
|
|
}
|
|
if tr == nil {
|
|
return nil, err
|
|
}
|
|
return tr, nil
|
|
}
|
|
|
|
func normalizedChar(c rune) rune {
|
|
switch {
|
|
case c >= 'A' && c <= 'Z':
|
|
c = c - 'A' + 'a'
|
|
case c == '_':
|
|
c = '-'
|
|
}
|
|
return c
|
|
}
|
|
|
|
// NormalisedName returns s with all Roman capitals
|
|
// mapped to lower case, and '_' mapped to '-'
|
|
func NormalizedName(s string) string {
|
|
return strings.Map(normalizedChar, s)
|
|
}
|
|
|
|
type translatingWriter struct {
|
|
w io.Writer
|
|
tr Translator
|
|
buf []byte // unconsumed data from writer.
|
|
}
|
|
|
|
// NewTranslatingWriter returns a new WriteCloser writing to w.
|
|
// It passes the written bytes through the given Translator.
|
|
func NewTranslatingWriter(w io.Writer, tr Translator) io.WriteCloser {
|
|
return &translatingWriter{w: w, tr: tr}
|
|
}
|
|
|
|
func (w *translatingWriter) Write(data []byte) (rn int, rerr error) {
|
|
wdata := data
|
|
if len(w.buf) > 0 {
|
|
w.buf = append(w.buf, data...)
|
|
wdata = w.buf
|
|
}
|
|
n, cdata, err := w.tr.Translate(wdata, false)
|
|
if err != nil {
|
|
// TODO
|
|
}
|
|
if n > 0 {
|
|
_, err = w.w.Write(cdata)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
}
|
|
w.buf = w.buf[:0]
|
|
if n < len(wdata) {
|
|
w.buf = append(w.buf, wdata[n:]...)
|
|
}
|
|
return len(data), nil
|
|
}
|
|
|
|
func (p *translatingWriter) Close() error {
|
|
for {
|
|
n, data, err := p.tr.Translate(p.buf, true)
|
|
p.buf = p.buf[n:]
|
|
if err != nil {
|
|
// TODO
|
|
}
|
|
// If the Translator produces no data
|
|
// at EOF, then assume that it never will.
|
|
if len(data) == 0 {
|
|
break
|
|
}
|
|
n, err = p.w.Write(data)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if n < len(data) {
|
|
return io.ErrShortWrite
|
|
}
|
|
if len(p.buf) == 0 {
|
|
break
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
type translatingReader struct {
|
|
r io.Reader
|
|
tr Translator
|
|
cdata []byte // unconsumed data from converter.
|
|
rdata []byte // unconverted data from reader.
|
|
err error // final error from reader.
|
|
}
|
|
|
|
// NewTranslatingReader returns a new Reader that
|
|
// translates data using the given Translator as it reads r.
|
|
func NewTranslatingReader(r io.Reader, tr Translator) io.Reader {
|
|
return &translatingReader{r: r, tr: tr}
|
|
}
|
|
|
|
func (r *translatingReader) Read(buf []byte) (int, error) {
|
|
for {
|
|
if len(r.cdata) > 0 {
|
|
n := copy(buf, r.cdata)
|
|
r.cdata = r.cdata[n:]
|
|
return n, nil
|
|
}
|
|
if r.err == nil {
|
|
r.rdata = ensureCap(r.rdata, len(r.rdata)+len(buf))
|
|
n, err := r.r.Read(r.rdata[len(r.rdata):cap(r.rdata)])
|
|
// Guard against non-compliant Readers.
|
|
if n == 0 && err == nil {
|
|
err = io.EOF
|
|
}
|
|
r.rdata = r.rdata[0 : len(r.rdata)+n]
|
|
r.err = err
|
|
} else if len(r.rdata) == 0 {
|
|
break
|
|
}
|
|
nc, cdata, cvterr := r.tr.Translate(r.rdata, r.err != nil)
|
|
if cvterr != nil {
|
|
// TODO
|
|
}
|
|
r.cdata = cdata
|
|
|
|
// Ensure that we consume all bytes at eof
|
|
// if the converter refuses them.
|
|
if nc == 0 && r.err != nil {
|
|
nc = len(r.rdata)
|
|
}
|
|
|
|
// Copy unconsumed data to the start of the rdata buffer.
|
|
r.rdata = r.rdata[0:copy(r.rdata, r.rdata[nc:])]
|
|
}
|
|
return 0, r.err
|
|
}
|
|
|
|
// ensureCap returns s with a capacity of at least n bytes.
|
|
// If cap(s) < n, then it returns a new copy of s with the
|
|
// required capacity.
|
|
func ensureCap(s []byte, n int) []byte {
|
|
if n <= cap(s) {
|
|
return s
|
|
}
|
|
// logic adapted from appendslice1 in runtime
|
|
m := cap(s)
|
|
if m == 0 {
|
|
m = n
|
|
} else {
|
|
for {
|
|
if m < 1024 {
|
|
m += m
|
|
} else {
|
|
m += m / 4
|
|
}
|
|
if m >= n {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
t := make([]byte, len(s), m)
|
|
copy(t, s)
|
|
return t
|
|
}
|
|
|
|
func appendRune(buf []byte, r rune) []byte {
|
|
n := len(buf)
|
|
buf = ensureCap(buf, n+utf8.UTFMax)
|
|
nu := utf8.EncodeRune(buf[n:n+utf8.UTFMax], r)
|
|
return buf[0 : n+nu]
|
|
}
|