vendor update for E2E framework

Signed-off-by: Madhu Rajanna <madhupr007@gmail.com>
This commit is contained in:
Madhu Rajanna
2019-05-31 15:15:11 +05:30
parent 9bb23e4e32
commit d300da19b7
2149 changed files with 598692 additions and 14107 deletions

View File

@ -53,9 +53,9 @@ type Builder struct {
// A Tailoring builds a collation table based on another collation table.
// The table is defined by specifying tailorings to the underlying table.
// See http://unicode.org/reports/tr35/ for an overview of tailoring
// See https://unicode.org/reports/tr35/ for an overview of tailoring
// collation tables. The CLDR contains pre-defined tailorings for a variety
// of languages (See http://www.unicode.org/Public/cldr/<version>/core.zip.)
// of languages (See https://www.unicode.org/Public/cldr/<version>/core.zip.)
type Tailoring struct {
id string
builder *Builder
@ -93,7 +93,7 @@ func (b *Builder) Tailoring(loc language.Tag) *Tailoring {
// a slice of runes to a sequence of collation elements.
// A collation element is specified as list of weights: []int{primary, secondary, ...}.
// The entries are typically obtained from a collation element table
// as defined in http://www.unicode.org/reports/tr10/#Data_Table_Format.
// as defined in https://www.unicode.org/reports/tr10/#Data_Table_Format.
// Note that the collation elements specified by colelems are only used
// as a guide. The actual weights generated by Builder may differ.
// The argument variables is a list of indices into colelems that should contain
@ -219,8 +219,8 @@ func (t *Tailoring) SetAnchorBefore(anchor string) error {
// will cause the collation elements corresponding to extend to be appended
// to the collation elements generated for the entry added by Insert.
// This has the same net effect as sorting str after the string anchor+extend.
// See http://www.unicode.org/reports/tr10/#Tailoring_Example for details
// on parametric tailoring and http://unicode.org/reports/tr35/#Collation_Elements
// See https://www.unicode.org/reports/tr10/#Tailoring_Example for details
// on parametric tailoring and https://unicode.org/reports/tr35/#Collation_Elements
// for full details on LDML.
//
// Examples: create a tailoring for Swedish, where "ä" is ordered after "z"
@ -262,7 +262,7 @@ func (t *Tailoring) Insert(level colltab.Level, str, extend string) error {
a := t.anchor
// Find the first element after the anchor which differs at a level smaller or
// equal to the given level. Then insert at this position.
// See http://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details.
// See https://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details.
e.before = t.before
if t.before {
t.before = false

View File

@ -105,7 +105,7 @@ func makeExpansionHeader(n int) (uint32, error) {
// - v* is the replacement tertiary weight for the first rune,
// - w* is the replacement tertiary weight for the second rune,
// Tertiary weights of subsequent runes should be replaced with maxTertiary.
// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
// See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
const (
decompID = 0xF0000000
)
@ -121,7 +121,7 @@ func makeDecompose(t1, t2 int) (uint32, error) {
}
const (
// These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
// These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
minUnified rune = 0x4E00
maxUnified = 0x9FFF
minCompatibility = 0xF900
@ -140,7 +140,7 @@ const (
// implicitPrimary returns the primary weight for the a rune
// for which there is no entry for the rune in the collation table.
// We take a different approach from the one specified in
// http://unicode.org/reports/tr10/#Implicit_Weights,
// https://unicode.org/reports/tr10/#Implicit_Weights,
// but preserve the resulting relative ordering of the runes.
func implicitPrimary(r rune) int {
if unicode.Is(unicode.Ideographic, r) {
@ -165,7 +165,7 @@ func implicitPrimary(r rune) int {
// [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
// We will rewrite these characters to a single CE.
// We assume the CJK values start at 0x8000.
// See http://unicode.org/reports/tr10/#Implicit_Weights
// See https://unicode.org/reports/tr10/#Implicit_Weights
func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
const (
cjkPrimaryStart = 0xFB40

View File

@ -18,7 +18,7 @@ import (
// the necessary tables.
// Any Unicode Collation Algorithm (UCA) table entry that has more than
// one rune one the left-hand side is called a contraction.
// See http://www.unicode.org/reports/tr10/#Contractions for more details.
// See https://www.unicode.org/reports/tr10/#Contractions for more details.
//
// We define the following terms:
// initial: a rune that appears as the first rune in a contraction.

View File

@ -26,7 +26,7 @@ const (
// entry is used to keep track of a single entry in the collation element table
// during building. Examples of entries can be found in the Default Unicode
// Collation Element Table.
// See http://www.unicode.org/Public/UCA/6.0.0/allkeys.txt.
// See https://www.unicode.org/Public/UCA/6.0.0/allkeys.txt.
type entry struct {
str string // same as string(runes)
runes []rune

View File

@ -193,7 +193,7 @@ func (c *Collator) compare() int {
// The returned slice will point to an allocation in Buffer and will remain
// valid until the next call to buf.Reset().
func (c *Collator) Key(buf *Buffer, str []byte) []byte {
// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
// See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
buf.init()
return c.key(buf, c.getColElems(str))
}
@ -203,7 +203,7 @@ func (c *Collator) Key(buf *Buffer, str []byte) []byte {
// The returned slice will point to an allocation in Buffer and will retain
// valid until the next call to buf.ResetKeys().
func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
// See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
buf.init()
return c.key(buf, c.getColElemsString(str))
}

View File

@ -195,7 +195,7 @@ func openArchive() *zip.Reader {
}
// parseUCA parses a Default Unicode Collation Element Table of the format
// specified in http://www.unicode.org/reports/tr10/#File_Format.
// specified in https://www.unicode.org/reports/tr10/#File_Format.
// It returns the variable top.
func parseUCA(builder *build.Builder) {
var r io.ReadCloser

View File

@ -165,7 +165,7 @@ var (
IgnoreWidth Option = ignoreWidth
ignoreWidth = Option{2, ignoreWidthF}
// Loose sets the collator to ignore diacritics, case and weight.
// Loose sets the collator to ignore diacritics, case and width.
Loose Option = loose
loose = Option{4, looseF}
@ -217,7 +217,7 @@ func Reorder(s ...string) Option {
// alternateHandling identifies the various ways in which variables are handled.
// A rune with a primary weight lower than the variable top is considered a
// variable.
// See http://www.unicode.org/reports/tr10/#Variable_Weighting for details.
// See https://www.unicode.org/reports/tr10/#Variable_Weighting for details.
type alternateHandling int
const (

249
vendor/golang.org/x/text/encoding/charmap/charmap.go generated vendored Normal file
View File

@ -0,0 +1,249 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run maketables.go
// Package charmap provides simple character encodings such as IBM Code Page 437
// and Windows 1252.
package charmap // import "golang.org/x/text/encoding/charmap"
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// These encodings vary only in the way clients should interpret them. Their
// coded character set is identical and a single implementation can be shared.
var (
// ISO8859_6E is the ISO 8859-6E encoding.
ISO8859_6E encoding.Encoding = &iso8859_6E
// ISO8859_6I is the ISO 8859-6I encoding.
ISO8859_6I encoding.Encoding = &iso8859_6I
// ISO8859_8E is the ISO 8859-8E encoding.
ISO8859_8E encoding.Encoding = &iso8859_8E
// ISO8859_8I is the ISO 8859-8I encoding.
ISO8859_8I encoding.Encoding = &iso8859_8I
iso8859_6E = internal.Encoding{
Encoding: ISO8859_6,
Name: "ISO-8859-6E",
MIB: identifier.ISO88596E,
}
iso8859_6I = internal.Encoding{
Encoding: ISO8859_6,
Name: "ISO-8859-6I",
MIB: identifier.ISO88596I,
}
iso8859_8E = internal.Encoding{
Encoding: ISO8859_8,
Name: "ISO-8859-8E",
MIB: identifier.ISO88598E,
}
iso8859_8I = internal.Encoding{
Encoding: ISO8859_8,
Name: "ISO-8859-8I",
MIB: identifier.ISO88598I,
}
)
// All is a list of all defined encodings in this package.
var All []encoding.Encoding = listAll
// TODO: implement these encodings, in order of importance.
// ASCII, ISO8859_1: Rather common. Close to Windows 1252.
// ISO8859_9: Close to Windows 1254.
// utf8Enc holds a rune's UTF-8 encoding in data[:len].
type utf8Enc struct {
len uint8
data [3]byte
}
// Charmap is an 8-bit character set encoding.
type Charmap struct {
// name is the encoding's name.
name string
// mib is the encoding type of this encoder.
mib identifier.MIB
// asciiSuperset states whether the encoding is a superset of ASCII.
asciiSuperset bool
// low is the lower bound of the encoded byte for a non-ASCII rune. If
// Charmap.asciiSuperset is true then this will be 0x80, otherwise 0x00.
low uint8
// replacement is the encoded replacement character.
replacement byte
// decode is the map from encoded byte to UTF-8.
decode [256]utf8Enc
// encoding is the map from runes to encoded bytes. Each entry is a
// uint32: the high 8 bits are the encoded byte and the low 24 bits are
// the rune. The table entries are sorted by ascending rune.
encode [256]uint32
}
// NewDecoder implements the encoding.Encoding interface.
func (m *Charmap) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: charmapDecoder{charmap: m}}
}
// NewEncoder implements the encoding.Encoding interface.
func (m *Charmap) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: charmapEncoder{charmap: m}}
}
// String returns the Charmap's name.
func (m *Charmap) String() string {
return m.name
}
// ID implements an internal interface.
func (m *Charmap) ID() (mib identifier.MIB, other string) {
return m.mib, ""
}
// charmapDecoder implements transform.Transformer by decoding to UTF-8.
type charmapDecoder struct {
transform.NopResetter
charmap *Charmap
}
func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for i, c := range src {
if m.charmap.asciiSuperset && c < utf8.RuneSelf {
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = c
nDst++
nSrc = i + 1
continue
}
decode := &m.charmap.decode[c]
n := int(decode.len)
if nDst+n > len(dst) {
err = transform.ErrShortDst
break
}
// It's 15% faster to avoid calling copy for these tiny slices.
for j := 0; j < n; j++ {
dst[nDst] = decode.data[j]
nDst++
}
nSrc = i + 1
}
return nDst, nSrc, err
}
// DecodeByte returns the Charmap's rune decoding of the byte b.
func (m *Charmap) DecodeByte(b byte) rune {
switch x := &m.decode[b]; x.len {
case 1:
return rune(x.data[0])
case 2:
return rune(x.data[0]&0x1f)<<6 | rune(x.data[1]&0x3f)
default:
return rune(x.data[0]&0x0f)<<12 | rune(x.data[1]&0x3f)<<6 | rune(x.data[2]&0x3f)
}
}
// charmapEncoder implements transform.Transformer by encoding from UTF-8.
type charmapEncoder struct {
transform.NopResetter
charmap *Charmap
}
func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for nSrc < len(src) {
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
if m.charmap.asciiSuperset {
nSrc++
dst[nDst] = uint8(r)
nDst++
continue
}
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
} else {
err = internal.RepertoireError(m.charmap.replacement)
}
break
}
}
// Binary search in [low, high) for that rune in the m.charmap.encode table.
for low, high := int(m.charmap.low), 0x100; ; {
if low >= high {
err = internal.RepertoireError(m.charmap.replacement)
break loop
}
mid := (low + high) / 2
got := m.charmap.encode[mid]
gotRune := rune(got & (1<<24 - 1))
if gotRune < r {
low = mid + 1
} else if gotRune > r {
high = mid
} else {
dst[nDst] = byte(got >> 24)
nDst++
break
}
}
nSrc += size
}
return nDst, nSrc, err
}
// EncodeRune returns the Charmap's byte encoding of the rune r. ok is whether
// r is in the Charmap's repertoire. If not, b is set to the Charmap's
// replacement byte. This is often the ASCII substitute character '\x1a'.
func (m *Charmap) EncodeRune(r rune) (b byte, ok bool) {
if r < utf8.RuneSelf && m.asciiSuperset {
return byte(r), true
}
for low, high := int(m.low), 0x100; ; {
if low >= high {
return m.replacement, false
}
mid := (low + high) / 2
got := m.encode[mid]
gotRune := rune(got & (1<<24 - 1))
if gotRune < r {
low = mid + 1
} else if gotRune > r {
high = mid
} else {
return byte(got >> 24), true
}
}
}

556
vendor/golang.org/x/text/encoding/charmap/maketables.go generated vendored Normal file
View File

@ -0,0 +1,556 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/internal/gen"
)
const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
` !"#$%&'()*+,-./0123456789:;<=>?` +
`@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
"`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
var encodings = []struct {
name string
mib string
comment string
varName string
replacement byte
mapping string
}{
{
"IBM Code Page 037",
"IBM037",
"",
"CodePage037",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm",
},
{
"IBM Code Page 437",
"PC8CodePage437",
"",
"CodePage437",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
},
{
"IBM Code Page 850",
"PC850Multilingual",
"",
"CodePage850",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
},
{
"IBM Code Page 852",
"PCp852",
"",
"CodePage852",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
},
{
"IBM Code Page 855",
"IBM855",
"",
"CodePage855",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
},
{
"Windows Code Page 858", // PC latin1 with Euro
"IBM00858",
"",
"CodePage858",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
},
{
"IBM Code Page 860",
"IBM860",
"",
"CodePage860",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm",
},
{
"IBM Code Page 862",
"PC862LatinHebrew",
"",
"CodePage862",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
},
{
"IBM Code Page 863",
"IBM863",
"",
"CodePage863",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm",
},
{
"IBM Code Page 865",
"IBM865",
"",
"CodePage865",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm",
},
{
"IBM Code Page 866",
"IBM866",
"",
"CodePage866",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-ibm866.txt",
},
{
"IBM Code Page 1047",
"IBM1047",
"",
"CodePage1047",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm",
},
{
"IBM Code Page 1140",
"IBM01140",
"",
"CodePage1140",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm",
},
{
"ISO 8859-1",
"ISOLatin1",
"",
"ISO8859_1",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm",
},
{
"ISO 8859-2",
"ISOLatin2",
"",
"ISO8859_2",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
},
{
"ISO 8859-3",
"ISOLatin3",
"",
"ISO8859_3",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
},
{
"ISO 8859-4",
"ISOLatin4",
"",
"ISO8859_4",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
},
{
"ISO 8859-5",
"ISOLatinCyrillic",
"",
"ISO8859_5",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
},
{
"ISO 8859-6",
"ISOLatinArabic",
"",
"ISO8859_6,ISO8859_6E,ISO8859_6I",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
},
{
"ISO 8859-7",
"ISOLatinGreek",
"",
"ISO8859_7",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
},
{
"ISO 8859-8",
"ISOLatinHebrew",
"",
"ISO8859_8,ISO8859_8E,ISO8859_8I",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
},
{
"ISO 8859-9",
"ISOLatin5",
"",
"ISO8859_9",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm",
},
{
"ISO 8859-10",
"ISOLatin6",
"",
"ISO8859_10",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
},
{
"ISO 8859-13",
"ISO885913",
"",
"ISO8859_13",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
},
{
"ISO 8859-14",
"ISO885914",
"",
"ISO8859_14",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
},
{
"ISO 8859-15",
"ISO885915",
"",
"ISO8859_15",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
},
{
"ISO 8859-16",
"ISO885916",
"",
"ISO8859_16",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
},
{
"KOI8-R",
"KOI8R",
"",
"KOI8R",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-koi8-r.txt",
},
{
"KOI8-U",
"KOI8U",
"",
"KOI8U",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-koi8-u.txt",
},
{
"Macintosh",
"Macintosh",
"",
"Macintosh",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-macintosh.txt",
},
{
"Macintosh Cyrillic",
"MacintoshCyrillic",
"",
"MacintoshCyrillic",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
},
{
"Windows 874",
"Windows874",
"",
"Windows874",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-874.txt",
},
{
"Windows 1250",
"Windows1250",
"",
"Windows1250",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1250.txt",
},
{
"Windows 1251",
"Windows1251",
"",
"Windows1251",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1251.txt",
},
{
"Windows 1252",
"Windows1252",
"",
"Windows1252",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1252.txt",
},
{
"Windows 1253",
"Windows1253",
"",
"Windows1253",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1253.txt",
},
{
"Windows 1254",
"Windows1254",
"",
"Windows1254",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1254.txt",
},
{
"Windows 1255",
"Windows1255",
"",
"Windows1255",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1255.txt",
},
{
"Windows 1256",
"Windows1256",
"",
"Windows1256",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1256.txt",
},
{
"Windows 1257",
"Windows1257",
"",
"Windows1257",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1257.txt",
},
{
"Windows 1258",
"Windows1258",
"",
"Windows1258",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1258.txt",
},
{
"X-User-Defined",
"XUserDefined",
"It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
"XUserDefined",
encoding.ASCIISub,
ascii +
"\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
"\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
"\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
"\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
"\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
"\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
"\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
"\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
"\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
"\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
"\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
"\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
"\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
"\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
"\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
"\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
},
}
func getWHATWG(url string) string {
res, err := http.Get(url)
if err != nil {
log.Fatalf("%q: Get: %v", url, err)
}
defer res.Body.Close()
mapping := make([]rune, 128)
for i := range mapping {
mapping[i] = '\ufffd'
}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := 0, 0
if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 128 <= x {
log.Fatalf("code %d is out of range", x)
}
if 0x80 <= y && y < 0xa0 {
// We diverge from the WHATWG spec by mapping control characters
// in the range [0x80, 0xa0) to U+FFFD.
continue
}
mapping[x] = rune(y)
}
return ascii + string(mapping)
}
func getUCM(url string) string {
res, err := http.Get(url)
if err != nil {
log.Fatalf("%q: Get: %v", url, err)
}
defer res.Body.Close()
mapping := make([]rune, 256)
for i := range mapping {
mapping[i] = '\ufffd'
}
charsFound := 0
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
var c byte
var r rune
if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
continue
}
mapping[c] = r
charsFound++
}
if charsFound < 200 {
log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
}
return string(mapping)
}
func main() {
mibs := map[string]bool{}
all := []string{}
w := gen.NewCodeWriter()
defer w.WriteGoFile("tables.go", "charmap")
printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
printf("import (\n")
printf("\t\"golang.org/x/text/encoding\"\n")
printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n")
printf(")\n\n")
for _, e := range encodings {
varNames := strings.Split(e.varName, ",")
all = append(all, varNames...)
varName := varNames[0]
switch {
case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
e.mapping = getWHATWG(e.mapping)
case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
e.mapping = getUCM(e.mapping)
}
asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
if asciiSuperset {
low = 0x80
}
lvn := 1
if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
lvn = 3
}
lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
printf("// %s is the %s encoding.\n", varName, e.name)
if e.comment != "" {
printf("//\n// %s\n", e.comment)
}
printf("var %s *Charmap = &%s\n\nvar %s = Charmap{\nname: %q,\n",
varName, lowerVarName, lowerVarName, e.name)
if mibs[e.mib] {
log.Fatalf("MIB type %q declared multiple times.", e.mib)
}
printf("mib: identifier.%s,\n", e.mib)
printf("asciiSuperset: %t,\n", asciiSuperset)
printf("low: 0x%02x,\n", low)
printf("replacement: 0x%02x,\n", e.replacement)
printf("decode: [256]utf8Enc{\n")
i, backMapping := 0, map[rune]byte{}
for _, c := range e.mapping {
if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
backMapping[c] = byte(i)
}
var buf [8]byte
n := utf8.EncodeRune(buf[:], c)
if n > 3 {
panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
}
printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
if i%2 == 1 {
printf("\n")
}
i++
}
printf("},\n")
printf("encode: [256]uint32{\n")
encode := make([]uint32, 0, 256)
for c, i := range backMapping {
encode = append(encode, uint32(i)<<24|uint32(c))
}
sort.Sort(byRune(encode))
for len(encode) < cap(encode) {
encode = append(encode, encode[len(encode)-1])
}
for i, enc := range encode {
printf("0x%08x,", enc)
if i%8 == 7 {
printf("\n")
}
}
printf("},\n}\n")
// Add an estimate of the size of a single Charmap{} struct value, which
// includes two 256 elem arrays of 4 bytes and some extra fields, which
// align to 3 uint64s on 64-bit architectures.
w.Size += 2*4*256 + 3*8
}
// TODO: add proper line breaking.
printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
}
type byRune []uint32
func (b byRune) Len() int { return len(b) }
func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

7410
vendor/golang.org/x/text/encoding/charmap/tables.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

335
vendor/golang.org/x/text/encoding/encoding.go generated vendored Normal file
View File

@ -0,0 +1,335 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package encoding defines an interface for character encodings, such as Shift
// JIS and Windows 1252, that can convert to and from UTF-8.
//
// Encoding implementations are provided in other packages, such as
// golang.org/x/text/encoding/charmap and
// golang.org/x/text/encoding/japanese.
package encoding // import "golang.org/x/text/encoding"
import (
"errors"
"io"
"strconv"
"unicode/utf8"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// TODO:
// - There seems to be some inconsistency in when decoders return errors
// and when not. Also documentation seems to suggest they shouldn't return
// errors at all (except for UTF-16).
// - Encoders seem to rely on or at least benefit from the input being in NFC
// normal form. Perhaps add an example how users could prepare their output.
// Encoding is a character set encoding that can be transformed to and from
// UTF-8.
type Encoding interface {
// NewDecoder returns a Decoder.
NewDecoder() *Decoder
// NewEncoder returns an Encoder.
NewEncoder() *Encoder
}
// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
//
// Transforming source bytes that are not of that encoding will not result in an
// error per se. Each byte that cannot be transcoded will be represented in the
// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
type Decoder struct {
transform.Transformer
// This forces external creators of Decoders to use names in struct
// initializers, allowing for future extendibility without having to break
// code.
_ struct{}
}
// Bytes converts the given encoded bytes to UTF-8. It returns the converted
// bytes or nil, err if any error occurred.
func (d *Decoder) Bytes(b []byte) ([]byte, error) {
b, _, err := transform.Bytes(d, b)
if err != nil {
return nil, err
}
return b, nil
}
// String converts the given encoded string to UTF-8. It returns the converted
// string or "", err if any error occurred.
func (d *Decoder) String(s string) (string, error) {
s, _, err := transform.String(d, s)
if err != nil {
return "", err
}
return s, nil
}
// Reader wraps another Reader to decode its bytes.
//
// The Decoder may not be used for any other operation as long as the returned
// Reader is in use.
func (d *Decoder) Reader(r io.Reader) io.Reader {
return transform.NewReader(r, d)
}
// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
//
// Each rune that cannot be transcoded will result in an error. In this case,
// the transform will consume all source byte up to, not including the offending
// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
// `\uFFFD`. To return early with an error instead, use transform.Chain to
// preprocess the data with a UTF8Validator.
type Encoder struct {
transform.Transformer
// This forces external creators of Encoders to use names in struct
// initializers, allowing for future extendibility without having to break
// code.
_ struct{}
}
// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
// any error occurred.
func (e *Encoder) Bytes(b []byte) ([]byte, error) {
b, _, err := transform.Bytes(e, b)
if err != nil {
return nil, err
}
return b, nil
}
// String converts a string from UTF-8. It returns the converted string or
// "", err if any error occurred.
func (e *Encoder) String(s string) (string, error) {
s, _, err := transform.String(e, s)
if err != nil {
return "", err
}
return s, nil
}
// Writer wraps another Writer to encode its UTF-8 output.
//
// The Encoder may not be used for any other operation as long as the returned
// Writer is in use.
func (e *Encoder) Writer(w io.Writer) io.Writer {
return transform.NewWriter(w, e)
}
// ASCIISub is the ASCII substitute character, as recommended by
// https://unicode.org/reports/tr36/#Text_Comparison
const ASCIISub = '\x1a'
// Nop is the nop encoding. Its transformed bytes are the same as the source
// bytes; it does not replace invalid UTF-8 sequences.
var Nop Encoding = nop{}
type nop struct{}
func (nop) NewDecoder() *Decoder {
return &Decoder{Transformer: transform.Nop}
}
func (nop) NewEncoder() *Encoder {
return &Encoder{Transformer: transform.Nop}
}
// Replacement is the replacement encoding. Decoding from the replacement
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
// the replacement encoding yields the same as the source bytes except that
// invalid UTF-8 is converted to '\uFFFD'.
//
// It is defined at http://encoding.spec.whatwg.org/#replacement
var Replacement Encoding = replacement{}
type replacement struct{}
func (replacement) NewDecoder() *Decoder {
return &Decoder{Transformer: replacementDecoder{}}
}
func (replacement) NewEncoder() *Encoder {
return &Encoder{Transformer: replacementEncoder{}}
}
func (replacement) ID() (mib identifier.MIB, other string) {
return identifier.Replacement, ""
}
type replacementDecoder struct{ transform.NopResetter }
func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if len(dst) < 3 {
return 0, 0, transform.ErrShortDst
}
if atEOF {
const fffd = "\ufffd"
dst[0] = fffd[0]
dst[1] = fffd[1]
dst[2] = fffd[2]
nDst = 3
}
return nDst, len(src), nil
}
type replacementEncoder struct{ transform.NopResetter }
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
r = '\ufffd'
}
}
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
// repertoire of the destination encoding with HTML escape sequences.
//
// This wrapper exists to comply to URL and HTML forms requiring a
// non-terminating legacy encoder. The produced sequences may lead to data
// loss as they are indistinguishable from legitimate input. To avoid this
// issue, use UTF-8 encodings whenever possible.
func HTMLEscapeUnsupported(e *Encoder) *Encoder {
return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
}
// ReplaceUnsupported wraps encoders to replace source runes outside the
// repertoire of the destination encoding with an encoding-specific
// replacement.
//
// This wrapper is only provided for backwards compatibility and legacy
// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
func ReplaceUnsupported(e *Encoder) *Encoder {
return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
}
type errorHandler struct {
*Encoder
handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
}
// TODO: consider making this error public in some form.
type repertoireError interface {
Replacement() byte
}
func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
for err != nil {
rerr, ok := err.(repertoireError)
if !ok {
return nDst, nSrc, err
}
r, sz := utf8.DecodeRune(src[nSrc:])
n, ok := h.handler(dst[nDst:], r, rerr)
if !ok {
return nDst, nSrc, transform.ErrShortDst
}
err = nil
nDst += n
if nSrc += sz; nSrc < len(src) {
var dn, sn int
dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
nDst += dn
nSrc += sn
}
}
return nDst, nSrc, err
}
func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
buf := [8]byte{}
b := strconv.AppendUint(buf[:0], uint64(r), 10)
if n = len(b) + len("&#;"); n >= len(dst) {
return 0, false
}
dst[0] = '&'
dst[1] = '#'
dst[copy(dst[2:], b)+2] = ';'
return n, true
}
func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
if len(dst) == 0 {
return 0, false
}
dst[0] = err.Replacement()
return 1, true
}
// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
// input byte that is not valid UTF-8.
var UTF8Validator transform.Transformer = utf8Validator{}
type utf8Validator struct{ transform.NopResetter }
func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
n := len(src)
if n > len(dst) {
n = len(dst)
}
for i := 0; i < n; {
if c := src[i]; c < utf8.RuneSelf {
dst[i] = c
i++
continue
}
_, size := utf8.DecodeRune(src[i:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
err = ErrInvalidUTF8
if !atEOF && !utf8.FullRune(src[i:]) {
err = transform.ErrShortSrc
}
return i, i, err
}
if i+size > len(dst) {
return i, i, transform.ErrShortDst
}
for ; size > 0; size-- {
dst[i] = src[i]
i++
}
}
if len(src) > len(dst) {
err = transform.ErrShortDst
}
return n, n, err
}

173
vendor/golang.org/x/text/encoding/htmlindex/gen.go generated vendored Normal file
View File

@ -0,0 +1,173 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bytes"
"encoding/json"
"fmt"
"log"
"strings"
"golang.org/x/text/internal/gen"
)
type group struct {
Encodings []struct {
Labels []string
Name string
}
}
func main() {
gen.Init()
r := gen.Open("https://encoding.spec.whatwg.org", "whatwg", "encodings.json")
var groups []group
if err := json.NewDecoder(r).Decode(&groups); err != nil {
log.Fatalf("Error reading encodings.json: %v", err)
}
w := &bytes.Buffer{}
fmt.Fprintln(w, "type htmlEncoding byte")
fmt.Fprintln(w, "const (")
for i, g := range groups {
for _, e := range g.Encodings {
key := strings.ToLower(e.Name)
name := consts[key]
if name == "" {
log.Fatalf("No const defined for %s.", key)
}
if i == 0 {
fmt.Fprintf(w, "%s htmlEncoding = iota\n", name)
} else {
fmt.Fprintf(w, "%s\n", name)
}
}
}
fmt.Fprintln(w, "numEncodings")
fmt.Fprint(w, ")\n\n")
fmt.Fprintln(w, "var canonical = [numEncodings]string{")
for _, g := range groups {
for _, e := range g.Encodings {
fmt.Fprintf(w, "%q,\n", strings.ToLower(e.Name))
}
}
fmt.Fprint(w, "}\n\n")
fmt.Fprintln(w, "var nameMap = map[string]htmlEncoding{")
for _, g := range groups {
for _, e := range g.Encodings {
for _, l := range e.Labels {
key := strings.ToLower(e.Name)
name := consts[key]
fmt.Fprintf(w, "%q: %s,\n", l, name)
}
}
}
fmt.Fprint(w, "}\n\n")
var tags []string
fmt.Fprintln(w, "var localeMap = []htmlEncoding{")
for _, loc := range locales {
tags = append(tags, loc.tag)
fmt.Fprintf(w, "%s, // %s \n", consts[loc.name], loc.tag)
}
fmt.Fprint(w, "}\n\n")
fmt.Fprintf(w, "const locales = %q\n", strings.Join(tags, " "))
gen.WriteGoFile("tables.go", "htmlindex", w.Bytes())
}
// consts maps canonical encoding name to internal constant.
var consts = map[string]string{
"utf-8": "utf8",
"ibm866": "ibm866",
"iso-8859-2": "iso8859_2",
"iso-8859-3": "iso8859_3",
"iso-8859-4": "iso8859_4",
"iso-8859-5": "iso8859_5",
"iso-8859-6": "iso8859_6",
"iso-8859-7": "iso8859_7",
"iso-8859-8": "iso8859_8",
"iso-8859-8-i": "iso8859_8I",
"iso-8859-10": "iso8859_10",
"iso-8859-13": "iso8859_13",
"iso-8859-14": "iso8859_14",
"iso-8859-15": "iso8859_15",
"iso-8859-16": "iso8859_16",
"koi8-r": "koi8r",
"koi8-u": "koi8u",
"macintosh": "macintosh",
"windows-874": "windows874",
"windows-1250": "windows1250",
"windows-1251": "windows1251",
"windows-1252": "windows1252",
"windows-1253": "windows1253",
"windows-1254": "windows1254",
"windows-1255": "windows1255",
"windows-1256": "windows1256",
"windows-1257": "windows1257",
"windows-1258": "windows1258",
"x-mac-cyrillic": "macintoshCyrillic",
"gbk": "gbk",
"gb18030": "gb18030",
// "hz-gb-2312": "hzgb2312", // Was removed from WhatWG
"big5": "big5",
"euc-jp": "eucjp",
"iso-2022-jp": "iso2022jp",
"shift_jis": "shiftJIS",
"euc-kr": "euckr",
"replacement": "replacement",
"utf-16be": "utf16be",
"utf-16le": "utf16le",
"x-user-defined": "xUserDefined",
}
// locales is taken from
// https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm.
var locales = []struct{ tag, name string }{
// The default value. Explicitly state latin to benefit from the exact
// script option, while still making 1252 the default encoding for languages
// written in Latin script.
{"und_Latn", "windows-1252"},
{"ar", "windows-1256"},
{"ba", "windows-1251"},
{"be", "windows-1251"},
{"bg", "windows-1251"},
{"cs", "windows-1250"},
{"el", "iso-8859-7"},
{"et", "windows-1257"},
{"fa", "windows-1256"},
{"he", "windows-1255"},
{"hr", "windows-1250"},
{"hu", "iso-8859-2"},
{"ja", "shift_jis"},
{"kk", "windows-1251"},
{"ko", "euc-kr"},
{"ku", "windows-1254"},
{"ky", "windows-1251"},
{"lt", "windows-1257"},
{"lv", "windows-1257"},
{"mk", "windows-1251"},
{"pl", "iso-8859-2"},
{"ru", "windows-1251"},
{"sah", "windows-1251"},
{"sk", "windows-1250"},
{"sl", "iso-8859-2"},
{"sr", "windows-1251"},
{"tg", "windows-1251"},
{"th", "windows-874"},
{"tr", "windows-1254"},
{"tt", "windows-1251"},
{"uk", "windows-1251"},
{"vi", "windows-1258"},
{"zh-hans", "gb18030"},
{"zh-hant", "big5"},
}

View File

@ -0,0 +1,86 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go
// Package htmlindex maps character set encoding names to Encodings as
// recommended by the W3C for use in HTML 5. See http://www.w3.org/TR/encoding.
package htmlindex
// TODO: perhaps have a "bare" version of the index (used by this package) that
// is not pre-loaded with all encodings. Global variables in encodings prevent
// the linker from being able to purge unneeded tables. This means that
// referencing all encodings, as this package does for the default index, links
// in all encodings unconditionally.
//
// This issue can be solved by either solving the linking issue (see
// https://github.com/golang/go/issues/6330) or refactoring the encoding tables
// (e.g. moving the tables to internal packages that do not use global
// variables).
// TODO: allow canonicalizing names
import (
"errors"
"strings"
"sync"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/language"
)
var (
errInvalidName = errors.New("htmlindex: invalid encoding name")
errUnknown = errors.New("htmlindex: unknown Encoding")
errUnsupported = errors.New("htmlindex: this encoding is not supported")
)
var (
matcherOnce sync.Once
matcher language.Matcher
)
// LanguageDefault returns the canonical name of the default encoding for a
// given language.
func LanguageDefault(tag language.Tag) string {
matcherOnce.Do(func() {
tags := []language.Tag{}
for _, t := range strings.Split(locales, " ") {
tags = append(tags, language.MustParse(t))
}
matcher = language.NewMatcher(tags, language.PreferSameScript(true))
})
_, i, _ := matcher.Match(tag)
return canonical[localeMap[i]] // Default is Windows-1252.
}
// Get returns an Encoding for one of the names listed in
// http://www.w3.org/TR/encoding using the Default Index. Matching is case-
// insensitive.
func Get(name string) (encoding.Encoding, error) {
x, ok := nameMap[strings.ToLower(strings.TrimSpace(name))]
if !ok {
return nil, errInvalidName
}
return encodings[x], nil
}
// Name reports the canonical name of the given Encoding. It will return
// an error if e is not associated with a supported encoding scheme.
func Name(e encoding.Encoding) (string, error) {
id, ok := e.(identifier.Interface)
if !ok {
return "", errUnknown
}
mib, _ := id.ID()
if mib == 0 {
return "", errUnknown
}
v, ok := mibMap[mib]
if !ok {
return "", errUnsupported
}
return canonical[v], nil
}

105
vendor/golang.org/x/text/encoding/htmlindex/map.go generated vendored Normal file
View File

@ -0,0 +1,105 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package htmlindex
import (
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/korean"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese"
"golang.org/x/text/encoding/unicode"
)
// mibMap maps a MIB identifier to an htmlEncoding index.
var mibMap = map[identifier.MIB]htmlEncoding{
identifier.UTF8: utf8,
identifier.UTF16BE: utf16be,
identifier.UTF16LE: utf16le,
identifier.IBM866: ibm866,
identifier.ISOLatin2: iso8859_2,
identifier.ISOLatin3: iso8859_3,
identifier.ISOLatin4: iso8859_4,
identifier.ISOLatinCyrillic: iso8859_5,
identifier.ISOLatinArabic: iso8859_6,
identifier.ISOLatinGreek: iso8859_7,
identifier.ISOLatinHebrew: iso8859_8,
identifier.ISO88598I: iso8859_8I,
identifier.ISOLatin6: iso8859_10,
identifier.ISO885913: iso8859_13,
identifier.ISO885914: iso8859_14,
identifier.ISO885915: iso8859_15,
identifier.ISO885916: iso8859_16,
identifier.KOI8R: koi8r,
identifier.KOI8U: koi8u,
identifier.Macintosh: macintosh,
identifier.MacintoshCyrillic: macintoshCyrillic,
identifier.Windows874: windows874,
identifier.Windows1250: windows1250,
identifier.Windows1251: windows1251,
identifier.Windows1252: windows1252,
identifier.Windows1253: windows1253,
identifier.Windows1254: windows1254,
identifier.Windows1255: windows1255,
identifier.Windows1256: windows1256,
identifier.Windows1257: windows1257,
identifier.Windows1258: windows1258,
identifier.XUserDefined: xUserDefined,
identifier.GBK: gbk,
identifier.GB18030: gb18030,
identifier.Big5: big5,
identifier.EUCPkdFmtJapanese: eucjp,
identifier.ISO2022JP: iso2022jp,
identifier.ShiftJIS: shiftJIS,
identifier.EUCKR: euckr,
identifier.Replacement: replacement,
}
// encodings maps the internal htmlEncoding to an Encoding.
// TODO: consider using a reusable index in encoding/internal.
var encodings = [numEncodings]encoding.Encoding{
utf8: unicode.UTF8,
ibm866: charmap.CodePage866,
iso8859_2: charmap.ISO8859_2,
iso8859_3: charmap.ISO8859_3,
iso8859_4: charmap.ISO8859_4,
iso8859_5: charmap.ISO8859_5,
iso8859_6: charmap.ISO8859_6,
iso8859_7: charmap.ISO8859_7,
iso8859_8: charmap.ISO8859_8,
iso8859_8I: charmap.ISO8859_8I,
iso8859_10: charmap.ISO8859_10,
iso8859_13: charmap.ISO8859_13,
iso8859_14: charmap.ISO8859_14,
iso8859_15: charmap.ISO8859_15,
iso8859_16: charmap.ISO8859_16,
koi8r: charmap.KOI8R,
koi8u: charmap.KOI8U,
macintosh: charmap.Macintosh,
windows874: charmap.Windows874,
windows1250: charmap.Windows1250,
windows1251: charmap.Windows1251,
windows1252: charmap.Windows1252,
windows1253: charmap.Windows1253,
windows1254: charmap.Windows1254,
windows1255: charmap.Windows1255,
windows1256: charmap.Windows1256,
windows1257: charmap.Windows1257,
windows1258: charmap.Windows1258,
macintoshCyrillic: charmap.MacintoshCyrillic,
gbk: simplifiedchinese.GBK,
gb18030: simplifiedchinese.GB18030,
big5: traditionalchinese.Big5,
eucjp: japanese.EUCJP,
iso2022jp: japanese.ISO2022JP,
shiftJIS: japanese.ShiftJIS,
euckr: korean.EUCKR,
replacement: encoding.Replacement,
utf16be: unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
utf16le: unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
xUserDefined: charmap.XUserDefined,
}

353
vendor/golang.org/x/text/encoding/htmlindex/tables.go generated vendored Normal file
View File

@ -0,0 +1,353 @@
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package htmlindex
type htmlEncoding byte
const (
utf8 htmlEncoding = iota
ibm866
iso8859_2
iso8859_3
iso8859_4
iso8859_5
iso8859_6
iso8859_7
iso8859_8
iso8859_8I
iso8859_10
iso8859_13
iso8859_14
iso8859_15
iso8859_16
koi8r
koi8u
macintosh
windows874
windows1250
windows1251
windows1252
windows1253
windows1254
windows1255
windows1256
windows1257
windows1258
macintoshCyrillic
gbk
gb18030
big5
eucjp
iso2022jp
shiftJIS
euckr
replacement
utf16be
utf16le
xUserDefined
numEncodings
)
var canonical = [numEncodings]string{
"utf-8",
"ibm866",
"iso-8859-2",
"iso-8859-3",
"iso-8859-4",
"iso-8859-5",
"iso-8859-6",
"iso-8859-7",
"iso-8859-8",
"iso-8859-8-i",
"iso-8859-10",
"iso-8859-13",
"iso-8859-14",
"iso-8859-15",
"iso-8859-16",
"koi8-r",
"koi8-u",
"macintosh",
"windows-874",
"windows-1250",
"windows-1251",
"windows-1252",
"windows-1253",
"windows-1254",
"windows-1255",
"windows-1256",
"windows-1257",
"windows-1258",
"x-mac-cyrillic",
"gbk",
"gb18030",
"big5",
"euc-jp",
"iso-2022-jp",
"shift_jis",
"euc-kr",
"replacement",
"utf-16be",
"utf-16le",
"x-user-defined",
}
var nameMap = map[string]htmlEncoding{
"unicode-1-1-utf-8": utf8,
"utf-8": utf8,
"utf8": utf8,
"866": ibm866,
"cp866": ibm866,
"csibm866": ibm866,
"ibm866": ibm866,
"csisolatin2": iso8859_2,
"iso-8859-2": iso8859_2,
"iso-ir-101": iso8859_2,
"iso8859-2": iso8859_2,
"iso88592": iso8859_2,
"iso_8859-2": iso8859_2,
"iso_8859-2:1987": iso8859_2,
"l2": iso8859_2,
"latin2": iso8859_2,
"csisolatin3": iso8859_3,
"iso-8859-3": iso8859_3,
"iso-ir-109": iso8859_3,
"iso8859-3": iso8859_3,
"iso88593": iso8859_3,
"iso_8859-3": iso8859_3,
"iso_8859-3:1988": iso8859_3,
"l3": iso8859_3,
"latin3": iso8859_3,
"csisolatin4": iso8859_4,
"iso-8859-4": iso8859_4,
"iso-ir-110": iso8859_4,
"iso8859-4": iso8859_4,
"iso88594": iso8859_4,
"iso_8859-4": iso8859_4,
"iso_8859-4:1988": iso8859_4,
"l4": iso8859_4,
"latin4": iso8859_4,
"csisolatincyrillic": iso8859_5,
"cyrillic": iso8859_5,
"iso-8859-5": iso8859_5,
"iso-ir-144": iso8859_5,
"iso8859-5": iso8859_5,
"iso88595": iso8859_5,
"iso_8859-5": iso8859_5,
"iso_8859-5:1988": iso8859_5,
"arabic": iso8859_6,
"asmo-708": iso8859_6,
"csiso88596e": iso8859_6,
"csiso88596i": iso8859_6,
"csisolatinarabic": iso8859_6,
"ecma-114": iso8859_6,
"iso-8859-6": iso8859_6,
"iso-8859-6-e": iso8859_6,
"iso-8859-6-i": iso8859_6,
"iso-ir-127": iso8859_6,
"iso8859-6": iso8859_6,
"iso88596": iso8859_6,
"iso_8859-6": iso8859_6,
"iso_8859-6:1987": iso8859_6,
"csisolatingreek": iso8859_7,
"ecma-118": iso8859_7,
"elot_928": iso8859_7,
"greek": iso8859_7,
"greek8": iso8859_7,
"iso-8859-7": iso8859_7,
"iso-ir-126": iso8859_7,
"iso8859-7": iso8859_7,
"iso88597": iso8859_7,
"iso_8859-7": iso8859_7,
"iso_8859-7:1987": iso8859_7,
"sun_eu_greek": iso8859_7,
"csiso88598e": iso8859_8,
"csisolatinhebrew": iso8859_8,
"hebrew": iso8859_8,
"iso-8859-8": iso8859_8,
"iso-8859-8-e": iso8859_8,
"iso-ir-138": iso8859_8,
"iso8859-8": iso8859_8,
"iso88598": iso8859_8,
"iso_8859-8": iso8859_8,
"iso_8859-8:1988": iso8859_8,
"visual": iso8859_8,
"csiso88598i": iso8859_8I,
"iso-8859-8-i": iso8859_8I,
"logical": iso8859_8I,
"csisolatin6": iso8859_10,
"iso-8859-10": iso8859_10,
"iso-ir-157": iso8859_10,
"iso8859-10": iso8859_10,
"iso885910": iso8859_10,
"l6": iso8859_10,
"latin6": iso8859_10,
"iso-8859-13": iso8859_13,
"iso8859-13": iso8859_13,
"iso885913": iso8859_13,
"iso-8859-14": iso8859_14,
"iso8859-14": iso8859_14,
"iso885914": iso8859_14,
"csisolatin9": iso8859_15,
"iso-8859-15": iso8859_15,
"iso8859-15": iso8859_15,
"iso885915": iso8859_15,
"iso_8859-15": iso8859_15,
"l9": iso8859_15,
"iso-8859-16": iso8859_16,
"cskoi8r": koi8r,
"koi": koi8r,
"koi8": koi8r,
"koi8-r": koi8r,
"koi8_r": koi8r,
"koi8-ru": koi8u,
"koi8-u": koi8u,
"csmacintosh": macintosh,
"mac": macintosh,
"macintosh": macintosh,
"x-mac-roman": macintosh,
"dos-874": windows874,
"iso-8859-11": windows874,
"iso8859-11": windows874,
"iso885911": windows874,
"tis-620": windows874,
"windows-874": windows874,
"cp1250": windows1250,
"windows-1250": windows1250,
"x-cp1250": windows1250,
"cp1251": windows1251,
"windows-1251": windows1251,
"x-cp1251": windows1251,
"ansi_x3.4-1968": windows1252,
"ascii": windows1252,
"cp1252": windows1252,
"cp819": windows1252,
"csisolatin1": windows1252,
"ibm819": windows1252,
"iso-8859-1": windows1252,
"iso-ir-100": windows1252,
"iso8859-1": windows1252,
"iso88591": windows1252,
"iso_8859-1": windows1252,
"iso_8859-1:1987": windows1252,
"l1": windows1252,
"latin1": windows1252,
"us-ascii": windows1252,
"windows-1252": windows1252,
"x-cp1252": windows1252,
"cp1253": windows1253,
"windows-1253": windows1253,
"x-cp1253": windows1253,
"cp1254": windows1254,
"csisolatin5": windows1254,
"iso-8859-9": windows1254,
"iso-ir-148": windows1254,
"iso8859-9": windows1254,
"iso88599": windows1254,
"iso_8859-9": windows1254,
"iso_8859-9:1989": windows1254,
"l5": windows1254,
"latin5": windows1254,
"windows-1254": windows1254,
"x-cp1254": windows1254,
"cp1255": windows1255,
"windows-1255": windows1255,
"x-cp1255": windows1255,
"cp1256": windows1256,
"windows-1256": windows1256,
"x-cp1256": windows1256,
"cp1257": windows1257,
"windows-1257": windows1257,
"x-cp1257": windows1257,
"cp1258": windows1258,
"windows-1258": windows1258,
"x-cp1258": windows1258,
"x-mac-cyrillic": macintoshCyrillic,
"x-mac-ukrainian": macintoshCyrillic,
"chinese": gbk,
"csgb2312": gbk,
"csiso58gb231280": gbk,
"gb2312": gbk,
"gb_2312": gbk,
"gb_2312-80": gbk,
"gbk": gbk,
"iso-ir-58": gbk,
"x-gbk": gbk,
"gb18030": gb18030,
"big5": big5,
"big5-hkscs": big5,
"cn-big5": big5,
"csbig5": big5,
"x-x-big5": big5,
"cseucpkdfmtjapanese": eucjp,
"euc-jp": eucjp,
"x-euc-jp": eucjp,
"csiso2022jp": iso2022jp,
"iso-2022-jp": iso2022jp,
"csshiftjis": shiftJIS,
"ms932": shiftJIS,
"ms_kanji": shiftJIS,
"shift-jis": shiftJIS,
"shift_jis": shiftJIS,
"sjis": shiftJIS,
"windows-31j": shiftJIS,
"x-sjis": shiftJIS,
"cseuckr": euckr,
"csksc56011987": euckr,
"euc-kr": euckr,
"iso-ir-149": euckr,
"korean": euckr,
"ks_c_5601-1987": euckr,
"ks_c_5601-1989": euckr,
"ksc5601": euckr,
"ksc_5601": euckr,
"windows-949": euckr,
"csiso2022kr": replacement,
"hz-gb-2312": replacement,
"iso-2022-cn": replacement,
"iso-2022-cn-ext": replacement,
"iso-2022-kr": replacement,
"replacement": replacement,
"utf-16be": utf16be,
"utf-16": utf16le,
"utf-16le": utf16le,
"x-user-defined": xUserDefined,
}
var localeMap = []htmlEncoding{
windows1252, // und_Latn
windows1256, // ar
windows1251, // ba
windows1251, // be
windows1251, // bg
windows1250, // cs
iso8859_7, // el
windows1257, // et
windows1256, // fa
windows1255, // he
windows1250, // hr
iso8859_2, // hu
shiftJIS, // ja
windows1251, // kk
euckr, // ko
windows1254, // ku
windows1251, // ky
windows1257, // lt
windows1257, // lv
windows1251, // mk
iso8859_2, // pl
windows1251, // ru
windows1251, // sah
windows1250, // sk
iso8859_2, // sl
windows1251, // sr
windows1251, // tg
windows874, // th
windows1254, // tr
windows1251, // tt
windows1251, // uk
windows1258, // vi
gb18030, // zh-hans
big5, // zh-hant
}
const locales = "und_Latn ar ba be bg cs el et fa he hr hu ja kk ko ku ky lt lv mk pl ru sah sk sl sr tg th tr tt uk vi zh-hans zh-hant"

View File

@ -0,0 +1,142 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bytes"
"encoding/xml"
"fmt"
"io"
"log"
"strings"
"golang.org/x/text/internal/gen"
)
type registry struct {
XMLName xml.Name `xml:"registry"`
Updated string `xml:"updated"`
Registry []struct {
ID string `xml:"id,attr"`
Record []struct {
Name string `xml:"name"`
Xref []struct {
Type string `xml:"type,attr"`
Data string `xml:"data,attr"`
} `xml:"xref"`
Desc struct {
Data string `xml:",innerxml"`
// Any []struct {
// Data string `xml:",chardata"`
// } `xml:",any"`
// Data string `xml:",chardata"`
} `xml:"description,"`
MIB string `xml:"value"`
Alias []string `xml:"alias"`
MIME string `xml:"preferred_alias"`
} `xml:"record"`
} `xml:"registry"`
}
func main() {
r := gen.OpenIANAFile("assignments/character-sets/character-sets.xml")
reg := &registry{}
if err := xml.NewDecoder(r).Decode(&reg); err != nil && err != io.EOF {
log.Fatalf("Error decoding charset registry: %v", err)
}
if len(reg.Registry) == 0 || reg.Registry[0].ID != "character-sets-1" {
log.Fatalf("Unexpected ID %s", reg.Registry[0].ID)
}
w := &bytes.Buffer{}
fmt.Fprintf(w, "const (\n")
for _, rec := range reg.Registry[0].Record {
constName := ""
for _, a := range rec.Alias {
if strings.HasPrefix(a, "cs") && strings.IndexByte(a, '-') == -1 {
// Some of the constant definitions have comments in them. Strip those.
constName = strings.Title(strings.SplitN(a[2:], "\n", 2)[0])
}
}
if constName == "" {
switch rec.MIB {
case "2085":
constName = "HZGB2312" // Not listed as alias for some reason.
default:
log.Fatalf("No cs alias defined for %s.", rec.MIB)
}
}
if rec.MIME != "" {
rec.MIME = fmt.Sprintf(" (MIME: %s)", rec.MIME)
}
fmt.Fprintf(w, "// %s is the MIB identifier with IANA name %s%s.\n//\n", constName, rec.Name, rec.MIME)
if len(rec.Desc.Data) > 0 {
fmt.Fprint(w, "// ")
d := xml.NewDecoder(strings.NewReader(rec.Desc.Data))
inElem := true
attr := ""
for {
t, err := d.Token()
if err != nil {
if err != io.EOF {
log.Fatal(err)
}
break
}
switch x := t.(type) {
case xml.CharData:
attr = "" // Don't need attribute info.
a := bytes.Split([]byte(x), []byte("\n"))
for i, b := range a {
if b = bytes.TrimSpace(b); len(b) != 0 {
if !inElem && i > 0 {
fmt.Fprint(w, "\n// ")
}
inElem = false
fmt.Fprintf(w, "%s ", string(b))
}
}
case xml.StartElement:
if x.Name.Local == "xref" {
inElem = true
use := false
for _, a := range x.Attr {
if a.Name.Local == "type" {
use = use || a.Value != "person"
}
if a.Name.Local == "data" && use {
// Patch up URLs to use https. From some links, the
// https version is different from the http one.
s := a.Value
s = strings.Replace(s, "http://", "https://", -1)
s = strings.Replace(s, "/unicode/", "/", -1)
attr = s + " "
}
}
}
case xml.EndElement:
inElem = false
fmt.Fprint(w, attr)
}
}
fmt.Fprint(w, "\n")
}
for _, x := range rec.Xref {
switch x.Type {
case "rfc":
fmt.Fprintf(w, "// Reference: %s\n", strings.ToUpper(x.Data))
case "uri":
fmt.Fprintf(w, "// Reference: %s\n", x.Data)
}
}
fmt.Fprintf(w, "%s MIB = %s\n", constName, rec.MIB)
fmt.Fprintln(w)
}
fmt.Fprintln(w, ")")
gen.WriteGoFile("mib.go", "identifier", w.Bytes())
}

View File

@ -0,0 +1,81 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go
// Package identifier defines the contract between implementations of Encoding
// and Index by defining identifiers that uniquely identify standardized coded
// character sets (CCS) and character encoding schemes (CES), which we will
// together refer to as encodings, for which Encoding implementations provide
// converters to and from UTF-8. This package is typically only of concern to
// implementers of Indexes and Encodings.
//
// One part of the identifier is the MIB code, which is defined by IANA and
// uniquely identifies a CCS or CES. Each code is associated with data that
// references authorities, official documentation as well as aliases and MIME
// names.
//
// Not all CESs are covered by the IANA registry. The "other" string that is
// returned by ID can be used to identify other character sets or versions of
// existing ones.
//
// It is recommended that each package that provides a set of Encodings provide
// the All and Common variables to reference all supported encodings and
// commonly used subset. This allows Index implementations to include all
// available encodings without explicitly referencing or knowing about them.
package identifier
// Note: this package is internal, but could be made public if there is a need
// for writing third-party Indexes and Encodings.
// References:
// - http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/convrtrs.txt
// - http://www.iana.org/assignments/character-sets/character-sets.xhtml
// - http://www.iana.org/assignments/ianacharset-mib/ianacharset-mib
// - http://www.ietf.org/rfc/rfc2978.txt
// - https://www.unicode.org/reports/tr22/
// - http://www.w3.org/TR/encoding/
// - https://encoding.spec.whatwg.org/
// - https://encoding.spec.whatwg.org/encodings.json
// - https://tools.ietf.org/html/rfc6657#section-5
// Interface can be implemented by Encodings to define the CCS or CES for which
// it implements conversions.
type Interface interface {
// ID returns an encoding identifier. Exactly one of the mib and other
// values should be non-zero.
//
// In the usual case it is only necessary to indicate the MIB code. The
// other string can be used to specify encodings for which there is no MIB,
// such as "x-mac-dingbat".
//
// The other string may only contain the characters a-z, A-Z, 0-9, - and _.
ID() (mib MIB, other string)
// NOTE: the restrictions on the encoding are to allow extending the syntax
// with additional information such as versions, vendors and other variants.
}
// A MIB identifies an encoding. It is derived from the IANA MIB codes and adds
// some identifiers for some encodings that are not covered by the IANA
// standard.
//
// See http://www.iana.org/assignments/ianacharset-mib.
type MIB uint16
// These additional MIB types are not defined in IANA. They are added because
// they are common and defined within the text repo.
const (
// Unofficial marks the start of encodings not registered by IANA.
Unofficial MIB = 10000 + iota
// Replacement is the WhatWG replacement encoding.
Replacement
// XUserDefined is the code for x-user-defined.
XUserDefined
// MacintoshCyrillic is the code for x-mac-cyrillic.
MacintoshCyrillic
)

File diff suppressed because it is too large Load Diff

75
vendor/golang.org/x/text/encoding/internal/internal.go generated vendored Normal file
View File

@ -0,0 +1,75 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package internal contains code that is shared among encoding implementations.
package internal
import (
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// Encoding is an implementation of the Encoding interface that adds the String
// and ID methods to an existing encoding.
type Encoding struct {
encoding.Encoding
Name string
MIB identifier.MIB
}
// _ verifies that Encoding implements identifier.Interface.
var _ identifier.Interface = (*Encoding)(nil)
func (e *Encoding) String() string {
return e.Name
}
func (e *Encoding) ID() (mib identifier.MIB, other string) {
return e.MIB, ""
}
// SimpleEncoding is an Encoding that combines two Transformers.
type SimpleEncoding struct {
Decoder transform.Transformer
Encoder transform.Transformer
}
func (e *SimpleEncoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: e.Decoder}
}
func (e *SimpleEncoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: e.Encoder}
}
// FuncEncoding is an Encoding that combines two functions returning a new
// Transformer.
type FuncEncoding struct {
Decoder func() transform.Transformer
Encoder func() transform.Transformer
}
func (e FuncEncoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: e.Decoder()}
}
func (e FuncEncoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: e.Encoder()}
}
// A RepertoireError indicates a rune is not in the repertoire of a destination
// encoding. It is associated with an encoding-specific suggested replacement
// byte.
type RepertoireError byte
// Error implements the error interrface.
func (r RepertoireError) Error() string {
return "encoding: rune not supported by encoding."
}
// Replacement returns the replacement string associated with this error.
func (r RepertoireError) Replacement() byte { return byte(r) }
var ErrASCIIReplacement = RepertoireError(encoding.ASCIISub)

12
vendor/golang.org/x/text/encoding/japanese/all.go generated vendored Normal file
View File

@ -0,0 +1,12 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"golang.org/x/text/encoding"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{EUCJP, ISO2022JP, ShiftJIS}

225
vendor/golang.org/x/text/encoding/japanese/eucjp.go generated vendored Normal file
View File

@ -0,0 +1,225 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// EUCJP is the EUC-JP encoding.
var EUCJP encoding.Encoding = &eucJP
var eucJP = internal.Encoding{
&internal.SimpleEncoding{eucJPDecoder{}, eucJPEncoder{}},
"EUC-JP",
identifier.EUCPkdFmtJapanese,
}
type eucJPDecoder struct{ transform.NopResetter }
// See https://encoding.spec.whatwg.org/#euc-jp-decoder.
func (eucJPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case c0 == 0x8e:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
break
}
c1 := src[nSrc+1]
switch {
case c1 < 0xa1:
r, size = utf8.RuneError, 1
case c1 > 0xdf:
r, size = utf8.RuneError, 2
if c1 == 0xff {
size = 1
}
default:
r, size = rune(c1)+(0xff61-0xa1), 2
}
case c0 == 0x8f:
if nSrc+2 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
if p := nSrc + 1; p < len(src) && 0xa1 <= src[p] && src[p] < 0xfe {
size = 2
}
break
}
c1 := src[nSrc+1]
if c1 < 0xa1 || 0xfe < c1 {
r, size = utf8.RuneError, 1
break
}
c2 := src[nSrc+2]
if c2 < 0xa1 || 0xfe < c2 {
r, size = utf8.RuneError, 2
break
}
r, size = utf8.RuneError, 3
if i := int(c1-0xa1)*94 + int(c2-0xa1); i < len(jis0212Decode) {
r = rune(jis0212Decode[i])
if r == 0 {
r = utf8.RuneError
}
}
case 0xa1 <= c0 && c0 <= 0xfe:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
break
}
c1 := src[nSrc+1]
if c1 < 0xa1 || 0xfe < c1 {
r, size = utf8.RuneError, 1
break
}
r, size = utf8.RuneError, 2
if i := int(c0-0xa1)*94 + int(c1-0xa1); i < len(jis0208Decode) {
r = rune(jis0208Decode[i])
if r == 0 {
r = utf8.RuneError
}
}
default:
r, size = utf8.RuneError, 1
}
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type eucJPEncoder struct{ transform.NopResetter }
func (eucJPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2or3
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2or3
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2or3
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2or3
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2or3
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
goto write2
}
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2or3
}
}
err = internal.ErrASCIIReplacement
break
}
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2or3:
if r>>tableShift == jis0208 {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
} else {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = 0x8f
nDst++
}
dst[nDst+0] = 0xa1 + uint8(r>>codeShift)&codeMask
dst[nDst+1] = 0xa1 + uint8(r)&codeMask
nDst += 2
continue
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = 0x8e
dst[nDst+1] = uint8(r - (0xff61 - 0xa1))
nDst += 2
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 6 {
panic("bad numEncodeTables")
}
}

299
vendor/golang.org/x/text/encoding/japanese/iso2022jp.go generated vendored Normal file
View File

@ -0,0 +1,299 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// ISO2022JP is the ISO-2022-JP encoding.
var ISO2022JP encoding.Encoding = &iso2022JP
var iso2022JP = internal.Encoding{
internal.FuncEncoding{iso2022JPNewDecoder, iso2022JPNewEncoder},
"ISO-2022-JP",
identifier.ISO2022JP,
}
func iso2022JPNewDecoder() transform.Transformer {
return new(iso2022JPDecoder)
}
func iso2022JPNewEncoder() transform.Transformer {
return new(iso2022JPEncoder)
}
const (
asciiState = iota
katakanaState
jis0208State
jis0212State
)
const asciiEsc = 0x1b
type iso2022JPDecoder int
func (d *iso2022JPDecoder) Reset() {
*d = asciiState
}
func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
c0 := src[nSrc]
if c0 >= utf8.RuneSelf {
r, size = '\ufffd', 1
goto write
}
if c0 == asciiEsc {
if nSrc+2 >= len(src) {
if !atEOF {
return nDst, nSrc, transform.ErrShortSrc
}
// TODO: is it correct to only skip 1??
r, size = '\ufffd', 1
goto write
}
size = 3
c1 := src[nSrc+1]
c2 := src[nSrc+2]
switch {
case c1 == '$' && (c2 == '@' || c2 == 'B'): // 0x24 {0x40, 0x42}
*d = jis0208State
continue
case c1 == '$' && c2 == '(': // 0x24 0x28
if nSrc+3 >= len(src) {
if !atEOF {
return nDst, nSrc, transform.ErrShortSrc
}
r, size = '\ufffd', 1
goto write
}
size = 4
if src[nSrc+3] == 'D' {
*d = jis0212State
continue
}
case c1 == '(' && (c2 == 'B' || c2 == 'J'): // 0x28 {0x42, 0x4A}
*d = asciiState
continue
case c1 == '(' && c2 == 'I': // 0x28 0x49
*d = katakanaState
continue
}
r, size = '\ufffd', 1
goto write
}
switch *d {
case asciiState:
r, size = rune(c0), 1
case katakanaState:
if c0 < 0x21 || 0x60 <= c0 {
r, size = '\ufffd', 1
goto write
}
r, size = rune(c0)+(0xff61-0x21), 1
default:
if c0 == 0x0a {
*d = asciiState
r, size = rune(c0), 1
goto write
}
if nSrc+1 >= len(src) {
if !atEOF {
return nDst, nSrc, transform.ErrShortSrc
}
r, size = '\ufffd', 1
goto write
}
size = 2
c1 := src[nSrc+1]
i := int(c0-0x21)*94 + int(c1-0x21)
if *d == jis0208State && i < len(jis0208Decode) {
r = rune(jis0208Decode[i])
} else if *d == jis0212State && i < len(jis0212Decode) {
r = rune(jis0212Decode[i])
} else {
r = '\ufffd'
goto write
}
if r == 0 {
r = '\ufffd'
}
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
return nDst, nSrc, transform.ErrShortDst
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type iso2022JPEncoder int
func (e *iso2022JPEncoder) Reset() {
*e = asciiState
}
func (e *iso2022JPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
//
// http://encoding.spec.whatwg.org/#iso-2022-jp says that "the index jis0212
// is not used by the iso-2022-jp encoder due to lack of widespread support".
//
// TODO: do we have to special-case U+00A5 and U+203E, as per
// http://encoding.spec.whatwg.org/#iso-2022-jp
// Doing so would mean that "\u00a5" would not be preserved
// after an encode-decode round trip.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
goto writeKatakana
}
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
goto writeJIS
}
}
// Switch back to ASCII state in case of error so that an ASCII
// replacement character can be written in the correct state.
if *e != asciiState {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
*e = asciiState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'B'
nDst += 3
}
err = internal.ErrASCIIReplacement
break
}
if *e != asciiState {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
*e = asciiState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'B'
nDst += 3
} else if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
writeJIS:
if *e != jis0208State {
if nDst+5 > len(dst) {
err = transform.ErrShortDst
break
}
*e = jis0208State
dst[nDst+0] = asciiEsc
dst[nDst+1] = '$'
dst[nDst+2] = 'B'
nDst += 3
} else if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = 0x21 + uint8(r>>codeShift)&codeMask
dst[nDst+1] = 0x21 + uint8(r)&codeMask
nDst += 2
continue
writeKatakana:
if *e != katakanaState {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
*e = katakanaState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'I'
nDst += 3
} else if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r - (0xff61 - 0x21))
nDst++
continue
}
if atEOF && err == nil && *e != asciiState {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
} else {
*e = asciiState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'B'
nDst += 3
}
}
return nDst, nSrc, err
}

View File

@ -0,0 +1,161 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
// TODO: Emoji extensions?
// https://www.unicode.org/faq/emoji_dingbats.html
// https://www.unicode.org/Public/UNIDATA/EmojiSources.txt
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
type entry struct {
jisCode, table int
}
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package japanese provides Japanese encodings such as EUC-JP and Shift JIS.\n")
fmt.Printf(`package japanese // import "golang.org/x/text/encoding/japanese"` + "\n\n")
reverse := [65536]entry{}
for i := range reverse {
reverse[i].table = -1
}
tables := []struct {
url string
name string
}{
{"http://encoding.spec.whatwg.org/index-jis0208.txt", "0208"},
{"http://encoding.spec.whatwg.org/index-jis0212.txt", "0212"},
}
for i, table := range tables {
res, err := http.Get(table.url)
if err != nil {
log.Fatalf("%q: Get: %v", table.url, err)
}
defer res.Body.Close()
mapping := [65536]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := 0, uint16(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("%q: could not parse %q", table.url, s)
}
if x < 0 || 120*94 <= x {
log.Fatalf("%q: JIS code %d is out of range", table.url, x)
}
mapping[x] = y
if reverse[y].table == -1 {
reverse[y] = entry{jisCode: x, table: i}
}
}
if err := scanner.Err(); err != nil {
log.Fatalf("%q: scanner error: %v", table.url, err)
}
fmt.Printf("// jis%sDecode is the decoding table from JIS %s code to Unicode.\n// It is defined at %s\n",
table.name, table.name, table.url)
fmt.Printf("var jis%sDecode = [...]uint16{\n", table.name)
for i, m := range mapping {
if m != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, m)
}
}
fmt.Printf("}\n\n")
}
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v.table == -1 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const (\n")
fmt.Printf("\tjis0208 = 1\n")
fmt.Printf("\tjis0212 = 2\n")
fmt.Printf("\tcodeMask = 0x7f\n")
fmt.Printf("\tcodeShift = 7\n")
fmt.Printf("\ttableShift = 14\n")
fmt.Printf(")\n\n")
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to JIS code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("//\n")
fmt.Printf("// The high two bits of the value record whether the JIS code comes from the\n")
fmt.Printf("// JIS0208 table (high bits == 1) or the JIS0212 table (high bits == 2).\n")
fmt.Printf("// The low 14 bits are two 7-bit unsigned integers j1 and j2 that form the\n")
fmt.Printf("// JIS code (94*j1 + j2) within that table.\n")
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x.table == -1 {
continue
}
fmt.Printf("\t%d - %d: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n",
j, v.low, tables[x.table].name, x.jisCode/94, x.jisCode%94)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

189
vendor/golang.org/x/text/encoding/japanese/shiftjis.go generated vendored Normal file
View File

@ -0,0 +1,189 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// ShiftJIS is the Shift JIS encoding, also known as Code Page 932 and
// Windows-31J.
var ShiftJIS encoding.Encoding = &shiftJIS
var shiftJIS = internal.Encoding{
&internal.SimpleEncoding{shiftJISDecoder{}, shiftJISEncoder{}},
"Shift JIS",
identifier.ShiftJIS,
}
type shiftJISDecoder struct{ transform.NopResetter }
func (shiftJISDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case 0xa1 <= c0 && c0 < 0xe0:
r, size = rune(c0)+(0xff61-0xa1), 1
case (0x81 <= c0 && c0 < 0xa0) || (0xe0 <= c0 && c0 < 0xfd):
if c0 <= 0x9f {
c0 -= 0x70
} else {
c0 -= 0xb0
}
c0 = 2*c0 - 0x21
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = '\ufffd', 1
goto write
}
c1 := src[nSrc+1]
switch {
case c1 < 0x40:
r, size = '\ufffd', 1 // c1 is ASCII so output on next round
goto write
case c1 < 0x7f:
c0--
c1 -= 0x40
case c1 == 0x7f:
r, size = '\ufffd', 1 // c1 is ASCII so output on next round
goto write
case c1 < 0x9f:
c0--
c1 -= 0x41
case c1 < 0xfd:
c1 -= 0x9f
default:
r, size = '\ufffd', 2
goto write
}
r, size = '\ufffd', 2
if i := int(c0)*94 + int(c1); i < len(jis0208Decode) {
r = rune(jis0208Decode[i])
if r == 0 {
r = '\ufffd'
}
}
case c0 == 0x80:
r, size = 0x80, 1
default:
r, size = '\ufffd', 1
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type shiftJISEncoder struct{ transform.NopResetter }
func (shiftJISEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break loop
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
goto write2
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
r -= 0xff61 - 0xa1
goto write1
}
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
goto write2
}
}
err = internal.ErrASCIIReplacement
break
}
write1:
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2:
j1 := uint8(r>>codeShift) & codeMask
j2 := uint8(r) & codeMask
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break loop
}
if j1 <= 61 {
dst[nDst+0] = 129 + j1/2
} else {
dst[nDst+0] = 193 + j1/2
}
if j1&1 == 0 {
dst[nDst+1] = j2 + j2/63 + 64
} else {
dst[nDst+1] = j2 + 159
}
nDst += 2
continue
}
return nDst, nSrc, err
}

26971
vendor/golang.org/x/text/encoding/japanese/tables.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

177
vendor/golang.org/x/text/encoding/korean/euckr.go generated vendored Normal file
View File

@ -0,0 +1,177 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package korean
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{EUCKR}
// EUCKR is the EUC-KR encoding, also known as Code Page 949.
var EUCKR encoding.Encoding = &eucKR
var eucKR = internal.Encoding{
&internal.SimpleEncoding{eucKRDecoder{}, eucKREncoder{}},
"EUC-KR",
identifier.EUCKR,
}
type eucKRDecoder struct{ transform.NopResetter }
func (eucKRDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case 0x81 <= c0 && c0 < 0xff:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
break
}
c1 := src[nSrc+1]
size = 2
if c0 < 0xc7 {
r = 178 * rune(c0-0x81)
switch {
case 0x41 <= c1 && c1 < 0x5b:
r += rune(c1) - (0x41 - 0*26)
case 0x61 <= c1 && c1 < 0x7b:
r += rune(c1) - (0x61 - 1*26)
case 0x81 <= c1 && c1 < 0xff:
r += rune(c1) - (0x81 - 2*26)
default:
goto decError
}
} else if 0xa1 <= c1 && c1 < 0xff {
r = 178*(0xc7-0x81) + rune(c0-0xc7)*94 + rune(c1-0xa1)
} else {
goto decError
}
if int(r) < len(decode) {
r = rune(decode[r])
if r != 0 {
break
}
}
decError:
r = utf8.RuneError
if c1 < utf8.RuneSelf {
size = 1
}
default:
r, size = utf8.RuneError, 1
break
}
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type eucKREncoder struct{ transform.NopResetter }
func (eucKREncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2
}
case encode5Low <= r && r < encode5High:
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2
}
case encode6Low <= r && r < encode6High:
if r = rune(encode6[r-encode6Low]); r != 0 {
goto write2
}
}
err = internal.ErrASCIIReplacement
break
}
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 8)
dst[nDst+1] = uint8(r)
nDst += 2
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 7 {
panic("bad numEncodeTables")
}
}

143
vendor/golang.org/x/text/encoding/korean/maketables.go generated vendored Normal file
View File

@ -0,0 +1,143 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package korean provides Korean encodings such as EUC-KR.\n")
fmt.Printf(`package korean // import "golang.org/x/text/encoding/korean"` + "\n\n")
res, err := http.Get("http://encoding.spec.whatwg.org/index-euc-kr.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
mapping := [65536]uint16{}
reverse := [65536]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint16(0), uint16(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 178*(0xc7-0x81)+(0xfe-0xc7)*94+(0xff-0xa1) <= x {
log.Fatalf("EUC-KR code %d is out of range", x)
}
mapping[x] = y
if reverse[y] == 0 {
c0, c1 := uint16(0), uint16(0)
if x < 178*(0xc7-0x81) {
c0 = uint16(x/178) + 0x81
c1 = uint16(x % 178)
switch {
case c1 < 1*26:
c1 += 0x41
case c1 < 2*26:
c1 += 0x47
default:
c1 += 0x4d
}
} else {
x -= 178 * (0xc7 - 0x81)
c0 = uint16(x/94) + 0xc7
c1 = uint16(x%94) + 0xa1
}
reverse[y] = c0<<8 | c1
}
}
if err := scanner.Err(); err != nil {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// decode is the decoding table from EUC-KR code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-euc-kr.txt\n")
fmt.Printf("var decode = [...]uint16{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, v)
}
}
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v == 0 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to EUC-KR code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {
continue
}
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

34152
vendor/golang.org/x/text/encoding/korean/tables.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,12 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package simplifiedchinese
import (
"golang.org/x/text/encoding"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{GB18030, GBK, HZGB2312}

View File

@ -0,0 +1,269 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package simplifiedchinese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
var (
// GB18030 is the GB18030 encoding.
GB18030 encoding.Encoding = &gbk18030
// GBK is the GBK encoding. It encodes an extension of the GB2312 character set
// and is also known as Code Page 936.
GBK encoding.Encoding = &gbk
)
var gbk = internal.Encoding{
&internal.SimpleEncoding{
gbkDecoder{gb18030: false},
gbkEncoder{gb18030: false},
},
"GBK",
identifier.GBK,
}
var gbk18030 = internal.Encoding{
&internal.SimpleEncoding{
gbkDecoder{gb18030: true},
gbkEncoder{gb18030: true},
},
"GB18030",
identifier.GB18030,
}
type gbkDecoder struct {
transform.NopResetter
gb18030 bool
}
func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
// says to treat "gbk" as Code Page 936.
case c0 == 0x80:
r, size = '€', 1
case c0 < 0xff:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
c1 := src[nSrc+1]
switch {
case 0x40 <= c1 && c1 < 0x7f:
c1 -= 0x40
case 0x80 <= c1 && c1 < 0xff:
c1 -= 0x41
case d.gb18030 && 0x30 <= c1 && c1 < 0x40:
if nSrc+3 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
// The second byte here is always ASCII, so we can set size
// to 1 in all cases.
r, size = utf8.RuneError, 1
goto write
}
c2 := src[nSrc+2]
if c2 < 0x81 || 0xff <= c2 {
r, size = utf8.RuneError, 1
goto write
}
c3 := src[nSrc+3]
if c3 < 0x30 || 0x3a <= c3 {
r, size = utf8.RuneError, 1
goto write
}
size = 4
r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30)
if r < 39420 {
i, j := 0, len(gb18030)
for i < j {
h := i + (j-i)/2
if r >= rune(gb18030[h][0]) {
i = h + 1
} else {
j = h
}
}
dec := &gb18030[i-1]
r += rune(dec[1]) - rune(dec[0])
goto write
}
r -= 189000
if 0 <= r && r < 0x100000 {
r += 0x10000
} else {
r, size = utf8.RuneError, 1
}
goto write
default:
r, size = utf8.RuneError, 1
goto write
}
r, size = '\ufffd', 2
if i := int(c0-0x81)*190 + int(c1); i < len(decode) {
r = rune(decode[i])
if r == 0 {
r = '\ufffd'
}
}
default:
r, size = utf8.RuneError, 1
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type gbkEncoder struct {
transform.NopResetter
gb18030 bool
}
func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, r2, size := rune(0), rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r2 = rune(encode0[r-encode0Low]); r2 != 0 {
goto write2
}
case encode1Low <= r && r < encode1High:
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
// says to treat "gbk" as Code Page 936.
if r == '€' {
r = 0x80
goto write1
}
if r2 = rune(encode1[r-encode1Low]); r2 != 0 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r2 = rune(encode2[r-encode2Low]); r2 != 0 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r2 = rune(encode3[r-encode3Low]); r2 != 0 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r2 = rune(encode4[r-encode4Low]); r2 != 0 {
goto write2
}
}
if e.gb18030 {
if r < 0x10000 {
i, j := 0, len(gb18030)
for i < j {
h := i + (j-i)/2
if r >= rune(gb18030[h][1]) {
i = h + 1
} else {
j = h
}
}
dec := &gb18030[i-1]
r += rune(dec[0]) - rune(dec[1])
goto write4
} else if r < 0x110000 {
r += 189000 - 0x10000
goto write4
}
}
err = internal.ErrASCIIReplacement
break
}
write1:
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r2 >> 8)
dst[nDst+1] = uint8(r2)
nDst += 2
continue
write4:
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+3] = uint8(r%10 + 0x30)
r /= 10
dst[nDst+2] = uint8(r%126 + 0x81)
r /= 126
dst[nDst+1] = uint8(r%10 + 0x30)
r /= 10
dst[nDst+0] = uint8(r + 0x81)
nDst += 4
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 5 {
panic("bad numEncodeTables")
}
}

View File

@ -0,0 +1,245 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package simplifiedchinese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// HZGB2312 is the HZ-GB2312 encoding.
var HZGB2312 encoding.Encoding = &hzGB2312
var hzGB2312 = internal.Encoding{
internal.FuncEncoding{hzGB2312NewDecoder, hzGB2312NewEncoder},
"HZ-GB2312",
identifier.HZGB2312,
}
func hzGB2312NewDecoder() transform.Transformer {
return new(hzGB2312Decoder)
}
func hzGB2312NewEncoder() transform.Transformer {
return new(hzGB2312Encoder)
}
const (
asciiState = iota
gbState
)
type hzGB2312Decoder int
func (d *hzGB2312Decoder) Reset() {
*d = asciiState
}
func (d *hzGB2312Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
c0 := src[nSrc]
if c0 >= utf8.RuneSelf {
r, size = utf8.RuneError, 1
goto write
}
if c0 == '~' {
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r = utf8.RuneError
goto write
}
size = 2
switch src[nSrc+1] {
case '{':
*d = gbState
continue
case '}':
*d = asciiState
continue
case '~':
if nDst >= len(dst) {
err = transform.ErrShortDst
break loop
}
dst[nDst] = '~'
nDst++
continue
case '\n':
continue
default:
r = utf8.RuneError
goto write
}
}
if *d == asciiState {
r, size = rune(c0), 1
} else {
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
size = 2
c1 := src[nSrc+1]
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
// error
} else if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
r = rune(decode[i])
if r != 0 {
goto write
}
}
if c1 > utf8.RuneSelf {
// Be consistent and always treat non-ASCII as a single error.
size = 1
}
r = utf8.RuneError
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type hzGB2312Encoder int
func (d *hzGB2312Encoder) Reset() {
*d = asciiState
}
func (e *hzGB2312Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
if r == '~' {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = '~'
dst[nDst+1] = '~'
nDst += 2
continue
} else if *e != asciiState {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
*e = asciiState
dst[nDst+0] = '~'
dst[nDst+1] = '}'
nDst += 2
} else if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst += 1
continue
}
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto writeGB
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto writeGB
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto writeGB
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto writeGB
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto writeGB
}
}
terminateInASCIIState:
// Switch back to ASCII state in case of error so that an ASCII
// replacement character can be written in the correct state.
if *e != asciiState {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = '~'
dst[nDst+1] = '}'
nDst += 2
}
err = internal.ErrASCIIReplacement
break
writeGB:
c0 := uint8(r>>8) - 0x80
c1 := uint8(r) - 0x80
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
goto terminateInASCIIState
}
if *e == asciiState {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
*e = gbState
dst[nDst+0] = '~'
dst[nDst+1] = '{'
nDst += 2
} else if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = c0
dst[nDst+1] = c1
nDst += 2
continue
}
// TODO: should one always terminate in ASCII state to make it safe to
// concatenate two HZ-GB2312-encoded strings?
return nDst, nSrc, err
}

View File

@ -0,0 +1,161 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package simplifiedchinese provides Simplified Chinese encodings such as GBK.\n")
fmt.Printf(`package simplifiedchinese // import "golang.org/x/text/encoding/simplifiedchinese"` + "\n\n")
printGB18030()
printGBK()
}
func printGB18030() {
res, err := http.Get("http://encoding.spec.whatwg.org/index-gb18030.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
fmt.Printf("// gb18030 is the table from http://encoding.spec.whatwg.org/index-gb18030.txt\n")
fmt.Printf("var gb18030 = [...][2]uint16{\n")
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint32(0), uint32(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0x10000 && y < 0x10000 {
fmt.Printf("\t{0x%04x, 0x%04x},\n", x, y)
}
}
fmt.Printf("}\n\n")
}
func printGBK() {
res, err := http.Get("http://encoding.spec.whatwg.org/index-gbk.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
mapping := [65536]uint16{}
reverse := [65536]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint16(0), uint16(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 126*190 <= x {
log.Fatalf("GBK code %d is out of range", x)
}
mapping[x] = y
if reverse[y] == 0 {
c0, c1 := x/190, x%190
if c1 >= 0x3f {
c1++
}
reverse[y] = (0x81+c0)<<8 | (0x40 + c1)
}
}
if err := scanner.Err(); err != nil {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// decode is the decoding table from GBK code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-gbk.txt\n")
fmt.Printf("var decode = [...]uint16{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, v)
}
}
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v == 0 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to GBK code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {
continue
}
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,199 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package traditionalchinese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{Big5}
// Big5 is the Big5 encoding, also known as Code Page 950.
var Big5 encoding.Encoding = &big5
var big5 = internal.Encoding{
&internal.SimpleEncoding{big5Decoder{}, big5Encoder{}},
"Big5",
identifier.Big5,
}
type big5Decoder struct{ transform.NopResetter }
func (big5Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size, s := rune(0), 0, ""
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case 0x81 <= c0 && c0 < 0xff:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
c1 := src[nSrc+1]
switch {
case 0x40 <= c1 && c1 < 0x7f:
c1 -= 0x40
case 0xa1 <= c1 && c1 < 0xff:
c1 -= 0x62
case c1 < 0x40:
r, size = utf8.RuneError, 1
goto write
default:
r, size = utf8.RuneError, 2
goto write
}
r, size = '\ufffd', 2
if i := int(c0-0x81)*157 + int(c1); i < len(decode) {
if 1133 <= i && i < 1167 {
// The two-rune special cases for LATIN CAPITAL / SMALL E WITH CIRCUMFLEX
// AND MACRON / CARON are from http://encoding.spec.whatwg.org/#big5
switch i {
case 1133:
s = "\u00CA\u0304"
goto writeStr
case 1135:
s = "\u00CA\u030C"
goto writeStr
case 1164:
s = "\u00EA\u0304"
goto writeStr
case 1166:
s = "\u00EA\u030C"
goto writeStr
}
}
r = rune(decode[i])
if r == 0 {
r = '\ufffd'
}
}
default:
r, size = utf8.RuneError, 1
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
continue loop
writeStr:
if nDst+len(s) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += copy(dst[nDst:], s)
continue loop
}
return nDst, nSrc, err
}
type big5Encoder struct{ transform.NopResetter }
func (big5Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
}
if r >= utf8.RuneSelf {
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2
}
case encode5Low <= r && r < encode5High:
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2
}
case encode6Low <= r && r < encode6High:
if r = rune(encode6[r-encode6Low]); r != 0 {
goto write2
}
case encode7Low <= r && r < encode7High:
if r = rune(encode7[r-encode7Low]); r != 0 {
goto write2
}
}
err = internal.ErrASCIIReplacement
break
}
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 8)
dst[nDst+1] = uint8(r)
nDst += 2
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 8 {
panic("bad numEncodeTables")
}
}

View File

@ -0,0 +1,140 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package traditionalchinese provides Traditional Chinese encodings such as Big5.\n")
fmt.Printf(`package traditionalchinese // import "golang.org/x/text/encoding/traditionalchinese"` + "\n\n")
res, err := http.Get("http://encoding.spec.whatwg.org/index-big5.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
mapping := [65536]uint32{}
reverse := [65536 * 4]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint16(0), uint32(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 126*157 <= x {
log.Fatalf("Big5 code %d is out of range", x)
}
mapping[x] = y
// The WHATWG spec http://encoding.spec.whatwg.org/#indexes says that
// "The index pointer for code point in index is the first pointer
// corresponding to code point in index", which would normally mean
// that the code below should be guarded by "if reverse[y] == 0", but
// last instead of first seems to match the behavior of
// "iconv -f UTF-8 -t BIG5". For example, U+8005 者 occurs twice in
// http://encoding.spec.whatwg.org/index-big5.txt, as index 2148
// (encoded as "\x8e\xcd") and index 6543 (encoded as "\xaa\xcc")
// and "echo 者 | iconv -f UTF-8 -t BIG5 | xxd" gives "\xaa\xcc".
c0, c1 := x/157, x%157
if c1 < 0x3f {
c1 += 0x40
} else {
c1 += 0x62
}
reverse[y] = (0x81+c0)<<8 | c1
}
if err := scanner.Err(); err != nil {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// decode is the decoding table from Big5 code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-big5.txt\n")
fmt.Printf("var decode = [...]uint32{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%08X,\n", i, v)
}
}
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v == 0 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to Big5 code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%6d, %6d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {
continue
}
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }

File diff suppressed because it is too large Load Diff

82
vendor/golang.org/x/text/encoding/unicode/override.go generated vendored Normal file
View File

@ -0,0 +1,82 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package unicode
import (
"golang.org/x/text/transform"
)
// BOMOverride returns a new decoder transformer that is identical to fallback,
// except that the presence of a Byte Order Mark at the start of the input
// causes it to switch to the corresponding Unicode decoding. It will only
// consider BOMs for UTF-8, UTF-16BE, and UTF-16LE.
//
// This differs from using ExpectBOM by allowing a BOM to switch to UTF-8, not
// just UTF-16 variants, and allowing falling back to any encoding scheme.
//
// This technique is recommended by the W3C for use in HTML 5: "For
// compatibility with deployed content, the byte order mark (also known as BOM)
// is considered more authoritative than anything else."
// http://www.w3.org/TR/encoding/#specification-hooks
//
// Using BOMOverride is mostly intended for use cases where the first characters
// of a fallback encoding are known to not be a BOM, for example, for valid HTML
// and most encodings.
func BOMOverride(fallback transform.Transformer) transform.Transformer {
// TODO: possibly allow a variadic argument of unicode encodings to allow
// specifying details of which fallbacks are supported as well as
// specifying the details of the implementations. This would also allow for
// support for UTF-32, which should not be supported by default.
return &bomOverride{fallback: fallback}
}
type bomOverride struct {
fallback transform.Transformer
current transform.Transformer
}
func (d *bomOverride) Reset() {
d.current = nil
d.fallback.Reset()
}
var (
// TODO: we could use decode functions here, instead of allocating a new
// decoder on every NewDecoder as IgnoreBOM decoders can be stateless.
utf16le = UTF16(LittleEndian, IgnoreBOM)
utf16be = UTF16(BigEndian, IgnoreBOM)
)
const utf8BOM = "\ufeff"
func (d *bomOverride) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if d.current != nil {
return d.current.Transform(dst, src, atEOF)
}
if len(src) < 3 && !atEOF {
return 0, 0, transform.ErrShortSrc
}
d.current = d.fallback
bomSize := 0
if len(src) >= 2 {
if src[0] == 0xFF && src[1] == 0xFE {
d.current = utf16le.NewDecoder()
bomSize = 2
} else if src[0] == 0xFE && src[1] == 0xFF {
d.current = utf16be.NewDecoder()
bomSize = 2
} else if len(src) >= 3 &&
src[0] == utf8BOM[0] &&
src[1] == utf8BOM[1] &&
src[2] == utf8BOM[2] {
d.current = transform.Nop
bomSize = 3
}
}
if bomSize < len(src) {
nDst, nSrc, err = d.current.Transform(dst, src[bomSize:], atEOF)
}
return nDst, nSrc + bomSize, err
}

434
vendor/golang.org/x/text/encoding/unicode/unicode.go generated vendored Normal file
View File

@ -0,0 +1,434 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package unicode provides Unicode encodings such as UTF-16.
package unicode // import "golang.org/x/text/encoding/unicode"
import (
"errors"
"unicode/utf16"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/internal/utf8internal"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
)
// TODO: I think the Transformers really should return errors on unmatched
// surrogate pairs and odd numbers of bytes. This is not required by RFC 2781,
// which leaves it open, but is suggested by WhatWG. It will allow for all error
// modes as defined by WhatWG: fatal, HTML and Replacement. This would require
// the introduction of some kind of error type for conveying the erroneous code
// point.
// UTF8 is the UTF-8 encoding.
var UTF8 encoding.Encoding = utf8enc
var utf8enc = &internal.Encoding{
&internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()},
"UTF-8",
identifier.UTF8,
}
type utf8Decoder struct{ transform.NopResetter }
func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
var pSrc int // point from which to start copy in src
var accept utf8internal.AcceptRange
// The decoder can only make the input larger, not smaller.
n := len(src)
if len(dst) < n {
err = transform.ErrShortDst
n = len(dst)
atEOF = false
}
for nSrc < n {
c := src[nSrc]
if c < utf8.RuneSelf {
nSrc++
continue
}
first := utf8internal.First[c]
size := int(first & utf8internal.SizeMask)
if first == utf8internal.FirstInvalid {
goto handleInvalid // invalid starter byte
}
accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift]
if nSrc+size > n {
if !atEOF {
// We may stop earlier than necessary here if the short sequence
// has invalid bytes. Not checking for this simplifies the code
// and may avoid duplicate computations in certain conditions.
if err == nil {
err = transform.ErrShortSrc
}
break
}
// Determine the maximal subpart of an ill-formed subsequence.
switch {
case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]:
size = 1
case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]:
size = 2
default:
size = 3 // As we are short, the maximum is 3.
}
goto handleInvalid
}
if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c {
size = 1
goto handleInvalid // invalid continuation byte
} else if size == 2 {
} else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c {
size = 2
goto handleInvalid // invalid continuation byte
} else if size == 3 {
} else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c {
size = 3
goto handleInvalid // invalid continuation byte
}
nSrc += size
continue
handleInvalid:
// Copy the scanned input so far.
nDst += copy(dst[nDst:], src[pSrc:nSrc])
// Append RuneError to the destination.
const runeError = "\ufffd"
if nDst+len(runeError) > len(dst) {
return nDst, nSrc, transform.ErrShortDst
}
nDst += copy(dst[nDst:], runeError)
// Skip the maximal subpart of an ill-formed subsequence according to
// the W3C standard way instead of the Go way. This Transform is
// probably the only place in the text repo where it is warranted.
nSrc += size
pSrc = nSrc
// Recompute the maximum source length.
if sz := len(dst) - nDst; sz < len(src)-nSrc {
err = transform.ErrShortDst
n = nSrc + sz
atEOF = false
}
}
return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err
}
// UTF16 returns a UTF-16 Encoding for the given default endianness and byte
// order mark (BOM) policy.
//
// When decoding from UTF-16 to UTF-8, if the BOMPolicy is IgnoreBOM then
// neither BOMs U+FEFF nor noncharacters U+FFFE in the input stream will affect
// the endianness used for decoding, and will instead be output as their
// standard UTF-8 encodings: "\xef\xbb\xbf" and "\xef\xbf\xbe". If the BOMPolicy
// is UseBOM or ExpectBOM a staring BOM is not written to the UTF-8 output.
// Instead, it overrides the default endianness e for the remainder of the
// transformation. Any subsequent BOMs U+FEFF or noncharacters U+FFFE will not
// affect the endianness used, and will instead be output as their standard
// UTF-8 encodings. For UseBOM, if there is no starting BOM, it will proceed
// with the default Endianness. For ExpectBOM, in that case, the transformation
// will return early with an ErrMissingBOM error.
//
// When encoding from UTF-8 to UTF-16, a BOM will be inserted at the start of
// the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM will not
// be inserted. The UTF-8 input does not need to contain a BOM.
//
// There is no concept of a 'native' endianness. If the UTF-16 data is produced
// and consumed in a greater context that implies a certain endianness, use
// IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM.
//
// In the language of https://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM
// corresponds to "Where the precise type of the data stream is known... the
// BOM should not be used" and ExpectBOM corresponds to "A particular
// protocol... may require use of the BOM".
func UTF16(e Endianness, b BOMPolicy) encoding.Encoding {
return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]}
}
// mibValue maps Endianness and BOMPolicy settings to MIB constants. Note that
// some configurations map to the same MIB identifier. RFC 2781 has requirements
// and recommendations. Some of the "configurations" are merely recommendations,
// so multiple configurations could match.
var mibValue = map[Endianness][numBOMValues]identifier.MIB{
BigEndian: [numBOMValues]identifier.MIB{
IgnoreBOM: identifier.UTF16BE,
UseBOM: identifier.UTF16, // BigEnding default is preferred by RFC 2781.
// TODO: acceptBOM | strictBOM would map to UTF16BE as well.
},
LittleEndian: [numBOMValues]identifier.MIB{
IgnoreBOM: identifier.UTF16LE,
UseBOM: identifier.UTF16, // LittleEndian default is allowed and preferred on Windows.
// TODO: acceptBOM | strictBOM would map to UTF16LE as well.
},
// ExpectBOM is not widely used and has no valid MIB identifier.
}
// All lists a configuration for each IANA-defined UTF-16 variant.
var All = []encoding.Encoding{
UTF8,
UTF16(BigEndian, UseBOM),
UTF16(BigEndian, IgnoreBOM),
UTF16(LittleEndian, IgnoreBOM),
}
// BOMPolicy is a UTF-16 encoding's byte order mark policy.
type BOMPolicy uint8
const (
writeBOM BOMPolicy = 0x01
acceptBOM BOMPolicy = 0x02
requireBOM BOMPolicy = 0x04
bomMask BOMPolicy = 0x07
// HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
// map of an array of length 8 of a type that is also used as a key or value
// in another map). See golang.org/issue/11354.
// TODO: consider changing this value back to 8 if the use of 1.4.* has
// been minimized.
numBOMValues = 8 + 1
// IgnoreBOM means to ignore any byte order marks.
IgnoreBOM BOMPolicy = 0
// Common and RFC 2781-compliant interpretation for UTF-16BE/LE.
// UseBOM means that the UTF-16 form may start with a byte order mark, which
// will be used to override the default encoding.
UseBOM BOMPolicy = writeBOM | acceptBOM
// Common and RFC 2781-compliant interpretation for UTF-16.
// ExpectBOM means that the UTF-16 form must start with a byte order mark,
// which will be used to override the default encoding.
ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
// Used in Java as Unicode (not to be confused with Java's UTF-16) and
// ICU's UTF-16,version=1. Not compliant with RFC 2781.
// TODO (maybe): strictBOM: BOM must match Endianness. This would allow:
// - UTF-16(B|L)E,version=1: writeBOM | acceptBOM | requireBOM | strictBOM
// (UnicodeBig and UnicodeLittle in Java)
// - RFC 2781-compliant, but less common interpretation for UTF-16(B|L)E:
// acceptBOM | strictBOM (e.g. assigned to CheckBOM).
// This addition would be consistent with supporting ExpectBOM.
)
// Endianness is a UTF-16 encoding's default endianness.
type Endianness bool
const (
// BigEndian is UTF-16BE.
BigEndian Endianness = false
// LittleEndian is UTF-16LE.
LittleEndian Endianness = true
)
// ErrMissingBOM means that decoding UTF-16 input with ExpectBOM did not find a
// starting byte order mark.
var ErrMissingBOM = errors.New("encoding: missing byte order mark")
type utf16Encoding struct {
config
mib identifier.MIB
}
type config struct {
endianness Endianness
bomPolicy BOMPolicy
}
func (u utf16Encoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: &utf16Decoder{
initial: u.config,
current: u.config,
}}
}
func (u utf16Encoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: &utf16Encoder{
endianness: u.endianness,
initialBOMPolicy: u.bomPolicy,
currentBOMPolicy: u.bomPolicy,
}}
}
func (u utf16Encoding) ID() (mib identifier.MIB, other string) {
return u.mib, ""
}
func (u utf16Encoding) String() string {
e, b := "B", ""
if u.endianness == LittleEndian {
e = "L"
}
switch u.bomPolicy {
case ExpectBOM:
b = "Expect"
case UseBOM:
b = "Use"
case IgnoreBOM:
b = "Ignore"
}
return "UTF-16" + e + "E (" + b + " BOM)"
}
type utf16Decoder struct {
initial config
current config
}
func (u *utf16Decoder) Reset() {
u.current = u.initial
}
func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if len(src) == 0 {
if atEOF && u.current.bomPolicy&requireBOM != 0 {
return 0, 0, ErrMissingBOM
}
return 0, 0, nil
}
if u.current.bomPolicy&acceptBOM != 0 {
if len(src) < 2 {
return 0, 0, transform.ErrShortSrc
}
switch {
case src[0] == 0xfe && src[1] == 0xff:
u.current.endianness = BigEndian
nSrc = 2
case src[0] == 0xff && src[1] == 0xfe:
u.current.endianness = LittleEndian
nSrc = 2
default:
if u.current.bomPolicy&requireBOM != 0 {
return 0, 0, ErrMissingBOM
}
}
u.current.bomPolicy = IgnoreBOM
}
var r rune
var dSize, sSize int
for nSrc < len(src) {
if nSrc+1 < len(src) {
x := uint16(src[nSrc+0])<<8 | uint16(src[nSrc+1])
if u.current.endianness == LittleEndian {
x = x>>8 | x<<8
}
r, sSize = rune(x), 2
if utf16.IsSurrogate(r) {
if nSrc+3 < len(src) {
x = uint16(src[nSrc+2])<<8 | uint16(src[nSrc+3])
if u.current.endianness == LittleEndian {
x = x>>8 | x<<8
}
// Save for next iteration if it is not a high surrogate.
if isHighSurrogate(rune(x)) {
r, sSize = utf16.DecodeRune(r, rune(x)), 4
}
} else if !atEOF {
err = transform.ErrShortSrc
break
}
}
if dSize = utf8.RuneLen(r); dSize < 0 {
r, dSize = utf8.RuneError, 3
}
} else if atEOF {
// Single trailing byte.
r, dSize, sSize = utf8.RuneError, 3, 1
} else {
err = transform.ErrShortSrc
break
}
if nDst+dSize > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
nSrc += sSize
}
return nDst, nSrc, err
}
func isHighSurrogate(r rune) bool {
return 0xDC00 <= r && r <= 0xDFFF
}
type utf16Encoder struct {
endianness Endianness
initialBOMPolicy BOMPolicy
currentBOMPolicy BOMPolicy
}
func (u *utf16Encoder) Reset() {
u.currentBOMPolicy = u.initialBOMPolicy
}
func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if u.currentBOMPolicy&writeBOM != 0 {
if len(dst) < 2 {
return 0, 0, transform.ErrShortDst
}
dst[0], dst[1] = 0xfe, 0xff
u.currentBOMPolicy = IgnoreBOM
nDst = 2
}
r, size := rune(0), 0
for nSrc < len(src) {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
}
if r <= 0xffff {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 8)
dst[nDst+1] = uint8(r)
nDst += 2
} else {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
r1, r2 := utf16.EncodeRune(r)
dst[nDst+0] = uint8(r1 >> 8)
dst[nDst+1] = uint8(r1)
dst[nDst+2] = uint8(r2 >> 8)
dst[nDst+3] = uint8(r2)
nDst += 4
}
nSrc += size
}
if u.endianness == LittleEndian {
for i := 0; i < nDst; i += 2 {
dst[i], dst[i+1] = dst[i+1], dst[i]
}
}
return nDst, nSrc, err
}

View File

@ -327,13 +327,13 @@ func splitExpandIndex(ce Elem) (index int) {
// - v* is the replacement tertiary weight for the first rune,
// - w* is the replacement tertiary weight for the second rune,
// Tertiary weights of subsequent runes should be replaced with maxTertiary.
// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
// See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
func splitDecompose(ce Elem) (t1, t2 uint8) {
return uint8(ce), uint8(ce >> 8)
}
const (
// These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
// These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
minUnified rune = 0x4E00
maxUnified = 0x9FFF
minCompatibility = 0xF900
@ -352,7 +352,7 @@ const (
// implicitPrimary returns the primary weight for the a rune
// for which there is no entry for the rune in the collation table.
// We take a different approach from the one specified in
// http://unicode.org/reports/tr10/#Implicit_Weights,
// https://unicode.org/reports/tr10/#Implicit_Weights,
// but preserve the resulting relative ordering of the runes.
func implicitPrimary(r rune) int {
if unicode.Is(unicode.Ideographic, r) {

View File

@ -130,7 +130,7 @@ type numberConverter struct {
// init completes initialization of a numberConverter and prepares it for adding
// more digits. elems is assumed to have a digit starting at oldLen.
func (nc *numberConverter) init(elems []Elem, oldLen int, isZero bool) {
// Insert a marker indicating the start of a number and and a placeholder
// Insert a marker indicating the start of a number and a placeholder
// for the number of digits.
if isZero {
elems = append(elems[:oldLen], nc.w.numberStart, 0)

View File

@ -48,7 +48,7 @@ func NewCodeWriter() *CodeWriter {
}
// WriteGoFile appends the buffer with the total size of all created structures
// and writes it as a Go file to the the given file with the given package name.
// and writes it as a Go file to the given file with the given package name.
func (w *CodeWriter) WriteGoFile(filename, pkg string) {
f, err := os.Create(filename)
if err != nil {
@ -61,12 +61,14 @@ func (w *CodeWriter) WriteGoFile(filename, pkg string) {
}
// WriteVersionedGoFile appends the buffer with the total size of all created
// structures and writes it as a Go file to the the given file with the given
// structures and writes it as a Go file to the given file with the given
// package name and build tags for the current Unicode version,
func (w *CodeWriter) WriteVersionedGoFile(filename, pkg string) {
tags := buildTags()
if tags != "" {
filename = insertVersion(filename, UnicodeVersion())
pattern := fileToPattern(filename)
updateBuildTags(pattern)
filename = fmt.Sprintf(pattern, UnicodeVersion())
}
f, err := os.Create(filename)
if err != nil {
@ -79,10 +81,12 @@ func (w *CodeWriter) WriteVersionedGoFile(filename, pkg string) {
}
// WriteGo appends the buffer with the total size of all created structures and
// writes it as a Go file to the the given writer with the given package name.
// writes it as a Go file to the given writer with the given package name.
func (w *CodeWriter) WriteGo(out io.Writer, pkg, tags string) (n int, err error) {
sz := w.Size
w.WriteComment("Total table size %d bytes (%dKiB); checksum: %X\n", sz, sz/1024, w.Hash.Sum32())
if sz > 0 {
w.WriteComment("Total table size %d bytes (%dKiB); checksum: %X\n", sz, sz/1024, w.Hash.Sum32())
}
defer w.buf.Reset()
return WriteGo(out, pkg, tags, w.buf.Bytes())
}
@ -199,7 +203,6 @@ func (w *CodeWriter) writeValue(v reflect.Value) {
// WriteString writes a string literal.
func (w *CodeWriter) WriteString(s string) {
s = strings.Replace(s, `\`, `\\`, -1)
io.WriteString(w.Hash, s) // content hash
w.Size += len(s)
@ -250,6 +253,9 @@ func (w *CodeWriter) WriteString(s string) {
out = fmt.Sprintf("\\U%08x", r)
}
chars = len(out)
} else if r == '\\' {
out = "\\" + string(r)
chars = 2
}
if n -= chars; n < 0 {
nLines++

View File

@ -7,7 +7,7 @@
//
// This package defines command line flags that are common to most generation
// tools. The flags allow for specifying specific Unicode and CLDR versions
// in the public Unicode data repository (http://www.unicode.org/Public).
// in the public Unicode data repository (https://www.unicode.org/Public).
//
// A local Unicode data mirror can be set through the flag -local or the
// environment variable UNICODE_DIR. The former takes precedence. The local
@ -31,6 +31,7 @@ import (
"os"
"path"
"path/filepath"
"regexp"
"strings"
"sync"
"unicode"
@ -40,7 +41,7 @@ import (
var (
url = flag.String("url",
"http://www.unicode.org/Public",
"https://www.unicode.org/Public",
"URL of Unicode database directory")
iana = flag.String("iana",
"http://www.iana.org",
@ -83,25 +84,21 @@ func CLDRVersion() string {
}
var tags = []struct{ version, buildTags string }{
{"10.0.0", "go1.10"},
{"", "!go1.10"},
{"9.0.0", "!go1.10"},
{"10.0.0", "go1.10,!go1.13"},
{"11.0.0", "go1.13"},
}
// buildTags reports the build tags used for the current Unicode version.
func buildTags() string {
v := UnicodeVersion()
for _, x := range tags {
// We should do a numeric comparison, but including the collate package
// would create an import cycle. We approximate it by assuming that
// longer version strings are later.
if len(x.version) <= len(v) {
return x.buildTags
}
if len(x.version) == len(v) && x.version <= v {
return x.buildTags
for _, e := range tags {
if e.version == v {
return e.buildTags
}
}
return tags[0].buildTags
log.Fatalf("Unknown build tags for Unicode version %q.", v)
return ""
}
// IsLocal reports whether data files are available locally.
@ -269,12 +266,29 @@ func WriteGoFile(filename, pkg string, b []byte) {
}
}
func insertVersion(filename, version string) string {
func fileToPattern(filename string) string {
suffix := ".go"
if strings.HasSuffix(filename, "_test.go") {
suffix = "_test.go"
}
return fmt.Sprint(filename[:len(filename)-len(suffix)], version, suffix)
prefix := filename[:len(filename)-len(suffix)]
return fmt.Sprint(prefix, "%s", suffix)
}
func updateBuildTags(pattern string) {
for _, t := range tags {
oldFile := fmt.Sprintf(pattern, t.version)
b, err := ioutil.ReadFile(oldFile)
if err != nil {
continue
}
build := fmt.Sprintf("// +build %s", t.buildTags)
b = regexp.MustCompile(`// \+build .*`).ReplaceAll(b, []byte(build))
err = ioutil.WriteFile(oldFile, b, 0644)
if err != nil {
log.Fatal(err)
}
}
}
// WriteVersionedGoFile prepends a standard file comment, adds build tags to
@ -282,16 +296,16 @@ func insertVersion(filename, version string) string {
// the given bytes, applies gofmt, and writes them to a file with the given
// name. It will call log.Fatal if there are any errors.
func WriteVersionedGoFile(filename, pkg string, b []byte) {
tags := buildTags()
if tags != "" {
filename = insertVersion(filename, UnicodeVersion())
}
pattern := fileToPattern(filename)
updateBuildTags(pattern)
filename = fmt.Sprintf(pattern, UnicodeVersion())
w, err := os.Create(filename)
if err != nil {
log.Fatalf("Could not create file %s: %v", filename, err)
}
defer w.Close()
if _, err = WriteGo(w, pkg, tags, b); err != nil {
if _, err = WriteGo(w, pkg, buildTags(), b); err != nil {
log.Fatalf("Error writing file %s: %v", filename, err)
}
}

View File

@ -4,13 +4,13 @@ package language
// This file contains code common to the maketables.go and the package code.
// langAliasType is the type of an alias in langAliasMap.
type langAliasType int8
// AliasType is the type of an alias in AliasMap.
type AliasType int8
const (
langDeprecated langAliasType = iota
langMacro
langLegacy
Deprecated AliasType = iota
Macro
Legacy
langAliasTypeUnknown langAliasType = -1
AliasTypeUnknown AliasType = -1
)

29
vendor/golang.org/x/text/internal/language/compact.go generated vendored Normal file
View File

@ -0,0 +1,29 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
// CompactCoreInfo is a compact integer with the three core tags encoded.
type CompactCoreInfo uint32
// GetCompactCore generates a uint32 value that is guaranteed to be unique for
// different language, region, and script values.
func GetCompactCore(t Tag) (cci CompactCoreInfo, ok bool) {
if t.LangID > langNoIndexOffset {
return 0, false
}
cci |= CompactCoreInfo(t.LangID) << (8 + 12)
cci |= CompactCoreInfo(t.ScriptID) << 12
cci |= CompactCoreInfo(t.RegionID)
return cci, true
}
// Tag generates a tag from c.
func (c CompactCoreInfo) Tag() Tag {
return Tag{
LangID: Language(c >> 20),
RegionID: Region(c & 0x3ff),
ScriptID: Script(c>>12) & 0xff,
}
}

View File

@ -0,0 +1,61 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package compact defines a compact representation of language tags.
//
// Common language tags (at least all for which locale information is defined
// in CLDR) are assigned a unique index. Each Tag is associated with such an
// ID for selecting language-related resources (such as translations) as well
// as one for selecting regional defaults (currency, number formatting, etc.)
//
// It may want to export this functionality at some point, but at this point
// this is only available for use within x/text.
package compact // import "golang.org/x/text/internal/language/compact"
import (
"sort"
"strings"
"golang.org/x/text/internal/language"
)
// ID is an integer identifying a single tag.
type ID uint16
func getCoreIndex(t language.Tag) (id ID, ok bool) {
cci, ok := language.GetCompactCore(t)
if !ok {
return 0, false
}
i := sort.Search(len(coreTags), func(i int) bool {
return cci <= coreTags[i]
})
if i == len(coreTags) || coreTags[i] != cci {
return 0, false
}
return ID(i), true
}
// Parent returns the ID of the parent or the root ID if id is already the root.
func (id ID) Parent() ID {
return parents[id]
}
// Tag converts id to an internal language Tag.
func (id ID) Tag() language.Tag {
if int(id) >= len(coreTags) {
return specialTags[int(id)-len(coreTags)]
}
return coreTags[id].Tag()
}
var specialTags []language.Tag
func init() {
tags := strings.Split(specialTagsStr, " ")
specialTags = make([]language.Tag, len(tags))
for i, t := range tags {
specialTags[i] = language.MustParse(t)
}
}

View File

@ -0,0 +1,64 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
// Language tag table generator.
// Data read from the web.
package main
import (
"flag"
"fmt"
"log"
"golang.org/x/text/internal/gen"
"golang.org/x/text/unicode/cldr"
)
var (
test = flag.Bool("test",
false,
"test existing tables; can be used to compare web data with package data.")
outputFile = flag.String("output",
"tables.go",
"output file for generated tables")
)
func main() {
gen.Init()
w := gen.NewCodeWriter()
defer w.WriteGoFile("tables.go", "compact")
fmt.Fprintln(w, `import "golang.org/x/text/internal/language"`)
b := newBuilder(w)
gen.WriteCLDRVersion(w)
b.writeCompactIndex()
}
type builder struct {
w *gen.CodeWriter
data *cldr.CLDR
supp *cldr.SupplementalData
}
func newBuilder(w *gen.CodeWriter) *builder {
r := gen.OpenCLDRCoreZip()
defer r.Close()
d := &cldr.Decoder{}
data, err := d.DecodeZip(r)
if err != nil {
log.Fatal(err)
}
b := builder{
w: w,
data: data,
supp: data.Supplemental(),
}
return &b
}

View File

@ -0,0 +1,113 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This file generates derivative tables based on the language package itself.
import (
"fmt"
"log"
"sort"
"strings"
"golang.org/x/text/internal/language"
)
// Compact indices:
// Note -va-X variants only apply to localization variants.
// BCP variants only ever apply to language.
// The only ambiguity between tags is with regions.
func (b *builder) writeCompactIndex() {
// Collect all language tags for which we have any data in CLDR.
m := map[language.Tag]bool{}
for _, lang := range b.data.Locales() {
// We include all locales unconditionally to be consistent with en_US.
// We want en_US, even though it has no data associated with it.
// TODO: put any of the languages for which no data exists at the end
// of the index. This allows all components based on ICU to use that
// as the cutoff point.
// if x := data.RawLDML(lang); false ||
// x.LocaleDisplayNames != nil ||
// x.Characters != nil ||
// x.Delimiters != nil ||
// x.Measurement != nil ||
// x.Dates != nil ||
// x.Numbers != nil ||
// x.Units != nil ||
// x.ListPatterns != nil ||
// x.Collations != nil ||
// x.Segmentations != nil ||
// x.Rbnf != nil ||
// x.Annotations != nil ||
// x.Metadata != nil {
// TODO: support POSIX natively, albeit non-standard.
tag := language.Make(strings.Replace(lang, "_POSIX", "-u-va-posix", 1))
m[tag] = true
// }
}
// TODO: plural rules are also defined for the deprecated tags:
// iw mo sh tl
// Consider removing these as compact tags.
// Include locales for plural rules, which uses a different structure.
for _, plurals := range b.supp.Plurals {
for _, rules := range plurals.PluralRules {
for _, lang := range strings.Split(rules.Locales, " ") {
m[language.Make(lang)] = true
}
}
}
var coreTags []language.CompactCoreInfo
var special []string
for t := range m {
if x := t.Extensions(); len(x) != 0 && fmt.Sprint(x) != "[u-va-posix]" {
log.Fatalf("Unexpected extension %v in %v", x, t)
}
if len(t.Variants()) == 0 && len(t.Extensions()) == 0 {
cci, ok := language.GetCompactCore(t)
if !ok {
log.Fatalf("Locale for non-basic language %q", t)
}
coreTags = append(coreTags, cci)
} else {
special = append(special, t.String())
}
}
w := b.w
sort.Slice(coreTags, func(i, j int) bool { return coreTags[i] < coreTags[j] })
sort.Strings(special)
w.WriteComment(`
NumCompactTags is the number of common tags. The maximum tag is
NumCompactTags-1.`)
w.WriteConst("NumCompactTags", len(m))
fmt.Fprintln(w, "const (")
for i, t := range coreTags {
fmt.Fprintf(w, "%s ID = %d\n", ident(t.Tag().String()), i)
}
for i, t := range special {
fmt.Fprintf(w, "%s ID = %d\n", ident(t), i+len(coreTags))
}
fmt.Fprintln(w, ")")
w.WriteVar("coreTags", coreTags)
w.WriteConst("specialTagsStr", strings.Join(special, " "))
}
func ident(s string) string {
return strings.Replace(s, "-", "", -1) + "Index"
}

View File

@ -0,0 +1,54 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"log"
"golang.org/x/text/internal/gen"
"golang.org/x/text/internal/language"
"golang.org/x/text/internal/language/compact"
"golang.org/x/text/unicode/cldr"
)
func main() {
r := gen.OpenCLDRCoreZip()
defer r.Close()
d := &cldr.Decoder{}
data, err := d.DecodeZip(r)
if err != nil {
log.Fatalf("DecodeZip: %v", err)
}
w := gen.NewCodeWriter()
defer w.WriteGoFile("parents.go", "compact")
// Create parents table.
type ID uint16
parents := make([]ID, compact.NumCompactTags)
for _, loc := range data.Locales() {
tag := language.MustParse(loc)
index, ok := compact.FromTag(tag)
if !ok {
continue
}
parentIndex := compact.ID(0) // und
for p := tag.Parent(); p != language.Und; p = p.Parent() {
if x, ok := compact.FromTag(p); ok {
parentIndex = x
break
}
}
parents[index] = ID(parentIndex)
}
w.WriteComment(`
parents maps a compact index of a tag to the compact index of the parent of
this tag.`)
w.WriteVar("parents", parents)
}

View File

@ -0,0 +1,260 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go gen_index.go -output tables.go
//go:generate go run gen_parents.go
package compact
// TODO: Remove above NOTE after:
// - verifying that tables are dropped correctly (most notably matcher tables).
import (
"strings"
"golang.org/x/text/internal/language"
)
// Tag represents a BCP 47 language tag. It is used to specify an instance of a
// specific language or locale. All language tag values are guaranteed to be
// well-formed.
type Tag struct {
// NOTE: exported tags will become part of the public API.
language ID
locale ID
full fullTag // always a language.Tag for now.
}
const _und = 0
type fullTag interface {
IsRoot() bool
Parent() language.Tag
}
// Make a compact Tag from a fully specified internal language Tag.
func Make(t language.Tag) (tag Tag) {
if region := t.TypeForKey("rg"); len(region) == 6 && region[2:] == "zzzz" {
if r, err := language.ParseRegion(region[:2]); err == nil {
tFull := t
t, _ = t.SetTypeForKey("rg", "")
// TODO: should we not consider "va" for the language tag?
var exact1, exact2 bool
tag.language, exact1 = FromTag(t)
t.RegionID = r
tag.locale, exact2 = FromTag(t)
if !exact1 || !exact2 {
tag.full = tFull
}
return tag
}
}
lang, ok := FromTag(t)
tag.language = lang
tag.locale = lang
if !ok {
tag.full = t
}
return tag
}
// Tag returns an internal language Tag version of this tag.
func (t Tag) Tag() language.Tag {
if t.full != nil {
return t.full.(language.Tag)
}
tag := t.language.Tag()
if t.language != t.locale {
loc := t.locale.Tag()
tag, _ = tag.SetTypeForKey("rg", strings.ToLower(loc.RegionID.String())+"zzzz")
}
return tag
}
// IsCompact reports whether this tag is fully defined in terms of ID.
func (t *Tag) IsCompact() bool {
return t.full == nil
}
// MayHaveVariants reports whether a tag may have variants. If it returns false
// it is guaranteed the tag does not have variants.
func (t Tag) MayHaveVariants() bool {
return t.full != nil || int(t.language) >= len(coreTags)
}
// MayHaveExtensions reports whether a tag may have extensions. If it returns
// false it is guaranteed the tag does not have them.
func (t Tag) MayHaveExtensions() bool {
return t.full != nil ||
int(t.language) >= len(coreTags) ||
t.language != t.locale
}
// IsRoot returns true if t is equal to language "und".
func (t Tag) IsRoot() bool {
if t.full != nil {
return t.full.IsRoot()
}
return t.language == _und
}
// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
// specific language are substituted with fields from the parent language.
// The parent for a language may change for newer versions of CLDR.
func (t Tag) Parent() Tag {
if t.full != nil {
return Make(t.full.Parent())
}
if t.language != t.locale {
// Simulate stripping -u-rg-xxxxxx
return Tag{language: t.language, locale: t.language}
}
// TODO: use parent lookup table once cycle from internal package is
// removed. Probably by internalizing the table and declaring this fast
// enough.
// lang := compactID(internal.Parent(uint16(t.language)))
lang, _ := FromTag(t.language.Tag().Parent())
return Tag{language: lang, locale: lang}
}
// returns token t and the rest of the string.
func nextToken(s string) (t, tail string) {
p := strings.Index(s[1:], "-")
if p == -1 {
return s[1:], ""
}
p++
return s[1:p], s[p:]
}
// LanguageID returns an index, where 0 <= index < NumCompactTags, for tags
// for which data exists in the text repository.The index will change over time
// and should not be stored in persistent storage. If t does not match a compact
// index, exact will be false and the compact index will be returned for the
// first match after repeatedly taking the Parent of t.
func LanguageID(t Tag) (id ID, exact bool) {
return t.language, t.full == nil
}
// RegionalID returns the ID for the regional variant of this tag. This index is
// used to indicate region-specific overrides, such as default currency, default
// calendar and week data, default time cycle, and default measurement system
// and unit preferences.
//
// For instance, the tag en-GB-u-rg-uszzzz specifies British English with US
// settings for currency, number formatting, etc. The CompactIndex for this tag
// will be that for en-GB, while the RegionalID will be the one corresponding to
// en-US.
func RegionalID(t Tag) (id ID, exact bool) {
return t.locale, t.full == nil
}
// LanguageTag returns t stripped of regional variant indicators.
//
// At the moment this means it is stripped of a regional and variant subtag "rg"
// and "va" in the "u" extension.
func (t Tag) LanguageTag() Tag {
if t.full == nil {
return Tag{language: t.language, locale: t.language}
}
tt := t.Tag()
tt.SetTypeForKey("rg", "")
tt.SetTypeForKey("va", "")
return Make(tt)
}
// RegionalTag returns the regional variant of the tag.
//
// At the moment this means that the region is set from the regional subtag
// "rg" in the "u" extension.
func (t Tag) RegionalTag() Tag {
rt := Tag{language: t.locale, locale: t.locale}
if t.full == nil {
return rt
}
b := language.Builder{}
tag := t.Tag()
// tag, _ = tag.SetTypeForKey("rg", "")
b.SetTag(t.locale.Tag())
if v := tag.Variants(); v != "" {
for _, v := range strings.Split(v, "-") {
b.AddVariant(v)
}
}
for _, e := range tag.Extensions() {
b.AddExt(e)
}
return t
}
// FromTag reports closest matching ID for an internal language Tag.
func FromTag(t language.Tag) (id ID, exact bool) {
// TODO: perhaps give more frequent tags a lower index.
// TODO: we could make the indexes stable. This will excluded some
// possibilities for optimization, so don't do this quite yet.
exact = true
b, s, r := t.Raw()
if t.HasString() {
if t.IsPrivateUse() {
// We have no entries for user-defined tags.
return 0, false
}
hasExtra := false
if t.HasVariants() {
if t.HasExtensions() {
build := language.Builder{}
build.SetTag(language.Tag{LangID: b, ScriptID: s, RegionID: r})
build.AddVariant(t.Variants())
exact = false
t = build.Make()
}
hasExtra = true
} else if _, ok := t.Extension('u'); ok {
// TODO: va may mean something else. Consider not considering it.
// Strip all but the 'va' entry.
old := t
variant := t.TypeForKey("va")
t = language.Tag{LangID: b, ScriptID: s, RegionID: r}
if variant != "" {
t, _ = t.SetTypeForKey("va", variant)
hasExtra = true
}
exact = old == t
} else {
exact = false
}
if hasExtra {
// We have some variants.
for i, s := range specialTags {
if s == t {
return ID(i + len(coreTags)), exact
}
}
exact = false
}
}
if x, ok := getCoreIndex(t); ok {
return x, exact
}
exact = false
if r != 0 && s == 0 {
// Deal with cases where an extra script is inserted for the region.
t, _ := t.Maximize()
if x, ok := getCoreIndex(t); ok {
return x, exact
}
}
for t = t.Parent(); t != root; t = t.Parent() {
// No variants specified: just compare core components.
// The key has the form lllssrrr, where l, s, and r are nibbles for
// respectively the langID, scriptID, and regionID.
if x, ok := getCoreIndex(t); ok {
return x, exact
}
}
return 0, exact
}
var root = language.Tag{}

View File

@ -0,0 +1,120 @@
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package compact
// parents maps a compact index of a tag to the compact index of the parent of
// this tag.
var parents = []ID{ // 775 elements
// Entry 0 - 3F
0x0000, 0x0000, 0x0001, 0x0001, 0x0000, 0x0004, 0x0000, 0x0006,
0x0000, 0x0008, 0x0000, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a,
0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a,
0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a,
0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x000a, 0x0000,
0x0000, 0x0028, 0x0000, 0x002a, 0x0000, 0x002c, 0x0000, 0x0000,
0x002f, 0x002e, 0x002e, 0x0000, 0x0033, 0x0000, 0x0035, 0x0000,
0x0037, 0x0000, 0x0039, 0x0000, 0x003b, 0x0000, 0x0000, 0x003e,
// Entry 40 - 7F
0x0000, 0x0040, 0x0040, 0x0000, 0x0043, 0x0043, 0x0000, 0x0046,
0x0000, 0x0048, 0x0000, 0x0000, 0x004b, 0x004a, 0x004a, 0x0000,
0x004f, 0x004f, 0x004f, 0x004f, 0x0000, 0x0054, 0x0054, 0x0000,
0x0057, 0x0000, 0x0059, 0x0000, 0x005b, 0x0000, 0x005d, 0x005d,
0x0000, 0x0060, 0x0000, 0x0062, 0x0000, 0x0064, 0x0000, 0x0066,
0x0066, 0x0000, 0x0069, 0x0000, 0x006b, 0x006b, 0x006b, 0x006b,
0x006b, 0x006b, 0x006b, 0x0000, 0x0073, 0x0000, 0x0075, 0x0000,
0x0077, 0x0000, 0x0000, 0x007a, 0x0000, 0x007c, 0x0000, 0x007e,
// Entry 80 - BF
0x0000, 0x0080, 0x0080, 0x0000, 0x0083, 0x0083, 0x0000, 0x0086,
0x0087, 0x0087, 0x0087, 0x0086, 0x0088, 0x0087, 0x0087, 0x0087,
0x0086, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0088,
0x0087, 0x0087, 0x0087, 0x0087, 0x0088, 0x0087, 0x0088, 0x0087,
0x0087, 0x0088, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
0x0087, 0x0087, 0x0087, 0x0086, 0x0087, 0x0087, 0x0087, 0x0087,
0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0086, 0x0087, 0x0086,
// Entry C0 - FF
0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
0x0088, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
0x0086, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0088, 0x0087,
0x0087, 0x0088, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087,
0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0086, 0x0086, 0x0087,
0x0087, 0x0086, 0x0087, 0x0087, 0x0087, 0x0087, 0x0087, 0x0000,
0x00ef, 0x0000, 0x00f1, 0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f2,
0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f1, 0x00f2, 0x00f1, 0x00f1,
// Entry 100 - 13F
0x00f2, 0x00f2, 0x00f1, 0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f1,
0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x00f2, 0x0000, 0x010e,
0x0000, 0x0110, 0x0000, 0x0112, 0x0000, 0x0114, 0x0114, 0x0000,
0x0117, 0x0117, 0x0117, 0x0117, 0x0000, 0x011c, 0x0000, 0x011e,
0x0000, 0x0120, 0x0120, 0x0000, 0x0123, 0x0123, 0x0123, 0x0123,
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
// Entry 140 - 17F
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123, 0x0123,
0x0123, 0x0123, 0x0000, 0x0152, 0x0000, 0x0154, 0x0000, 0x0156,
0x0000, 0x0158, 0x0000, 0x015a, 0x0000, 0x015c, 0x015c, 0x015c,
0x0000, 0x0160, 0x0000, 0x0000, 0x0163, 0x0000, 0x0165, 0x0000,
0x0167, 0x0167, 0x0167, 0x0000, 0x016b, 0x0000, 0x016d, 0x0000,
0x016f, 0x0000, 0x0171, 0x0171, 0x0000, 0x0174, 0x0000, 0x0176,
0x0000, 0x0178, 0x0000, 0x017a, 0x0000, 0x017c, 0x0000, 0x017e,
// Entry 180 - 1BF
0x0000, 0x0000, 0x0000, 0x0182, 0x0000, 0x0184, 0x0184, 0x0184,
0x0184, 0x0000, 0x0000, 0x0000, 0x018b, 0x0000, 0x0000, 0x018e,
0x0000, 0x0000, 0x0191, 0x0000, 0x0000, 0x0000, 0x0195, 0x0000,
0x0197, 0x0000, 0x0000, 0x019a, 0x0000, 0x0000, 0x019d, 0x0000,
0x019f, 0x0000, 0x01a1, 0x0000, 0x01a3, 0x0000, 0x01a5, 0x0000,
0x01a7, 0x0000, 0x01a9, 0x0000, 0x01ab, 0x0000, 0x01ad, 0x0000,
0x01af, 0x0000, 0x01b1, 0x01b1, 0x0000, 0x01b4, 0x0000, 0x01b6,
0x0000, 0x01b8, 0x0000, 0x01ba, 0x0000, 0x01bc, 0x0000, 0x0000,
// Entry 1C0 - 1FF
0x01bf, 0x0000, 0x01c1, 0x0000, 0x01c3, 0x0000, 0x01c5, 0x0000,
0x01c7, 0x0000, 0x01c9, 0x0000, 0x01cb, 0x01cb, 0x01cb, 0x01cb,
0x0000, 0x01d0, 0x0000, 0x01d2, 0x01d2, 0x0000, 0x01d5, 0x0000,
0x01d7, 0x0000, 0x01d9, 0x0000, 0x01db, 0x0000, 0x01dd, 0x0000,
0x01df, 0x01df, 0x0000, 0x01e2, 0x0000, 0x01e4, 0x0000, 0x01e6,
0x0000, 0x01e8, 0x0000, 0x01ea, 0x0000, 0x01ec, 0x0000, 0x01ee,
0x0000, 0x01f0, 0x0000, 0x0000, 0x01f3, 0x0000, 0x01f5, 0x01f5,
0x01f5, 0x0000, 0x01f9, 0x0000, 0x01fb, 0x0000, 0x01fd, 0x0000,
// Entry 200 - 23F
0x01ff, 0x0000, 0x0000, 0x0202, 0x0000, 0x0204, 0x0204, 0x0000,
0x0207, 0x0000, 0x0209, 0x0209, 0x0000, 0x020c, 0x020c, 0x0000,
0x020f, 0x020f, 0x020f, 0x020f, 0x020f, 0x020f, 0x020f, 0x0000,
0x0217, 0x0000, 0x0219, 0x0000, 0x021b, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0221, 0x0000, 0x0000, 0x0224, 0x0000, 0x0226,
0x0226, 0x0000, 0x0229, 0x0000, 0x022b, 0x022b, 0x0000, 0x0000,
0x022f, 0x022e, 0x022e, 0x0000, 0x0000, 0x0234, 0x0000, 0x0236,
0x0000, 0x0238, 0x0000, 0x0244, 0x023a, 0x0244, 0x0244, 0x0244,
// Entry 240 - 27F
0x0244, 0x0244, 0x0244, 0x0244, 0x023a, 0x0244, 0x0244, 0x0000,
0x0247, 0x0247, 0x0247, 0x0000, 0x024b, 0x0000, 0x024d, 0x0000,
0x024f, 0x024f, 0x0000, 0x0252, 0x0000, 0x0254, 0x0254, 0x0254,
0x0254, 0x0254, 0x0254, 0x0000, 0x025b, 0x0000, 0x025d, 0x0000,
0x025f, 0x0000, 0x0261, 0x0000, 0x0263, 0x0000, 0x0265, 0x0000,
0x0000, 0x0268, 0x0268, 0x0268, 0x0000, 0x026c, 0x0000, 0x026e,
0x0000, 0x0270, 0x0000, 0x0000, 0x0000, 0x0274, 0x0273, 0x0273,
0x0000, 0x0278, 0x0000, 0x027a, 0x0000, 0x027c, 0x0000, 0x0000,
// Entry 280 - 2BF
0x0000, 0x0000, 0x0281, 0x0000, 0x0000, 0x0284, 0x0000, 0x0286,
0x0286, 0x0286, 0x0286, 0x0000, 0x028b, 0x028b, 0x028b, 0x0000,
0x028f, 0x028f, 0x028f, 0x028f, 0x028f, 0x0000, 0x0295, 0x0295,
0x0295, 0x0295, 0x0000, 0x0000, 0x0000, 0x0000, 0x029d, 0x029d,
0x029d, 0x0000, 0x02a1, 0x02a1, 0x02a1, 0x02a1, 0x0000, 0x0000,
0x02a7, 0x02a7, 0x02a7, 0x02a7, 0x0000, 0x02ac, 0x0000, 0x02ae,
0x02ae, 0x0000, 0x02b1, 0x0000, 0x02b3, 0x0000, 0x02b5, 0x02b5,
0x0000, 0x0000, 0x02b9, 0x0000, 0x0000, 0x0000, 0x02bd, 0x0000,
// Entry 2C0 - 2FF
0x02bf, 0x02bf, 0x0000, 0x0000, 0x02c3, 0x0000, 0x02c5, 0x0000,
0x02c7, 0x0000, 0x02c9, 0x0000, 0x02cb, 0x0000, 0x02cd, 0x02cd,
0x0000, 0x0000, 0x02d1, 0x0000, 0x02d3, 0x02d0, 0x02d0, 0x0000,
0x0000, 0x02d8, 0x02d7, 0x02d7, 0x0000, 0x0000, 0x02dd, 0x0000,
0x02df, 0x0000, 0x02e1, 0x0000, 0x0000, 0x02e4, 0x0000, 0x02e6,
0x0000, 0x0000, 0x02e9, 0x0000, 0x02eb, 0x0000, 0x02ed, 0x0000,
0x02ef, 0x02ef, 0x0000, 0x0000, 0x02f3, 0x02f2, 0x02f2, 0x0000,
0x02f7, 0x0000, 0x02f9, 0x02f9, 0x02f9, 0x02f9, 0x02f9, 0x0000,
// Entry 300 - 33F
0x02ff, 0x0300, 0x02ff, 0x0000, 0x0303, 0x0051, 0x00e6,
} // Size: 1574 bytes
// Total table size 1574 bytes (1KiB); checksum: 895AAF0B

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,91 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package compact
var (
und = Tag{}
Und Tag = Tag{}
Afrikaans Tag = Tag{language: afIndex, locale: afIndex}
Amharic Tag = Tag{language: amIndex, locale: amIndex}
Arabic Tag = Tag{language: arIndex, locale: arIndex}
ModernStandardArabic Tag = Tag{language: ar001Index, locale: ar001Index}
Azerbaijani Tag = Tag{language: azIndex, locale: azIndex}
Bulgarian Tag = Tag{language: bgIndex, locale: bgIndex}
Bengali Tag = Tag{language: bnIndex, locale: bnIndex}
Catalan Tag = Tag{language: caIndex, locale: caIndex}
Czech Tag = Tag{language: csIndex, locale: csIndex}
Danish Tag = Tag{language: daIndex, locale: daIndex}
German Tag = Tag{language: deIndex, locale: deIndex}
Greek Tag = Tag{language: elIndex, locale: elIndex}
English Tag = Tag{language: enIndex, locale: enIndex}
AmericanEnglish Tag = Tag{language: enUSIndex, locale: enUSIndex}
BritishEnglish Tag = Tag{language: enGBIndex, locale: enGBIndex}
Spanish Tag = Tag{language: esIndex, locale: esIndex}
EuropeanSpanish Tag = Tag{language: esESIndex, locale: esESIndex}
LatinAmericanSpanish Tag = Tag{language: es419Index, locale: es419Index}
Estonian Tag = Tag{language: etIndex, locale: etIndex}
Persian Tag = Tag{language: faIndex, locale: faIndex}
Finnish Tag = Tag{language: fiIndex, locale: fiIndex}
Filipino Tag = Tag{language: filIndex, locale: filIndex}
French Tag = Tag{language: frIndex, locale: frIndex}
CanadianFrench Tag = Tag{language: frCAIndex, locale: frCAIndex}
Gujarati Tag = Tag{language: guIndex, locale: guIndex}
Hebrew Tag = Tag{language: heIndex, locale: heIndex}
Hindi Tag = Tag{language: hiIndex, locale: hiIndex}
Croatian Tag = Tag{language: hrIndex, locale: hrIndex}
Hungarian Tag = Tag{language: huIndex, locale: huIndex}
Armenian Tag = Tag{language: hyIndex, locale: hyIndex}
Indonesian Tag = Tag{language: idIndex, locale: idIndex}
Icelandic Tag = Tag{language: isIndex, locale: isIndex}
Italian Tag = Tag{language: itIndex, locale: itIndex}
Japanese Tag = Tag{language: jaIndex, locale: jaIndex}
Georgian Tag = Tag{language: kaIndex, locale: kaIndex}
Kazakh Tag = Tag{language: kkIndex, locale: kkIndex}
Khmer Tag = Tag{language: kmIndex, locale: kmIndex}
Kannada Tag = Tag{language: knIndex, locale: knIndex}
Korean Tag = Tag{language: koIndex, locale: koIndex}
Kirghiz Tag = Tag{language: kyIndex, locale: kyIndex}
Lao Tag = Tag{language: loIndex, locale: loIndex}
Lithuanian Tag = Tag{language: ltIndex, locale: ltIndex}
Latvian Tag = Tag{language: lvIndex, locale: lvIndex}
Macedonian Tag = Tag{language: mkIndex, locale: mkIndex}
Malayalam Tag = Tag{language: mlIndex, locale: mlIndex}
Mongolian Tag = Tag{language: mnIndex, locale: mnIndex}
Marathi Tag = Tag{language: mrIndex, locale: mrIndex}
Malay Tag = Tag{language: msIndex, locale: msIndex}
Burmese Tag = Tag{language: myIndex, locale: myIndex}
Nepali Tag = Tag{language: neIndex, locale: neIndex}
Dutch Tag = Tag{language: nlIndex, locale: nlIndex}
Norwegian Tag = Tag{language: noIndex, locale: noIndex}
Punjabi Tag = Tag{language: paIndex, locale: paIndex}
Polish Tag = Tag{language: plIndex, locale: plIndex}
Portuguese Tag = Tag{language: ptIndex, locale: ptIndex}
BrazilianPortuguese Tag = Tag{language: ptBRIndex, locale: ptBRIndex}
EuropeanPortuguese Tag = Tag{language: ptPTIndex, locale: ptPTIndex}
Romanian Tag = Tag{language: roIndex, locale: roIndex}
Russian Tag = Tag{language: ruIndex, locale: ruIndex}
Sinhala Tag = Tag{language: siIndex, locale: siIndex}
Slovak Tag = Tag{language: skIndex, locale: skIndex}
Slovenian Tag = Tag{language: slIndex, locale: slIndex}
Albanian Tag = Tag{language: sqIndex, locale: sqIndex}
Serbian Tag = Tag{language: srIndex, locale: srIndex}
SerbianLatin Tag = Tag{language: srLatnIndex, locale: srLatnIndex}
Swedish Tag = Tag{language: svIndex, locale: svIndex}
Swahili Tag = Tag{language: swIndex, locale: swIndex}
Tamil Tag = Tag{language: taIndex, locale: taIndex}
Telugu Tag = Tag{language: teIndex, locale: teIndex}
Thai Tag = Tag{language: thIndex, locale: thIndex}
Turkish Tag = Tag{language: trIndex, locale: trIndex}
Ukrainian Tag = Tag{language: ukIndex, locale: ukIndex}
Urdu Tag = Tag{language: urIndex, locale: urIndex}
Uzbek Tag = Tag{language: uzIndex, locale: uzIndex}
Vietnamese Tag = Tag{language: viIndex, locale: viIndex}
Chinese Tag = Tag{language: zhIndex, locale: zhIndex}
SimplifiedChinese Tag = Tag{language: zhHansIndex, locale: zhHansIndex}
TraditionalChinese Tag = Tag{language: zhHantIndex, locale: zhHantIndex}
Zulu Tag = Tag{language: zuIndex, locale: zuIndex}
)

167
vendor/golang.org/x/text/internal/language/compose.go generated vendored Normal file
View File

@ -0,0 +1,167 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"sort"
"strings"
)
// A Builder allows constructing a Tag from individual components.
// Its main user is Compose in the top-level language package.
type Builder struct {
Tag Tag
private string // the x extension
variants []string
extensions []string
}
// Make returns a new Tag from the current settings.
func (b *Builder) Make() Tag {
t := b.Tag
if len(b.extensions) > 0 || len(b.variants) > 0 {
sort.Sort(sortVariants(b.variants))
sort.Strings(b.extensions)
if b.private != "" {
b.extensions = append(b.extensions, b.private)
}
n := maxCoreSize + tokenLen(b.variants...) + tokenLen(b.extensions...)
buf := make([]byte, n)
p := t.genCoreBytes(buf)
t.pVariant = byte(p)
p += appendTokens(buf[p:], b.variants...)
t.pExt = uint16(p)
p += appendTokens(buf[p:], b.extensions...)
t.str = string(buf[:p])
// We may not always need to remake the string, but when or when not
// to do so is rather tricky.
scan := makeScanner(buf[:p])
t, _ = parse(&scan, "")
return t
} else if b.private != "" {
t.str = b.private
t.RemakeString()
}
return t
}
// SetTag copies all the settings from a given Tag. Any previously set values
// are discarded.
func (b *Builder) SetTag(t Tag) {
b.Tag.LangID = t.LangID
b.Tag.RegionID = t.RegionID
b.Tag.ScriptID = t.ScriptID
// TODO: optimize
b.variants = b.variants[:0]
if variants := t.Variants(); variants != "" {
for _, vr := range strings.Split(variants[1:], "-") {
b.variants = append(b.variants, vr)
}
}
b.extensions, b.private = b.extensions[:0], ""
for _, e := range t.Extensions() {
b.AddExt(e)
}
}
// AddExt adds extension e to the tag. e must be a valid extension as returned
// by Tag.Extension. If the extension already exists, it will be discarded,
// except for a -u extension, where non-existing key-type pairs will added.
func (b *Builder) AddExt(e string) {
if e[0] == 'x' {
if b.private == "" {
b.private = e
}
return
}
for i, s := range b.extensions {
if s[0] == e[0] {
if e[0] == 'u' {
b.extensions[i] += e[1:]
}
return
}
}
b.extensions = append(b.extensions, e)
}
// SetExt sets the extension e to the tag. e must be a valid extension as
// returned by Tag.Extension. If the extension already exists, it will be
// overwritten, except for a -u extension, where the individual key-type pairs
// will be set.
func (b *Builder) SetExt(e string) {
if e[0] == 'x' {
b.private = e
return
}
for i, s := range b.extensions {
if s[0] == e[0] {
if e[0] == 'u' {
b.extensions[i] = e + s[1:]
} else {
b.extensions[i] = e
}
return
}
}
b.extensions = append(b.extensions, e)
}
// AddVariant adds any number of variants.
func (b *Builder) AddVariant(v ...string) {
for _, v := range v {
if v != "" {
b.variants = append(b.variants, v)
}
}
}
// ClearVariants removes any variants previously added, including those
// copied from a Tag in SetTag.
func (b *Builder) ClearVariants() {
b.variants = b.variants[:0]
}
// ClearExtensions removes any extensions previously added, including those
// copied from a Tag in SetTag.
func (b *Builder) ClearExtensions() {
b.private = ""
b.extensions = b.extensions[:0]
}
func tokenLen(token ...string) (n int) {
for _, t := range token {
n += len(t) + 1
}
return
}
func appendTokens(b []byte, token ...string) int {
p := 0
for _, t := range token {
b[p] = '-'
copy(b[p+1:], t)
p += 1 + len(t)
}
return p
}
type sortVariants []string
func (s sortVariants) Len() int {
return len(s)
}
func (s sortVariants) Swap(i, j int) {
s[j], s[i] = s[i], s[j]
}
func (s sortVariants) Less(i, j int) bool {
return variantIndex[s[i]] < variantIndex[s[j]]
}

28
vendor/golang.org/x/text/internal/language/coverage.go generated vendored Normal file
View File

@ -0,0 +1,28 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
// BaseLanguages returns the list of all supported base languages. It generates
// the list by traversing the internal structures.
func BaseLanguages() []Language {
base := make([]Language, 0, NumLanguages)
for i := 0; i < langNoIndexOffset; i++ {
// We included "und" already for the value 0.
if i != nonCanonicalUnd {
base = append(base, Language(i))
}
}
i := langNoIndexOffset
for _, v := range langNoIndex {
for k := 0; k < 8; k++ {
if v&1 == 1 {
base = append(base, Language(i))
}
v >>= 1
i++
}
}
return base
}

1520
vendor/golang.org/x/text/internal/language/gen.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -8,13 +8,13 @@ package main
// This file contains code common to the maketables.go and the package code.
// langAliasType is the type of an alias in langAliasMap.
type langAliasType int8
// AliasType is the type of an alias in AliasMap.
type AliasType int8
const (
langDeprecated langAliasType = iota
langMacro
langLegacy
Deprecated AliasType = iota
Macro
Legacy
langAliasTypeUnknown langAliasType = -1
AliasTypeUnknown AliasType = -1
)

596
vendor/golang.org/x/text/internal/language/language.go generated vendored Normal file
View File

@ -0,0 +1,596 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go gen_common.go -output tables.go
package language // import "golang.org/x/text/internal/language"
// TODO: Remove above NOTE after:
// - verifying that tables are dropped correctly (most notably matcher tables).
import (
"errors"
"fmt"
"strings"
)
const (
// maxCoreSize is the maximum size of a BCP 47 tag without variants and
// extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
maxCoreSize = 12
// max99thPercentileSize is a somewhat arbitrary buffer size that presumably
// is large enough to hold at least 99% of the BCP 47 tags.
max99thPercentileSize = 32
// maxSimpleUExtensionSize is the maximum size of a -u extension with one
// key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
maxSimpleUExtensionSize = 14
)
// Tag represents a BCP 47 language tag. It is used to specify an instance of a
// specific language or locale. All language tag values are guaranteed to be
// well-formed. The zero value of Tag is Und.
type Tag struct {
// TODO: the following fields have the form TagTypeID. This name is chosen
// to allow refactoring the public package without conflicting with its
// Base, Script, and Region methods. Once the transition is fully completed
// the ID can be stripped from the name.
LangID Language
RegionID Region
// TODO: we will soon run out of positions for ScriptID. Idea: instead of
// storing lang, region, and ScriptID codes, store only the compact index and
// have a lookup table from this code to its expansion. This greatly speeds
// up table lookup, speed up common variant cases.
// This will also immediately free up 3 extra bytes. Also, the pVariant
// field can now be moved to the lookup table, as the compact index uniquely
// determines the offset of a possible variant.
ScriptID Script
pVariant byte // offset in str, includes preceding '-'
pExt uint16 // offset of first extension, includes preceding '-'
// str is the string representation of the Tag. It will only be used if the
// tag has variants or extensions.
str string
}
// Make is a convenience wrapper for Parse that omits the error.
// In case of an error, a sensible default is returned.
func Make(s string) Tag {
t, _ := Parse(s)
return t
}
// Raw returns the raw base language, script and region, without making an
// attempt to infer their values.
// TODO: consider removing
func (t Tag) Raw() (b Language, s Script, r Region) {
return t.LangID, t.ScriptID, t.RegionID
}
// equalTags compares language, script and region subtags only.
func (t Tag) equalTags(a Tag) bool {
return t.LangID == a.LangID && t.ScriptID == a.ScriptID && t.RegionID == a.RegionID
}
// IsRoot returns true if t is equal to language "und".
func (t Tag) IsRoot() bool {
if int(t.pVariant) < len(t.str) {
return false
}
return t.equalTags(Und)
}
// IsPrivateUse reports whether the Tag consists solely of an IsPrivateUse use
// tag.
func (t Tag) IsPrivateUse() bool {
return t.str != "" && t.pVariant == 0
}
// RemakeString is used to update t.str in case lang, script or region changed.
// It is assumed that pExt and pVariant still point to the start of the
// respective parts.
func (t *Tag) RemakeString() {
if t.str == "" {
return
}
extra := t.str[t.pVariant:]
if t.pVariant > 0 {
extra = extra[1:]
}
if t.equalTags(Und) && strings.HasPrefix(extra, "x-") {
t.str = extra
t.pVariant = 0
t.pExt = 0
return
}
var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
b := buf[:t.genCoreBytes(buf[:])]
if extra != "" {
diff := len(b) - int(t.pVariant)
b = append(b, '-')
b = append(b, extra...)
t.pVariant = uint8(int(t.pVariant) + diff)
t.pExt = uint16(int(t.pExt) + diff)
} else {
t.pVariant = uint8(len(b))
t.pExt = uint16(len(b))
}
t.str = string(b)
}
// genCoreBytes writes a string for the base languages, script and region tags
// to the given buffer and returns the number of bytes written. It will never
// write more than maxCoreSize bytes.
func (t *Tag) genCoreBytes(buf []byte) int {
n := t.LangID.StringToBuf(buf[:])
if t.ScriptID != 0 {
n += copy(buf[n:], "-")
n += copy(buf[n:], t.ScriptID.String())
}
if t.RegionID != 0 {
n += copy(buf[n:], "-")
n += copy(buf[n:], t.RegionID.String())
}
return n
}
// String returns the canonical string representation of the language tag.
func (t Tag) String() string {
if t.str != "" {
return t.str
}
if t.ScriptID == 0 && t.RegionID == 0 {
return t.LangID.String()
}
buf := [maxCoreSize]byte{}
return string(buf[:t.genCoreBytes(buf[:])])
}
// MarshalText implements encoding.TextMarshaler.
func (t Tag) MarshalText() (text []byte, err error) {
if t.str != "" {
text = append(text, t.str...)
} else if t.ScriptID == 0 && t.RegionID == 0 {
text = append(text, t.LangID.String()...)
} else {
buf := [maxCoreSize]byte{}
text = buf[:t.genCoreBytes(buf[:])]
}
return text, nil
}
// UnmarshalText implements encoding.TextUnmarshaler.
func (t *Tag) UnmarshalText(text []byte) error {
tag, err := Parse(string(text))
*t = tag
return err
}
// Variants returns the part of the tag holding all variants or the empty string
// if there are no variants defined.
func (t Tag) Variants() string {
if t.pVariant == 0 {
return ""
}
return t.str[t.pVariant:t.pExt]
}
// VariantOrPrivateUseTags returns variants or private use tags.
func (t Tag) VariantOrPrivateUseTags() string {
if t.pExt > 0 {
return t.str[t.pVariant:t.pExt]
}
return t.str[t.pVariant:]
}
// HasString reports whether this tag defines more than just the raw
// components.
func (t Tag) HasString() bool {
return t.str != ""
}
// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
// specific language are substituted with fields from the parent language.
// The parent for a language may change for newer versions of CLDR.
func (t Tag) Parent() Tag {
if t.str != "" {
// Strip the variants and extensions.
b, s, r := t.Raw()
t = Tag{LangID: b, ScriptID: s, RegionID: r}
if t.RegionID == 0 && t.ScriptID != 0 && t.LangID != 0 {
base, _ := addTags(Tag{LangID: t.LangID})
if base.ScriptID == t.ScriptID {
return Tag{LangID: t.LangID}
}
}
return t
}
if t.LangID != 0 {
if t.RegionID != 0 {
maxScript := t.ScriptID
if maxScript == 0 {
max, _ := addTags(t)
maxScript = max.ScriptID
}
for i := range parents {
if Language(parents[i].lang) == t.LangID && Script(parents[i].maxScript) == maxScript {
for _, r := range parents[i].fromRegion {
if Region(r) == t.RegionID {
return Tag{
LangID: t.LangID,
ScriptID: Script(parents[i].script),
RegionID: Region(parents[i].toRegion),
}
}
}
}
}
// Strip the script if it is the default one.
base, _ := addTags(Tag{LangID: t.LangID})
if base.ScriptID != maxScript {
return Tag{LangID: t.LangID, ScriptID: maxScript}
}
return Tag{LangID: t.LangID}
} else if t.ScriptID != 0 {
// The parent for an base-script pair with a non-default script is
// "und" instead of the base language.
base, _ := addTags(Tag{LangID: t.LangID})
if base.ScriptID != t.ScriptID {
return Und
}
return Tag{LangID: t.LangID}
}
}
return Und
}
// ParseExtension parses s as an extension and returns it on success.
func ParseExtension(s string) (ext string, err error) {
scan := makeScannerString(s)
var end int
if n := len(scan.token); n != 1 {
return "", ErrSyntax
}
scan.toLower(0, len(scan.b))
end = parseExtension(&scan)
if end != len(s) {
return "", ErrSyntax
}
return string(scan.b), nil
}
// HasVariants reports whether t has variants.
func (t Tag) HasVariants() bool {
return uint16(t.pVariant) < t.pExt
}
// HasExtensions reports whether t has extensions.
func (t Tag) HasExtensions() bool {
return int(t.pExt) < len(t.str)
}
// Extension returns the extension of type x for tag t. It will return
// false for ok if t does not have the requested extension. The returned
// extension will be invalid in this case.
func (t Tag) Extension(x byte) (ext string, ok bool) {
for i := int(t.pExt); i < len(t.str)-1; {
var ext string
i, ext = getExtension(t.str, i)
if ext[0] == x {
return ext, true
}
}
return "", false
}
// Extensions returns all extensions of t.
func (t Tag) Extensions() []string {
e := []string{}
for i := int(t.pExt); i < len(t.str)-1; {
var ext string
i, ext = getExtension(t.str, i)
e = append(e, ext)
}
return e
}
// TypeForKey returns the type associated with the given key, where key and type
// are of the allowed values defined for the Unicode locale extension ('u') in
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// TypeForKey will traverse the inheritance chain to get the correct value.
func (t Tag) TypeForKey(key string) string {
if start, end, _ := t.findTypeForKey(key); end != start {
return t.str[start:end]
}
return ""
}
var (
errPrivateUse = errors.New("cannot set a key on a private use tag")
errInvalidArguments = errors.New("invalid key or type")
)
// SetTypeForKey returns a new Tag with the key set to type, where key and type
// are of the allowed values defined for the Unicode locale extension ('u') in
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// An empty value removes an existing pair with the same key.
func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
if t.IsPrivateUse() {
return t, errPrivateUse
}
if len(key) != 2 {
return t, errInvalidArguments
}
// Remove the setting if value is "".
if value == "" {
start, end, _ := t.findTypeForKey(key)
if start != end {
// Remove key tag and leading '-'.
start -= 4
// Remove a possible empty extension.
if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
start -= 2
}
if start == int(t.pVariant) && end == len(t.str) {
t.str = ""
t.pVariant, t.pExt = 0, 0
} else {
t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
}
}
return t, nil
}
if len(value) < 3 || len(value) > 8 {
return t, errInvalidArguments
}
var (
buf [maxCoreSize + maxSimpleUExtensionSize]byte
uStart int // start of the -u extension.
)
// Generate the tag string if needed.
if t.str == "" {
uStart = t.genCoreBytes(buf[:])
buf[uStart] = '-'
uStart++
}
// Create new key-type pair and parse it to verify.
b := buf[uStart:]
copy(b, "u-")
copy(b[2:], key)
b[4] = '-'
b = b[:5+copy(b[5:], value)]
scan := makeScanner(b)
if parseExtensions(&scan); scan.err != nil {
return t, scan.err
}
// Assemble the replacement string.
if t.str == "" {
t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
t.str = string(buf[:uStart+len(b)])
} else {
s := t.str
start, end, hasExt := t.findTypeForKey(key)
if start == end {
if hasExt {
b = b[2:]
}
t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
} else {
t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
}
}
return t, nil
}
// findKeyAndType returns the start and end position for the type corresponding
// to key or the point at which to insert the key-value pair if the type
// wasn't found. The hasExt return value reports whether an -u extension was present.
// Note: the extensions are typically very small and are likely to contain
// only one key-type pair.
func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
p := int(t.pExt)
if len(key) != 2 || p == len(t.str) || p == 0 {
return p, p, false
}
s := t.str
// Find the correct extension.
for p++; s[p] != 'u'; p++ {
if s[p] > 'u' {
p--
return p, p, false
}
if p = nextExtension(s, p); p == len(s) {
return len(s), len(s), false
}
}
// Proceed to the hyphen following the extension name.
p++
// curKey is the key currently being processed.
curKey := ""
// Iterate over keys until we get the end of a section.
for {
// p points to the hyphen preceding the current token.
if p3 := p + 3; s[p3] == '-' {
// Found a key.
// Check whether we just processed the key that was requested.
if curKey == key {
return start, p, true
}
// Set to the next key and continue scanning type tokens.
curKey = s[p+1 : p3]
if curKey > key {
return p, p, true
}
// Start of the type token sequence.
start = p + 4
// A type is at least 3 characters long.
p += 7 // 4 + 3
} else {
// Attribute or type, which is at least 3 characters long.
p += 4
}
// p points past the third character of a type or attribute.
max := p + 5 // maximum length of token plus hyphen.
if len(s) < max {
max = len(s)
}
for ; p < max && s[p] != '-'; p++ {
}
// Bail if we have exhausted all tokens or if the next token starts
// a new extension.
if p == len(s) || s[p+2] == '-' {
if curKey == key {
return start, p, true
}
return p, p, true
}
}
}
// ParseBase parses a 2- or 3-letter ISO 639 code.
// It returns a ValueError if s is a well-formed but unknown language identifier
// or another error if another error occurred.
func ParseBase(s string) (Language, error) {
if n := len(s); n < 2 || 3 < n {
return 0, ErrSyntax
}
var buf [3]byte
return getLangID(buf[:copy(buf[:], s)])
}
// ParseScript parses a 4-letter ISO 15924 code.
// It returns a ValueError if s is a well-formed but unknown script identifier
// or another error if another error occurred.
func ParseScript(s string) (Script, error) {
if len(s) != 4 {
return 0, ErrSyntax
}
var buf [4]byte
return getScriptID(script, buf[:copy(buf[:], s)])
}
// EncodeM49 returns the Region for the given UN M.49 code.
// It returns an error if r is not a valid code.
func EncodeM49(r int) (Region, error) {
return getRegionM49(r)
}
// ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
// It returns a ValueError if s is a well-formed but unknown region identifier
// or another error if another error occurred.
func ParseRegion(s string) (Region, error) {
if n := len(s); n < 2 || 3 < n {
return 0, ErrSyntax
}
var buf [3]byte
return getRegionID(buf[:copy(buf[:], s)])
}
// IsCountry returns whether this region is a country or autonomous area. This
// includes non-standard definitions from CLDR.
func (r Region) IsCountry() bool {
if r == 0 || r.IsGroup() || r.IsPrivateUse() && r != _XK {
return false
}
return true
}
// IsGroup returns whether this region defines a collection of regions. This
// includes non-standard definitions from CLDR.
func (r Region) IsGroup() bool {
if r == 0 {
return false
}
return int(regionInclusion[r]) < len(regionContainment)
}
// Contains returns whether Region c is contained by Region r. It returns true
// if c == r.
func (r Region) Contains(c Region) bool {
if r == c {
return true
}
g := regionInclusion[r]
if g >= nRegionGroups {
return false
}
m := regionContainment[g]
d := regionInclusion[c]
b := regionInclusionBits[d]
// A contained country may belong to multiple disjoint groups. Matching any
// of these indicates containment. If the contained region is a group, it
// must strictly be a subset.
if d >= nRegionGroups {
return b&m != 0
}
return b&^m == 0
}
var errNoTLD = errors.New("language: region is not a valid ccTLD")
// TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
// In all other cases it returns either the region itself or an error.
//
// This method may return an error for a region for which there exists a
// canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
// region will already be canonicalized it was obtained from a Tag that was
// obtained using any of the default methods.
func (r Region) TLD() (Region, error) {
// See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
// difference between ISO 3166-1 and IANA ccTLD.
if r == _GB {
r = _UK
}
if (r.typ() & ccTLD) == 0 {
return 0, errNoTLD
}
return r, nil
}
// Canonicalize returns the region or a possible replacement if the region is
// deprecated. It will not return a replacement for deprecated regions that
// are split into multiple regions.
func (r Region) Canonicalize() Region {
if cr := normRegion(r); cr != 0 {
return cr
}
return r
}
// Variant represents a registered variant of a language as defined by BCP 47.
type Variant struct {
ID uint8
str string
}
// ParseVariant parses and returns a Variant. An error is returned if s is not
// a valid variant.
func ParseVariant(s string) (Variant, error) {
s = strings.ToLower(s)
if id, ok := variantIndex[s]; ok {
return Variant{id, s}, nil
}
return Variant{}, NewValueError([]byte(s))
}
// String returns the string representation of the variant.
func (v Variant) String() string {
return v.str
}

View File

@ -17,11 +17,11 @@ import (
// if it could not be found.
func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {
if !tag.FixCase(form, key) {
return 0, errSyntax
return 0, ErrSyntax
}
i := idx.Index(key)
if i == -1 {
return 0, mkErrInvalid(key)
return 0, NewValueError(key)
}
return i, nil
}
@ -32,38 +32,45 @@ func searchUint(imap []uint16, key uint16) int {
})
}
type langID uint16
type Language uint16
// getLangID returns the langID of s if s is a canonical subtag
// or langUnknown if s is not a canonical subtag.
func getLangID(s []byte) (langID, error) {
func getLangID(s []byte) (Language, error) {
if len(s) == 2 {
return getLangISO2(s)
}
return getLangISO3(s)
}
// TODO language normalization as well as the AliasMaps could be moved to the
// higher level package, but it is a bit tricky to separate the generation.
func (id Language) Canonicalize() (Language, AliasType) {
return normLang(id)
}
// mapLang returns the mapped langID of id according to mapping m.
func normLang(id langID) (langID, langAliasType) {
k := sort.Search(len(langAliasMap), func(i int) bool {
return langAliasMap[i].from >= uint16(id)
func normLang(id Language) (Language, AliasType) {
k := sort.Search(len(AliasMap), func(i int) bool {
return AliasMap[i].From >= uint16(id)
})
if k < len(langAliasMap) && langAliasMap[k].from == uint16(id) {
return langID(langAliasMap[k].to), langAliasTypes[k]
if k < len(AliasMap) && AliasMap[k].From == uint16(id) {
return Language(AliasMap[k].To), AliasTypes[k]
}
return id, langAliasTypeUnknown
return id, AliasTypeUnknown
}
// getLangISO2 returns the langID for the given 2-letter ISO language code
// or unknownLang if this does not exist.
func getLangISO2(s []byte) (langID, error) {
func getLangISO2(s []byte) (Language, error) {
if !tag.FixCase("zz", s) {
return 0, errSyntax
return 0, ErrSyntax
}
if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {
return langID(i), nil
return Language(i), nil
}
return 0, mkErrInvalid(s)
return 0, NewValueError(s)
}
const base = 'z' - 'a' + 1
@ -88,7 +95,7 @@ func intToStr(v uint, s []byte) {
// getLangISO3 returns the langID for the given 3-letter ISO language code
// or unknownLang if this does not exist.
func getLangISO3(s []byte) (langID, error) {
func getLangISO3(s []byte) (Language, error) {
if tag.FixCase("und", s) {
// first try to match canonical 3-letter entries
for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {
@ -96,7 +103,7 @@ func getLangISO3(s []byte) (langID, error) {
// We treat "und" as special and always translate it to "unspecified".
// Note that ZZ and Zzzz are private use and are not treated as
// unspecified by default.
id := langID(i)
id := Language(i)
if id == nonCanonicalUnd {
return 0, nil
}
@ -104,26 +111,26 @@ func getLangISO3(s []byte) (langID, error) {
}
}
if i := altLangISO3.Index(s); i != -1 {
return langID(altLangIndex[altLangISO3.Elem(i)[3]]), nil
return Language(altLangIndex[altLangISO3.Elem(i)[3]]), nil
}
n := strToInt(s)
if langNoIndex[n/8]&(1<<(n%8)) != 0 {
return langID(n) + langNoIndexOffset, nil
return Language(n) + langNoIndexOffset, nil
}
// Check for non-canonical uses of ISO3.
for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {
if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {
return langID(i), nil
return Language(i), nil
}
}
return 0, mkErrInvalid(s)
return 0, NewValueError(s)
}
return 0, errSyntax
return 0, ErrSyntax
}
// stringToBuf writes the string to b and returns the number of bytes
// StringToBuf writes the string to b and returns the number of bytes
// written. cap(b) must be >= 3.
func (id langID) stringToBuf(b []byte) int {
func (id Language) StringToBuf(b []byte) int {
if id >= langNoIndexOffset {
intToStr(uint(id)-langNoIndexOffset, b[:3])
return 3
@ -140,7 +147,7 @@ func (id langID) stringToBuf(b []byte) int {
// String returns the BCP 47 representation of the langID.
// Use b as variable name, instead of id, to ensure the variable
// used is consistent with that of Base in which this type is embedded.
func (b langID) String() string {
func (b Language) String() string {
if b == 0 {
return "und"
} else if b >= langNoIndexOffset {
@ -157,7 +164,7 @@ func (b langID) String() string {
}
// ISO3 returns the ISO 639-3 language code.
func (b langID) ISO3() string {
func (b Language) ISO3() string {
if b == 0 || b >= langNoIndexOffset {
return b.String()
}
@ -173,15 +180,24 @@ func (b langID) ISO3() string {
}
// IsPrivateUse reports whether this language code is reserved for private use.
func (b langID) IsPrivateUse() bool {
func (b Language) IsPrivateUse() bool {
return langPrivateStart <= b && b <= langPrivateEnd
}
type regionID uint16
// SuppressScript returns the script marked as SuppressScript in the IANA
// language tag repository, or 0 if there is no such script.
func (b Language) SuppressScript() Script {
if b < langNoIndexOffset {
return Script(suppressScript[b])
}
return 0
}
type Region uint16
// getRegionID returns the region id for s if s is a valid 2-letter region code
// or unknownRegion.
func getRegionID(s []byte) (regionID, error) {
func getRegionID(s []byte) (Region, error) {
if len(s) == 3 {
if isAlpha(s[0]) {
return getRegionISO3(s)
@ -195,34 +211,34 @@ func getRegionID(s []byte) (regionID, error) {
// getRegionISO2 returns the regionID for the given 2-letter ISO country code
// or unknownRegion if this does not exist.
func getRegionISO2(s []byte) (regionID, error) {
func getRegionISO2(s []byte) (Region, error) {
i, err := findIndex(regionISO, s, "ZZ")
if err != nil {
return 0, err
}
return regionID(i) + isoRegionOffset, nil
return Region(i) + isoRegionOffset, nil
}
// getRegionISO3 returns the regionID for the given 3-letter ISO country code
// or unknownRegion if this does not exist.
func getRegionISO3(s []byte) (regionID, error) {
func getRegionISO3(s []byte) (Region, error) {
if tag.FixCase("ZZZ", s) {
for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {
if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {
return regionID(i) + isoRegionOffset, nil
return Region(i) + isoRegionOffset, nil
}
}
for i := 0; i < len(altRegionISO3); i += 3 {
if tag.Compare(altRegionISO3[i:i+3], s) == 0 {
return regionID(altRegionIDs[i/3]), nil
return Region(altRegionIDs[i/3]), nil
}
}
return 0, mkErrInvalid(s)
return 0, NewValueError(s)
}
return 0, errSyntax
return 0, ErrSyntax
}
func getRegionM49(n int) (regionID, error) {
func getRegionM49(n int) (Region, error) {
if 0 < n && n <= 999 {
const (
searchBits = 7
@ -236,7 +252,7 @@ func getRegionM49(n int) (regionID, error) {
return buf[i] >= val
})
if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {
return regionID(r & regionMask), nil
return Region(r & regionMask), nil
}
}
var e ValueError
@ -247,13 +263,13 @@ func getRegionM49(n int) (regionID, error) {
// normRegion returns a region if r is deprecated or 0 otherwise.
// TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).
// TODO: consider mapping split up regions to new most populous one (like CLDR).
func normRegion(r regionID) regionID {
func normRegion(r Region) Region {
m := regionOldMap
k := sort.Search(len(m), func(i int) bool {
return m[i].from >= uint16(r)
return m[i].From >= uint16(r)
})
if k < len(m) && m[k].from == uint16(r) {
return regionID(m[k].to)
if k < len(m) && m[k].From == uint16(r) {
return Region(m[k].To)
}
return 0
}
@ -264,13 +280,13 @@ const (
bcp47Region
)
func (r regionID) typ() byte {
func (r Region) typ() byte {
return regionTypes[r]
}
// String returns the BCP 47 representation for the region.
// It returns "ZZ" for an unspecified region.
func (r regionID) String() string {
func (r Region) String() string {
if r < isoRegionOffset {
if r == 0 {
return "ZZ"
@ -284,7 +300,7 @@ func (r regionID) String() string {
// ISO3 returns the 3-letter ISO code of r.
// Note that not all regions have a 3-letter ISO code.
// In such cases this method returns "ZZZ".
func (r regionID) ISO3() string {
func (r Region) ISO3() string {
if r < isoRegionOffset {
return "ZZZ"
}
@ -301,29 +317,29 @@ func (r regionID) ISO3() string {
// M49 returns the UN M.49 encoding of r, or 0 if this encoding
// is not defined for r.
func (r regionID) M49() int {
func (r Region) M49() int {
return int(m49[r])
}
// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
// may include private-use tags that are assigned by CLDR and used in this
// implementation. So IsPrivateUse and IsCountry can be simultaneously true.
func (r regionID) IsPrivateUse() bool {
func (r Region) IsPrivateUse() bool {
return r.typ()&iso3166UserAssigned != 0
}
type scriptID uint8
type Script uint8
// getScriptID returns the script id for string s. It assumes that s
// is of the format [A-Z][a-z]{3}.
func getScriptID(idx tag.Index, s []byte) (scriptID, error) {
func getScriptID(idx tag.Index, s []byte) (Script, error) {
i, err := findIndex(idx, s, "Zzzz")
return scriptID(i), err
return Script(i), err
}
// String returns the script code in title case.
// It returns "Zzzz" for an unspecified script.
func (s scriptID) String() string {
func (s Script) String() string {
if s == 0 {
return "Zzzz"
}
@ -331,7 +347,7 @@ func (s scriptID) String() string {
}
// IsPrivateUse reports whether this script code is reserved for private use.
func (s scriptID) IsPrivateUse() bool {
func (s Script) IsPrivateUse() bool {
return _Qaaa <= s && s <= _Qabx
}
@ -389,7 +405,7 @@ func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {
if v < 0 {
return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true
}
t.lang = langID(v)
t.LangID = Language(v)
return t, true
}
return t, false

226
vendor/golang.org/x/text/internal/language/match.go generated vendored Normal file
View File

@ -0,0 +1,226 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import "errors"
type scriptRegionFlags uint8
const (
isList = 1 << iota
scriptInFrom
regionInFrom
)
func (t *Tag) setUndefinedLang(id Language) {
if t.LangID == 0 {
t.LangID = id
}
}
func (t *Tag) setUndefinedScript(id Script) {
if t.ScriptID == 0 {
t.ScriptID = id
}
}
func (t *Tag) setUndefinedRegion(id Region) {
if t.RegionID == 0 || t.RegionID.Contains(id) {
t.RegionID = id
}
}
// ErrMissingLikelyTagsData indicates no information was available
// to compute likely values of missing tags.
var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
// addLikelySubtags sets subtags to their most likely value, given the locale.
// In most cases this means setting fields for unknown values, but in some
// cases it may alter a value. It returns an ErrMissingLikelyTagsData error
// if the given locale cannot be expanded.
func (t Tag) addLikelySubtags() (Tag, error) {
id, err := addTags(t)
if err != nil {
return t, err
} else if id.equalTags(t) {
return t, nil
}
id.RemakeString()
return id, nil
}
// specializeRegion attempts to specialize a group region.
func specializeRegion(t *Tag) bool {
if i := regionInclusion[t.RegionID]; i < nRegionGroups {
x := likelyRegionGroup[i]
if Language(x.lang) == t.LangID && Script(x.script) == t.ScriptID {
t.RegionID = Region(x.region)
}
return true
}
return false
}
// Maximize returns a new tag with missing tags filled in.
func (t Tag) Maximize() (Tag, error) {
return addTags(t)
}
func addTags(t Tag) (Tag, error) {
// We leave private use identifiers alone.
if t.IsPrivateUse() {
return t, nil
}
if t.ScriptID != 0 && t.RegionID != 0 {
if t.LangID != 0 {
// already fully specified
specializeRegion(&t)
return t, nil
}
// Search matches for und-script-region. Note that for these cases
// region will never be a group so there is no need to check for this.
list := likelyRegion[t.RegionID : t.RegionID+1]
if x := list[0]; x.flags&isList != 0 {
list = likelyRegionList[x.lang : x.lang+uint16(x.script)]
}
for _, x := range list {
// Deviating from the spec. See match_test.go for details.
if Script(x.script) == t.ScriptID {
t.setUndefinedLang(Language(x.lang))
return t, nil
}
}
}
if t.LangID != 0 {
// Search matches for lang-script and lang-region, where lang != und.
if t.LangID < langNoIndexOffset {
x := likelyLang[t.LangID]
if x.flags&isList != 0 {
list := likelyLangList[x.region : x.region+uint16(x.script)]
if t.ScriptID != 0 {
for _, x := range list {
if Script(x.script) == t.ScriptID && x.flags&scriptInFrom != 0 {
t.setUndefinedRegion(Region(x.region))
return t, nil
}
}
} else if t.RegionID != 0 {
count := 0
goodScript := true
tt := t
for _, x := range list {
// We visit all entries for which the script was not
// defined, including the ones where the region was not
// defined. This allows for proper disambiguation within
// regions.
if x.flags&scriptInFrom == 0 && t.RegionID.Contains(Region(x.region)) {
tt.RegionID = Region(x.region)
tt.setUndefinedScript(Script(x.script))
goodScript = goodScript && tt.ScriptID == Script(x.script)
count++
}
}
if count == 1 {
return tt, nil
}
// Even if we fail to find a unique Region, we might have
// an unambiguous script.
if goodScript {
t.ScriptID = tt.ScriptID
}
}
}
}
} else {
// Search matches for und-script.
if t.ScriptID != 0 {
x := likelyScript[t.ScriptID]
if x.region != 0 {
t.setUndefinedRegion(Region(x.region))
t.setUndefinedLang(Language(x.lang))
return t, nil
}
}
// Search matches for und-region. If und-script-region exists, it would
// have been found earlier.
if t.RegionID != 0 {
if i := regionInclusion[t.RegionID]; i < nRegionGroups {
x := likelyRegionGroup[i]
if x.region != 0 {
t.setUndefinedLang(Language(x.lang))
t.setUndefinedScript(Script(x.script))
t.RegionID = Region(x.region)
}
} else {
x := likelyRegion[t.RegionID]
if x.flags&isList != 0 {
x = likelyRegionList[x.lang]
}
if x.script != 0 && x.flags != scriptInFrom {
t.setUndefinedLang(Language(x.lang))
t.setUndefinedScript(Script(x.script))
return t, nil
}
}
}
}
// Search matches for lang.
if t.LangID < langNoIndexOffset {
x := likelyLang[t.LangID]
if x.flags&isList != 0 {
x = likelyLangList[x.region]
}
if x.region != 0 {
t.setUndefinedScript(Script(x.script))
t.setUndefinedRegion(Region(x.region))
}
specializeRegion(&t)
if t.LangID == 0 {
t.LangID = _en // default language
}
return t, nil
}
return t, ErrMissingLikelyTagsData
}
func (t *Tag) setTagsFrom(id Tag) {
t.LangID = id.LangID
t.ScriptID = id.ScriptID
t.RegionID = id.RegionID
}
// minimize removes the region or script subtags from t such that
// t.addLikelySubtags() == t.minimize().addLikelySubtags().
func (t Tag) minimize() (Tag, error) {
t, err := minimizeTags(t)
if err != nil {
return t, err
}
t.RemakeString()
return t, nil
}
// minimizeTags mimics the behavior of the ICU 51 C implementation.
func minimizeTags(t Tag) (Tag, error) {
if t.equalTags(Und) {
return t, nil
}
max, err := addTags(t)
if err != nil {
return t, err
}
for _, id := range [...]Tag{
{LangID: t.LangID},
{LangID: t.LangID, RegionID: t.RegionID},
{LangID: t.LangID, ScriptID: t.ScriptID},
} {
if x, err := addTags(id); err == nil && max.equalTags(x) {
t.setTagsFrom(id)
break
}
}
return t, nil
}

594
vendor/golang.org/x/text/internal/language/parse.go generated vendored Normal file
View File

@ -0,0 +1,594 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"bytes"
"errors"
"fmt"
"sort"
"golang.org/x/text/internal/tag"
)
// isAlpha returns true if the byte is not a digit.
// b must be an ASCII letter or digit.
func isAlpha(b byte) bool {
return b > '9'
}
// isAlphaNum returns true if the string contains only ASCII letters or digits.
func isAlphaNum(s []byte) bool {
for _, c := range s {
if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
return false
}
}
return true
}
// ErrSyntax is returned by any of the parsing functions when the
// input is not well-formed, according to BCP 47.
// TODO: return the position at which the syntax error occurred?
var ErrSyntax = errors.New("language: tag is not well-formed")
// ErrDuplicateKey is returned when a tag contains the same key twice with
// different values in the -u section.
var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
// ValueError is returned by any of the parsing functions when the
// input is well-formed but the respective subtag is not recognized
// as a valid value.
type ValueError struct {
v [8]byte
}
// NewValueError creates a new ValueError.
func NewValueError(tag []byte) ValueError {
var e ValueError
copy(e.v[:], tag)
return e
}
func (e ValueError) tag() []byte {
n := bytes.IndexByte(e.v[:], 0)
if n == -1 {
n = 8
}
return e.v[:n]
}
// Error implements the error interface.
func (e ValueError) Error() string {
return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
}
// Subtag returns the subtag for which the error occurred.
func (e ValueError) Subtag() string {
return string(e.tag())
}
// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
type scanner struct {
b []byte
bytes [max99thPercentileSize]byte
token []byte
start int // start position of the current token
end int // end position of the current token
next int // next point for scan
err error
done bool
}
func makeScannerString(s string) scanner {
scan := scanner{}
if len(s) <= len(scan.bytes) {
scan.b = scan.bytes[:copy(scan.bytes[:], s)]
} else {
scan.b = []byte(s)
}
scan.init()
return scan
}
// makeScanner returns a scanner using b as the input buffer.
// b is not copied and may be modified by the scanner routines.
func makeScanner(b []byte) scanner {
scan := scanner{b: b}
scan.init()
return scan
}
func (s *scanner) init() {
for i, c := range s.b {
if c == '_' {
s.b[i] = '-'
}
}
s.scan()
}
// restToLower converts the string between start and end to lower case.
func (s *scanner) toLower(start, end int) {
for i := start; i < end; i++ {
c := s.b[i]
if 'A' <= c && c <= 'Z' {
s.b[i] += 'a' - 'A'
}
}
}
func (s *scanner) setError(e error) {
if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
s.err = e
}
}
// resizeRange shrinks or grows the array at position oldStart such that
// a new string of size newSize can fit between oldStart and oldEnd.
// Sets the scan point to after the resized range.
func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
s.start = oldStart
if end := oldStart + newSize; end != oldEnd {
diff := end - oldEnd
if end < cap(s.b) {
b := make([]byte, len(s.b)+diff)
copy(b, s.b[:oldStart])
copy(b[end:], s.b[oldEnd:])
s.b = b
} else {
s.b = append(s.b[end:], s.b[oldEnd:]...)
}
s.next = end + (s.next - s.end)
s.end = end
}
}
// replace replaces the current token with repl.
func (s *scanner) replace(repl string) {
s.resizeRange(s.start, s.end, len(repl))
copy(s.b[s.start:], repl)
}
// gobble removes the current token from the input.
// Caller must call scan after calling gobble.
func (s *scanner) gobble(e error) {
s.setError(e)
if s.start == 0 {
s.b = s.b[:+copy(s.b, s.b[s.next:])]
s.end = 0
} else {
s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
s.end = s.start - 1
}
s.next = s.start
}
// deleteRange removes the given range from s.b before the current token.
func (s *scanner) deleteRange(start, end int) {
s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
diff := end - start
s.next -= diff
s.start -= diff
s.end -= diff
}
// scan parses the next token of a BCP 47 string. Tokens that are larger
// than 8 characters or include non-alphanumeric characters result in an error
// and are gobbled and removed from the output.
// It returns the end position of the last token consumed.
func (s *scanner) scan() (end int) {
end = s.end
s.token = nil
for s.start = s.next; s.next < len(s.b); {
i := bytes.IndexByte(s.b[s.next:], '-')
if i == -1 {
s.end = len(s.b)
s.next = len(s.b)
i = s.end - s.start
} else {
s.end = s.next + i
s.next = s.end + 1
}
token := s.b[s.start:s.end]
if i < 1 || i > 8 || !isAlphaNum(token) {
s.gobble(ErrSyntax)
continue
}
s.token = token
return end
}
if n := len(s.b); n > 0 && s.b[n-1] == '-' {
s.setError(ErrSyntax)
s.b = s.b[:len(s.b)-1]
}
s.done = true
return end
}
// acceptMinSize parses multiple tokens of the given size or greater.
// It returns the end position of the last token consumed.
func (s *scanner) acceptMinSize(min int) (end int) {
end = s.end
s.scan()
for ; len(s.token) >= min; s.scan() {
end = s.end
}
return end
}
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
// failed it returns an error and any part of the tag that could be parsed.
// If parsing succeeded but an unknown value was found, it returns
// ValueError. The Tag returned in this case is just stripped of the unknown
// value. All other values are preserved. It accepts tags in the BCP 47 format
// and extensions to this standard defined in
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
func Parse(s string) (t Tag, err error) {
// TODO: consider supporting old-style locale key-value pairs.
if s == "" {
return Und, ErrSyntax
}
if len(s) <= maxAltTaglen {
b := [maxAltTaglen]byte{}
for i, c := range s {
// Generating invalid UTF-8 is okay as it won't match.
if 'A' <= c && c <= 'Z' {
c += 'a' - 'A'
} else if c == '_' {
c = '-'
}
b[i] = byte(c)
}
if t, ok := grandfathered(b); ok {
return t, nil
}
}
scan := makeScannerString(s)
return parse(&scan, s)
}
func parse(scan *scanner, s string) (t Tag, err error) {
t = Und
var end int
if n := len(scan.token); n <= 1 {
scan.toLower(0, len(scan.b))
if n == 0 || scan.token[0] != 'x' {
return t, ErrSyntax
}
end = parseExtensions(scan)
} else if n >= 4 {
return Und, ErrSyntax
} else { // the usual case
t, end = parseTag(scan)
if n := len(scan.token); n == 1 {
t.pExt = uint16(end)
end = parseExtensions(scan)
} else if end < len(scan.b) {
scan.setError(ErrSyntax)
scan.b = scan.b[:end]
}
}
if int(t.pVariant) < len(scan.b) {
if end < len(s) {
s = s[:end]
}
if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
t.str = s
} else {
t.str = string(scan.b)
}
} else {
t.pVariant, t.pExt = 0, 0
}
return t, scan.err
}
// parseTag parses language, script, region and variants.
// It returns a Tag and the end position in the input that was parsed.
func parseTag(scan *scanner) (t Tag, end int) {
var e error
// TODO: set an error if an unknown lang, script or region is encountered.
t.LangID, e = getLangID(scan.token)
scan.setError(e)
scan.replace(t.LangID.String())
langStart := scan.start
end = scan.scan()
for len(scan.token) == 3 && isAlpha(scan.token[0]) {
// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
// to a tag of the form <extlang>.
lang, e := getLangID(scan.token)
if lang != 0 {
t.LangID = lang
copy(scan.b[langStart:], lang.String())
scan.b[langStart+3] = '-'
scan.start = langStart + 4
}
scan.gobble(e)
end = scan.scan()
}
if len(scan.token) == 4 && isAlpha(scan.token[0]) {
t.ScriptID, e = getScriptID(script, scan.token)
if t.ScriptID == 0 {
scan.gobble(e)
}
end = scan.scan()
}
if n := len(scan.token); n >= 2 && n <= 3 {
t.RegionID, e = getRegionID(scan.token)
if t.RegionID == 0 {
scan.gobble(e)
} else {
scan.replace(t.RegionID.String())
}
end = scan.scan()
}
scan.toLower(scan.start, len(scan.b))
t.pVariant = byte(end)
end = parseVariants(scan, end, t)
t.pExt = uint16(end)
return t, end
}
var separator = []byte{'-'}
// parseVariants scans tokens as long as each token is a valid variant string.
// Duplicate variants are removed.
func parseVariants(scan *scanner, end int, t Tag) int {
start := scan.start
varIDBuf := [4]uint8{}
variantBuf := [4][]byte{}
varID := varIDBuf[:0]
variant := variantBuf[:0]
last := -1
needSort := false
for ; len(scan.token) >= 4; scan.scan() {
// TODO: measure the impact of needing this conversion and redesign
// the data structure if there is an issue.
v, ok := variantIndex[string(scan.token)]
if !ok {
// unknown variant
// TODO: allow user-defined variants?
scan.gobble(NewValueError(scan.token))
continue
}
varID = append(varID, v)
variant = append(variant, scan.token)
if !needSort {
if last < int(v) {
last = int(v)
} else {
needSort = true
// There is no legal combinations of more than 7 variants
// (and this is by no means a useful sequence).
const maxVariants = 8
if len(varID) > maxVariants {
break
}
}
}
end = scan.end
}
if needSort {
sort.Sort(variantsSort{varID, variant})
k, l := 0, -1
for i, v := range varID {
w := int(v)
if l == w {
// Remove duplicates.
continue
}
varID[k] = varID[i]
variant[k] = variant[i]
k++
l = w
}
if str := bytes.Join(variant[:k], separator); len(str) == 0 {
end = start - 1
} else {
scan.resizeRange(start, end, len(str))
copy(scan.b[scan.start:], str)
end = scan.end
}
}
return end
}
type variantsSort struct {
i []uint8
v [][]byte
}
func (s variantsSort) Len() int {
return len(s.i)
}
func (s variantsSort) Swap(i, j int) {
s.i[i], s.i[j] = s.i[j], s.i[i]
s.v[i], s.v[j] = s.v[j], s.v[i]
}
func (s variantsSort) Less(i, j int) bool {
return s.i[i] < s.i[j]
}
type bytesSort struct {
b [][]byte
n int // first n bytes to compare
}
func (b bytesSort) Len() int {
return len(b.b)
}
func (b bytesSort) Swap(i, j int) {
b.b[i], b.b[j] = b.b[j], b.b[i]
}
func (b bytesSort) Less(i, j int) bool {
for k := 0; k < b.n; k++ {
if b.b[i][k] == b.b[j][k] {
continue
}
return b.b[i][k] < b.b[j][k]
}
return false
}
// parseExtensions parses and normalizes the extensions in the buffer.
// It returns the last position of scan.b that is part of any extension.
// It also trims scan.b to remove excess parts accordingly.
func parseExtensions(scan *scanner) int {
start := scan.start
exts := [][]byte{}
private := []byte{}
end := scan.end
for len(scan.token) == 1 {
extStart := scan.start
ext := scan.token[0]
end = parseExtension(scan)
extension := scan.b[extStart:end]
if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
scan.setError(ErrSyntax)
end = extStart
continue
} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
scan.b = scan.b[:end]
return end
} else if ext == 'x' {
private = extension
break
}
exts = append(exts, extension)
}
sort.Sort(bytesSort{exts, 1})
if len(private) > 0 {
exts = append(exts, private)
}
scan.b = scan.b[:start]
if len(exts) > 0 {
scan.b = append(scan.b, bytes.Join(exts, separator)...)
} else if start > 0 {
// Strip trailing '-'.
scan.b = scan.b[:start-1]
}
return end
}
// parseExtension parses a single extension and returns the position of
// the extension end.
func parseExtension(scan *scanner) int {
start, end := scan.start, scan.end
switch scan.token[0] {
case 'u':
attrStart := end
scan.scan()
for last := []byte{}; len(scan.token) > 2; scan.scan() {
if bytes.Compare(scan.token, last) != -1 {
// Attributes are unsorted. Start over from scratch.
p := attrStart + 1
scan.next = p
attrs := [][]byte{}
for scan.scan(); len(scan.token) > 2; scan.scan() {
attrs = append(attrs, scan.token)
end = scan.end
}
sort.Sort(bytesSort{attrs, 3})
copy(scan.b[p:], bytes.Join(attrs, separator))
break
}
last = scan.token
end = scan.end
}
var last, key []byte
for attrEnd := end; len(scan.token) == 2; last = key {
key = scan.token
keyEnd := scan.end
end = scan.acceptMinSize(3)
// TODO: check key value validity
if keyEnd == end || bytes.Compare(key, last) != 1 {
// We have an invalid key or the keys are not sorted.
// Start scanning keys from scratch and reorder.
p := attrEnd + 1
scan.next = p
keys := [][]byte{}
for scan.scan(); len(scan.token) == 2; {
keyStart, keyEnd := scan.start, scan.end
end = scan.acceptMinSize(3)
if keyEnd != end {
keys = append(keys, scan.b[keyStart:end])
} else {
scan.setError(ErrSyntax)
end = keyStart
}
}
sort.Stable(bytesSort{keys, 2})
if n := len(keys); n > 0 {
k := 0
for i := 1; i < n; i++ {
if !bytes.Equal(keys[k][:2], keys[i][:2]) {
k++
keys[k] = keys[i]
} else if !bytes.Equal(keys[k], keys[i]) {
scan.setError(ErrDuplicateKey)
}
}
keys = keys[:k+1]
}
reordered := bytes.Join(keys, separator)
if e := p + len(reordered); e < end {
scan.deleteRange(e, end)
end = e
}
copy(scan.b[p:], reordered)
break
}
}
case 't':
scan.scan()
if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
_, end = parseTag(scan)
scan.toLower(start, end)
}
for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
end = scan.acceptMinSize(3)
}
case 'x':
end = scan.acceptMinSize(1)
default:
end = scan.acceptMinSize(2)
}
return end
}
// getExtension returns the name, body and end position of the extension.
func getExtension(s string, p int) (end int, ext string) {
if s[p] == '-' {
p++
}
if s[p] == 'x' {
return len(s), s[p:]
}
end = nextExtension(s, p)
return end, s[p:end]
}
// nextExtension finds the next extension within the string, searching
// for the -<char>- pattern from position p.
// In the fast majority of cases, language tags will have at most
// one extension and extensions tend to be small.
func nextExtension(s string, p int) int {
for n := len(s) - 3; p < n; {
if s[p] == '-' {
if s[p+2] == '-' {
return p
}
p += 3
} else {
p++
}
}
return len(s)
}

3431
vendor/golang.org/x/text/internal/language/tables.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

48
vendor/golang.org/x/text/internal/language/tags.go generated vendored Normal file
View File

@ -0,0 +1,48 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
// It simplifies safe initialization of Tag values.
func MustParse(s string) Tag {
t, err := Parse(s)
if err != nil {
panic(err)
}
return t
}
// MustParseBase is like ParseBase, but panics if the given base cannot be parsed.
// It simplifies safe initialization of Base values.
func MustParseBase(s string) Language {
b, err := ParseBase(s)
if err != nil {
panic(err)
}
return b
}
// MustParseScript is like ParseScript, but panics if the given script cannot be
// parsed. It simplifies safe initialization of Script values.
func MustParseScript(s string) Script {
scr, err := ParseScript(s)
if err != nil {
panic(err)
}
return scr
}
// MustParseRegion is like ParseRegion, but panics if the given region cannot be
// parsed. It simplifies safe initialization of Region values.
func MustParseRegion(s string) Region {
r, err := ParseRegion(s)
if err != nil {
panic(err)
}
return r
}
// Und is the root language.
var Und Tag

View File

@ -53,7 +53,7 @@
// Indexes of starter blocks in case of multiple trie roots.
//
// It is recommended that users test the generated trie by checking the returned
// value for every rune. Such exhaustive tests are possible as the the number of
// value for every rune. Such exhaustive tests are possible as the number of
// runes in Unicode is limited.
package triegen // import "golang.org/x/text/internal/triegen"

View File

@ -3,8 +3,8 @@
// license that can be found in the LICENSE file.
// Package ucd provides a parser for Unicode Character Database files, the
// format of which is defined in http://www.unicode.org/reports/tr44/. See
// http://www.unicode.org/Public/UCD/latest/ucd/ for example files.
// format of which is defined in https://www.unicode.org/reports/tr44/. See
// https://www.unicode.org/Public/UCD/latest/ucd/ for example files.
//
// It currently does not support substitutions of missing fields.
package ucd // import "golang.org/x/text/internal/ucd"

View File

@ -0,0 +1,87 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package utf8internal contains low-level utf8-related constants, tables, etc.
// that are used internally by the text package.
package utf8internal
// The default lowest and highest continuation byte.
const (
LoCB = 0x80 // 1000 0000
HiCB = 0xBF // 1011 1111
)
// Constants related to getting information of first bytes of UTF-8 sequences.
const (
// ASCII identifies a UTF-8 byte as ASCII.
ASCII = as
// FirstInvalid indicates a byte is invalid as a first byte of a UTF-8
// sequence.
FirstInvalid = xx
// SizeMask is a mask for the size bits. Use use x&SizeMask to get the size.
SizeMask = 7
// AcceptShift is the right-shift count for the first byte info byte to get
// the index into the AcceptRanges table. See AcceptRanges.
AcceptShift = 4
// The names of these constants are chosen to give nice alignment in the
// table below. The first nibble is an index into acceptRanges or F for
// special one-byte cases. The second nibble is the Rune length or the
// Status for the special one-byte case.
xx = 0xF1 // invalid: size 1
as = 0xF0 // ASCII: size 1
s1 = 0x02 // accept 0, size 2
s2 = 0x13 // accept 1, size 3
s3 = 0x03 // accept 0, size 3
s4 = 0x23 // accept 2, size 3
s5 = 0x34 // accept 3, size 4
s6 = 0x04 // accept 0, size 4
s7 = 0x44 // accept 4, size 4
)
// First is information about the first byte in a UTF-8 sequence.
var First = [256]uint8{
// 1 2 3 4 5 6 7 8 9 A B C D E F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
// 1 2 3 4 5 6 7 8 9 A B C D E F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
}
// AcceptRange gives the range of valid values for the second byte in a UTF-8
// sequence for any value for First that is not ASCII or FirstInvalid.
type AcceptRange struct {
Lo uint8 // lowest value for second byte.
Hi uint8 // highest value for second byte.
}
// AcceptRanges is a slice of AcceptRange values. For a given byte sequence b
//
// AcceptRanges[First[b[0]]>>AcceptShift]
//
// will give the value of AcceptRange for the multi-byte UTF-8 sequence starting
// at b[0].
var AcceptRanges = [...]AcceptRange{
0: {LoCB, HiCB},
1: {0xA0, HiCB},
2: {LoCB, 0x9F},
3: {0x90, HiCB},
4: {LoCB, 0x8F},
}

View File

@ -7,6 +7,8 @@ package language
import (
"fmt"
"sort"
"golang.org/x/text/internal/language"
)
// The Coverage interface is used to define the level of coverage of an
@ -44,9 +46,9 @@ type allSubtags struct{}
// consecutive range, it simply returns a slice of numbers in increasing order.
// The "undefined" region is not returned.
func (s allSubtags) Regions() []Region {
reg := make([]Region, numRegions)
reg := make([]Region, language.NumRegions)
for i := range reg {
reg[i] = Region{regionID(i + 1)}
reg[i] = Region{language.Region(i + 1)}
}
return reg
}
@ -55,9 +57,9 @@ func (s allSubtags) Regions() []Region {
// consecutive range, it simply returns a slice of numbers in increasing order.
// The "undefined" script is not returned.
func (s allSubtags) Scripts() []Script {
scr := make([]Script, numScripts)
scr := make([]Script, language.NumScripts)
for i := range scr {
scr[i] = Script{scriptID(i + 1)}
scr[i] = Script{language.Script(i + 1)}
}
return scr
}
@ -65,22 +67,10 @@ func (s allSubtags) Scripts() []Script {
// BaseLanguages returns the list of all supported base languages. It generates
// the list by traversing the internal structures.
func (s allSubtags) BaseLanguages() []Base {
base := make([]Base, 0, numLanguages)
for i := 0; i < langNoIndexOffset; i++ {
// We included "und" already for the value 0.
if i != nonCanonicalUnd {
base = append(base, Base{langID(i)})
}
}
i := langNoIndexOffset
for _, v := range langNoIndex {
for k := 0; k < 8; k++ {
if v&1 == 1 {
base = append(base, Base{langID(i)})
}
v >>= 1
i++
}
bs := language.BaseLanguages()
base := make([]Base, len(bs))
for i, b := range bs {
base[i] = Base{b}
}
return base
}
@ -90,7 +80,7 @@ func (s allSubtags) Tags() []Tag {
return nil
}
// coverage is used used by NewCoverage which is used as a convenient way for
// coverage is used by NewCoverage which is used as a convenient way for
// creating Coverage implementations for partially defined data. Very often a
// package will only need to define a subset of slices. coverage provides a
// convenient way to do this. Moreover, packages using NewCoverage, instead of
@ -134,7 +124,7 @@ func (s *coverage) BaseLanguages() []Base {
}
a := make([]Base, len(tags))
for i, t := range tags {
a[i] = Base{langID(t.lang)}
a[i] = Base{language.Language(t.lang())}
}
sort.Sort(bases(a))
k := 0

File diff suppressed because it is too large Load Diff

View File

@ -1,162 +0,0 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This file generates derivative tables based on the language package itself.
import (
"bytes"
"flag"
"fmt"
"io/ioutil"
"log"
"reflect"
"sort"
"strings"
"golang.org/x/text/internal/gen"
"golang.org/x/text/language"
"golang.org/x/text/unicode/cldr"
)
var (
test = flag.Bool("test", false,
"test existing tables; can be used to compare web data with package data.")
draft = flag.String("draft",
"contributed",
`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
)
func main() {
gen.Init()
// Read the CLDR zip file.
r := gen.OpenCLDRCoreZip()
defer r.Close()
d := &cldr.Decoder{}
data, err := d.DecodeZip(r)
if err != nil {
log.Fatalf("DecodeZip: %v", err)
}
w := gen.NewCodeWriter()
defer func() {
buf := &bytes.Buffer{}
if _, err = w.WriteGo(buf, "language", ""); err != nil {
log.Fatalf("Error formatting file index.go: %v", err)
}
// Since we're generating a table for our own package we need to rewrite
// doing the equivalent of go fmt -r 'language.b -> b'. Using
// bytes.Replace will do.
out := bytes.Replace(buf.Bytes(), []byte("language."), nil, -1)
if err := ioutil.WriteFile("index.go", out, 0600); err != nil {
log.Fatalf("Could not create file index.go: %v", err)
}
}()
m := map[language.Tag]bool{}
for _, lang := range data.Locales() {
// We include all locales unconditionally to be consistent with en_US.
// We want en_US, even though it has no data associated with it.
// TODO: put any of the languages for which no data exists at the end
// of the index. This allows all components based on ICU to use that
// as the cutoff point.
// if x := data.RawLDML(lang); false ||
// x.LocaleDisplayNames != nil ||
// x.Characters != nil ||
// x.Delimiters != nil ||
// x.Measurement != nil ||
// x.Dates != nil ||
// x.Numbers != nil ||
// x.Units != nil ||
// x.ListPatterns != nil ||
// x.Collations != nil ||
// x.Segmentations != nil ||
// x.Rbnf != nil ||
// x.Annotations != nil ||
// x.Metadata != nil {
// TODO: support POSIX natively, albeit non-standard.
tag := language.Make(strings.Replace(lang, "_POSIX", "-u-va-posix", 1))
m[tag] = true
// }
}
// Include locales for plural rules, which uses a different structure.
for _, plurals := range data.Supplemental().Plurals {
for _, rules := range plurals.PluralRules {
for _, lang := range strings.Split(rules.Locales, " ") {
m[language.Make(lang)] = true
}
}
}
var core, special []language.Tag
for t := range m {
if x := t.Extensions(); len(x) != 0 && fmt.Sprint(x) != "[u-va-posix]" {
log.Fatalf("Unexpected extension %v in %v", x, t)
}
if len(t.Variants()) == 0 && len(t.Extensions()) == 0 {
core = append(core, t)
} else {
special = append(special, t)
}
}
w.WriteComment(`
NumCompactTags is the number of common tags. The maximum tag is
NumCompactTags-1.`)
w.WriteConst("NumCompactTags", len(core)+len(special))
sort.Sort(byAlpha(special))
w.WriteVar("specialTags", special)
// TODO: order by frequency?
sort.Sort(byAlpha(core))
// Size computations are just an estimate.
w.Size += int(reflect.TypeOf(map[uint32]uint16{}).Size())
w.Size += len(core) * 6 // size of uint32 and uint16
fmt.Fprintln(w)
fmt.Fprintln(w, "var coreTags = map[uint32]uint16{")
fmt.Fprintln(w, "0x0: 0, // und")
i := len(special) + 1 // Und and special tags already written.
for _, t := range core {
if t == language.Und {
continue
}
fmt.Fprint(w.Hash, t, i)
b, s, r := t.Raw()
fmt.Fprintf(w, "0x%s%s%s: %d, // %s\n",
getIndex(b, 3), // 3 is enough as it is guaranteed to be a compact number
getIndex(s, 2),
getIndex(r, 3),
i, t)
i++
}
fmt.Fprintln(w, "}")
}
// getIndex prints the subtag type and extracts its index of size nibble.
// If the index is less than n nibbles, the result is prefixed with 0s.
func getIndex(x interface{}, n int) string {
s := fmt.Sprintf("%#v", x) // s is of form Type{typeID: 0x00}
s = s[strings.Index(s, "0x")+2 : len(s)-1]
return strings.Repeat("0", n-len(s)) + s
}
type byAlpha []language.Tag
func (a byAlpha) Len() int { return len(a) }
func (a byAlpha) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a byAlpha) Less(i, j int) bool { return a[i].String() < a[j].String() }

View File

@ -1,783 +0,0 @@
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package language
// NumCompactTags is the number of common tags. The maximum tag is
// NumCompactTags-1.
const NumCompactTags = 768
var specialTags = []Tag{ // 2 elements
0: {lang: 0xd7, region: 0x6e, script: 0x0, pVariant: 0x5, pExt: 0xe, str: "ca-ES-valencia"},
1: {lang: 0x139, region: 0x135, script: 0x0, pVariant: 0x5, pExt: 0x5, str: "en-US-u-va-posix"},
} // Size: 72 bytes
var coreTags = map[uint32]uint16{
0x0: 0, // und
0x01600000: 3, // af
0x016000d2: 4, // af-NA
0x01600161: 5, // af-ZA
0x01c00000: 6, // agq
0x01c00052: 7, // agq-CM
0x02100000: 8, // ak
0x02100080: 9, // ak-GH
0x02700000: 10, // am
0x0270006f: 11, // am-ET
0x03a00000: 12, // ar
0x03a00001: 13, // ar-001
0x03a00023: 14, // ar-AE
0x03a00039: 15, // ar-BH
0x03a00062: 16, // ar-DJ
0x03a00067: 17, // ar-DZ
0x03a0006b: 18, // ar-EG
0x03a0006c: 19, // ar-EH
0x03a0006d: 20, // ar-ER
0x03a00097: 21, // ar-IL
0x03a0009b: 22, // ar-IQ
0x03a000a1: 23, // ar-JO
0x03a000a8: 24, // ar-KM
0x03a000ac: 25, // ar-KW
0x03a000b0: 26, // ar-LB
0x03a000b9: 27, // ar-LY
0x03a000ba: 28, // ar-MA
0x03a000c9: 29, // ar-MR
0x03a000e1: 30, // ar-OM
0x03a000ed: 31, // ar-PS
0x03a000f3: 32, // ar-QA
0x03a00108: 33, // ar-SA
0x03a0010b: 34, // ar-SD
0x03a00115: 35, // ar-SO
0x03a00117: 36, // ar-SS
0x03a0011c: 37, // ar-SY
0x03a00120: 38, // ar-TD
0x03a00128: 39, // ar-TN
0x03a0015e: 40, // ar-YE
0x04000000: 41, // ars
0x04300000: 42, // as
0x04300099: 43, // as-IN
0x04400000: 44, // asa
0x0440012f: 45, // asa-TZ
0x04800000: 46, // ast
0x0480006e: 47, // ast-ES
0x05800000: 48, // az
0x0581f000: 49, // az-Cyrl
0x0581f032: 50, // az-Cyrl-AZ
0x05857000: 51, // az-Latn
0x05857032: 52, // az-Latn-AZ
0x05e00000: 53, // bas
0x05e00052: 54, // bas-CM
0x07100000: 55, // be
0x07100047: 56, // be-BY
0x07500000: 57, // bem
0x07500162: 58, // bem-ZM
0x07900000: 59, // bez
0x0790012f: 60, // bez-TZ
0x07e00000: 61, // bg
0x07e00038: 62, // bg-BG
0x08200000: 63, // bh
0x0a000000: 64, // bm
0x0a0000c3: 65, // bm-ML
0x0a500000: 66, // bn
0x0a500035: 67, // bn-BD
0x0a500099: 68, // bn-IN
0x0a900000: 69, // bo
0x0a900053: 70, // bo-CN
0x0a900099: 71, // bo-IN
0x0b200000: 72, // br
0x0b200078: 73, // br-FR
0x0b500000: 74, // brx
0x0b500099: 75, // brx-IN
0x0b700000: 76, // bs
0x0b71f000: 77, // bs-Cyrl
0x0b71f033: 78, // bs-Cyrl-BA
0x0b757000: 79, // bs-Latn
0x0b757033: 80, // bs-Latn-BA
0x0d700000: 81, // ca
0x0d700022: 82, // ca-AD
0x0d70006e: 83, // ca-ES
0x0d700078: 84, // ca-FR
0x0d70009e: 85, // ca-IT
0x0db00000: 86, // ccp
0x0db00035: 87, // ccp-BD
0x0db00099: 88, // ccp-IN
0x0dc00000: 89, // ce
0x0dc00106: 90, // ce-RU
0x0df00000: 91, // cgg
0x0df00131: 92, // cgg-UG
0x0e500000: 93, // chr
0x0e500135: 94, // chr-US
0x0e900000: 95, // ckb
0x0e90009b: 96, // ckb-IQ
0x0e90009c: 97, // ckb-IR
0x0fa00000: 98, // cs
0x0fa0005e: 99, // cs-CZ
0x0fe00000: 100, // cu
0x0fe00106: 101, // cu-RU
0x10000000: 102, // cy
0x1000007b: 103, // cy-GB
0x10100000: 104, // da
0x10100063: 105, // da-DK
0x10100082: 106, // da-GL
0x10800000: 107, // dav
0x108000a4: 108, // dav-KE
0x10d00000: 109, // de
0x10d0002e: 110, // de-AT
0x10d00036: 111, // de-BE
0x10d0004e: 112, // de-CH
0x10d00060: 113, // de-DE
0x10d0009e: 114, // de-IT
0x10d000b2: 115, // de-LI
0x10d000b7: 116, // de-LU
0x11700000: 117, // dje
0x117000d4: 118, // dje-NE
0x11f00000: 119, // dsb
0x11f00060: 120, // dsb-DE
0x12400000: 121, // dua
0x12400052: 122, // dua-CM
0x12800000: 123, // dv
0x12b00000: 124, // dyo
0x12b00114: 125, // dyo-SN
0x12d00000: 126, // dz
0x12d00043: 127, // dz-BT
0x12f00000: 128, // ebu
0x12f000a4: 129, // ebu-KE
0x13000000: 130, // ee
0x13000080: 131, // ee-GH
0x13000122: 132, // ee-TG
0x13600000: 133, // el
0x1360005d: 134, // el-CY
0x13600087: 135, // el-GR
0x13900000: 136, // en
0x13900001: 137, // en-001
0x1390001a: 138, // en-150
0x13900025: 139, // en-AG
0x13900026: 140, // en-AI
0x1390002d: 141, // en-AS
0x1390002e: 142, // en-AT
0x1390002f: 143, // en-AU
0x13900034: 144, // en-BB
0x13900036: 145, // en-BE
0x1390003a: 146, // en-BI
0x1390003d: 147, // en-BM
0x13900042: 148, // en-BS
0x13900046: 149, // en-BW
0x13900048: 150, // en-BZ
0x13900049: 151, // en-CA
0x1390004a: 152, // en-CC
0x1390004e: 153, // en-CH
0x13900050: 154, // en-CK
0x13900052: 155, // en-CM
0x1390005c: 156, // en-CX
0x1390005d: 157, // en-CY
0x13900060: 158, // en-DE
0x13900061: 159, // en-DG
0x13900063: 160, // en-DK
0x13900064: 161, // en-DM
0x1390006d: 162, // en-ER
0x13900072: 163, // en-FI
0x13900073: 164, // en-FJ
0x13900074: 165, // en-FK
0x13900075: 166, // en-FM
0x1390007b: 167, // en-GB
0x1390007c: 168, // en-GD
0x1390007f: 169, // en-GG
0x13900080: 170, // en-GH
0x13900081: 171, // en-GI
0x13900083: 172, // en-GM
0x1390008a: 173, // en-GU
0x1390008c: 174, // en-GY
0x1390008d: 175, // en-HK
0x13900096: 176, // en-IE
0x13900097: 177, // en-IL
0x13900098: 178, // en-IM
0x13900099: 179, // en-IN
0x1390009a: 180, // en-IO
0x1390009f: 181, // en-JE
0x139000a0: 182, // en-JM
0x139000a4: 183, // en-KE
0x139000a7: 184, // en-KI
0x139000a9: 185, // en-KN
0x139000ad: 186, // en-KY
0x139000b1: 187, // en-LC
0x139000b4: 188, // en-LR
0x139000b5: 189, // en-LS
0x139000bf: 190, // en-MG
0x139000c0: 191, // en-MH
0x139000c6: 192, // en-MO
0x139000c7: 193, // en-MP
0x139000ca: 194, // en-MS
0x139000cb: 195, // en-MT
0x139000cc: 196, // en-MU
0x139000ce: 197, // en-MW
0x139000d0: 198, // en-MY
0x139000d2: 199, // en-NA
0x139000d5: 200, // en-NF
0x139000d6: 201, // en-NG
0x139000d9: 202, // en-NL
0x139000dd: 203, // en-NR
0x139000df: 204, // en-NU
0x139000e0: 205, // en-NZ
0x139000e6: 206, // en-PG
0x139000e7: 207, // en-PH
0x139000e8: 208, // en-PK
0x139000eb: 209, // en-PN
0x139000ec: 210, // en-PR
0x139000f0: 211, // en-PW
0x13900107: 212, // en-RW
0x13900109: 213, // en-SB
0x1390010a: 214, // en-SC
0x1390010b: 215, // en-SD
0x1390010c: 216, // en-SE
0x1390010d: 217, // en-SG
0x1390010e: 218, // en-SH
0x1390010f: 219, // en-SI
0x13900112: 220, // en-SL
0x13900117: 221, // en-SS
0x1390011b: 222, // en-SX
0x1390011d: 223, // en-SZ
0x1390011f: 224, // en-TC
0x13900125: 225, // en-TK
0x13900129: 226, // en-TO
0x1390012c: 227, // en-TT
0x1390012d: 228, // en-TV
0x1390012f: 229, // en-TZ
0x13900131: 230, // en-UG
0x13900133: 231, // en-UM
0x13900135: 232, // en-US
0x13900139: 233, // en-VC
0x1390013c: 234, // en-VG
0x1390013d: 235, // en-VI
0x1390013f: 236, // en-VU
0x13900142: 237, // en-WS
0x13900161: 238, // en-ZA
0x13900162: 239, // en-ZM
0x13900164: 240, // en-ZW
0x13c00000: 241, // eo
0x13c00001: 242, // eo-001
0x13e00000: 243, // es
0x13e0001f: 244, // es-419
0x13e0002c: 245, // es-AR
0x13e0003f: 246, // es-BO
0x13e00041: 247, // es-BR
0x13e00048: 248, // es-BZ
0x13e00051: 249, // es-CL
0x13e00054: 250, // es-CO
0x13e00056: 251, // es-CR
0x13e00059: 252, // es-CU
0x13e00065: 253, // es-DO
0x13e00068: 254, // es-EA
0x13e00069: 255, // es-EC
0x13e0006e: 256, // es-ES
0x13e00086: 257, // es-GQ
0x13e00089: 258, // es-GT
0x13e0008f: 259, // es-HN
0x13e00094: 260, // es-IC
0x13e000cf: 261, // es-MX
0x13e000d8: 262, // es-NI
0x13e000e2: 263, // es-PA
0x13e000e4: 264, // es-PE
0x13e000e7: 265, // es-PH
0x13e000ec: 266, // es-PR
0x13e000f1: 267, // es-PY
0x13e0011a: 268, // es-SV
0x13e00135: 269, // es-US
0x13e00136: 270, // es-UY
0x13e0013b: 271, // es-VE
0x14000000: 272, // et
0x1400006a: 273, // et-EE
0x14500000: 274, // eu
0x1450006e: 275, // eu-ES
0x14600000: 276, // ewo
0x14600052: 277, // ewo-CM
0x14800000: 278, // fa
0x14800024: 279, // fa-AF
0x1480009c: 280, // fa-IR
0x14e00000: 281, // ff
0x14e00052: 282, // ff-CM
0x14e00084: 283, // ff-GN
0x14e000c9: 284, // ff-MR
0x14e00114: 285, // ff-SN
0x15100000: 286, // fi
0x15100072: 287, // fi-FI
0x15300000: 288, // fil
0x153000e7: 289, // fil-PH
0x15800000: 290, // fo
0x15800063: 291, // fo-DK
0x15800076: 292, // fo-FO
0x15e00000: 293, // fr
0x15e00036: 294, // fr-BE
0x15e00037: 295, // fr-BF
0x15e0003a: 296, // fr-BI
0x15e0003b: 297, // fr-BJ
0x15e0003c: 298, // fr-BL
0x15e00049: 299, // fr-CA
0x15e0004b: 300, // fr-CD
0x15e0004c: 301, // fr-CF
0x15e0004d: 302, // fr-CG
0x15e0004e: 303, // fr-CH
0x15e0004f: 304, // fr-CI
0x15e00052: 305, // fr-CM
0x15e00062: 306, // fr-DJ
0x15e00067: 307, // fr-DZ
0x15e00078: 308, // fr-FR
0x15e0007a: 309, // fr-GA
0x15e0007e: 310, // fr-GF
0x15e00084: 311, // fr-GN
0x15e00085: 312, // fr-GP
0x15e00086: 313, // fr-GQ
0x15e00091: 314, // fr-HT
0x15e000a8: 315, // fr-KM
0x15e000b7: 316, // fr-LU
0x15e000ba: 317, // fr-MA
0x15e000bb: 318, // fr-MC
0x15e000be: 319, // fr-MF
0x15e000bf: 320, // fr-MG
0x15e000c3: 321, // fr-ML
0x15e000c8: 322, // fr-MQ
0x15e000c9: 323, // fr-MR
0x15e000cc: 324, // fr-MU
0x15e000d3: 325, // fr-NC
0x15e000d4: 326, // fr-NE
0x15e000e5: 327, // fr-PF
0x15e000ea: 328, // fr-PM
0x15e00102: 329, // fr-RE
0x15e00107: 330, // fr-RW
0x15e0010a: 331, // fr-SC
0x15e00114: 332, // fr-SN
0x15e0011c: 333, // fr-SY
0x15e00120: 334, // fr-TD
0x15e00122: 335, // fr-TG
0x15e00128: 336, // fr-TN
0x15e0013f: 337, // fr-VU
0x15e00140: 338, // fr-WF
0x15e0015f: 339, // fr-YT
0x16900000: 340, // fur
0x1690009e: 341, // fur-IT
0x16d00000: 342, // fy
0x16d000d9: 343, // fy-NL
0x16e00000: 344, // ga
0x16e00096: 345, // ga-IE
0x17e00000: 346, // gd
0x17e0007b: 347, // gd-GB
0x19000000: 348, // gl
0x1900006e: 349, // gl-ES
0x1a300000: 350, // gsw
0x1a30004e: 351, // gsw-CH
0x1a300078: 352, // gsw-FR
0x1a3000b2: 353, // gsw-LI
0x1a400000: 354, // gu
0x1a400099: 355, // gu-IN
0x1a900000: 356, // guw
0x1ab00000: 357, // guz
0x1ab000a4: 358, // guz-KE
0x1ac00000: 359, // gv
0x1ac00098: 360, // gv-IM
0x1b400000: 361, // ha
0x1b400080: 362, // ha-GH
0x1b4000d4: 363, // ha-NE
0x1b4000d6: 364, // ha-NG
0x1b800000: 365, // haw
0x1b800135: 366, // haw-US
0x1bc00000: 367, // he
0x1bc00097: 368, // he-IL
0x1be00000: 369, // hi
0x1be00099: 370, // hi-IN
0x1d100000: 371, // hr
0x1d100033: 372, // hr-BA
0x1d100090: 373, // hr-HR
0x1d200000: 374, // hsb
0x1d200060: 375, // hsb-DE
0x1d500000: 376, // hu
0x1d500092: 377, // hu-HU
0x1d700000: 378, // hy
0x1d700028: 379, // hy-AM
0x1e100000: 380, // id
0x1e100095: 381, // id-ID
0x1e700000: 382, // ig
0x1e7000d6: 383, // ig-NG
0x1ea00000: 384, // ii
0x1ea00053: 385, // ii-CN
0x1f500000: 386, // io
0x1f800000: 387, // is
0x1f80009d: 388, // is-IS
0x1f900000: 389, // it
0x1f90004e: 390, // it-CH
0x1f90009e: 391, // it-IT
0x1f900113: 392, // it-SM
0x1f900138: 393, // it-VA
0x1fa00000: 394, // iu
0x20000000: 395, // ja
0x200000a2: 396, // ja-JP
0x20300000: 397, // jbo
0x20700000: 398, // jgo
0x20700052: 399, // jgo-CM
0x20a00000: 400, // jmc
0x20a0012f: 401, // jmc-TZ
0x20e00000: 402, // jv
0x21000000: 403, // ka
0x2100007d: 404, // ka-GE
0x21200000: 405, // kab
0x21200067: 406, // kab-DZ
0x21600000: 407, // kaj
0x21700000: 408, // kam
0x217000a4: 409, // kam-KE
0x21f00000: 410, // kcg
0x22300000: 411, // kde
0x2230012f: 412, // kde-TZ
0x22700000: 413, // kea
0x2270005a: 414, // kea-CV
0x23400000: 415, // khq
0x234000c3: 416, // khq-ML
0x23900000: 417, // ki
0x239000a4: 418, // ki-KE
0x24200000: 419, // kk
0x242000ae: 420, // kk-KZ
0x24400000: 421, // kkj
0x24400052: 422, // kkj-CM
0x24500000: 423, // kl
0x24500082: 424, // kl-GL
0x24600000: 425, // kln
0x246000a4: 426, // kln-KE
0x24a00000: 427, // km
0x24a000a6: 428, // km-KH
0x25100000: 429, // kn
0x25100099: 430, // kn-IN
0x25400000: 431, // ko
0x254000aa: 432, // ko-KP
0x254000ab: 433, // ko-KR
0x25600000: 434, // kok
0x25600099: 435, // kok-IN
0x26a00000: 436, // ks
0x26a00099: 437, // ks-IN
0x26b00000: 438, // ksb
0x26b0012f: 439, // ksb-TZ
0x26d00000: 440, // ksf
0x26d00052: 441, // ksf-CM
0x26e00000: 442, // ksh
0x26e00060: 443, // ksh-DE
0x27400000: 444, // ku
0x28100000: 445, // kw
0x2810007b: 446, // kw-GB
0x28a00000: 447, // ky
0x28a000a5: 448, // ky-KG
0x29100000: 449, // lag
0x2910012f: 450, // lag-TZ
0x29500000: 451, // lb
0x295000b7: 452, // lb-LU
0x2a300000: 453, // lg
0x2a300131: 454, // lg-UG
0x2af00000: 455, // lkt
0x2af00135: 456, // lkt-US
0x2b500000: 457, // ln
0x2b50002a: 458, // ln-AO
0x2b50004b: 459, // ln-CD
0x2b50004c: 460, // ln-CF
0x2b50004d: 461, // ln-CG
0x2b800000: 462, // lo
0x2b8000af: 463, // lo-LA
0x2bf00000: 464, // lrc
0x2bf0009b: 465, // lrc-IQ
0x2bf0009c: 466, // lrc-IR
0x2c000000: 467, // lt
0x2c0000b6: 468, // lt-LT
0x2c200000: 469, // lu
0x2c20004b: 470, // lu-CD
0x2c400000: 471, // luo
0x2c4000a4: 472, // luo-KE
0x2c500000: 473, // luy
0x2c5000a4: 474, // luy-KE
0x2c700000: 475, // lv
0x2c7000b8: 476, // lv-LV
0x2d100000: 477, // mas
0x2d1000a4: 478, // mas-KE
0x2d10012f: 479, // mas-TZ
0x2e900000: 480, // mer
0x2e9000a4: 481, // mer-KE
0x2ed00000: 482, // mfe
0x2ed000cc: 483, // mfe-MU
0x2f100000: 484, // mg
0x2f1000bf: 485, // mg-MG
0x2f200000: 486, // mgh
0x2f2000d1: 487, // mgh-MZ
0x2f400000: 488, // mgo
0x2f400052: 489, // mgo-CM
0x2ff00000: 490, // mk
0x2ff000c2: 491, // mk-MK
0x30400000: 492, // ml
0x30400099: 493, // ml-IN
0x30b00000: 494, // mn
0x30b000c5: 495, // mn-MN
0x31b00000: 496, // mr
0x31b00099: 497, // mr-IN
0x31f00000: 498, // ms
0x31f0003e: 499, // ms-BN
0x31f000d0: 500, // ms-MY
0x31f0010d: 501, // ms-SG
0x32000000: 502, // mt
0x320000cb: 503, // mt-MT
0x32500000: 504, // mua
0x32500052: 505, // mua-CM
0x33100000: 506, // my
0x331000c4: 507, // my-MM
0x33a00000: 508, // mzn
0x33a0009c: 509, // mzn-IR
0x34100000: 510, // nah
0x34500000: 511, // naq
0x345000d2: 512, // naq-NA
0x34700000: 513, // nb
0x347000da: 514, // nb-NO
0x34700110: 515, // nb-SJ
0x34e00000: 516, // nd
0x34e00164: 517, // nd-ZW
0x35000000: 518, // nds
0x35000060: 519, // nds-DE
0x350000d9: 520, // nds-NL
0x35100000: 521, // ne
0x35100099: 522, // ne-IN
0x351000db: 523, // ne-NP
0x36700000: 524, // nl
0x36700030: 525, // nl-AW
0x36700036: 526, // nl-BE
0x36700040: 527, // nl-BQ
0x3670005b: 528, // nl-CW
0x367000d9: 529, // nl-NL
0x36700116: 530, // nl-SR
0x3670011b: 531, // nl-SX
0x36800000: 532, // nmg
0x36800052: 533, // nmg-CM
0x36a00000: 534, // nn
0x36a000da: 535, // nn-NO
0x36c00000: 536, // nnh
0x36c00052: 537, // nnh-CM
0x36f00000: 538, // no
0x37500000: 539, // nqo
0x37600000: 540, // nr
0x37a00000: 541, // nso
0x38000000: 542, // nus
0x38000117: 543, // nus-SS
0x38700000: 544, // ny
0x38900000: 545, // nyn
0x38900131: 546, // nyn-UG
0x39000000: 547, // om
0x3900006f: 548, // om-ET
0x390000a4: 549, // om-KE
0x39500000: 550, // or
0x39500099: 551, // or-IN
0x39800000: 552, // os
0x3980007d: 553, // os-GE
0x39800106: 554, // os-RU
0x39d00000: 555, // pa
0x39d05000: 556, // pa-Arab
0x39d050e8: 557, // pa-Arab-PK
0x39d33000: 558, // pa-Guru
0x39d33099: 559, // pa-Guru-IN
0x3a100000: 560, // pap
0x3b300000: 561, // pl
0x3b3000e9: 562, // pl-PL
0x3bd00000: 563, // prg
0x3bd00001: 564, // prg-001
0x3be00000: 565, // ps
0x3be00024: 566, // ps-AF
0x3c000000: 567, // pt
0x3c00002a: 568, // pt-AO
0x3c000041: 569, // pt-BR
0x3c00004e: 570, // pt-CH
0x3c00005a: 571, // pt-CV
0x3c000086: 572, // pt-GQ
0x3c00008b: 573, // pt-GW
0x3c0000b7: 574, // pt-LU
0x3c0000c6: 575, // pt-MO
0x3c0000d1: 576, // pt-MZ
0x3c0000ee: 577, // pt-PT
0x3c000118: 578, // pt-ST
0x3c000126: 579, // pt-TL
0x3c400000: 580, // qu
0x3c40003f: 581, // qu-BO
0x3c400069: 582, // qu-EC
0x3c4000e4: 583, // qu-PE
0x3d400000: 584, // rm
0x3d40004e: 585, // rm-CH
0x3d900000: 586, // rn
0x3d90003a: 587, // rn-BI
0x3dc00000: 588, // ro
0x3dc000bc: 589, // ro-MD
0x3dc00104: 590, // ro-RO
0x3de00000: 591, // rof
0x3de0012f: 592, // rof-TZ
0x3e200000: 593, // ru
0x3e200047: 594, // ru-BY
0x3e2000a5: 595, // ru-KG
0x3e2000ae: 596, // ru-KZ
0x3e2000bc: 597, // ru-MD
0x3e200106: 598, // ru-RU
0x3e200130: 599, // ru-UA
0x3e500000: 600, // rw
0x3e500107: 601, // rw-RW
0x3e600000: 602, // rwk
0x3e60012f: 603, // rwk-TZ
0x3eb00000: 604, // sah
0x3eb00106: 605, // sah-RU
0x3ec00000: 606, // saq
0x3ec000a4: 607, // saq-KE
0x3f300000: 608, // sbp
0x3f30012f: 609, // sbp-TZ
0x3fa00000: 610, // sd
0x3fa000e8: 611, // sd-PK
0x3fc00000: 612, // sdh
0x3fd00000: 613, // se
0x3fd00072: 614, // se-FI
0x3fd000da: 615, // se-NO
0x3fd0010c: 616, // se-SE
0x3ff00000: 617, // seh
0x3ff000d1: 618, // seh-MZ
0x40100000: 619, // ses
0x401000c3: 620, // ses-ML
0x40200000: 621, // sg
0x4020004c: 622, // sg-CF
0x40800000: 623, // shi
0x40857000: 624, // shi-Latn
0x408570ba: 625, // shi-Latn-MA
0x408dc000: 626, // shi-Tfng
0x408dc0ba: 627, // shi-Tfng-MA
0x40c00000: 628, // si
0x40c000b3: 629, // si-LK
0x41200000: 630, // sk
0x41200111: 631, // sk-SK
0x41600000: 632, // sl
0x4160010f: 633, // sl-SI
0x41c00000: 634, // sma
0x41d00000: 635, // smi
0x41e00000: 636, // smj
0x41f00000: 637, // smn
0x41f00072: 638, // smn-FI
0x42200000: 639, // sms
0x42300000: 640, // sn
0x42300164: 641, // sn-ZW
0x42900000: 642, // so
0x42900062: 643, // so-DJ
0x4290006f: 644, // so-ET
0x429000a4: 645, // so-KE
0x42900115: 646, // so-SO
0x43100000: 647, // sq
0x43100027: 648, // sq-AL
0x431000c2: 649, // sq-MK
0x4310014d: 650, // sq-XK
0x43200000: 651, // sr
0x4321f000: 652, // sr-Cyrl
0x4321f033: 653, // sr-Cyrl-BA
0x4321f0bd: 654, // sr-Cyrl-ME
0x4321f105: 655, // sr-Cyrl-RS
0x4321f14d: 656, // sr-Cyrl-XK
0x43257000: 657, // sr-Latn
0x43257033: 658, // sr-Latn-BA
0x432570bd: 659, // sr-Latn-ME
0x43257105: 660, // sr-Latn-RS
0x4325714d: 661, // sr-Latn-XK
0x43700000: 662, // ss
0x43a00000: 663, // ssy
0x43b00000: 664, // st
0x44400000: 665, // sv
0x44400031: 666, // sv-AX
0x44400072: 667, // sv-FI
0x4440010c: 668, // sv-SE
0x44500000: 669, // sw
0x4450004b: 670, // sw-CD
0x445000a4: 671, // sw-KE
0x4450012f: 672, // sw-TZ
0x44500131: 673, // sw-UG
0x44e00000: 674, // syr
0x45000000: 675, // ta
0x45000099: 676, // ta-IN
0x450000b3: 677, // ta-LK
0x450000d0: 678, // ta-MY
0x4500010d: 679, // ta-SG
0x46100000: 680, // te
0x46100099: 681, // te-IN
0x46400000: 682, // teo
0x464000a4: 683, // teo-KE
0x46400131: 684, // teo-UG
0x46700000: 685, // tg
0x46700124: 686, // tg-TJ
0x46b00000: 687, // th
0x46b00123: 688, // th-TH
0x46f00000: 689, // ti
0x46f0006d: 690, // ti-ER
0x46f0006f: 691, // ti-ET
0x47100000: 692, // tig
0x47600000: 693, // tk
0x47600127: 694, // tk-TM
0x48000000: 695, // tn
0x48200000: 696, // to
0x48200129: 697, // to-TO
0x48a00000: 698, // tr
0x48a0005d: 699, // tr-CY
0x48a0012b: 700, // tr-TR
0x48e00000: 701, // ts
0x49400000: 702, // tt
0x49400106: 703, // tt-RU
0x4a400000: 704, // twq
0x4a4000d4: 705, // twq-NE
0x4a900000: 706, // tzm
0x4a9000ba: 707, // tzm-MA
0x4ac00000: 708, // ug
0x4ac00053: 709, // ug-CN
0x4ae00000: 710, // uk
0x4ae00130: 711, // uk-UA
0x4b400000: 712, // ur
0x4b400099: 713, // ur-IN
0x4b4000e8: 714, // ur-PK
0x4bc00000: 715, // uz
0x4bc05000: 716, // uz-Arab
0x4bc05024: 717, // uz-Arab-AF
0x4bc1f000: 718, // uz-Cyrl
0x4bc1f137: 719, // uz-Cyrl-UZ
0x4bc57000: 720, // uz-Latn
0x4bc57137: 721, // uz-Latn-UZ
0x4be00000: 722, // vai
0x4be57000: 723, // vai-Latn
0x4be570b4: 724, // vai-Latn-LR
0x4bee3000: 725, // vai-Vaii
0x4bee30b4: 726, // vai-Vaii-LR
0x4c000000: 727, // ve
0x4c300000: 728, // vi
0x4c30013e: 729, // vi-VN
0x4c900000: 730, // vo
0x4c900001: 731, // vo-001
0x4cc00000: 732, // vun
0x4cc0012f: 733, // vun-TZ
0x4ce00000: 734, // wa
0x4cf00000: 735, // wae
0x4cf0004e: 736, // wae-CH
0x4e500000: 737, // wo
0x4e500114: 738, // wo-SN
0x4f200000: 739, // xh
0x4fb00000: 740, // xog
0x4fb00131: 741, // xog-UG
0x50900000: 742, // yav
0x50900052: 743, // yav-CM
0x51200000: 744, // yi
0x51200001: 745, // yi-001
0x51800000: 746, // yo
0x5180003b: 747, // yo-BJ
0x518000d6: 748, // yo-NG
0x51f00000: 749, // yue
0x51f38000: 750, // yue-Hans
0x51f38053: 751, // yue-Hans-CN
0x51f39000: 752, // yue-Hant
0x51f3908d: 753, // yue-Hant-HK
0x52800000: 754, // zgh
0x528000ba: 755, // zgh-MA
0x52900000: 756, // zh
0x52938000: 757, // zh-Hans
0x52938053: 758, // zh-Hans-CN
0x5293808d: 759, // zh-Hans-HK
0x529380c6: 760, // zh-Hans-MO
0x5293810d: 761, // zh-Hans-SG
0x52939000: 762, // zh-Hant
0x5293908d: 763, // zh-Hant-HK
0x529390c6: 764, // zh-Hant-MO
0x5293912e: 765, // zh-Hant-TW
0x52f00000: 766, // zu
0x52f00161: 767, // zu-ZA
}
// Total table size 4676 bytes (4KiB); checksum: 17BE3673

View File

@ -2,8 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go gen_common.go -output tables.go
//go:generate go run gen_index.go
//go:generate go run gen.go -output tables.go
package language
@ -11,47 +10,34 @@ package language
// - verifying that tables are dropped correctly (most notably matcher tables).
import (
"errors"
"fmt"
"strings"
)
const (
// maxCoreSize is the maximum size of a BCP 47 tag without variants and
// extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
maxCoreSize = 12
// max99thPercentileSize is a somewhat arbitrary buffer size that presumably
// is large enough to hold at least 99% of the BCP 47 tags.
max99thPercentileSize = 32
// maxSimpleUExtensionSize is the maximum size of a -u extension with one
// key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
maxSimpleUExtensionSize = 14
"golang.org/x/text/internal/language"
"golang.org/x/text/internal/language/compact"
)
// Tag represents a BCP 47 language tag. It is used to specify an instance of a
// specific language or locale. All language tag values are guaranteed to be
// well-formed.
type Tag struct {
lang langID
region regionID
// TODO: we will soon run out of positions for script. Idea: instead of
// storing lang, region, and script codes, store only the compact index and
// have a lookup table from this code to its expansion. This greatly speeds
// up table lookup, speed up common variant cases.
// This will also immediately free up 3 extra bytes. Also, the pVariant
// field can now be moved to the lookup table, as the compact index uniquely
// determines the offset of a possible variant.
script scriptID
pVariant byte // offset in str, includes preceding '-'
pExt uint16 // offset of first extension, includes preceding '-'
type Tag compact.Tag
// str is the string representation of the Tag. It will only be used if the
// tag has variants or extensions.
str string
func makeTag(t language.Tag) (tag Tag) {
return Tag(compact.Make(t))
}
func (t *Tag) tag() language.Tag {
return (*compact.Tag)(t).Tag()
}
func (t *Tag) isCompact() bool {
return (*compact.Tag)(t).IsCompact()
}
// TODO: improve performance.
func (t *Tag) lang() language.Language { return t.tag().LangID }
func (t *Tag) region() language.Region { return t.tag().RegionID }
func (t *Tag) script() language.Script { return t.tag().ScriptID }
// Make is a convenience wrapper for Parse that omits the error.
// In case of an error, a sensible default is returned.
func Make(s string) Tag {
@ -68,25 +54,13 @@ func (c CanonType) Make(s string) Tag {
// Raw returns the raw base language, script and region, without making an
// attempt to infer their values.
func (t Tag) Raw() (b Base, s Script, r Region) {
return Base{t.lang}, Script{t.script}, Region{t.region}
}
// equalTags compares language, script and region subtags only.
func (t Tag) equalTags(a Tag) bool {
return t.lang == a.lang && t.script == a.script && t.region == a.region
tt := t.tag()
return Base{tt.LangID}, Script{tt.ScriptID}, Region{tt.RegionID}
}
// IsRoot returns true if t is equal to language "und".
func (t Tag) IsRoot() bool {
if int(t.pVariant) < len(t.str) {
return false
}
return t.equalTags(und)
}
// private reports whether the Tag consists solely of a private use tag.
func (t Tag) private() bool {
return t.str != "" && t.pVariant == 0
return compact.Tag(t).IsRoot()
}
// CanonType can be used to enable or disable various types of canonicalization.
@ -138,73 +112,73 @@ const (
// canonicalize returns the canonicalized equivalent of the tag and
// whether there was any change.
func (t Tag) canonicalize(c CanonType) (Tag, bool) {
func canonicalize(c CanonType, t language.Tag) (language.Tag, bool) {
if c == Raw {
return t, false
}
changed := false
if c&SuppressScript != 0 {
if t.lang < langNoIndexOffset && uint8(t.script) == suppressScript[t.lang] {
t.script = 0
if t.LangID.SuppressScript() == t.ScriptID {
t.ScriptID = 0
changed = true
}
}
if c&canonLang != 0 {
for {
if l, aliasType := normLang(t.lang); l != t.lang {
if l, aliasType := t.LangID.Canonicalize(); l != t.LangID {
switch aliasType {
case langLegacy:
case language.Legacy:
if c&Legacy != 0 {
if t.lang == _sh && t.script == 0 {
t.script = _Latn
if t.LangID == _sh && t.ScriptID == 0 {
t.ScriptID = _Latn
}
t.lang = l
t.LangID = l
changed = true
}
case langMacro:
case language.Macro:
if c&Macro != 0 {
// We deviate here from CLDR. The mapping "nb" -> "no"
// qualifies as a typical Macro language mapping. However,
// for legacy reasons, CLDR maps "no", the macro language
// code for Norwegian, to the dominant variant "nb". This
// change is currently under consideration for CLDR as well.
// See http://unicode.org/cldr/trac/ticket/2698 and also
// http://unicode.org/cldr/trac/ticket/1790 for some of the
// See https://unicode.org/cldr/trac/ticket/2698 and also
// https://unicode.org/cldr/trac/ticket/1790 for some of the
// practical implications. TODO: this check could be removed
// if CLDR adopts this change.
if c&CLDR == 0 || t.lang != _nb {
if c&CLDR == 0 || t.LangID != _nb {
changed = true
t.lang = l
t.LangID = l
}
}
case langDeprecated:
case language.Deprecated:
if c&DeprecatedBase != 0 {
if t.lang == _mo && t.region == 0 {
t.region = _MD
if t.LangID == _mo && t.RegionID == 0 {
t.RegionID = _MD
}
t.lang = l
t.LangID = l
changed = true
// Other canonicalization types may still apply.
continue
}
}
} else if c&Legacy != 0 && t.lang == _no && c&CLDR != 0 {
t.lang = _nb
} else if c&Legacy != 0 && t.LangID == _no && c&CLDR != 0 {
t.LangID = _nb
changed = true
}
break
}
}
if c&DeprecatedScript != 0 {
if t.script == _Qaai {
if t.ScriptID == _Qaai {
changed = true
t.script = _Zinh
t.ScriptID = _Zinh
}
}
if c&DeprecatedRegion != 0 {
if r := normRegion(t.region); r != 0 {
if r := t.RegionID.Canonicalize(); r != t.RegionID {
changed = true
t.region = r
t.RegionID = r
}
}
return t, changed
@ -212,11 +186,20 @@ func (t Tag) canonicalize(c CanonType) (Tag, bool) {
// Canonicalize returns the canonicalized equivalent of the tag.
func (c CanonType) Canonicalize(t Tag) (Tag, error) {
t, changed := t.canonicalize(c)
if changed {
t.remakeString()
// First try fast path.
if t.isCompact() {
if _, changed := canonicalize(c, compact.Tag(t).Tag()); !changed {
return t, nil
}
}
// It is unlikely that one will canonicalize a tag after matching. So do
// a slow but simple approach here.
if tag, changed := canonicalize(c, t.tag()); changed {
tag.RemakeString()
return makeTag(tag), nil
}
return t, nil
}
// Confidence indicates the level of certainty for a given return value.
@ -239,83 +222,21 @@ func (c Confidence) String() string {
return confName[c]
}
// remakeString is used to update t.str in case lang, script or region changed.
// It is assumed that pExt and pVariant still point to the start of the
// respective parts.
func (t *Tag) remakeString() {
if t.str == "" {
return
}
extra := t.str[t.pVariant:]
if t.pVariant > 0 {
extra = extra[1:]
}
if t.equalTags(und) && strings.HasPrefix(extra, "x-") {
t.str = extra
t.pVariant = 0
t.pExt = 0
return
}
var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
b := buf[:t.genCoreBytes(buf[:])]
if extra != "" {
diff := len(b) - int(t.pVariant)
b = append(b, '-')
b = append(b, extra...)
t.pVariant = uint8(int(t.pVariant) + diff)
t.pExt = uint16(int(t.pExt) + diff)
} else {
t.pVariant = uint8(len(b))
t.pExt = uint16(len(b))
}
t.str = string(b)
}
// genCoreBytes writes a string for the base languages, script and region tags
// to the given buffer and returns the number of bytes written. It will never
// write more than maxCoreSize bytes.
func (t *Tag) genCoreBytes(buf []byte) int {
n := t.lang.stringToBuf(buf[:])
if t.script != 0 {
n += copy(buf[n:], "-")
n += copy(buf[n:], t.script.String())
}
if t.region != 0 {
n += copy(buf[n:], "-")
n += copy(buf[n:], t.region.String())
}
return n
}
// String returns the canonical string representation of the language tag.
func (t Tag) String() string {
if t.str != "" {
return t.str
}
if t.script == 0 && t.region == 0 {
return t.lang.String()
}
buf := [maxCoreSize]byte{}
return string(buf[:t.genCoreBytes(buf[:])])
return t.tag().String()
}
// MarshalText implements encoding.TextMarshaler.
func (t Tag) MarshalText() (text []byte, err error) {
if t.str != "" {
text = append(text, t.str...)
} else if t.script == 0 && t.region == 0 {
text = append(text, t.lang.String()...)
} else {
buf := [maxCoreSize]byte{}
text = buf[:t.genCoreBytes(buf[:])]
}
return text, nil
return t.tag().MarshalText()
}
// UnmarshalText implements encoding.TextUnmarshaler.
func (t *Tag) UnmarshalText(text []byte) error {
tag, err := Raw.Parse(string(text))
*t = tag
var tag language.Tag
err := tag.UnmarshalText(text)
*t = makeTag(tag)
return err
}
@ -323,15 +244,16 @@ func (t *Tag) UnmarshalText(text []byte) error {
// unspecified, an attempt will be made to infer it from the context.
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
func (t Tag) Base() (Base, Confidence) {
if t.lang != 0 {
return Base{t.lang}, Exact
if b := t.lang(); b != 0 {
return Base{b}, Exact
}
tt := t.tag()
c := High
if t.script == 0 && !(Region{t.region}).IsCountry() {
if tt.ScriptID == 0 && !tt.RegionID.IsCountry() {
c = Low
}
if tag, err := addTags(t); err == nil && tag.lang != 0 {
return Base{tag.lang}, c
if tag, err := tt.Maximize(); err == nil && tag.LangID != 0 {
return Base{tag.LangID}, c
}
return Base{0}, No
}
@ -344,35 +266,34 @@ func (t Tag) Base() (Base, Confidence) {
// If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined)
// as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks
// common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts.
// See http://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
// See https://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
// unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified.
// Note that an inferred script is never guaranteed to be the correct one. Latin is
// almost exclusively used for Afrikaans, but Arabic has been used for some texts
// in the past. Also, the script that is commonly used may change over time.
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
func (t Tag) Script() (Script, Confidence) {
if t.script != 0 {
return Script{t.script}, Exact
if scr := t.script(); scr != 0 {
return Script{scr}, Exact
}
sc, c := scriptID(_Zzzz), No
if t.lang < langNoIndexOffset {
if scr := scriptID(suppressScript[t.lang]); scr != 0 {
// Note: it is not always the case that a language with a suppress
// script value is only written in one script (e.g. kk, ms, pa).
if t.region == 0 {
return Script{scriptID(scr)}, High
}
sc, c = scr, High
tt := t.tag()
sc, c := language.Script(_Zzzz), No
if scr := tt.LangID.SuppressScript(); scr != 0 {
// Note: it is not always the case that a language with a suppress
// script value is only written in one script (e.g. kk, ms, pa).
if tt.RegionID == 0 {
return Script{scr}, High
}
sc, c = scr, High
}
if tag, err := addTags(t); err == nil {
if tag.script != sc {
sc, c = tag.script, Low
if tag, err := tt.Maximize(); err == nil {
if tag.ScriptID != sc {
sc, c = tag.ScriptID, Low
}
} else {
t, _ = (Deprecated | Macro).Canonicalize(t)
if tag, err := addTags(t); err == nil && tag.script != sc {
sc, c = tag.script, Low
tt, _ = canonicalize(Deprecated|Macro, tt)
if tag, err := tt.Maximize(); err == nil && tag.ScriptID != sc {
sc, c = tag.ScriptID, Low
}
}
return Script{sc}, c
@ -382,28 +303,31 @@ func (t Tag) Script() (Script, Confidence) {
// infer a most likely candidate from the context.
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
func (t Tag) Region() (Region, Confidence) {
if t.region != 0 {
return Region{t.region}, Exact
if r := t.region(); r != 0 {
return Region{r}, Exact
}
if t, err := addTags(t); err == nil {
return Region{t.region}, Low // TODO: differentiate between high and low.
tt := t.tag()
if tt, err := tt.Maximize(); err == nil {
return Region{tt.RegionID}, Low // TODO: differentiate between high and low.
}
t, _ = (Deprecated | Macro).Canonicalize(t)
if tag, err := addTags(t); err == nil {
return Region{tag.region}, Low
tt, _ = canonicalize(Deprecated|Macro, tt)
if tag, err := tt.Maximize(); err == nil {
return Region{tag.RegionID}, Low
}
return Region{_ZZ}, No // TODO: return world instead of undetermined?
}
// Variant returns the variants specified explicitly for this language tag.
// Variants returns the variants specified explicitly for this language tag.
// or nil if no variant was specified.
func (t Tag) Variants() []Variant {
if !compact.Tag(t).MayHaveVariants() {
return nil
}
v := []Variant{}
if int(t.pVariant) < int(t.pExt) {
for x, str := "", t.str[t.pVariant:t.pExt]; str != ""; {
x, str = nextToken(str)
v = append(v, Variant{x})
}
x, str := "", t.tag().Variants()
for str != "" {
x, str = nextToken(str)
v = append(v, Variant{x})
}
return v
}
@ -411,57 +335,13 @@ func (t Tag) Variants() []Variant {
// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
// specific language are substituted with fields from the parent language.
// The parent for a language may change for newer versions of CLDR.
//
// Parent returns a tag for a less specific language that is mutually
// intelligible or Und if there is no such language. This may not be the same as
// simply stripping the last BCP 47 subtag. For instance, the parent of "zh-TW"
// is "zh-Hant", and the parent of "zh-Hant" is "und".
func (t Tag) Parent() Tag {
if t.str != "" {
// Strip the variants and extensions.
t, _ = Raw.Compose(t.Raw())
if t.region == 0 && t.script != 0 && t.lang != 0 {
base, _ := addTags(Tag{lang: t.lang})
if base.script == t.script {
return Tag{lang: t.lang}
}
}
return t
}
if t.lang != 0 {
if t.region != 0 {
maxScript := t.script
if maxScript == 0 {
max, _ := addTags(t)
maxScript = max.script
}
for i := range parents {
if langID(parents[i].lang) == t.lang && scriptID(parents[i].maxScript) == maxScript {
for _, r := range parents[i].fromRegion {
if regionID(r) == t.region {
return Tag{
lang: t.lang,
script: scriptID(parents[i].script),
region: regionID(parents[i].toRegion),
}
}
}
}
}
// Strip the script if it is the default one.
base, _ := addTags(Tag{lang: t.lang})
if base.script != maxScript {
return Tag{lang: t.lang, script: maxScript}
}
return Tag{lang: t.lang}
} else if t.script != 0 {
// The parent for an base-script pair with a non-default script is
// "und" instead of the base language.
base, _ := addTags(Tag{lang: t.lang})
if base.script != t.script {
return und
}
return Tag{lang: t.lang}
}
}
return und
return Tag(compact.Tag(t).Parent())
}
// returns token t and the rest of the string.
@ -487,17 +367,8 @@ func (e Extension) String() string {
// ParseExtension parses s as an extension and returns it on success.
func ParseExtension(s string) (e Extension, err error) {
scan := makeScannerString(s)
var end int
if n := len(scan.token); n != 1 {
return Extension{}, errSyntax
}
scan.toLower(0, len(scan.b))
end = parseExtension(&scan)
if end != len(s) {
return Extension{}, errSyntax
}
return Extension{string(scan.b)}, nil
ext, err := language.ParseExtension(s)
return Extension{ext}, err
}
// Type returns the one-byte extension type of e. It returns 0 for the zero
@ -518,22 +389,20 @@ func (e Extension) Tokens() []string {
// false for ok if t does not have the requested extension. The returned
// extension will be invalid in this case.
func (t Tag) Extension(x byte) (ext Extension, ok bool) {
for i := int(t.pExt); i < len(t.str)-1; {
var ext string
i, ext = getExtension(t.str, i)
if ext[0] == x {
return Extension{ext}, true
}
if !compact.Tag(t).MayHaveExtensions() {
return Extension{}, false
}
return Extension{}, false
e, ok := t.tag().Extension(x)
return Extension{e}, ok
}
// Extensions returns all extensions of t.
func (t Tag) Extensions() []Extension {
if !compact.Tag(t).MayHaveExtensions() {
return nil
}
e := []Extension{}
for i := int(t.pExt); i < len(t.str)-1; {
var ext string
i, ext = getExtension(t.str, i)
for _, ext := range t.tag().Extensions() {
e = append(e, Extension{ext})
}
return e
@ -541,259 +410,105 @@ func (t Tag) Extensions() []Extension {
// TypeForKey returns the type associated with the given key, where key and type
// are of the allowed values defined for the Unicode locale extension ('u') in
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// TypeForKey will traverse the inheritance chain to get the correct value.
func (t Tag) TypeForKey(key string) string {
if start, end, _ := t.findTypeForKey(key); end != start {
return t.str[start:end]
if !compact.Tag(t).MayHaveExtensions() {
if key != "rg" && key != "va" {
return ""
}
}
return ""
return t.tag().TypeForKey(key)
}
var (
errPrivateUse = errors.New("cannot set a key on a private use tag")
errInvalidArguments = errors.New("invalid key or type")
)
// SetTypeForKey returns a new Tag with the key set to type, where key and type
// are of the allowed values defined for the Unicode locale extension ('u') in
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// An empty value removes an existing pair with the same key.
func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
if t.private() {
return t, errPrivateUse
}
if len(key) != 2 {
return t, errInvalidArguments
}
// Remove the setting if value is "".
if value == "" {
start, end, _ := t.findTypeForKey(key)
if start != end {
// Remove key tag and leading '-'.
start -= 4
// Remove a possible empty extension.
if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
start -= 2
}
if start == int(t.pVariant) && end == len(t.str) {
t.str = ""
t.pVariant, t.pExt = 0, 0
} else {
t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
}
}
return t, nil
}
if len(value) < 3 || len(value) > 8 {
return t, errInvalidArguments
}
var (
buf [maxCoreSize + maxSimpleUExtensionSize]byte
uStart int // start of the -u extension.
)
// Generate the tag string if needed.
if t.str == "" {
uStart = t.genCoreBytes(buf[:])
buf[uStart] = '-'
uStart++
}
// Create new key-type pair and parse it to verify.
b := buf[uStart:]
copy(b, "u-")
copy(b[2:], key)
b[4] = '-'
b = b[:5+copy(b[5:], value)]
scan := makeScanner(b)
if parseExtensions(&scan); scan.err != nil {
return t, scan.err
}
// Assemble the replacement string.
if t.str == "" {
t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
t.str = string(buf[:uStart+len(b)])
} else {
s := t.str
start, end, hasExt := t.findTypeForKey(key)
if start == end {
if hasExt {
b = b[2:]
}
t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
} else {
t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
}
}
return t, nil
tt, err := t.tag().SetTypeForKey(key, value)
return makeTag(tt), err
}
// findKeyAndType returns the start and end position for the type corresponding
// to key or the point at which to insert the key-value pair if the type
// wasn't found. The hasExt return value reports whether an -u extension was present.
// Note: the extensions are typically very small and are likely to contain
// only one key-type pair.
func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
p := int(t.pExt)
if len(key) != 2 || p == len(t.str) || p == 0 {
return p, p, false
}
s := t.str
// Find the correct extension.
for p++; s[p] != 'u'; p++ {
if s[p] > 'u' {
p--
return p, p, false
}
if p = nextExtension(s, p); p == len(s) {
return len(s), len(s), false
}
}
// Proceed to the hyphen following the extension name.
p++
// curKey is the key currently being processed.
curKey := ""
// Iterate over keys until we get the end of a section.
for {
// p points to the hyphen preceding the current token.
if p3 := p + 3; s[p3] == '-' {
// Found a key.
// Check whether we just processed the key that was requested.
if curKey == key {
return start, p, true
}
// Set to the next key and continue scanning type tokens.
curKey = s[p+1 : p3]
if curKey > key {
return p, p, true
}
// Start of the type token sequence.
start = p + 4
// A type is at least 3 characters long.
p += 7 // 4 + 3
} else {
// Attribute or type, which is at least 3 characters long.
p += 4
}
// p points past the third character of a type or attribute.
max := p + 5 // maximum length of token plus hyphen.
if len(s) < max {
max = len(s)
}
for ; p < max && s[p] != '-'; p++ {
}
// Bail if we have exhausted all tokens or if the next token starts
// a new extension.
if p == len(s) || s[p+2] == '-' {
if curKey == key {
return start, p, true
}
return p, p, true
}
}
}
// NumCompactTags is the number of compact tags. The maximum tag is
// NumCompactTags-1.
const NumCompactTags = compact.NumCompactTags
// CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
// for which data exists in the text repository. The index will change over time
// and should not be stored in persistent storage. Extensions, except for the
// 'va' type of the 'u' extension, are ignored. It will return 0, false if no
// compact tag exists, where 0 is the index for the root language (Und).
func CompactIndex(t Tag) (index int, ok bool) {
// TODO: perhaps give more frequent tags a lower index.
// TODO: we could make the indexes stable. This will excluded some
// possibilities for optimization, so don't do this quite yet.
b, s, r := t.Raw()
if len(t.str) > 0 {
if strings.HasPrefix(t.str, "x-") {
// We have no entries for user-defined tags.
return 0, false
}
if uint16(t.pVariant) != t.pExt {
// There are no tags with variants and an u-va type.
if t.TypeForKey("va") != "" {
return 0, false
}
t, _ = Raw.Compose(b, s, r, t.Variants())
} else if _, ok := t.Extension('u'); ok {
// Strip all but the 'va' entry.
variant := t.TypeForKey("va")
t, _ = Raw.Compose(b, s, r)
t, _ = t.SetTypeForKey("va", variant)
}
if len(t.str) > 0 {
// We have some variants.
for i, s := range specialTags {
if s == t {
return i + 1, true
}
}
return 0, false
}
}
// No variants specified: just compare core components.
// The key has the form lllssrrr, where l, s, and r are nibbles for
// respectively the langID, scriptID, and regionID.
key := uint32(b.langID) << (8 + 12)
key |= uint32(s.scriptID) << 12
key |= uint32(r.regionID)
x, ok := coreTags[key]
return int(x), ok
// for which data exists in the text repository.The index will change over time
// and should not be stored in persistent storage. If t does not match a compact
// index, exact will be false and the compact index will be returned for the
// first match after repeatedly taking the Parent of t.
func CompactIndex(t Tag) (index int, exact bool) {
id, exact := compact.LanguageID(compact.Tag(t))
return int(id), exact
}
var root = language.Tag{}
// Base is an ISO 639 language code, used for encoding the base language
// of a language tag.
type Base struct {
langID
langID language.Language
}
// ParseBase parses a 2- or 3-letter ISO 639 code.
// It returns a ValueError if s is a well-formed but unknown language identifier
// or another error if another error occurred.
func ParseBase(s string) (Base, error) {
if n := len(s); n < 2 || 3 < n {
return Base{}, errSyntax
}
var buf [3]byte
l, err := getLangID(buf[:copy(buf[:], s)])
l, err := language.ParseBase(s)
return Base{l}, err
}
// String returns the BCP 47 representation of the base language.
func (b Base) String() string {
return b.langID.String()
}
// ISO3 returns the ISO 639-3 language code.
func (b Base) ISO3() string {
return b.langID.ISO3()
}
// IsPrivateUse reports whether this language code is reserved for private use.
func (b Base) IsPrivateUse() bool {
return b.langID.IsPrivateUse()
}
// Script is a 4-letter ISO 15924 code for representing scripts.
// It is idiomatically represented in title case.
type Script struct {
scriptID
scriptID language.Script
}
// ParseScript parses a 4-letter ISO 15924 code.
// It returns a ValueError if s is a well-formed but unknown script identifier
// or another error if another error occurred.
func ParseScript(s string) (Script, error) {
if len(s) != 4 {
return Script{}, errSyntax
}
var buf [4]byte
sc, err := getScriptID(script, buf[:copy(buf[:], s)])
sc, err := language.ParseScript(s)
return Script{sc}, err
}
// String returns the script code in title case.
// It returns "Zzzz" for an unspecified script.
func (s Script) String() string {
return s.scriptID.String()
}
// IsPrivateUse reports whether this script code is reserved for private use.
func (s Script) IsPrivateUse() bool {
return s.scriptID.IsPrivateUse()
}
// Region is an ISO 3166-1 or UN M.49 code for representing countries and regions.
type Region struct {
regionID
regionID language.Region
}
// EncodeM49 returns the Region for the given UN M.49 code.
// It returns an error if r is not a valid code.
func EncodeM49(r int) (Region, error) {
rid, err := getRegionM49(r)
rid, err := language.EncodeM49(r)
return Region{rid}, err
}
@ -801,62 +516,54 @@ func EncodeM49(r int) (Region, error) {
// It returns a ValueError if s is a well-formed but unknown region identifier
// or another error if another error occurred.
func ParseRegion(s string) (Region, error) {
if n := len(s); n < 2 || 3 < n {
return Region{}, errSyntax
}
var buf [3]byte
r, err := getRegionID(buf[:copy(buf[:], s)])
r, err := language.ParseRegion(s)
return Region{r}, err
}
// String returns the BCP 47 representation for the region.
// It returns "ZZ" for an unspecified region.
func (r Region) String() string {
return r.regionID.String()
}
// ISO3 returns the 3-letter ISO code of r.
// Note that not all regions have a 3-letter ISO code.
// In such cases this method returns "ZZZ".
func (r Region) ISO3() string {
return r.regionID.ISO3()
}
// M49 returns the UN M.49 encoding of r, or 0 if this encoding
// is not defined for r.
func (r Region) M49() int {
return r.regionID.M49()
}
// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
// may include private-use tags that are assigned by CLDR and used in this
// implementation. So IsPrivateUse and IsCountry can be simultaneously true.
func (r Region) IsPrivateUse() bool {
return r.regionID.IsPrivateUse()
}
// IsCountry returns whether this region is a country or autonomous area. This
// includes non-standard definitions from CLDR.
func (r Region) IsCountry() bool {
if r.regionID == 0 || r.IsGroup() || r.IsPrivateUse() && r.regionID != _XK {
return false
}
return true
return r.regionID.IsCountry()
}
// IsGroup returns whether this region defines a collection of regions. This
// includes non-standard definitions from CLDR.
func (r Region) IsGroup() bool {
if r.regionID == 0 {
return false
}
return int(regionInclusion[r.regionID]) < len(regionContainment)
return r.regionID.IsGroup()
}
// Contains returns whether Region c is contained by Region r. It returns true
// if c == r.
func (r Region) Contains(c Region) bool {
return r.regionID.contains(c.regionID)
return r.regionID.Contains(c.regionID)
}
func (r regionID) contains(c regionID) bool {
if r == c {
return true
}
g := regionInclusion[r]
if g >= nRegionGroups {
return false
}
m := regionContainment[g]
d := regionInclusion[c]
b := regionInclusionBits[d]
// A contained country may belong to multiple disjoint groups. Matching any
// of these indicates containment. If the contained region is a group, it
// must strictly be a subset.
if d >= nRegionGroups {
return b&m != 0
}
return b&^m == 0
}
var errNoTLD = errors.New("language: region is not a valid ccTLD")
// TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
// In all other cases it returns either the region itself or an error.
//
@ -865,25 +572,15 @@ var errNoTLD = errors.New("language: region is not a valid ccTLD")
// region will already be canonicalized it was obtained from a Tag that was
// obtained using any of the default methods.
func (r Region) TLD() (Region, error) {
// See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
// difference between ISO 3166-1 and IANA ccTLD.
if r.regionID == _GB {
r = Region{_UK}
}
if (r.typ() & ccTLD) == 0 {
return Region{}, errNoTLD
}
return r, nil
tld, err := r.regionID.TLD()
return Region{tld}, err
}
// Canonicalize returns the region or a possible replacement if the region is
// deprecated. It will not return a replacement for deprecated regions that
// are split into multiple regions.
func (r Region) Canonicalize() Region {
if cr := normRegion(r.regionID); cr != 0 {
return Region{cr}
}
return r
return Region{r.regionID.Canonicalize()}
}
// Variant represents a registered variant of a language as defined by BCP 47.
@ -894,11 +591,8 @@ type Variant struct {
// ParseVariant parses and returns a Variant. An error is returned if s is not
// a valid variant.
func ParseVariant(s string) (Variant, error) {
s = strings.ToLower(s)
if _, ok := variantIndex[s]; ok {
return Variant{s}, nil
}
return Variant{}, mkErrInvalid([]byte(s))
v, err := language.ParseVariant(s)
return Variant{v.String()}, err
}
// String returns the string representation of the variant.

View File

@ -4,7 +4,12 @@
package language
import "errors"
import (
"errors"
"strings"
"golang.org/x/text/internal/language"
)
// A MatchOption configures a Matcher.
type MatchOption func(*matcher)
@ -74,12 +79,13 @@ func NewMatcher(t []Tag, options ...MatchOption) Matcher {
}
func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
var tt language.Tag
match, w, c := m.getBest(want...)
if match != nil {
t, index = match.tag, match.index
tt, index = match.tag, match.index
} else {
// TODO: this should be an option
t = m.default_.tag
tt = m.default_.tag
if m.preferSameScript {
outer:
for _, w := range want {
@ -91,7 +97,7 @@ func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
}
for i, h := range m.supported {
if script.scriptID == h.maxScript {
t, index = h.tag, i
tt, index = h.tag, i
break outer
}
}
@ -99,238 +105,45 @@ func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
}
// TODO: select first language tag based on script.
}
if w.region != 0 && t.region != 0 && t.region.contains(w.region) {
t, _ = Raw.Compose(t, Region{w.region})
if w.RegionID != tt.RegionID && w.RegionID != 0 {
if w.RegionID != 0 && tt.RegionID != 0 && tt.RegionID.Contains(w.RegionID) {
tt.RegionID = w.RegionID
tt.RemakeString()
} else if r := w.RegionID.String(); len(r) == 2 {
// TODO: also filter macro and deprecated.
tt, _ = tt.SetTypeForKey("rg", strings.ToLower(r)+"zzzz")
}
}
// Copy options from the user-provided tag into the result tag. This is hard
// to do after the fact, so we do it here.
// TODO: add in alternative variants to -u-va-.
// TODO: add preferred region to -u-rg-.
if e := w.Extensions(); len(e) > 0 {
t, _ = Raw.Compose(t, e)
}
return t, index, c
}
type scriptRegionFlags uint8
const (
isList = 1 << iota
scriptInFrom
regionInFrom
)
func (t *Tag) setUndefinedLang(id langID) {
if t.lang == 0 {
t.lang = id
}
}
func (t *Tag) setUndefinedScript(id scriptID) {
if t.script == 0 {
t.script = id
}
}
func (t *Tag) setUndefinedRegion(id regionID) {
if t.region == 0 || t.region.contains(id) {
t.region = id
b := language.Builder{}
b.SetTag(tt)
for _, e := range e {
b.AddExt(e)
}
tt = b.Make()
}
return makeTag(tt), index, c
}
// ErrMissingLikelyTagsData indicates no information was available
// to compute likely values of missing tags.
var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
// addLikelySubtags sets subtags to their most likely value, given the locale.
// In most cases this means setting fields for unknown values, but in some
// cases it may alter a value. It returns an ErrMissingLikelyTagsData error
// if the given locale cannot be expanded.
func (t Tag) addLikelySubtags() (Tag, error) {
id, err := addTags(t)
if err != nil {
return t, err
} else if id.equalTags(t) {
return t, nil
}
id.remakeString()
return id, nil
}
// specializeRegion attempts to specialize a group region.
func specializeRegion(t *Tag) bool {
if i := regionInclusion[t.region]; i < nRegionGroups {
x := likelyRegionGroup[i]
if langID(x.lang) == t.lang && scriptID(x.script) == t.script {
t.region = regionID(x.region)
}
return true
}
return false
}
func addTags(t Tag) (Tag, error) {
// We leave private use identifiers alone.
if t.private() {
return t, nil
}
if t.script != 0 && t.region != 0 {
if t.lang != 0 {
// already fully specified
specializeRegion(&t)
return t, nil
}
// Search matches for und-script-region. Note that for these cases
// region will never be a group so there is no need to check for this.
list := likelyRegion[t.region : t.region+1]
if x := list[0]; x.flags&isList != 0 {
list = likelyRegionList[x.lang : x.lang+uint16(x.script)]
}
for _, x := range list {
// Deviating from the spec. See match_test.go for details.
if scriptID(x.script) == t.script {
t.setUndefinedLang(langID(x.lang))
return t, nil
}
}
}
if t.lang != 0 {
// Search matches for lang-script and lang-region, where lang != und.
if t.lang < langNoIndexOffset {
x := likelyLang[t.lang]
if x.flags&isList != 0 {
list := likelyLangList[x.region : x.region+uint16(x.script)]
if t.script != 0 {
for _, x := range list {
if scriptID(x.script) == t.script && x.flags&scriptInFrom != 0 {
t.setUndefinedRegion(regionID(x.region))
return t, nil
}
}
} else if t.region != 0 {
count := 0
goodScript := true
tt := t
for _, x := range list {
// We visit all entries for which the script was not
// defined, including the ones where the region was not
// defined. This allows for proper disambiguation within
// regions.
if x.flags&scriptInFrom == 0 && t.region.contains(regionID(x.region)) {
tt.region = regionID(x.region)
tt.setUndefinedScript(scriptID(x.script))
goodScript = goodScript && tt.script == scriptID(x.script)
count++
}
}
if count == 1 {
return tt, nil
}
// Even if we fail to find a unique Region, we might have
// an unambiguous script.
if goodScript {
t.script = tt.script
}
}
}
}
} else {
// Search matches for und-script.
if t.script != 0 {
x := likelyScript[t.script]
if x.region != 0 {
t.setUndefinedRegion(regionID(x.region))
t.setUndefinedLang(langID(x.lang))
return t, nil
}
}
// Search matches for und-region. If und-script-region exists, it would
// have been found earlier.
if t.region != 0 {
if i := regionInclusion[t.region]; i < nRegionGroups {
x := likelyRegionGroup[i]
if x.region != 0 {
t.setUndefinedLang(langID(x.lang))
t.setUndefinedScript(scriptID(x.script))
t.region = regionID(x.region)
}
} else {
x := likelyRegion[t.region]
if x.flags&isList != 0 {
x = likelyRegionList[x.lang]
}
if x.script != 0 && x.flags != scriptInFrom {
t.setUndefinedLang(langID(x.lang))
t.setUndefinedScript(scriptID(x.script))
return t, nil
}
}
}
}
// Search matches for lang.
if t.lang < langNoIndexOffset {
x := likelyLang[t.lang]
if x.flags&isList != 0 {
x = likelyLangList[x.region]
}
if x.region != 0 {
t.setUndefinedScript(scriptID(x.script))
t.setUndefinedRegion(regionID(x.region))
}
specializeRegion(&t)
if t.lang == 0 {
t.lang = _en // default language
}
return t, nil
}
return t, ErrMissingLikelyTagsData
}
func (t *Tag) setTagsFrom(id Tag) {
t.lang = id.lang
t.script = id.script
t.region = id.region
}
// minimize removes the region or script subtags from t such that
// t.addLikelySubtags() == t.minimize().addLikelySubtags().
func (t Tag) minimize() (Tag, error) {
t, err := minimizeTags(t)
if err != nil {
return t, err
}
t.remakeString()
return t, nil
}
// minimizeTags mimics the behavior of the ICU 51 C implementation.
func minimizeTags(t Tag) (Tag, error) {
if t.equalTags(und) {
return t, nil
}
max, err := addTags(t)
if err != nil {
return t, err
}
for _, id := range [...]Tag{
{lang: t.lang},
{lang: t.lang, region: t.region},
{lang: t.lang, script: t.script},
} {
if x, err := addTags(id); err == nil && max.equalTags(x) {
t.setTagsFrom(id)
break
}
}
return t, nil
}
// func (t *Tag) setTagsFrom(id Tag) {
// t.LangID = id.LangID
// t.ScriptID = id.ScriptID
// t.RegionID = id.RegionID
// }
// Tag Matching
// CLDR defines an algorithm for finding the best match between two sets of language
// tags. The basic algorithm defines how to score a possible match and then find
// the match with the best score
// (see http://www.unicode.org/reports/tr35/#LanguageMatching).
// (see https://www.unicode.org/reports/tr35/#LanguageMatching).
// Using scoring has several disadvantages. The scoring obfuscates the importance of
// the various factors considered, making the algorithm harder to understand. Using
// scoring also requires the full score to be computed for each pair of tags.
@ -441,7 +254,7 @@ func minimizeTags(t Tag) (Tag, error) {
type matcher struct {
default_ *haveTag
supported []*haveTag
index map[langID]*matchHeader
index map[language.Language]*matchHeader
passSettings bool
preferSameScript bool
}
@ -456,7 +269,7 @@ type matchHeader struct {
// haveTag holds a supported Tag and its maximized script and region. The maximized
// or canonicalized language is not stored as it is not needed during matching.
type haveTag struct {
tag Tag
tag language.Tag
// index of this tag in the original list of supported tags.
index int
@ -466,37 +279,37 @@ type haveTag struct {
conf Confidence
// Maximized region and script.
maxRegion regionID
maxScript scriptID
maxRegion language.Region
maxScript language.Script
// altScript may be checked as an alternative match to maxScript. If altScript
// matches, the confidence level for this match is Low. Theoretically there
// could be multiple alternative scripts. This does not occur in practice.
altScript scriptID
altScript language.Script
// nextMax is the index of the next haveTag with the same maximized tags.
nextMax uint16
}
func makeHaveTag(tag Tag, index int) (haveTag, langID) {
func makeHaveTag(tag language.Tag, index int) (haveTag, language.Language) {
max := tag
if tag.lang != 0 || tag.region != 0 || tag.script != 0 {
max, _ = max.canonicalize(All)
max, _ = addTags(max)
max.remakeString()
if tag.LangID != 0 || tag.RegionID != 0 || tag.ScriptID != 0 {
max, _ = canonicalize(All, max)
max, _ = max.Maximize()
max.RemakeString()
}
return haveTag{tag, index, Exact, max.region, max.script, altScript(max.lang, max.script), 0}, max.lang
return haveTag{tag, index, Exact, max.RegionID, max.ScriptID, altScript(max.LangID, max.ScriptID), 0}, max.LangID
}
// altScript returns an alternative script that may match the given script with
// a low confidence. At the moment, the langMatch data allows for at most one
// script to map to another and we rely on this to keep the code simple.
func altScript(l langID, s scriptID) scriptID {
func altScript(l language.Language, s language.Script) language.Script {
for _, alt := range matchScript {
// TODO: also match cases where language is not the same.
if (langID(alt.wantLang) == l || langID(alt.haveLang) == l) &&
scriptID(alt.haveScript) == s {
return scriptID(alt.wantScript)
if (language.Language(alt.wantLang) == l || language.Language(alt.haveLang) == l) &&
language.Script(alt.haveScript) == s {
return language.Script(alt.wantScript)
}
}
return 0
@ -508,7 +321,7 @@ func (h *matchHeader) addIfNew(n haveTag, exact bool) {
h.original = h.original || exact
// Don't add new exact matches.
for _, v := range h.haveTags {
if v.tag.equalsRest(n.tag) {
if equalsRest(v.tag, n.tag) {
return
}
}
@ -517,7 +330,7 @@ func (h *matchHeader) addIfNew(n haveTag, exact bool) {
for i, v := range h.haveTags {
if v.maxScript == n.maxScript &&
v.maxRegion == n.maxRegion &&
v.tag.variantOrPrivateTagStr() == n.tag.variantOrPrivateTagStr() {
v.tag.VariantOrPrivateUseTags() == n.tag.VariantOrPrivateUseTags() {
for h.haveTags[i].nextMax != 0 {
i = int(h.haveTags[i].nextMax)
}
@ -530,7 +343,7 @@ func (h *matchHeader) addIfNew(n haveTag, exact bool) {
// header returns the matchHeader for the given language. It creates one if
// it doesn't already exist.
func (m *matcher) header(l langID) *matchHeader {
func (m *matcher) header(l language.Language) *matchHeader {
if h := m.index[l]; h != nil {
return h
}
@ -554,7 +367,7 @@ func toConf(d uint8) Confidence {
// for a given tag.
func newMatcher(supported []Tag, options []MatchOption) *matcher {
m := &matcher{
index: make(map[langID]*matchHeader),
index: make(map[language.Language]*matchHeader),
preferSameScript: true,
}
for _, o := range options {
@ -567,16 +380,18 @@ func newMatcher(supported []Tag, options []MatchOption) *matcher {
// Add supported languages to the index. Add exact matches first to give
// them precedence.
for i, tag := range supported {
pair, _ := makeHaveTag(tag, i)
m.header(tag.lang).addIfNew(pair, true)
tt := tag.tag()
pair, _ := makeHaveTag(tt, i)
m.header(tt.LangID).addIfNew(pair, true)
m.supported = append(m.supported, &pair)
}
m.default_ = m.header(supported[0].lang).haveTags[0]
m.default_ = m.header(supported[0].lang()).haveTags[0]
// Keep these in two different loops to support the case that two equivalent
// languages are distinguished, such as iw and he.
for i, tag := range supported {
pair, max := makeHaveTag(tag, i)
if max != tag.lang {
tt := tag.tag()
pair, max := makeHaveTag(tt, i)
if max != tt.LangID {
m.header(max).addIfNew(pair, true)
}
}
@ -585,11 +400,11 @@ func newMatcher(supported []Tag, options []MatchOption) *matcher {
// update will only add entries to original indexes, thus not computing any
// transitive relations.
update := func(want, have uint16, conf Confidence) {
if hh := m.index[langID(have)]; hh != nil {
if hh := m.index[language.Language(have)]; hh != nil {
if !hh.original {
return
}
hw := m.header(langID(want))
hw := m.header(language.Language(want))
for _, ht := range hh.haveTags {
v := *ht
if conf < v.conf {
@ -597,7 +412,7 @@ func newMatcher(supported []Tag, options []MatchOption) *matcher {
}
v.nextMax = 0 // this value needs to be recomputed
if v.altScript != 0 {
v.altScript = altScript(langID(want), v.maxScript)
v.altScript = altScript(language.Language(want), v.maxScript)
}
hw.addIfNew(v, conf == Exact && hh.original)
}
@ -618,66 +433,67 @@ func newMatcher(supported []Tag, options []MatchOption) *matcher {
// First we match deprecated equivalents. If they are perfect equivalents
// (their canonicalization simply substitutes a different language code, but
// nothing else), the match confidence is Exact, otherwise it is High.
for i, lm := range langAliasMap {
for i, lm := range language.AliasMap {
// If deprecated codes match and there is no fiddling with the script or
// or region, we consider it an exact match.
conf := Exact
if langAliasTypes[i] != langMacro {
if !isExactEquivalent(langID(lm.from)) {
if language.AliasTypes[i] != language.Macro {
if !isExactEquivalent(language.Language(lm.From)) {
conf = High
}
update(lm.to, lm.from, conf)
update(lm.To, lm.From, conf)
}
update(lm.from, lm.to, conf)
update(lm.From, lm.To, conf)
}
return m
}
// getBest gets the best matching tag in m for any of the given tags, taking into
// account the order of preference of the given tags.
func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
func (m *matcher) getBest(want ...Tag) (got *haveTag, orig language.Tag, c Confidence) {
best := bestMatch{}
for i, w := range want {
var max Tag
for i, ww := range want {
w := ww.tag()
var max language.Tag
// Check for exact match first.
h := m.index[w.lang]
if w.lang != 0 {
h := m.index[w.LangID]
if w.LangID != 0 {
if h == nil {
continue
}
// Base language is defined.
max, _ = w.canonicalize(Legacy | Deprecated | Macro)
max, _ = canonicalize(Legacy|Deprecated|Macro, w)
// A region that is added through canonicalization is stronger than
// a maximized region: set it in the original (e.g. mo -> ro-MD).
if w.region != max.region {
w.region = max.region
if w.RegionID != max.RegionID {
w.RegionID = max.RegionID
}
// TODO: should we do the same for scripts?
// See test case: en, sr, nl ; sh ; sr
max, _ = addTags(max)
max, _ = max.Maximize()
} else {
// Base language is not defined.
if h != nil {
for i := range h.haveTags {
have := h.haveTags[i]
if have.tag.equalsRest(w) {
if equalsRest(have.tag, w) {
return have, w, Exact
}
}
}
if w.script == 0 && w.region == 0 {
if w.ScriptID == 0 && w.RegionID == 0 {
// We skip all tags matching und for approximate matching, including
// private tags.
continue
}
max, _ = addTags(w)
if h = m.index[max.lang]; h == nil {
max, _ = w.Maximize()
if h = m.index[max.LangID]; h == nil {
continue
}
}
pin := true
for _, t := range want[i+1:] {
if w.lang == t.lang {
if w.LangID == t.lang() {
pin = false
break
}
@ -685,11 +501,11 @@ func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
// Check for match based on maximized tag.
for i := range h.haveTags {
have := h.haveTags[i]
best.update(have, w, max.script, max.region, pin)
best.update(have, w, max.ScriptID, max.RegionID, pin)
if best.conf == Exact {
for have.nextMax != 0 {
have = h.haveTags[have.nextMax]
best.update(have, w, max.script, max.region, pin)
best.update(have, w, max.ScriptID, max.RegionID, pin)
}
return best.have, best.want, best.conf
}
@ -697,9 +513,9 @@ func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
}
if best.conf <= No {
if len(want) != 0 {
return nil, want[0], No
return nil, want[0].tag(), No
}
return nil, Tag{}, No
return nil, language.Tag{}, No
}
return best.have, best.want, best.conf
}
@ -707,9 +523,9 @@ func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
// bestMatch accumulates the best match so far.
type bestMatch struct {
have *haveTag
want Tag
want language.Tag
conf Confidence
pinnedRegion regionID
pinnedRegion language.Region
pinLanguage bool
sameRegionGroup bool
// Cached results from applying tie-breaking rules.
@ -734,19 +550,19 @@ type bestMatch struct {
// still prefer a second language over a dialect of the preferred language by
// explicitly specifying dialects, e.g. "en, nl, en-GB". In this case pin should
// be false.
func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion regionID, pin bool) {
func (m *bestMatch) update(have *haveTag, tag language.Tag, maxScript language.Script, maxRegion language.Region, pin bool) {
// Bail if the maximum attainable confidence is below that of the current best match.
c := have.conf
if c < m.conf {
return
}
// Don't change the language once we already have found an exact match.
if m.pinLanguage && tag.lang != m.want.lang {
if m.pinLanguage && tag.LangID != m.want.LangID {
return
}
// Pin the region group if we are comparing tags for the same language.
if tag.lang == m.want.lang && m.sameRegionGroup {
_, sameGroup := regionGroupDist(m.pinnedRegion, have.maxRegion, have.maxScript, m.want.lang)
if tag.LangID == m.want.LangID && m.sameRegionGroup {
_, sameGroup := regionGroupDist(m.pinnedRegion, have.maxRegion, have.maxScript, m.want.LangID)
if !sameGroup {
return
}
@ -756,7 +572,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
// don't pin anything, otherwise pin the language.
m.pinLanguage = pin
}
if have.tag.equalsRest(tag) {
if equalsRest(have.tag, tag) {
} else if have.maxScript != maxScript {
// There is usually very little comprehension between different scripts.
// In a few cases there may still be Low comprehension. This possibility
@ -786,7 +602,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
// Tie-breaker rules:
// We prefer if the pre-maximized language was specified and identical.
origLang := have.tag.lang == tag.lang && tag.lang != 0
origLang := have.tag.LangID == tag.LangID && tag.LangID != 0
if !beaten && m.origLang != origLang {
if m.origLang {
return
@ -795,7 +611,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
}
// We prefer if the pre-maximized region was specified and identical.
origReg := have.tag.region == tag.region && tag.region != 0
origReg := have.tag.RegionID == tag.RegionID && tag.RegionID != 0
if !beaten && m.origReg != origReg {
if m.origReg {
return
@ -803,7 +619,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
beaten = true
}
regGroupDist, sameGroup := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.lang)
regGroupDist, sameGroup := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.LangID)
if !beaten && m.regGroupDist != regGroupDist {
if regGroupDist > m.regGroupDist {
return
@ -811,7 +627,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
beaten = true
}
paradigmReg := isParadigmLocale(tag.lang, have.maxRegion)
paradigmReg := isParadigmLocale(tag.LangID, have.maxRegion)
if !beaten && m.paradigmReg != paradigmReg {
if !paradigmReg {
return
@ -820,7 +636,7 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
}
// Next we prefer if the pre-maximized script was specified and identical.
origScript := have.tag.script == tag.script && tag.script != 0
origScript := have.tag.ScriptID == tag.ScriptID && tag.ScriptID != 0
if !beaten && m.origScript != origScript {
if m.origScript {
return
@ -843,9 +659,9 @@ func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion
}
}
func isParadigmLocale(lang langID, r regionID) bool {
func isParadigmLocale(lang language.Language, r language.Region) bool {
for _, e := range paradigmLocales {
if langID(e[0]) == lang && (r == regionID(e[1]) || r == regionID(e[2])) {
if language.Language(e[0]) == lang && (r == language.Region(e[1]) || r == language.Region(e[2])) {
return true
}
}
@ -854,13 +670,13 @@ func isParadigmLocale(lang langID, r regionID) bool {
// regionGroupDist computes the distance between two regions based on their
// CLDR grouping.
func regionGroupDist(a, b regionID, script scriptID, lang langID) (dist uint8, same bool) {
func regionGroupDist(a, b language.Region, script language.Script, lang language.Language) (dist uint8, same bool) {
const defaultDistance = 4
aGroup := uint(regionToGroups[a]) << 1
bGroup := uint(regionToGroups[b]) << 1
for _, ri := range matchRegion {
if langID(ri.lang) == lang && (ri.script == 0 || scriptID(ri.script) == script) {
if language.Language(ri.lang) == lang && (ri.script == 0 || language.Script(ri.script) == script) {
group := uint(1 << (ri.group &^ 0x80))
if 0x80&ri.group == 0 {
if aGroup&bGroup&group != 0 { // Both regions are in the group.
@ -876,31 +692,16 @@ func regionGroupDist(a, b regionID, script scriptID, lang langID) (dist uint8, s
return defaultDistance, true
}
func (t Tag) variants() string {
if t.pVariant == 0 {
return ""
}
return t.str[t.pVariant:t.pExt]
}
// variantOrPrivateTagStr returns variants or private use tags.
func (t Tag) variantOrPrivateTagStr() string {
if t.pExt > 0 {
return t.str[t.pVariant:t.pExt]
}
return t.str[t.pVariant:]
}
// equalsRest compares everything except the language.
func (a Tag) equalsRest(b Tag) bool {
func equalsRest(a, b language.Tag) bool {
// TODO: don't include extensions in this comparison. To do this efficiently,
// though, we should handle private tags separately.
return a.script == b.script && a.region == b.region && a.variantOrPrivateTagStr() == b.variantOrPrivateTagStr()
return a.ScriptID == b.ScriptID && a.RegionID == b.RegionID && a.VariantOrPrivateUseTags() == b.VariantOrPrivateUseTags()
}
// isExactEquivalent returns true if canonicalizing the language will not alter
// the script or region of a tag.
func isExactEquivalent(l langID) bool {
func isExactEquivalent(l language.Language) bool {
for _, o := range notEquivalent {
if o == l {
return false
@ -909,25 +710,26 @@ func isExactEquivalent(l langID) bool {
return true
}
var notEquivalent []langID
var notEquivalent []language.Language
func init() {
// Create a list of all languages for which canonicalization may alter the
// script or region.
for _, lm := range langAliasMap {
tag := Tag{lang: langID(lm.from)}
if tag, _ = tag.canonicalize(All); tag.script != 0 || tag.region != 0 {
notEquivalent = append(notEquivalent, langID(lm.from))
for _, lm := range language.AliasMap {
tag := language.Tag{LangID: language.Language(lm.From)}
if tag, _ = canonicalize(All, tag); tag.ScriptID != 0 || tag.RegionID != 0 {
notEquivalent = append(notEquivalent, language.Language(lm.From))
}
}
// Maximize undefined regions of paradigm locales.
for i, v := range paradigmLocales {
max, _ := addTags(Tag{lang: langID(v[0])})
t := language.Tag{LangID: language.Language(v[0])}
max, _ := t.Maximize()
if v[1] == 0 {
paradigmLocales[i][1] = uint16(max.region)
paradigmLocales[i][1] = uint16(max.RegionID)
}
if v[2] == 0 {
paradigmLocales[i][2] = uint16(max.region)
paradigmLocales[i][2] = uint16(max.RegionID)
}
}
}

View File

@ -5,216 +5,21 @@
package language
import (
"bytes"
"errors"
"fmt"
"sort"
"strconv"
"strings"
"golang.org/x/text/internal/tag"
"golang.org/x/text/internal/language"
)
// isAlpha returns true if the byte is not a digit.
// b must be an ASCII letter or digit.
func isAlpha(b byte) bool {
return b > '9'
}
// isAlphaNum returns true if the string contains only ASCII letters or digits.
func isAlphaNum(s []byte) bool {
for _, c := range s {
if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
return false
}
}
return true
}
// errSyntax is returned by any of the parsing functions when the
// input is not well-formed, according to BCP 47.
// TODO: return the position at which the syntax error occurred?
var errSyntax = errors.New("language: tag is not well-formed")
// ValueError is returned by any of the parsing functions when the
// input is well-formed but the respective subtag is not recognized
// as a valid value.
type ValueError struct {
v [8]byte
}
type ValueError interface {
error
func mkErrInvalid(s []byte) error {
var e ValueError
copy(e.v[:], s)
return e
}
func (e ValueError) tag() []byte {
n := bytes.IndexByte(e.v[:], 0)
if n == -1 {
n = 8
}
return e.v[:n]
}
// Error implements the error interface.
func (e ValueError) Error() string {
return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
}
// Subtag returns the subtag for which the error occurred.
func (e ValueError) Subtag() string {
return string(e.tag())
}
// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
type scanner struct {
b []byte
bytes [max99thPercentileSize]byte
token []byte
start int // start position of the current token
end int // end position of the current token
next int // next point for scan
err error
done bool
}
func makeScannerString(s string) scanner {
scan := scanner{}
if len(s) <= len(scan.bytes) {
scan.b = scan.bytes[:copy(scan.bytes[:], s)]
} else {
scan.b = []byte(s)
}
scan.init()
return scan
}
// makeScanner returns a scanner using b as the input buffer.
// b is not copied and may be modified by the scanner routines.
func makeScanner(b []byte) scanner {
scan := scanner{b: b}
scan.init()
return scan
}
func (s *scanner) init() {
for i, c := range s.b {
if c == '_' {
s.b[i] = '-'
}
}
s.scan()
}
// restToLower converts the string between start and end to lower case.
func (s *scanner) toLower(start, end int) {
for i := start; i < end; i++ {
c := s.b[i]
if 'A' <= c && c <= 'Z' {
s.b[i] += 'a' - 'A'
}
}
}
func (s *scanner) setError(e error) {
if s.err == nil || (e == errSyntax && s.err != errSyntax) {
s.err = e
}
}
// resizeRange shrinks or grows the array at position oldStart such that
// a new string of size newSize can fit between oldStart and oldEnd.
// Sets the scan point to after the resized range.
func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
s.start = oldStart
if end := oldStart + newSize; end != oldEnd {
diff := end - oldEnd
if end < cap(s.b) {
b := make([]byte, len(s.b)+diff)
copy(b, s.b[:oldStart])
copy(b[end:], s.b[oldEnd:])
s.b = b
} else {
s.b = append(s.b[end:], s.b[oldEnd:]...)
}
s.next = end + (s.next - s.end)
s.end = end
}
}
// replace replaces the current token with repl.
func (s *scanner) replace(repl string) {
s.resizeRange(s.start, s.end, len(repl))
copy(s.b[s.start:], repl)
}
// gobble removes the current token from the input.
// Caller must call scan after calling gobble.
func (s *scanner) gobble(e error) {
s.setError(e)
if s.start == 0 {
s.b = s.b[:+copy(s.b, s.b[s.next:])]
s.end = 0
} else {
s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
s.end = s.start - 1
}
s.next = s.start
}
// deleteRange removes the given range from s.b before the current token.
func (s *scanner) deleteRange(start, end int) {
s.setError(errSyntax)
s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
diff := end - start
s.next -= diff
s.start -= diff
s.end -= diff
}
// scan parses the next token of a BCP 47 string. Tokens that are larger
// than 8 characters or include non-alphanumeric characters result in an error
// and are gobbled and removed from the output.
// It returns the end position of the last token consumed.
func (s *scanner) scan() (end int) {
end = s.end
s.token = nil
for s.start = s.next; s.next < len(s.b); {
i := bytes.IndexByte(s.b[s.next:], '-')
if i == -1 {
s.end = len(s.b)
s.next = len(s.b)
i = s.end - s.start
} else {
s.end = s.next + i
s.next = s.end + 1
}
token := s.b[s.start:s.end]
if i < 1 || i > 8 || !isAlphaNum(token) {
s.gobble(errSyntax)
continue
}
s.token = token
return end
}
if n := len(s.b); n > 0 && s.b[n-1] == '-' {
s.setError(errSyntax)
s.b = s.b[:len(s.b)-1]
}
s.done = true
return end
}
// acceptMinSize parses multiple tokens of the given size or greater.
// It returns the end position of the last token consumed.
func (s *scanner) acceptMinSize(min int) (end int) {
end = s.end
s.scan()
for ; len(s.token) >= min; s.scan() {
end = s.end
}
return end
// Subtag returns the subtag for which the error occurred.
Subtag() string
}
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
@ -223,7 +28,7 @@ func (s *scanner) acceptMinSize(min int) (end int) {
// ValueError. The Tag returned in this case is just stripped of the unknown
// value. All other values are preserved. It accepts tags in the BCP 47 format
// and extensions to this standard defined in
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// The resulting tag is canonicalized using the default canonicalization type.
func Parse(s string) (t Tag, err error) {
return Default.Parse(s)
@ -235,327 +40,18 @@ func Parse(s string) (t Tag, err error) {
// ValueError. The Tag returned in this case is just stripped of the unknown
// value. All other values are preserved. It accepts tags in the BCP 47 format
// and extensions to this standard defined in
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// The resulting tag is canonicalized using the the canonicalization type c.
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// The resulting tag is canonicalized using the canonicalization type c.
func (c CanonType) Parse(s string) (t Tag, err error) {
// TODO: consider supporting old-style locale key-value pairs.
if s == "" {
return und, errSyntax
tt, err := language.Parse(s)
if err != nil {
return makeTag(tt), err
}
if len(s) <= maxAltTaglen {
b := [maxAltTaglen]byte{}
for i, c := range s {
// Generating invalid UTF-8 is okay as it won't match.
if 'A' <= c && c <= 'Z' {
c += 'a' - 'A'
} else if c == '_' {
c = '-'
}
b[i] = byte(c)
}
if t, ok := grandfathered(b); ok {
return t, nil
}
}
scan := makeScannerString(s)
t, err = parse(&scan, s)
t, changed := t.canonicalize(c)
tt, changed := canonicalize(c, tt)
if changed {
t.remakeString()
tt.RemakeString()
}
return t, err
}
func parse(scan *scanner, s string) (t Tag, err error) {
t = und
var end int
if n := len(scan.token); n <= 1 {
scan.toLower(0, len(scan.b))
if n == 0 || scan.token[0] != 'x' {
return t, errSyntax
}
end = parseExtensions(scan)
} else if n >= 4 {
return und, errSyntax
} else { // the usual case
t, end = parseTag(scan)
if n := len(scan.token); n == 1 {
t.pExt = uint16(end)
end = parseExtensions(scan)
} else if end < len(scan.b) {
scan.setError(errSyntax)
scan.b = scan.b[:end]
}
}
if int(t.pVariant) < len(scan.b) {
if end < len(s) {
s = s[:end]
}
if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
t.str = s
} else {
t.str = string(scan.b)
}
} else {
t.pVariant, t.pExt = 0, 0
}
return t, scan.err
}
// parseTag parses language, script, region and variants.
// It returns a Tag and the end position in the input that was parsed.
func parseTag(scan *scanner) (t Tag, end int) {
var e error
// TODO: set an error if an unknown lang, script or region is encountered.
t.lang, e = getLangID(scan.token)
scan.setError(e)
scan.replace(t.lang.String())
langStart := scan.start
end = scan.scan()
for len(scan.token) == 3 && isAlpha(scan.token[0]) {
// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
// to a tag of the form <extlang>.
lang, e := getLangID(scan.token)
if lang != 0 {
t.lang = lang
copy(scan.b[langStart:], lang.String())
scan.b[langStart+3] = '-'
scan.start = langStart + 4
}
scan.gobble(e)
end = scan.scan()
}
if len(scan.token) == 4 && isAlpha(scan.token[0]) {
t.script, e = getScriptID(script, scan.token)
if t.script == 0 {
scan.gobble(e)
}
end = scan.scan()
}
if n := len(scan.token); n >= 2 && n <= 3 {
t.region, e = getRegionID(scan.token)
if t.region == 0 {
scan.gobble(e)
} else {
scan.replace(t.region.String())
}
end = scan.scan()
}
scan.toLower(scan.start, len(scan.b))
t.pVariant = byte(end)
end = parseVariants(scan, end, t)
t.pExt = uint16(end)
return t, end
}
var separator = []byte{'-'}
// parseVariants scans tokens as long as each token is a valid variant string.
// Duplicate variants are removed.
func parseVariants(scan *scanner, end int, t Tag) int {
start := scan.start
varIDBuf := [4]uint8{}
variantBuf := [4][]byte{}
varID := varIDBuf[:0]
variant := variantBuf[:0]
last := -1
needSort := false
for ; len(scan.token) >= 4; scan.scan() {
// TODO: measure the impact of needing this conversion and redesign
// the data structure if there is an issue.
v, ok := variantIndex[string(scan.token)]
if !ok {
// unknown variant
// TODO: allow user-defined variants?
scan.gobble(mkErrInvalid(scan.token))
continue
}
varID = append(varID, v)
variant = append(variant, scan.token)
if !needSort {
if last < int(v) {
last = int(v)
} else {
needSort = true
// There is no legal combinations of more than 7 variants
// (and this is by no means a useful sequence).
const maxVariants = 8
if len(varID) > maxVariants {
break
}
}
}
end = scan.end
}
if needSort {
sort.Sort(variantsSort{varID, variant})
k, l := 0, -1
for i, v := range varID {
w := int(v)
if l == w {
// Remove duplicates.
continue
}
varID[k] = varID[i]
variant[k] = variant[i]
k++
l = w
}
if str := bytes.Join(variant[:k], separator); len(str) == 0 {
end = start - 1
} else {
scan.resizeRange(start, end, len(str))
copy(scan.b[scan.start:], str)
end = scan.end
}
}
return end
}
type variantsSort struct {
i []uint8
v [][]byte
}
func (s variantsSort) Len() int {
return len(s.i)
}
func (s variantsSort) Swap(i, j int) {
s.i[i], s.i[j] = s.i[j], s.i[i]
s.v[i], s.v[j] = s.v[j], s.v[i]
}
func (s variantsSort) Less(i, j int) bool {
return s.i[i] < s.i[j]
}
type bytesSort [][]byte
func (b bytesSort) Len() int {
return len(b)
}
func (b bytesSort) Swap(i, j int) {
b[i], b[j] = b[j], b[i]
}
func (b bytesSort) Less(i, j int) bool {
return bytes.Compare(b[i], b[j]) == -1
}
// parseExtensions parses and normalizes the extensions in the buffer.
// It returns the last position of scan.b that is part of any extension.
// It also trims scan.b to remove excess parts accordingly.
func parseExtensions(scan *scanner) int {
start := scan.start
exts := [][]byte{}
private := []byte{}
end := scan.end
for len(scan.token) == 1 {
extStart := scan.start
ext := scan.token[0]
end = parseExtension(scan)
extension := scan.b[extStart:end]
if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
scan.setError(errSyntax)
end = extStart
continue
} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
scan.b = scan.b[:end]
return end
} else if ext == 'x' {
private = extension
break
}
exts = append(exts, extension)
}
sort.Sort(bytesSort(exts))
if len(private) > 0 {
exts = append(exts, private)
}
scan.b = scan.b[:start]
if len(exts) > 0 {
scan.b = append(scan.b, bytes.Join(exts, separator)...)
} else if start > 0 {
// Strip trailing '-'.
scan.b = scan.b[:start-1]
}
return end
}
// parseExtension parses a single extension and returns the position of
// the extension end.
func parseExtension(scan *scanner) int {
start, end := scan.start, scan.end
switch scan.token[0] {
case 'u':
attrStart := end
scan.scan()
for last := []byte{}; len(scan.token) > 2; scan.scan() {
if bytes.Compare(scan.token, last) != -1 {
// Attributes are unsorted. Start over from scratch.
p := attrStart + 1
scan.next = p
attrs := [][]byte{}
for scan.scan(); len(scan.token) > 2; scan.scan() {
attrs = append(attrs, scan.token)
end = scan.end
}
sort.Sort(bytesSort(attrs))
copy(scan.b[p:], bytes.Join(attrs, separator))
break
}
last = scan.token
end = scan.end
}
var last, key []byte
for attrEnd := end; len(scan.token) == 2; last = key {
key = scan.token
keyEnd := scan.end
end = scan.acceptMinSize(3)
// TODO: check key value validity
if keyEnd == end || bytes.Compare(key, last) != 1 {
// We have an invalid key or the keys are not sorted.
// Start scanning keys from scratch and reorder.
p := attrEnd + 1
scan.next = p
keys := [][]byte{}
for scan.scan(); len(scan.token) == 2; {
keyStart, keyEnd := scan.start, scan.end
end = scan.acceptMinSize(3)
if keyEnd != end {
keys = append(keys, scan.b[keyStart:end])
} else {
scan.setError(errSyntax)
end = keyStart
}
}
sort.Sort(bytesSort(keys))
reordered := bytes.Join(keys, separator)
if e := p + len(reordered); e < end {
scan.deleteRange(e, end)
end = e
}
copy(scan.b[p:], bytes.Join(keys, separator))
break
}
}
case 't':
scan.scan()
if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
_, end = parseTag(scan)
scan.toLower(start, end)
}
for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
end = scan.acceptMinSize(3)
}
case 'x':
end = scan.acceptMinSize(1)
default:
end = scan.acceptMinSize(2)
}
return end
return makeTag(tt), err
}
// Compose creates a Tag from individual parts, which may be of type Tag, Base,
@ -563,10 +59,11 @@ func parseExtension(scan *scanner) int {
// Base, Script or Region or slice of type Variant or Extension is passed more
// than once, the latter will overwrite the former. Variants and Extensions are
// accumulated, but if two extensions of the same type are passed, the latter
// will replace the former. A Tag overwrites all former values and typically
// only makes sense as the first argument. The resulting tag is returned after
// canonicalizing using the Default CanonType. If one or more errors are
// encountered, one of the errors is returned.
// will replace the former. For -u extensions, though, the key-type pairs are
// added, where later values overwrite older ones. A Tag overwrites all former
// values and typically only makes sense as the first argument. The resulting
// tag is returned after canonicalizing using the Default CanonType. If one or
// more errors are encountered, one of the errors is returned.
func Compose(part ...interface{}) (t Tag, err error) {
return Default.Compose(part...)
}
@ -576,191 +73,63 @@ func Compose(part ...interface{}) (t Tag, err error) {
// Base, Script or Region or slice of type Variant or Extension is passed more
// than once, the latter will overwrite the former. Variants and Extensions are
// accumulated, but if two extensions of the same type are passed, the latter
// will replace the former. A Tag overwrites all former values and typically
// only makes sense as the first argument. The resulting tag is returned after
// canonicalizing using CanonType c. If one or more errors are encountered,
// one of the errors is returned.
// will replace the former. For -u extensions, though, the key-type pairs are
// added, where later values overwrite older ones. A Tag overwrites all former
// values and typically only makes sense as the first argument. The resulting
// tag is returned after canonicalizing using CanonType c. If one or more errors
// are encountered, one of the errors is returned.
func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
var b builder
if err = b.update(part...); err != nil {
var b language.Builder
if err = update(&b, part...); err != nil {
return und, err
}
t, _ = b.tag.canonicalize(c)
if len(b.ext) > 0 || len(b.variant) > 0 {
sort.Sort(sortVariant(b.variant))
sort.Strings(b.ext)
if b.private != "" {
b.ext = append(b.ext, b.private)
}
n := maxCoreSize + tokenLen(b.variant...) + tokenLen(b.ext...)
buf := make([]byte, n)
p := t.genCoreBytes(buf)
t.pVariant = byte(p)
p += appendTokens(buf[p:], b.variant...)
t.pExt = uint16(p)
p += appendTokens(buf[p:], b.ext...)
t.str = string(buf[:p])
} else if b.private != "" {
t.str = b.private
t.remakeString()
}
return
}
type builder struct {
tag Tag
private string // the x extension
ext []string
variant []string
err error
}
func (b *builder) addExt(e string) {
if e == "" {
} else if e[0] == 'x' {
b.private = e
} else {
b.ext = append(b.ext, e)
}
b.Tag, _ = canonicalize(c, b.Tag)
return makeTag(b.Make()), err
}
var errInvalidArgument = errors.New("invalid Extension or Variant")
func (b *builder) update(part ...interface{}) (err error) {
replace := func(l *[]string, s string, eq func(a, b string) bool) bool {
if s == "" {
b.err = errInvalidArgument
return true
}
for i, v := range *l {
if eq(v, s) {
(*l)[i] = s
return true
}
}
return false
}
func update(b *language.Builder, part ...interface{}) (err error) {
for _, x := range part {
switch v := x.(type) {
case Tag:
b.tag.lang = v.lang
b.tag.region = v.region
b.tag.script = v.script
if v.str != "" {
b.variant = nil
for x, s := "", v.str[v.pVariant:v.pExt]; s != ""; {
x, s = nextToken(s)
b.variant = append(b.variant, x)
}
b.ext, b.private = nil, ""
for i, e := int(v.pExt), ""; i < len(v.str); {
i, e = getExtension(v.str, i)
b.addExt(e)
}
}
b.SetTag(v.tag())
case Base:
b.tag.lang = v.langID
b.Tag.LangID = v.langID
case Script:
b.tag.script = v.scriptID
b.Tag.ScriptID = v.scriptID
case Region:
b.tag.region = v.regionID
b.Tag.RegionID = v.regionID
case Variant:
if !replace(&b.variant, v.variant, func(a, b string) bool { return a == b }) {
b.variant = append(b.variant, v.variant)
if v.variant == "" {
err = errInvalidArgument
break
}
b.AddVariant(v.variant)
case Extension:
if !replace(&b.ext, v.s, func(a, b string) bool { return a[0] == b[0] }) {
b.addExt(v.s)
if v.s == "" {
err = errInvalidArgument
break
}
b.SetExt(v.s)
case []Variant:
b.variant = nil
for _, x := range v {
b.update(x)
b.ClearVariants()
for _, v := range v {
b.AddVariant(v.variant)
}
case []Extension:
b.ext, b.private = nil, ""
b.ClearExtensions()
for _, e := range v {
b.update(e)
b.SetExt(e.s)
}
// TODO: support parsing of raw strings based on morphology or just extensions?
case error:
err = v
}
}
return
}
func tokenLen(token ...string) (n int) {
for _, t := range token {
n += len(t) + 1
}
return
}
func appendTokens(b []byte, token ...string) int {
p := 0
for _, t := range token {
b[p] = '-'
copy(b[p+1:], t)
p += 1 + len(t)
}
return p
}
type sortVariant []string
func (s sortVariant) Len() int {
return len(s)
}
func (s sortVariant) Swap(i, j int) {
s[j], s[i] = s[i], s[j]
}
func (s sortVariant) Less(i, j int) bool {
return variantIndex[s[i]] < variantIndex[s[j]]
}
func findExt(list []string, x byte) int {
for i, e := range list {
if e[0] == x {
return i
}
}
return -1
}
// getExtension returns the name, body and end position of the extension.
func getExtension(s string, p int) (end int, ext string) {
if s[p] == '-' {
p++
}
if s[p] == 'x' {
return len(s), s[p:]
}
end = nextExtension(s, p)
return end, s[p:end]
}
// nextExtension finds the next extension within the string, searching
// for the -<char>- pattern from position p.
// In the fast majority of cases, language tags will have at most
// one extension and extensions tend to be small.
func nextExtension(s string, p int) int {
for n := len(s) - 3; p < n; {
if s[p] == '-' {
if s[p+2] == '-' {
return p
if v != nil {
err = v
}
p += 3
} else {
p++
}
}
return len(s)
return
}
var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
@ -788,7 +157,7 @@ func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
if !ok {
return nil, nil, err
}
t = Tag{lang: id}
t = makeTag(language.Tag{LangID: id})
}
// Scan the optional weight.
@ -830,9 +199,9 @@ func split(s string, c byte) (head, tail string) {
return strings.TrimSpace(s), ""
}
// Add hack mapping to deal with a small number of cases that that occur
// Add hack mapping to deal with a small number of cases that occur
// in Accept-Language (with reasonable frequency).
var acceptFallback = map[string]langID{
var acceptFallback = map[string]language.Language{
"english": _en,
"deutsch": _de,
"italian": _it,

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,8 @@
package language
import "golang.org/x/text/internal/language/compact"
// TODO: Various sets of commonly use tags and regions.
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
@ -61,83 +63,83 @@ var (
Und Tag = Tag{}
Afrikaans Tag = Tag{lang: _af} // af
Amharic Tag = Tag{lang: _am} // am
Arabic Tag = Tag{lang: _ar} // ar
ModernStandardArabic Tag = Tag{lang: _ar, region: _001} // ar-001
Azerbaijani Tag = Tag{lang: _az} // az
Bulgarian Tag = Tag{lang: _bg} // bg
Bengali Tag = Tag{lang: _bn} // bn
Catalan Tag = Tag{lang: _ca} // ca
Czech Tag = Tag{lang: _cs} // cs
Danish Tag = Tag{lang: _da} // da
German Tag = Tag{lang: _de} // de
Greek Tag = Tag{lang: _el} // el
English Tag = Tag{lang: _en} // en
AmericanEnglish Tag = Tag{lang: _en, region: _US} // en-US
BritishEnglish Tag = Tag{lang: _en, region: _GB} // en-GB
Spanish Tag = Tag{lang: _es} // es
EuropeanSpanish Tag = Tag{lang: _es, region: _ES} // es-ES
LatinAmericanSpanish Tag = Tag{lang: _es, region: _419} // es-419
Estonian Tag = Tag{lang: _et} // et
Persian Tag = Tag{lang: _fa} // fa
Finnish Tag = Tag{lang: _fi} // fi
Filipino Tag = Tag{lang: _fil} // fil
French Tag = Tag{lang: _fr} // fr
CanadianFrench Tag = Tag{lang: _fr, region: _CA} // fr-CA
Gujarati Tag = Tag{lang: _gu} // gu
Hebrew Tag = Tag{lang: _he} // he
Hindi Tag = Tag{lang: _hi} // hi
Croatian Tag = Tag{lang: _hr} // hr
Hungarian Tag = Tag{lang: _hu} // hu
Armenian Tag = Tag{lang: _hy} // hy
Indonesian Tag = Tag{lang: _id} // id
Icelandic Tag = Tag{lang: _is} // is
Italian Tag = Tag{lang: _it} // it
Japanese Tag = Tag{lang: _ja} // ja
Georgian Tag = Tag{lang: _ka} // ka
Kazakh Tag = Tag{lang: _kk} // kk
Khmer Tag = Tag{lang: _km} // km
Kannada Tag = Tag{lang: _kn} // kn
Korean Tag = Tag{lang: _ko} // ko
Kirghiz Tag = Tag{lang: _ky} // ky
Lao Tag = Tag{lang: _lo} // lo
Lithuanian Tag = Tag{lang: _lt} // lt
Latvian Tag = Tag{lang: _lv} // lv
Macedonian Tag = Tag{lang: _mk} // mk
Malayalam Tag = Tag{lang: _ml} // ml
Mongolian Tag = Tag{lang: _mn} // mn
Marathi Tag = Tag{lang: _mr} // mr
Malay Tag = Tag{lang: _ms} // ms
Burmese Tag = Tag{lang: _my} // my
Nepali Tag = Tag{lang: _ne} // ne
Dutch Tag = Tag{lang: _nl} // nl
Norwegian Tag = Tag{lang: _no} // no
Punjabi Tag = Tag{lang: _pa} // pa
Polish Tag = Tag{lang: _pl} // pl
Portuguese Tag = Tag{lang: _pt} // pt
BrazilianPortuguese Tag = Tag{lang: _pt, region: _BR} // pt-BR
EuropeanPortuguese Tag = Tag{lang: _pt, region: _PT} // pt-PT
Romanian Tag = Tag{lang: _ro} // ro
Russian Tag = Tag{lang: _ru} // ru
Sinhala Tag = Tag{lang: _si} // si
Slovak Tag = Tag{lang: _sk} // sk
Slovenian Tag = Tag{lang: _sl} // sl
Albanian Tag = Tag{lang: _sq} // sq
Serbian Tag = Tag{lang: _sr} // sr
SerbianLatin Tag = Tag{lang: _sr, script: _Latn} // sr-Latn
Swedish Tag = Tag{lang: _sv} // sv
Swahili Tag = Tag{lang: _sw} // sw
Tamil Tag = Tag{lang: _ta} // ta
Telugu Tag = Tag{lang: _te} // te
Thai Tag = Tag{lang: _th} // th
Turkish Tag = Tag{lang: _tr} // tr
Ukrainian Tag = Tag{lang: _uk} // uk
Urdu Tag = Tag{lang: _ur} // ur
Uzbek Tag = Tag{lang: _uz} // uz
Vietnamese Tag = Tag{lang: _vi} // vi
Chinese Tag = Tag{lang: _zh} // zh
SimplifiedChinese Tag = Tag{lang: _zh, script: _Hans} // zh-Hans
TraditionalChinese Tag = Tag{lang: _zh, script: _Hant} // zh-Hant
Zulu Tag = Tag{lang: _zu} // zu
Afrikaans Tag = Tag(compact.Afrikaans)
Amharic Tag = Tag(compact.Amharic)
Arabic Tag = Tag(compact.Arabic)
ModernStandardArabic Tag = Tag(compact.ModernStandardArabic)
Azerbaijani Tag = Tag(compact.Azerbaijani)
Bulgarian Tag = Tag(compact.Bulgarian)
Bengali Tag = Tag(compact.Bengali)
Catalan Tag = Tag(compact.Catalan)
Czech Tag = Tag(compact.Czech)
Danish Tag = Tag(compact.Danish)
German Tag = Tag(compact.German)
Greek Tag = Tag(compact.Greek)
English Tag = Tag(compact.English)
AmericanEnglish Tag = Tag(compact.AmericanEnglish)
BritishEnglish Tag = Tag(compact.BritishEnglish)
Spanish Tag = Tag(compact.Spanish)
EuropeanSpanish Tag = Tag(compact.EuropeanSpanish)
LatinAmericanSpanish Tag = Tag(compact.LatinAmericanSpanish)
Estonian Tag = Tag(compact.Estonian)
Persian Tag = Tag(compact.Persian)
Finnish Tag = Tag(compact.Finnish)
Filipino Tag = Tag(compact.Filipino)
French Tag = Tag(compact.French)
CanadianFrench Tag = Tag(compact.CanadianFrench)
Gujarati Tag = Tag(compact.Gujarati)
Hebrew Tag = Tag(compact.Hebrew)
Hindi Tag = Tag(compact.Hindi)
Croatian Tag = Tag(compact.Croatian)
Hungarian Tag = Tag(compact.Hungarian)
Armenian Tag = Tag(compact.Armenian)
Indonesian Tag = Tag(compact.Indonesian)
Icelandic Tag = Tag(compact.Icelandic)
Italian Tag = Tag(compact.Italian)
Japanese Tag = Tag(compact.Japanese)
Georgian Tag = Tag(compact.Georgian)
Kazakh Tag = Tag(compact.Kazakh)
Khmer Tag = Tag(compact.Khmer)
Kannada Tag = Tag(compact.Kannada)
Korean Tag = Tag(compact.Korean)
Kirghiz Tag = Tag(compact.Kirghiz)
Lao Tag = Tag(compact.Lao)
Lithuanian Tag = Tag(compact.Lithuanian)
Latvian Tag = Tag(compact.Latvian)
Macedonian Tag = Tag(compact.Macedonian)
Malayalam Tag = Tag(compact.Malayalam)
Mongolian Tag = Tag(compact.Mongolian)
Marathi Tag = Tag(compact.Marathi)
Malay Tag = Tag(compact.Malay)
Burmese Tag = Tag(compact.Burmese)
Nepali Tag = Tag(compact.Nepali)
Dutch Tag = Tag(compact.Dutch)
Norwegian Tag = Tag(compact.Norwegian)
Punjabi Tag = Tag(compact.Punjabi)
Polish Tag = Tag(compact.Polish)
Portuguese Tag = Tag(compact.Portuguese)
BrazilianPortuguese Tag = Tag(compact.BrazilianPortuguese)
EuropeanPortuguese Tag = Tag(compact.EuropeanPortuguese)
Romanian Tag = Tag(compact.Romanian)
Russian Tag = Tag(compact.Russian)
Sinhala Tag = Tag(compact.Sinhala)
Slovak Tag = Tag(compact.Slovak)
Slovenian Tag = Tag(compact.Slovenian)
Albanian Tag = Tag(compact.Albanian)
Serbian Tag = Tag(compact.Serbian)
SerbianLatin Tag = Tag(compact.SerbianLatin)
Swedish Tag = Tag(compact.Swedish)
Swahili Tag = Tag(compact.Swahili)
Tamil Tag = Tag(compact.Tamil)
Telugu Tag = Tag(compact.Telugu)
Thai Tag = Tag(compact.Thai)
Turkish Tag = Tag(compact.Turkish)
Ukrainian Tag = Tag(compact.Ukrainian)
Urdu Tag = Tag(compact.Urdu)
Uzbek Tag = Tag(compact.Uzbek)
Vietnamese Tag = Tag(compact.Vietnamese)
Chinese Tag = Tag(compact.Chinese)
SimplifiedChinese Tag = Tag(compact.SimplifiedChinese)
TraditionalChinese Tag = Tag(compact.TraditionalChinese)
Zulu Tag = Tag(compact.Zulu)
)

187
vendor/golang.org/x/text/runes/cond.go generated vendored Normal file
View File

@ -0,0 +1,187 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package runes
import (
"unicode/utf8"
"golang.org/x/text/transform"
)
// Note: below we pass invalid UTF-8 to the tIn and tNotIn transformers as is.
// This is done for various reasons:
// - To retain the semantics of the Nop transformer: if input is passed to a Nop
// one would expect it to be unchanged.
// - It would be very expensive to pass a converted RuneError to a transformer:
// a transformer might need more source bytes after RuneError, meaning that
// the only way to pass it safely is to create a new buffer and manage the
// intermingling of RuneErrors and normal input.
// - Many transformers leave ill-formed UTF-8 as is, so this is not
// inconsistent. Generally ill-formed UTF-8 is only replaced if it is a
// logical consequence of the operation (as for Map) or if it otherwise would
// pose security concerns (as for Remove).
// - An alternative would be to return an error on ill-formed UTF-8, but this
// would be inconsistent with other operations.
// If returns a transformer that applies tIn to consecutive runes for which
// s.Contains(r) and tNotIn to consecutive runes for which !s.Contains(r). Reset
// is called on tIn and tNotIn at the start of each run. A Nop transformer will
// substitute a nil value passed to tIn or tNotIn. Invalid UTF-8 is translated
// to RuneError to determine which transformer to apply, but is passed as is to
// the respective transformer.
func If(s Set, tIn, tNotIn transform.Transformer) Transformer {
if tIn == nil && tNotIn == nil {
return Transformer{transform.Nop}
}
if tIn == nil {
tIn = transform.Nop
}
if tNotIn == nil {
tNotIn = transform.Nop
}
sIn, ok := tIn.(transform.SpanningTransformer)
if !ok {
sIn = dummySpan{tIn}
}
sNotIn, ok := tNotIn.(transform.SpanningTransformer)
if !ok {
sNotIn = dummySpan{tNotIn}
}
a := &cond{
tIn: sIn,
tNotIn: sNotIn,
f: s.Contains,
}
a.Reset()
return Transformer{a}
}
type dummySpan struct{ transform.Transformer }
func (d dummySpan) Span(src []byte, atEOF bool) (n int, err error) {
return 0, transform.ErrEndOfSpan
}
type cond struct {
tIn, tNotIn transform.SpanningTransformer
f func(rune) bool
check func(rune) bool // current check to perform
t transform.SpanningTransformer // current transformer to use
}
// Reset implements transform.Transformer.
func (t *cond) Reset() {
t.check = t.is
t.t = t.tIn
t.t.Reset() // notIn will be reset on first usage.
}
func (t *cond) is(r rune) bool {
if t.f(r) {
return true
}
t.check = t.isNot
t.t = t.tNotIn
t.tNotIn.Reset()
return false
}
func (t *cond) isNot(r rune) bool {
if !t.f(r) {
return true
}
t.check = t.is
t.t = t.tIn
t.tIn.Reset()
return false
}
// This implementation of Span doesn't help all too much, but it needs to be
// there to satisfy this package's Transformer interface.
// TODO: there are certainly room for improvements, though. For example, if
// t.t == transform.Nop (which will a common occurrence) it will save a bundle
// to special-case that loop.
func (t *cond) Span(src []byte, atEOF bool) (n int, err error) {
p := 0
for n < len(src) && err == nil {
// Don't process too much at a time as the Spanner that will be
// called on this block may terminate early.
const maxChunk = 4096
max := len(src)
if v := n + maxChunk; v < max {
max = v
}
atEnd := false
size := 0
current := t.t
for ; p < max; p += size {
r := rune(src[p])
if r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[p:]); size == 1 {
if !atEOF && !utf8.FullRune(src[p:]) {
err = transform.ErrShortSrc
break
}
}
if !t.check(r) {
// The next rune will be the start of a new run.
atEnd = true
break
}
}
n2, err2 := current.Span(src[n:p], atEnd || (atEOF && p == len(src)))
n += n2
if err2 != nil {
return n, err2
}
// At this point either err != nil or t.check will pass for the rune at p.
p = n + size
}
return n, err
}
func (t *cond) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
p := 0
for nSrc < len(src) && err == nil {
// Don't process too much at a time, as the work might be wasted if the
// destination buffer isn't large enough to hold the result or a
// transform returns an error early.
const maxChunk = 4096
max := len(src)
if n := nSrc + maxChunk; n < len(src) {
max = n
}
atEnd := false
size := 0
current := t.t
for ; p < max; p += size {
r := rune(src[p])
if r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[p:]); size == 1 {
if !atEOF && !utf8.FullRune(src[p:]) {
err = transform.ErrShortSrc
break
}
}
if !t.check(r) {
// The next rune will be the start of a new run.
atEnd = true
break
}
}
nDst2, nSrc2, err2 := current.Transform(dst[nDst:], src[nSrc:p], atEnd || (atEOF && p == len(src)))
nDst += nDst2
nSrc += nSrc2
if err2 != nil {
return nDst, nSrc, err2
}
// At this point either err != nil or t.check will pass for the rune at p.
p = nSrc + size
}
return nDst, nSrc, err
}

355
vendor/golang.org/x/text/runes/runes.go generated vendored Normal file
View File

@ -0,0 +1,355 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package runes provide transforms for UTF-8 encoded text.
package runes // import "golang.org/x/text/runes"
import (
"unicode"
"unicode/utf8"
"golang.org/x/text/transform"
)
// A Set is a collection of runes.
type Set interface {
// Contains returns true if r is contained in the set.
Contains(r rune) bool
}
type setFunc func(rune) bool
func (s setFunc) Contains(r rune) bool {
return s(r)
}
// Note: using funcs here instead of wrapping types result in cleaner
// documentation and a smaller API.
// In creates a Set with a Contains method that returns true for all runes in
// the given RangeTable.
func In(rt *unicode.RangeTable) Set {
return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
}
// In creates a Set with a Contains method that returns true for all runes not
// in the given RangeTable.
func NotIn(rt *unicode.RangeTable) Set {
return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
}
// Predicate creates a Set with a Contains method that returns f(r).
func Predicate(f func(rune) bool) Set {
return setFunc(f)
}
// Transformer implements the transform.Transformer interface.
type Transformer struct {
t transform.SpanningTransformer
}
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
return t.t.Transform(dst, src, atEOF)
}
func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
return t.t.Span(b, atEOF)
}
func (t Transformer) Reset() { t.t.Reset() }
// Bytes returns a new byte slice with the result of converting b using t. It
// calls Reset on t. It returns nil if any error was found. This can only happen
// if an error-producing Transformer is passed to If.
func (t Transformer) Bytes(b []byte) []byte {
b, _, err := transform.Bytes(t, b)
if err != nil {
return nil
}
return b
}
// String returns a string with the result of converting s using t. It calls
// Reset on t. It returns the empty string if any error was found. This can only
// happen if an error-producing Transformer is passed to If.
func (t Transformer) String(s string) string {
s, _, err := transform.String(t, s)
if err != nil {
return ""
}
return s
}
// TODO:
// - Copy: copying strings and bytes in whole-rune units.
// - Validation (maybe)
// - Well-formed-ness (maybe)
const runeErrorString = string(utf8.RuneError)
// Remove returns a Transformer that removes runes r for which s.Contains(r).
// Illegal input bytes are replaced by RuneError before being passed to f.
func Remove(s Set) Transformer {
if f, ok := s.(setFunc); ok {
// This little trick cuts the running time of BenchmarkRemove for sets
// created by Predicate roughly in half.
// TODO: special-case RangeTables as well.
return Transformer{remove(f)}
}
return Transformer{remove(s.Contains)}
}
// TODO: remove transform.RemoveFunc.
type remove func(r rune) bool
func (remove) Reset() {}
// Span implements transform.Spanner.
func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
for r, size := rune(0), 0; n < len(src); {
if r = rune(src[n]); r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[n:]) {
err = transform.ErrShortSrc
} else {
err = transform.ErrEndOfSpan
}
break
}
if t(r) {
err = transform.ErrEndOfSpan
break
}
n += size
}
return
}
// Transform implements transform.Transformer.
func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for r, size := rune(0), 0; nSrc < len(src); {
if r = rune(src[nSrc]); r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
// We replace illegal bytes with RuneError. Not doing so might
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
// The resulting byte sequence may subsequently contain runes
// for which t(r) is true that were passed unnoticed.
if !t(utf8.RuneError) {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = runeErrorString[0]
dst[nDst+1] = runeErrorString[1]
dst[nDst+2] = runeErrorString[2]
nDst += 3
}
nSrc++
continue
}
if t(r) {
nSrc += size
continue
}
if nDst+size > len(dst) {
err = transform.ErrShortDst
break
}
for i := 0; i < size; i++ {
dst[nDst] = src[nSrc]
nDst++
nSrc++
}
}
return
}
// Map returns a Transformer that maps the runes in the input using the given
// mapping. Illegal bytes in the input are converted to utf8.RuneError before
// being passed to the mapping func.
func Map(mapping func(rune) rune) Transformer {
return Transformer{mapper(mapping)}
}
type mapper func(rune) rune
func (mapper) Reset() {}
// Span implements transform.Spanner.
func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
for r, size := rune(0), 0; n < len(src); n += size {
if r = rune(src[n]); r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[n:]) {
err = transform.ErrShortSrc
} else {
err = transform.ErrEndOfSpan
}
break
}
if t(r) != r {
err = transform.ErrEndOfSpan
break
}
}
return n, err
}
// Transform implements transform.Transformer.
func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
var replacement rune
var b [utf8.UTFMax]byte
for r, size := rune(0), 0; nSrc < len(src); {
if r = rune(src[nSrc]); r < utf8.RuneSelf {
if replacement = t(r); replacement < utf8.RuneSelf {
if nDst == len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = byte(replacement)
nDst++
nSrc++
continue
}
size = 1
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = runeErrorString[0]
dst[nDst+1] = runeErrorString[1]
dst[nDst+2] = runeErrorString[2]
nDst += 3
nSrc++
continue
}
} else if replacement = t(r); replacement == r {
if nDst+size > len(dst) {
err = transform.ErrShortDst
break
}
for i := 0; i < size; i++ {
dst[nDst] = src[nSrc]
nDst++
nSrc++
}
continue
}
n := utf8.EncodeRune(b[:], replacement)
if nDst+n > len(dst) {
err = transform.ErrShortDst
break
}
for i := 0; i < n; i++ {
dst[nDst] = b[i]
nDst++
}
nSrc += size
}
return
}
// ReplaceIllFormed returns a transformer that replaces all input bytes that are
// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
func ReplaceIllFormed() Transformer {
return Transformer{&replaceIllFormed{}}
}
type replaceIllFormed struct{ transform.NopResetter }
func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
for n < len(src) {
// ASCII fast path.
if src[n] < utf8.RuneSelf {
n++
continue
}
r, size := utf8.DecodeRune(src[n:])
// Look for a valid non-ASCII rune.
if r != utf8.RuneError || size != 1 {
n += size
continue
}
// Look for short source data.
if !atEOF && !utf8.FullRune(src[n:]) {
err = transform.ErrShortSrc
break
}
// We have an invalid rune.
err = transform.ErrEndOfSpan
break
}
return n, err
}
func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for nSrc < len(src) {
// ASCII fast path.
if r := src[nSrc]; r < utf8.RuneSelf {
if nDst == len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = r
nDst++
nSrc++
continue
}
// Look for a valid non-ASCII rune.
if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
err = transform.ErrShortDst
break
}
nDst += size
nSrc += size
continue
}
// Look for short source data.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
// We have an invalid rune.
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = runeErrorString[0]
dst[nDst+1] = runeErrorString[1]
dst[nDst+2] = runeErrorString[2]
nDst += 3
nSrc++
}
return nDst, nSrc, err
}

View File

@ -78,8 +78,8 @@ type SpanningTransformer interface {
// considering the error err.
//
// A nil error means that all input bytes are known to be identical to the
// output produced by the Transformer. A nil error can be be returned
// regardless of whether atEOF is true. If err is nil, then then n must
// output produced by the Transformer. A nil error can be returned
// regardless of whether atEOF is true. If err is nil, then n must
// equal len(src); the converse is not necessarily true.
//
// ErrEndOfSpan means that the Transformer output may differ from the
@ -493,7 +493,7 @@ func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err erro
return dstL.n, srcL.p, err
}
// Deprecated: use runes.Remove instead.
// Deprecated: Use runes.Remove instead.
func RemoveFunc(f func(r rune) bool) Transformer {
return removeF(f)
}

View File

@ -6,7 +6,7 @@
// Package bidi contains functionality for bidirectional text support.
//
// See http://www.unicode.org/reports/tr9.
// See https://www.unicode.org/reports/tr9.
//
// NOTE: UNDER CONSTRUCTION. This API may change in backwards incompatible ways
// and without notice.

View File

@ -12,7 +12,7 @@ import (
// This file contains a port of the reference implementation of the
// Bidi Parentheses Algorithm:
// http://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java
// https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java
//
// The implementation in this file covers definitions BD14-BD16 and rule N0
// of UAX#9.
@ -246,7 +246,7 @@ func (p *bracketPairer) getStrongTypeN0(index int) Class {
// assuming the given embedding direction.
//
// It returns ON if no strong type is found. If a single strong type is found,
// it returns this this type. Otherwise it returns the embedding direction.
// it returns this type. Otherwise it returns the embedding direction.
//
// TODO: use separate type for "strong" directionality.
func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class {

View File

@ -7,7 +7,7 @@ package bidi
import "log"
// This implementation is a port based on the reference implementation found at:
// http://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/
// https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/
//
// described in Unicode Bidirectional Algorithm (UAX #9).
//

View File

@ -26,7 +26,7 @@ func main() {
}
// bidiClass names and codes taken from class "bc" in
// http://www.unicode.org/Public/8.0.0/ucd/PropertyValueAliases.txt
// https://www.unicode.org/Public/8.0.0/ucd/PropertyValueAliases.txt
var bidiClass = map[string]Class{
"AL": AL, // ArabicLetter
"AN": AN, // ArabicNumber

View File

@ -15,7 +15,7 @@ import (
)
// These tables are hand-extracted from:
// http://www.unicode.org/Public/8.0.0/ucd/extracted/DerivedBidiClass.txt
// https://www.unicode.org/Public/8.0.0/ucd/extracted/DerivedBidiClass.txt
func visitDefaults(fn func(r rune, c Class)) {
// first write default values for ranges listed above.
visitRunes(fn, AL, []rune{

View File

@ -1,6 +1,6 @@
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
// +build go1.10
// +build go1.10,!go1.13
package bidi

1887
vendor/golang.org/x/text/unicode/bidi/tables11.0.0.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -5,14 +5,15 @@
//go:generate go run makexml.go -output xml.go
// Package cldr provides a parser for LDML and related XML formats.
// This package is intended to be used by the table generation tools
// for the various internationalization-related packages.
// As the XML types are generated from the CLDR DTD, and as the CLDR standard
// is periodically amended, this package may change considerably over time.
// This mostly means that data may appear and disappear between versions.
// That is, old code should keep compiling for newer versions, but data
// may have moved or changed.
// CLDR version 22 is the first version supported by this package.
//
// This package is intended to be used by the table generation tools for the
// various packages in x/text and is not internal for historical reasons.
//
// As the XML types are generated from the CLDR DTD, and as the CLDR standard is
// periodically amended, this package may change considerably over time. This
// mostly means that data may appear and disappear between versions. That is,
// old code should keep compiling for newer versions, but data may have moved or
// changed. CLDR version 22 is the first version supported by this package.
// Older versions may not work.
package cldr // import "golang.org/x/text/unicode/cldr"
@ -94,6 +95,12 @@ func (cldr *CLDR) RawLDML(loc string) *LDML {
// LDML returns the fully resolved LDML XML for loc, which must be one of
// the strings returned by Locales.
//
// Deprecated: Use RawLDML and implement inheritance manually or using the
// internal cldrtree package.
// Inheritance has changed quite a bit since the onset of this package and in
// practice data often represented in a way where knowledge of how it was
// inherited is relevant.
func (cldr *CLDR) LDML(loc string) (*LDML, error) {
return cldr.resolve(loc)
}

View File

@ -27,7 +27,7 @@ const (
// cldrIndex is a Unicode-reserved sentinel value used to mark the start
// of a grouping within an index.
// We ignore any rule that starts with this rune.
// See http://unicode.org/reports/tr35/#Collation_Elements for details.
// See https://unicode.org/reports/tr35/#Collation_Elements for details.
cldrIndex = "\uFDD0"
// specialAnchor is the format in which to represent logical reset positions,
@ -51,7 +51,7 @@ func (c Collation) Process(p RuleProcessor) (err error) {
}
// processRules parses rules in the Collation Rule Syntax defined in
// http://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings.
// https://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings.
func processRules(p RuleProcessor, s string) (err error) {
chk := func(s string, e error) string {
if err == nil {

View File

@ -58,9 +58,10 @@ func (d *Decoder) Decode(l Loader) (cldr *CLDR, err error) {
if len(d.dirFilter) > 0 && !in(d.dirFilter, m[1]) {
continue
}
var r io.Reader
var r io.ReadCloser
if r, err = l.Reader(i); err == nil {
err = d.decode(m[1], m[2], r)
r.Close()
}
if err != nil {
return nil, err
@ -100,7 +101,7 @@ func (d *Decoder) decode(dir, id string, r io.Reader) error {
if l.Identity == nil {
return fmt.Errorf("%s/%s: missing identity element", dir, id)
}
// TODO: verify when CLDR bug http://unicode.org/cldr/trac/ticket/8970
// TODO: verify when CLDR bug https://unicode.org/cldr/trac/ticket/8970
// is resolved.
// path := strings.Split(id, "_")
// if lang := l.Identity.Language.Type; lang != path[0] {

View File

@ -153,7 +153,7 @@ var comments = map[string]string{
// Dates contains information regarding the format and parsing of dates and times.
`,
"localeDisplayNames": `
// LocaleDisplayNames specifies localized display names for for scripts, languages,
// LocaleDisplayNames specifies localized display names for scripts, languages,
// countries, currencies, and variants.
`,
"numbers": `

View File

@ -5,7 +5,7 @@
package cldr
// This file implements the various inheritance constructs defined by LDML.
// See http://www.unicode.org/reports/tr35/#Inheritance_and_Validity
// See https://www.unicode.org/reports/tr35/#Inheritance_and_Validity
// for more details.
import (
@ -309,7 +309,7 @@ func in(set []string, s string) bool {
}
// attrKey computes a key based on the distinguishable attributes of
// an element and it's values.
// an element and its values.
func attrKey(v reflect.Value, exclude ...string) string {
parts := []string{}
ename := v.Interface().(Elem).GetCommon().name

View File

@ -1237,7 +1237,7 @@ type TimeZoneNames struct {
} `xml:"metazone"`
}
// LocaleDisplayNames specifies localized display names for for scripts, languages,
// LocaleDisplayNames specifies localized display names for scripts, languages,
// countries, currencies, and variants.
type LocaleDisplayNames struct {
Common

View File

@ -407,7 +407,7 @@ func decomposeHangul(buf []byte, r rune) int {
// decomposeHangul algorithmically decomposes a Hangul rune into
// its Jamo components.
// See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
// See https://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
func (rb *reorderBuffer) decomposeHangul(r rune) {
r -= hangulBase
x := r % jamoTCount
@ -420,7 +420,7 @@ func (rb *reorderBuffer) decomposeHangul(r rune) {
}
// combineHangul algorithmically combines Jamo character components into Hangul.
// See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
// See https://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
func (rb *reorderBuffer) combineHangul(s, i, k int) {
b := rb.rune[:]
bn := rb.nrune
@ -461,6 +461,10 @@ func (rb *reorderBuffer) combineHangul(s, i, k int) {
// It should only be used to recompose a single segment, as it will not
// handle alternations between Hangul and non-Hangul characters correctly.
func (rb *reorderBuffer) compose() {
// Lazily load the map used by the combine func below, but do
// it outside of the loop.
recompMapOnce.Do(buildRecompMap)
// UAX #15, section X5 , including Corrigendum #5
// "In any character sequence beginning with starter S, a character C is
// blocked from S if and only if there is some character B between S

View File

@ -4,6 +4,8 @@
package norm
import "encoding/binary"
// This file contains Form-specific logic and wrappers for data in tables.go.
// Rune info is stored in a separate trie per composing form. A composing form
@ -178,6 +180,17 @@ func (p Properties) TrailCCC() uint8 {
return ccc[p.tccc]
}
func buildRecompMap() {
recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
var buf [8]byte
for i := 0; i < len(recompMapPacked); i += 8 {
copy(buf[:], recompMapPacked[i:i+8])
key := binary.BigEndian.Uint32(buf[:4])
val := binary.BigEndian.Uint32(buf[4:])
recompMap[key] = rune(val)
}
}
// Recomposition
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
// This clips off the bits of three entries, but we know this will not
@ -186,8 +199,14 @@ func (p Properties) TrailCCC() uint8 {
// Note that the recomposition map for NFC and NFKC are identical.
// combine returns the combined rune or 0 if it doesn't exist.
//
// The caller is responsible for calling
// recompMapOnce.Do(buildRecompMap) sometime before this is called.
func combine(a, b rune) rune {
key := uint32(uint16(a))<<16 + uint32(uint16(b))
if recompMap == nil {
panic("caller error") // see func comment
}
return recompMap[key]
}

View File

@ -128,8 +128,9 @@ func (i *Iter) Next() []byte {
func nextASCIIBytes(i *Iter) []byte {
p := i.p + 1
if p >= i.rb.nsrc {
p0 := i.p
i.setDone()
return i.rb.src.bytes[i.p:p]
return i.rb.src.bytes[p0:p]
}
if i.rb.src.bytes[p] < utf8.RuneSelf {
p0 := i.p

View File

@ -12,6 +12,7 @@ package main
import (
"bytes"
"encoding/binary"
"flag"
"fmt"
"io"
@ -261,7 +262,7 @@ func compactCCC() {
// CompositionExclusions.txt has form:
// 0958 # ...
// See http://unicode.org/reports/tr44/ for full explanation
// See https://unicode.org/reports/tr44/ for full explanation
func loadCompositionExclusions() {
f := gen.OpenUCDFile("CompositionExclusions.txt")
defer f.Close()
@ -735,6 +736,8 @@ func makeTables() {
max = n
}
}
fmt.Fprintln(w, `import "sync"`)
fmt.Fprintln(w)
fmt.Fprintln(w, "const (")
fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.")
@ -782,16 +785,23 @@ func makeTables() {
sz := nrentries * 8
size += sz
fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz)
fmt.Fprintln(w, "var recompMap = map[uint32]rune{")
fmt.Fprintln(w, "var recompMap map[uint32]rune")
fmt.Fprintln(w, "var recompMapOnce sync.Once\n")
fmt.Fprintln(w, `const recompMapPacked = "" +`)
var buf [8]byte
for i, c := range chars {
f := c.forms[FCanonical]
d := f.decomp
if !f.isOneWay && len(d) > 0 {
key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1]))
fmt.Fprintf(w, "0x%.8X: 0x%.4X,\n", key, i)
binary.BigEndian.PutUint32(buf[:4], key)
binary.BigEndian.PutUint32(buf[4:], uint32(i))
fmt.Fprintf(w, "\t\t%q + // 0x%.8X: 0x%.8X\n", string(buf[:]), key, uint32(i))
}
}
fmt.Fprintf(w, "}\n\n")
// hack so we don't have to special case the trailing plus sign
fmt.Fprintf(w, ` ""`)
fmt.Fprintln(w)
}
fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size)
@ -857,7 +867,7 @@ func verifyComputed() {
// DerivedNormalizationProps.txt has form:
// 00C0..00C5 ; NFD_QC; N # ...
// 0374 ; NFD_QC; N # ...
// See http://unicode.org/reports/tr44/ for full explanation
// See https://unicode.org/reports/tr44/ for full explanation
func testDerived() {
f := gen.OpenUCDFile("DerivedNormalizationProps.txt")
defer f.Close()

View File

@ -29,8 +29,8 @@ import (
// proceed independently on both sides:
// f(x) == append(f(x[0:n]), f(x[n:])...)
//
// References: http://unicode.org/reports/tr15/ and
// http://unicode.org/notes/tn5/.
// References: https://unicode.org/reports/tr15/ and
// https://unicode.org/notes/tn5/.
type Form int
const (

View File

@ -60,8 +60,8 @@ func (w *normWriter) Close() error {
}
// Writer returns a new writer that implements Write(b)
// by writing f(b) to w. The returned writer may use an
// an internal buffer to maintain state across Write calls.
// by writing f(b) to w. The returned writer may use an
// internal buffer to maintain state across Write calls.
// Calling its Close method writes any buffered data to w.
func (f Form) Writer(w io.Writer) io.WriteCloser {
wr := &normWriter{rb: reorderBuffer{}, w: w}

File diff suppressed because it is too large Load Diff

7693
vendor/golang.org/x/text/unicode/norm/tables11.0.0.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -18,7 +18,6 @@ func (Form) Reset() {}
// Users should either catch ErrShortDst and allow dst to grow or have dst be at
// least of size MaxTransformChunkSize to be guaranteed of progress.
func (f Form) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
n := 0
// Cap the maximum number of src bytes to check.
b := src
eof := atEOF
@ -27,13 +26,14 @@ func (f Form) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
eof = false
b = b[:ns]
}
i, ok := formTable[f].quickSpan(inputBytes(b), n, len(b), eof)
n += copy(dst[n:], b[n:i])
i, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), eof)
n := copy(dst, b[:i])
if !ok {
nDst, nSrc, err = f.transform(dst[n:], src[n:], atEOF)
return nDst + n, nSrc + n, err
}
if n < len(src) && !atEOF {
if err == nil && n < len(src) && !atEOF {
err = transform.ErrShortSrc
}
return n, n, err
@ -79,7 +79,7 @@ func (f Form) transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
nSrc += n
nDst += n
if ok {
if n < rb.nsrc && !atEOF {
if err == nil && n < rb.nsrc && !atEOF {
err = transform.ErrShortSrc
}
return nDst, nSrc, err

Some files were not shown because too many files have changed in this diff Show More