rebase: ParseAcceptLanguage takes a long time to parse complex tags

A vulnerability was found in golang.org/x/text/language package which
could cause a denial of service. An attacker can craft an
Accept-Language header which ParseAcceptLanguage will take significant
time to parse.
Version v0.3.8 of golang.org/x/text fixes a vulnerability.

See-also: https://go.dev/issue/56152
See-also: https://bugzilla.redhat.com/CVE-2022-32149
Signed-off-by: Niels de Vos <ndevos@redhat.com>
(cherry picked from commit e08005f402)
This commit is contained in:
Niels de Vos 2022-10-17 08:49:59 +02:00 committed by mergify[bot]
parent f9adcde538
commit 763aa3df03
27 changed files with 667 additions and 583 deletions

2
go.mod
View File

@ -150,7 +150,7 @@ require (
go.uber.org/zap v1.21.0 // indirect
golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8 // indirect
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/text v0.3.8 // indirect
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect
gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect
google.golang.org/appengine v1.6.7 // indirect

3
go.sum
View File

@ -1492,8 +1492,9 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.3.8 h1:nAL+RVCQ9uMn3vJZbV+MRnydTJFPf8qqY42YiA6MrqY=
golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=

3
vendor/golang.org/x/text/AUTHORS generated vendored
View File

@ -1,3 +0,0 @@
# This source code refers to The Go Authors for copyright purposes.
# The master list of authors is in the main Go distribution,
# visible at http://tip.golang.org/AUTHORS.

View File

@ -1,3 +0,0 @@
# This source code was written by the Go contributors.
# The master list of contributors is in the main Go distribution,
# visible at http://tip.golang.org/CONTRIBUTORS.

View File

@ -128,7 +128,8 @@ const (
// The entry is pointed to by the exception index in an entry. It has the
// following format:
//
// Header
// Header:
//
// byte 0:
// 7..6 unused
// 5..4 CCC type (same bits as entry)
@ -149,6 +150,7 @@ const (
// A length of 0 indicates a mapping to zero-length string.
//
// Body bytes:
//
// case folding bytes
// lowercase mapping bytes
// uppercase mapping bytes
@ -156,6 +158,7 @@ const (
// closure mapping bytes (for NFKC_Casefold). (TODO)
//
// Fallbacks:
//
// missing fold -> lower
// missing title -> upper
// all missing -> original rune

View File

@ -93,8 +93,11 @@ var canonical = [numEncodings]string{
var nameMap = map[string]htmlEncoding{
"unicode-1-1-utf-8": utf8,
"unicode11utf8": utf8,
"unicode20utf8": utf8,
"utf-8": utf8,
"utf8": utf8,
"x-unicode20utf8": utf8,
"866": ibm866,
"cp866": ibm866,
"csibm866": ibm866,
@ -307,7 +310,13 @@ var nameMap = map[string]htmlEncoding{
"iso-2022-cn-ext": replacement,
"iso-2022-kr": replacement,
"replacement": replacement,
"unicodefffe": utf16be,
"utf-16be": utf16be,
"csunicode": utf16le,
"iso-10646-ucs-2": utf16le,
"ucs-2": utf16le,
"unicode": utf16le,
"unicodefeff": utf16le,
"utf-16": utf16le,
"utf-16le": utf16le,
"x-user-defined": xUserDefined,

View File

@ -905,6 +905,14 @@ const (
// https://www.unicode.org/notes/tn6/
BOCU1 MIB = 1020
// UTF7IMAP is the MIB identifier with IANA name UTF-7-IMAP.
//
// Note: This charset is used to encode Unicode in IMAP mailbox names;
// see section 5.1.3 of rfc3501 . It should never be used
// outside this context. A name has been assigned so that charset processing
// implementations can refer to it in a consistent way.
UTF7IMAP MIB = 1021
// Windows30Latin1 is the MIB identifier with IANA name ISO-8859-1-Windows-3.0-Latin-1.
//
// Extended ISO 8859-1 Latin-1 for Windows 3.0.

View File

@ -55,6 +55,8 @@ loop:
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
// says to treat "gbk" as Code Page 936.
// GBKs decoder is gb18030s decoder. https://encoding.spec.whatwg.org/#gbk-decoder
// If byte is 0x80, return code point U+20AC. https://encoding.spec.whatwg.org/#gb18030-decoder
case c0 == 0x80:
r, size = '€', 1
@ -180,7 +182,9 @@ func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
// says to treat "gbk" as Code Page 936.
if r == '€' {
// GBKs encoder is gb18030s encoder with its _is GBK_ set to true. https://encoding.spec.whatwg.org/#gbk-encoder
// If _is GBK_ is true and code point is U+20AC, return byte 0x80. https://encoding.spec.whatwg.org/#gb18030-encoder
if !e.gb18030 && r == '€' {
r = 0x80
goto write1
}

View File

@ -966,7 +966,7 @@ var coreTags = []language.CompactCoreInfo{ // 773 elements
0x3fd00000, 0x3fd00072, 0x3fd000da, 0x3fd0010c,
0x3ff00000, 0x3ff000d1, 0x40100000, 0x401000c3,
0x40200000, 0x4020004c, 0x40700000, 0x40800000,
0x4085a000, 0x4085a0ba, 0x408e3000, 0x408e30ba,
0x4085a000, 0x4085a0ba, 0x408e8000, 0x408e80ba,
0x40c00000, 0x40c000b3, 0x41200000, 0x41200111,
0x41600000, 0x4160010f, 0x41c00000, 0x41d00000,
// Entry 280 - 29F
@ -994,7 +994,7 @@ var coreTags = []language.CompactCoreInfo{ // 773 elements
0x4ae00130, 0x4b400000, 0x4b400099, 0x4b4000e8,
0x4bc00000, 0x4bc05000, 0x4bc05024, 0x4bc20000,
0x4bc20137, 0x4bc5a000, 0x4bc5a137, 0x4be00000,
0x4be5a000, 0x4be5a0b4, 0x4beeb000, 0x4beeb0b4,
0x4be5a000, 0x4be5a0b4, 0x4bef1000, 0x4bef10b4,
0x4c000000, 0x4c300000, 0x4c30013e, 0x4c900000,
// Entry 2E0 - 2FF
0x4c900001, 0x4cc00000, 0x4cc0012f, 0x4ce00000,
@ -1012,4 +1012,4 @@ var coreTags = []language.CompactCoreInfo{ // 773 elements
const specialTagsStr string = "ca-ES-valencia en-US-u-va-posix"
// Total table size 3147 bytes (3KiB); checksum: BE816D44
// Total table size 3147 bytes (3KiB); checksum: 6772C83C

View File

@ -328,7 +328,7 @@ func (r Region) IsPrivateUse() bool {
return r.typ()&iso3166UserAssigned != 0
}
type Script uint8
type Script uint16
// getScriptID returns the script id for string s. It assumes that s
// is of the format [A-Z][a-z]{3}.

View File

@ -270,7 +270,7 @@ func parse(scan *scanner, s string) (t Tag, err error) {
} else if n >= 4 {
return Und, ErrSyntax
} else { // the usual case
t, end = parseTag(scan)
t, end = parseTag(scan, true)
if n := len(scan.token); n == 1 {
t.pExt = uint16(end)
end = parseExtensions(scan)
@ -296,7 +296,8 @@ func parse(scan *scanner, s string) (t Tag, err error) {
// parseTag parses language, script, region and variants.
// It returns a Tag and the end position in the input that was parsed.
func parseTag(scan *scanner) (t Tag, end int) {
// If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.
func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {
var e error
// TODO: set an error if an unknown lang, script or region is encountered.
t.LangID, e = getLangID(scan.token)
@ -307,14 +308,17 @@ func parseTag(scan *scanner) (t Tag, end int) {
for len(scan.token) == 3 && isAlpha(scan.token[0]) {
// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
// to a tag of the form <extlang>.
if doNorm {
lang, e := getLangID(scan.token)
if lang != 0 {
t.LangID = lang
copy(scan.b[langStart:], lang.String())
scan.b[langStart+3] = '-'
scan.start = langStart + 4
langStr := lang.String()
copy(scan.b[langStart:], langStr)
scan.b[langStart+len(langStr)] = '-'
scan.start = langStart + len(langStr) + 1
}
scan.gobble(e)
}
end = scan.scan()
}
if len(scan.token) == 4 && isAlpha(scan.token[0]) {
@ -559,7 +563,7 @@ func parseExtension(scan *scanner) int {
case 't': // https://www.ietf.org/rfc/rfc6497.txt
scan.scan()
if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
_, end = parseTag(scan)
_, end = parseTag(scan, false)
scan.toLower(start, end)
}
for len(scan.token) == 2 && !isAlpha(scan.token[1]) {

File diff suppressed because it is too large Load Diff

View File

@ -10,8 +10,7 @@
// and provides the user with the best experience
// (see https://blog.golang.org/matchlang).
//
//
// Matching preferred against supported languages
// # Matching preferred against supported languages
//
// A Matcher for an application that supports English, Australian English,
// Danish, and standard Mandarin can be created as follows:
@ -48,8 +47,7 @@
// For instance, it will know that a reader of Bokmål Danish can read Norwegian
// and will know that Cantonese ("yue") is a good match for "zh-HK".
//
//
// Using match results
// # Using match results
//
// To guarantee a consistent user experience to the user it is important to
// use the same language tag for the selection of any locale-specific services.
@ -70,8 +68,7 @@
// Match and MatchString both return the index of the matched supported tag
// to simplify associating such data with the matched tag.
//
//
// Canonicalization
// # Canonicalization
//
// If one uses the Matcher to compare languages one does not need to
// worry about canonicalization.
@ -92,10 +89,9 @@
// equivalence relations. The CanonType type can be used to alter the
// canonicalization form.
//
// References
// # References
//
// BCP 47 - Tags for Identifying Languages http://tools.ietf.org/html/bcp47
//
package language // import "golang.org/x/text/language"
// TODO: explanation on how to match languages for your own locale-specific

View File

@ -545,7 +545,7 @@ type bestMatch struct {
// match as the preferred match.
//
// If pin is true and have and tag are a strong match, it will henceforth only
// consider matches for this language. This corresponds to the nothing that most
// consider matches for this language. This corresponds to the idea that most
// users have a strong preference for the first defined language. A user can
// still prefer a second language over a dialect of the preferred language by
// explicitly specifying dialects, e.g. "en, nl, en-GB". In this case pin should

View File

@ -147,6 +147,7 @@ func update(b *language.Builder, part ...interface{}) (err error) {
}
var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
var errTagListTooLarge = errors.New("tag list exceeds max length")
// ParseAcceptLanguage parses the contents of an Accept-Language header as
// defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and
@ -164,6 +165,10 @@ func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
}
}()
if strings.Count(s, "-") > 1000 {
return nil, nil, errTagListTooLarge
}
var entry string
for s != "" {
if entry, s = split(s, ','); entry == "" {

View File

@ -39,12 +39,12 @@ const (
_Hani = 57
_Hans = 59
_Hant = 60
_Qaaa = 143
_Qaai = 151
_Qabx = 192
_Zinh = 245
_Zyyy = 250
_Zzzz = 251
_Qaaa = 147
_Qaai = 155
_Qabx = 196
_Zinh = 252
_Zyyy = 257
_Zzzz = 258
)
var regionToGroups = []uint8{ // 358 elements
@ -265,9 +265,9 @@ var matchScript = []scriptIntelligibility{ // 26 elements
13: {wantLang: 0x39d, haveLang: 0x139, wantScript: 0x36, haveScript: 0x5a, distance: 0xa},
14: {wantLang: 0x3be, haveLang: 0x139, wantScript: 0x5, haveScript: 0x5a, distance: 0xa},
15: {wantLang: 0x3fa, haveLang: 0x139, wantScript: 0x5, haveScript: 0x5a, distance: 0xa},
16: {wantLang: 0x40c, haveLang: 0x139, wantScript: 0xcf, haveScript: 0x5a, distance: 0xa},
17: {wantLang: 0x450, haveLang: 0x139, wantScript: 0xde, haveScript: 0x5a, distance: 0xa},
18: {wantLang: 0x461, haveLang: 0x139, wantScript: 0xe1, haveScript: 0x5a, distance: 0xa},
16: {wantLang: 0x40c, haveLang: 0x139, wantScript: 0xd4, haveScript: 0x5a, distance: 0xa},
17: {wantLang: 0x450, haveLang: 0x139, wantScript: 0xe3, haveScript: 0x5a, distance: 0xa},
18: {wantLang: 0x461, haveLang: 0x139, wantScript: 0xe6, haveScript: 0x5a, distance: 0xa},
19: {wantLang: 0x46f, haveLang: 0x139, wantScript: 0x2c, haveScript: 0x5a, distance: 0xa},
20: {wantLang: 0x476, haveLang: 0x3e2, wantScript: 0x5a, haveScript: 0x20, distance: 0xa},
21: {wantLang: 0x4b4, haveLang: 0x139, wantScript: 0x5, haveScript: 0x5a, distance: 0xa},

View File

@ -495,9 +495,9 @@ func (s *isolatingRunSequence) resolveWeakTypes() {
if t == NSM {
s.types[i] = precedingCharacterType
} else {
if t.in(LRI, RLI, FSI, PDI) {
precedingCharacterType = ON
}
// if t.in(LRI, RLI, FSI, PDI) {
// precedingCharacterType = ON
// }
precedingCharacterType = t
}
}

View File

@ -110,6 +110,7 @@ func (p Properties) BoundaryAfter() bool {
}
// We pack quick check data in 4 bits:
//
// 5: Combines forward (0 == false, 1 == true)
// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.

View File

@ -27,6 +27,7 @@ import (
// the bytes or string x converted to the given form.
// A position n in x is called a boundary if conversion to the form can
// proceed independently on both sides:
//
// f(x) == append(f(x[0:n]), f(x[n:])...)
//
// References: https://unicode.org/reports/tr15/ and

View File

@ -7315,7 +7315,7 @@ const recompMapPacked = "" +
"\x00V\x03\x03\x00\x00\x1e|" + // 0x00560303: 0x00001E7C
"\x00v\x03\x03\x00\x00\x1e}" + // 0x00760303: 0x00001E7D
"\x00V\x03#\x00\x00\x1e~" + // 0x00560323: 0x00001E7E
"\x00v\x03#\x00\x00\x1e\u007f" + // 0x00760323: 0x00001E7F
"\x00v\x03#\x00\x00\x1e\x7f" + // 0x00760323: 0x00001E7F
"\x00W\x03\x00\x00\x00\x1e\x80" + // 0x00570300: 0x00001E80
"\x00w\x03\x00\x00\x00\x1e\x81" + // 0x00770300: 0x00001E81
"\x00W\x03\x01\x00\x00\x1e\x82" + // 0x00570301: 0x00001E82
@ -7342,7 +7342,7 @@ const recompMapPacked = "" +
"\x00t\x03\b\x00\x00\x1e\x97" + // 0x00740308: 0x00001E97
"\x00w\x03\n\x00\x00\x1e\x98" + // 0x0077030A: 0x00001E98
"\x00y\x03\n\x00\x00\x1e\x99" + // 0x0079030A: 0x00001E99
"\x01\u007f\x03\a\x00\x00\x1e\x9b" + // 0x017F0307: 0x00001E9B
"\x01\x7f\x03\a\x00\x00\x1e\x9b" + // 0x017F0307: 0x00001E9B
"\x00A\x03#\x00\x00\x1e\xa0" + // 0x00410323: 0x00001EA0
"\x00a\x03#\x00\x00\x1e\xa1" + // 0x00610323: 0x00001EA1
"\x00A\x03\t\x00\x00\x1e\xa2" + // 0x00410309: 0x00001EA2

View File

@ -1146,21 +1146,31 @@ var widthIndex = [1408]uint8{
}
// inverseData contains 4-byte entries of the following format:
//
// <length> <modified UTF-8-encoded rune> <0 padding>
//
// The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the
// UTF-8 encoding of the original rune. Mappings often have the following
// pattern:
//
// -> A (U+FF21 -> U+0041)
// -> B (U+FF22 -> U+0042)
// ...
//
// By xor-ing the last byte the same entry can be shared by many mappings. This
// reduces the total number of distinct entries by about two thirds.
// The resulting entry for the aforementioned mappings is
//
// { 0x01, 0xE0, 0x00, 0x00 }
//
// Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get
//
// E0 ^ A1 = 41.
//
// Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get
//
// E0 ^ A2 = 42.
//
// Note that because of the xor-ing, the byte sequence stored in the entry is
// not valid UTF-8.
var inverseData = [150][4]byte{

View File

@ -1158,21 +1158,31 @@ var widthIndex = [1408]uint8{
}
// inverseData contains 4-byte entries of the following format:
//
// <length> <modified UTF-8-encoded rune> <0 padding>
//
// The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the
// UTF-8 encoding of the original rune. Mappings often have the following
// pattern:
//
// -> A (U+FF21 -> U+0041)
// -> B (U+FF22 -> U+0042)
// ...
//
// By xor-ing the last byte the same entry can be shared by many mappings. This
// reduces the total number of distinct entries by about two thirds.
// The resulting entry for the aforementioned mappings is
//
// { 0x01, 0xE0, 0x00, 0x00 }
//
// Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get
//
// E0 ^ A1 = 41.
//
// Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get
//
// E0 ^ A2 = 42.
//
// Note that because of the xor-ing, the byte sequence stored in the entry is
// not valid UTF-8.
var inverseData = [150][4]byte{

View File

@ -1178,21 +1178,31 @@ var widthIndex = [1408]uint8{
}
// inverseData contains 4-byte entries of the following format:
//
// <length> <modified UTF-8-encoded rune> <0 padding>
//
// The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the
// UTF-8 encoding of the original rune. Mappings often have the following
// pattern:
//
// -> A (U+FF21 -> U+0041)
// -> B (U+FF22 -> U+0042)
// ...
//
// By xor-ing the last byte the same entry can be shared by many mappings. This
// reduces the total number of distinct entries by about two thirds.
// The resulting entry for the aforementioned mappings is
//
// { 0x01, 0xE0, 0x00, 0x00 }
//
// Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get
//
// E0 ^ A1 = 41.
//
// Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get
//
// E0 ^ A2 = 42.
//
// Note that because of the xor-ing, the byte sequence stored in the entry is
// not valid UTF-8.
var inverseData = [150][4]byte{

View File

@ -1179,21 +1179,31 @@ var widthIndex = [1408]uint8{
}
// inverseData contains 4-byte entries of the following format:
//
// <length> <modified UTF-8-encoded rune> <0 padding>
//
// The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the
// UTF-8 encoding of the original rune. Mappings often have the following
// pattern:
//
// -> A (U+FF21 -> U+0041)
// -> B (U+FF22 -> U+0042)
// ...
//
// By xor-ing the last byte the same entry can be shared by many mappings. This
// reduces the total number of distinct entries by about two thirds.
// The resulting entry for the aforementioned mappings is
//
// { 0x01, 0xE0, 0x00, 0x00 }
//
// Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get
//
// E0 ^ A1 = 41.
//
// Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get
//
// E0 ^ A2 = 42.
//
// Note that because of the xor-ing, the byte sequence stored in the entry is
// not valid UTF-8.
var inverseData = [150][4]byte{

View File

@ -1114,21 +1114,31 @@ var widthIndex = [1408]uint8{
}
// inverseData contains 4-byte entries of the following format:
//
// <length> <modified UTF-8-encoded rune> <0 padding>
//
// The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the
// UTF-8 encoding of the original rune. Mappings often have the following
// pattern:
//
// -> A (U+FF21 -> U+0041)
// -> B (U+FF22 -> U+0042)
// ...
//
// By xor-ing the last byte the same entry can be shared by many mappings. This
// reduces the total number of distinct entries by about two thirds.
// The resulting entry for the aforementioned mappings is
//
// { 0x01, 0xE0, 0x00, 0x00 }
//
// Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get
//
// E0 ^ A1 = 41.
//
// Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get
//
// E0 ^ A2 = 42.
//
// Note that because of the xor-ing, the byte sequence stored in the entry is
// not valid UTF-8.
var inverseData = [150][4]byte{

2
vendor/modules.txt vendored
View File

@ -651,7 +651,7 @@ golang.org/x/sys/windows
# golang.org/x/term v0.0.0-20210927222741-03fcf44c2211
## explicit; go 1.17
golang.org/x/term
# golang.org/x/text v0.3.7
# golang.org/x/text v0.3.8
## explicit; go 1.17
golang.org/x/text/cases
golang.org/x/text/encoding