This commit is contained in:
Mikaël Cluseau
2018-07-06 19:13:18 +11:00
parent d76b49b3c5
commit 21d3f45969
442 changed files with 75578 additions and 0 deletions

17590
vendor/github.com/ulikunitz/xz/internal/randtxt/englm3.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,82 @@
// Copyright 2014-2017 Ulrich Kunitz. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package randtxt
import (
"bufio"
"io"
"unicode"
)
// GroupReader groups the incoming text in groups of 5, whereby the
// number of groups per line can be controlled.
type GroupReader struct {
R io.ByteReader
GroupsPerLine int
off int64
eof bool
}
// NewGroupReader creates a new group reader.
func NewGroupReader(r io.Reader) *GroupReader {
return &GroupReader{R: bufio.NewReader(r)}
}
// Read formats the data provided by the internal reader in groups of 5
// characters. If GroupsPerLine hasn't been initialized 8 groups per
// line will be produced.
func (r *GroupReader) Read(p []byte) (n int, err error) {
if r.eof {
return 0, io.EOF
}
groupsPerLine := r.GroupsPerLine
if groupsPerLine < 1 {
groupsPerLine = 8
}
lineLen := int64(groupsPerLine * 6)
var c byte
for i := range p {
switch {
case r.off%lineLen == lineLen-1:
if i+1 == len(p) && len(p) > 1 {
return i, nil
}
c = '\n'
case r.off%6 == 5:
if i+1 == len(p) && len(p) > 1 {
return i, nil
}
c = ' '
default:
c, err = r.R.ReadByte()
if err == io.EOF {
r.eof = true
if i > 0 {
switch p[i-1] {
case ' ':
p[i-1] = '\n'
fallthrough
case '\n':
return i, io.EOF
}
}
p[i] = '\n'
return i + 1, io.EOF
}
if err != nil {
return i, err
}
switch {
case c == ' ':
c = '_'
case !unicode.IsPrint(rune(c)):
c = '-'
}
}
p[i] = c
r.off++
}
return len(p), nil
}

View File

@ -0,0 +1,185 @@
// Copyright 2014-2017 Ulrich Kunitz. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package randtxt supports the generation of random text using a
// trigram model for the English language.
package randtxt
import (
"math"
"math/rand"
"sort"
)
// ngram stores an entry from the language model.
type ngram struct {
s string
lgP float64
lgQ float64
}
// ngrams represents a slice of ngram values and is used to represent a
// language model.
type ngrams []ngram
func (s ngrams) Len() int { return len(s) }
func (s ngrams) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s ngrams) Less(i, j int) bool { return s[i].s < s[j].s }
// Sorts the language model in the sequence of their ngrams.
func (s ngrams) Sort() { sort.Sort(s) }
// Search is looking for an ngram or the position where it would be
// inserted.
func (s ngrams) Search(g string) int {
return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
}
// prob represents a string, usually an ngram, and a probability value.
type prob struct {
s string
p float64
}
// probs is a slice of prob values that can be sorted and searched.
type probs []prob
func (s probs) Len() int { return len(s) }
func (s probs) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s probs) Less(i, j int) bool { return s[i].s < s[j].s }
// SortByNgram sorts the probs slice by ngram, field s.
func (s probs) SortByNgram() { sort.Sort(s) }
// SortsByProb sorts the probs slice by probability, field p.
func (s probs) SortByProb() { sort.Sort(byProb{s}) }
// SearchNgram searches for an ngram or the position where it would be
// inserted.
func (s probs) SearchNgram(g string) int {
return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
}
// SearchProb searches ngrams for a specific probability or where it
// would be inserted.
func (s probs) SearchProb(p float64) int {
return sort.Search(len(s), func(k int) bool { return s[k].p >= p })
}
// byProb is used to sort probs slice by probability, field p.
type byProb struct {
probs
}
func (s byProb) Less(i, j int) bool {
return s.probs[i].p < s.probs[j].p
}
// cdf can be used to setup a cumulative distribution function
// represented by a probs slice. We should have returned an actual
// function.
func cdf(n int, p func(i int) prob) probs {
prs := make(probs, n)
sum := 0.0
for i := range prs {
pr := p(i)
sum += pr.p
prs[i] = pr
}
q := 1.0 / sum
x := 0.0
for i, pr := range prs {
x += pr.p * q
if x > 1.0 {
x = 1.0
}
prs[i].p = x
}
if !sort.IsSorted(byProb{prs}) {
panic("cdf not sorted")
}
return prs
}
// pCDFOfLM converts a language model into a cumulative distribution
// function represented by probs.
func pCDFOfLM(lm ngrams) probs {
return cdf(len(lm), func(i int) prob {
return prob{lm[i].s, math.Exp2(lm[i].lgP)}
})
}
// cCDF converts a ngrams slice into a cumulative distribution function
// using the conditional probability lgQ.
func cCDF(s ngrams) probs {
return cdf(len(s), func(i int) prob {
return prob{s[i].s, math.Exp2(s[i].lgQ)}
})
}
// comap contains a map of conditional distribution function for the
// last character.
type comap map[string]probs
// comapOfLM converts a language model in a map of conditional
// distribution functions.
func comapOfLM(lm ngrams) comap {
if !sort.IsSorted(lm) {
panic("lm is not sorted")
}
m := make(comap, 26*26)
for i := 0; i < len(lm); {
j := i
g := lm[i].s
g2 := g[:2]
z := g2 + "Z"
i = lm.Search(z)
if i >= len(lm) || lm[i].s != z {
panic("unexpected search result")
}
i++
m[g2] = cCDF(lm[j:i])
}
return m
}
// trigram returns the trigram with prefix g2 using a probability value
// in the range [0.0,1.0).
func (c comap) trigram(g2 string, p float64) string {
prs := c[g2]
i := prs.SearchProb(p)
return prs[i].s
}
var (
// CDF for normal probabilities
pcdf = pCDFOfLM(englm3)
// map of two letter conditionals
cmap = comapOfLM(englm3)
)
// Reader generates a stream of text of uppercase letters with trigrams
// distributed according to a language model of the English language.
type Reader struct {
rnd *rand.Rand
g3 string
}
// NewReader creates a new reader. The argument src must create a uniformly
// distributed stream of random values.
func NewReader(src rand.Source) *Reader {
rnd := rand.New(src)
i := pcdf.SearchProb(rnd.Float64())
return &Reader{rnd, pcdf[i].s}
}
// Read reads random text. The Read function will always return len(p)
// bytes and will never return an error.
func (r *Reader) Read(p []byte) (n int, err error) {
for i := range p {
r.g3 = cmap.trigram(r.g3[1:], r.rnd.Float64())
p[i] = r.g3[2]
}
return len(p), nil
}

View File

@ -0,0 +1,37 @@
// Copyright 2014-2017 Ulrich Kunitz. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package randtxt
import (
"bufio"
"io"
"math/rand"
"testing"
)
func TestReader(t *testing.T) {
lr := io.LimitReader(NewReader(rand.NewSource(13)), 195)
pretty := NewGroupReader(lr)
scanner := bufio.NewScanner(pretty)
for scanner.Scan() {
t.Log(scanner.Text())
}
if err := scanner.Err(); err != nil {
t.Fatalf("scanner error %s", err)
}
}
func TestComap(t *testing.T) {
prs := cmap["TH"]
for _, p := range prs[3:6] {
t.Logf("%v", p)
}
p := 0.2
x := cmap.trigram("TH", p)
if x != "THE" {
t.Fatalf("cmap.trigram(%q, %.1f) returned %q; want %q",
"TH", p, x, "THE")
}
}