vendor
This commit is contained in:
17590
vendor/github.com/ulikunitz/xz/internal/randtxt/englm3.go
generated
vendored
Normal file
17590
vendor/github.com/ulikunitz/xz/internal/randtxt/englm3.go
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
82
vendor/github.com/ulikunitz/xz/internal/randtxt/groupreader.go
generated
vendored
Normal file
82
vendor/github.com/ulikunitz/xz/internal/randtxt/groupreader.go
generated
vendored
Normal file
@ -0,0 +1,82 @@
|
||||
// Copyright 2014-2017 Ulrich Kunitz. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package randtxt
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// GroupReader groups the incoming text in groups of 5, whereby the
|
||||
// number of groups per line can be controlled.
|
||||
type GroupReader struct {
|
||||
R io.ByteReader
|
||||
GroupsPerLine int
|
||||
off int64
|
||||
eof bool
|
||||
}
|
||||
|
||||
// NewGroupReader creates a new group reader.
|
||||
func NewGroupReader(r io.Reader) *GroupReader {
|
||||
return &GroupReader{R: bufio.NewReader(r)}
|
||||
}
|
||||
|
||||
// Read formats the data provided by the internal reader in groups of 5
|
||||
// characters. If GroupsPerLine hasn't been initialized 8 groups per
|
||||
// line will be produced.
|
||||
func (r *GroupReader) Read(p []byte) (n int, err error) {
|
||||
if r.eof {
|
||||
return 0, io.EOF
|
||||
}
|
||||
groupsPerLine := r.GroupsPerLine
|
||||
if groupsPerLine < 1 {
|
||||
groupsPerLine = 8
|
||||
}
|
||||
lineLen := int64(groupsPerLine * 6)
|
||||
var c byte
|
||||
for i := range p {
|
||||
switch {
|
||||
case r.off%lineLen == lineLen-1:
|
||||
if i+1 == len(p) && len(p) > 1 {
|
||||
return i, nil
|
||||
}
|
||||
c = '\n'
|
||||
case r.off%6 == 5:
|
||||
if i+1 == len(p) && len(p) > 1 {
|
||||
return i, nil
|
||||
}
|
||||
c = ' '
|
||||
default:
|
||||
c, err = r.R.ReadByte()
|
||||
if err == io.EOF {
|
||||
r.eof = true
|
||||
if i > 0 {
|
||||
switch p[i-1] {
|
||||
case ' ':
|
||||
p[i-1] = '\n'
|
||||
fallthrough
|
||||
case '\n':
|
||||
return i, io.EOF
|
||||
}
|
||||
}
|
||||
p[i] = '\n'
|
||||
return i + 1, io.EOF
|
||||
}
|
||||
if err != nil {
|
||||
return i, err
|
||||
}
|
||||
switch {
|
||||
case c == ' ':
|
||||
c = '_'
|
||||
case !unicode.IsPrint(rune(c)):
|
||||
c = '-'
|
||||
}
|
||||
}
|
||||
p[i] = c
|
||||
r.off++
|
||||
}
|
||||
return len(p), nil
|
||||
}
|
185
vendor/github.com/ulikunitz/xz/internal/randtxt/probs.go
generated
vendored
Normal file
185
vendor/github.com/ulikunitz/xz/internal/randtxt/probs.go
generated
vendored
Normal file
@ -0,0 +1,185 @@
|
||||
// Copyright 2014-2017 Ulrich Kunitz. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package randtxt supports the generation of random text using a
|
||||
// trigram model for the English language.
|
||||
package randtxt
|
||||
|
||||
import (
|
||||
"math"
|
||||
"math/rand"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// ngram stores an entry from the language model.
|
||||
type ngram struct {
|
||||
s string
|
||||
lgP float64
|
||||
lgQ float64
|
||||
}
|
||||
|
||||
// ngrams represents a slice of ngram values and is used to represent a
|
||||
// language model.
|
||||
type ngrams []ngram
|
||||
|
||||
func (s ngrams) Len() int { return len(s) }
|
||||
func (s ngrams) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
|
||||
func (s ngrams) Less(i, j int) bool { return s[i].s < s[j].s }
|
||||
|
||||
// Sorts the language model in the sequence of their ngrams.
|
||||
func (s ngrams) Sort() { sort.Sort(s) }
|
||||
|
||||
// Search is looking for an ngram or the position where it would be
|
||||
// inserted.
|
||||
func (s ngrams) Search(g string) int {
|
||||
return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
|
||||
}
|
||||
|
||||
// prob represents a string, usually an ngram, and a probability value.
|
||||
type prob struct {
|
||||
s string
|
||||
p float64
|
||||
}
|
||||
|
||||
// probs is a slice of prob values that can be sorted and searched.
|
||||
type probs []prob
|
||||
|
||||
func (s probs) Len() int { return len(s) }
|
||||
func (s probs) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
|
||||
func (s probs) Less(i, j int) bool { return s[i].s < s[j].s }
|
||||
|
||||
// SortByNgram sorts the probs slice by ngram, field s.
|
||||
func (s probs) SortByNgram() { sort.Sort(s) }
|
||||
|
||||
// SortsByProb sorts the probs slice by probability, field p.
|
||||
func (s probs) SortByProb() { sort.Sort(byProb{s}) }
|
||||
|
||||
// SearchNgram searches for an ngram or the position where it would be
|
||||
// inserted.
|
||||
func (s probs) SearchNgram(g string) int {
|
||||
return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
|
||||
}
|
||||
|
||||
// SearchProb searches ngrams for a specific probability or where it
|
||||
// would be inserted.
|
||||
func (s probs) SearchProb(p float64) int {
|
||||
return sort.Search(len(s), func(k int) bool { return s[k].p >= p })
|
||||
}
|
||||
|
||||
// byProb is used to sort probs slice by probability, field p.
|
||||
type byProb struct {
|
||||
probs
|
||||
}
|
||||
|
||||
func (s byProb) Less(i, j int) bool {
|
||||
return s.probs[i].p < s.probs[j].p
|
||||
}
|
||||
|
||||
// cdf can be used to setup a cumulative distribution function
|
||||
// represented by a probs slice. We should have returned an actual
|
||||
// function.
|
||||
func cdf(n int, p func(i int) prob) probs {
|
||||
prs := make(probs, n)
|
||||
sum := 0.0
|
||||
for i := range prs {
|
||||
pr := p(i)
|
||||
sum += pr.p
|
||||
prs[i] = pr
|
||||
}
|
||||
q := 1.0 / sum
|
||||
x := 0.0
|
||||
for i, pr := range prs {
|
||||
x += pr.p * q
|
||||
if x > 1.0 {
|
||||
x = 1.0
|
||||
}
|
||||
prs[i].p = x
|
||||
}
|
||||
if !sort.IsSorted(byProb{prs}) {
|
||||
panic("cdf not sorted")
|
||||
}
|
||||
return prs
|
||||
}
|
||||
|
||||
// pCDFOfLM converts a language model into a cumulative distribution
|
||||
// function represented by probs.
|
||||
func pCDFOfLM(lm ngrams) probs {
|
||||
return cdf(len(lm), func(i int) prob {
|
||||
return prob{lm[i].s, math.Exp2(lm[i].lgP)}
|
||||
})
|
||||
}
|
||||
|
||||
// cCDF converts a ngrams slice into a cumulative distribution function
|
||||
// using the conditional probability lgQ.
|
||||
func cCDF(s ngrams) probs {
|
||||
return cdf(len(s), func(i int) prob {
|
||||
return prob{s[i].s, math.Exp2(s[i].lgQ)}
|
||||
})
|
||||
}
|
||||
|
||||
// comap contains a map of conditional distribution function for the
|
||||
// last character.
|
||||
type comap map[string]probs
|
||||
|
||||
// comapOfLM converts a language model in a map of conditional
|
||||
// distribution functions.
|
||||
func comapOfLM(lm ngrams) comap {
|
||||
if !sort.IsSorted(lm) {
|
||||
panic("lm is not sorted")
|
||||
}
|
||||
m := make(comap, 26*26)
|
||||
for i := 0; i < len(lm); {
|
||||
j := i
|
||||
g := lm[i].s
|
||||
g2 := g[:2]
|
||||
z := g2 + "Z"
|
||||
i = lm.Search(z)
|
||||
if i >= len(lm) || lm[i].s != z {
|
||||
panic("unexpected search result")
|
||||
}
|
||||
i++
|
||||
m[g2] = cCDF(lm[j:i])
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// trigram returns the trigram with prefix g2 using a probability value
|
||||
// in the range [0.0,1.0).
|
||||
func (c comap) trigram(g2 string, p float64) string {
|
||||
prs := c[g2]
|
||||
i := prs.SearchProb(p)
|
||||
return prs[i].s
|
||||
}
|
||||
|
||||
var (
|
||||
// CDF for normal probabilities
|
||||
pcdf = pCDFOfLM(englm3)
|
||||
// map of two letter conditionals
|
||||
cmap = comapOfLM(englm3)
|
||||
)
|
||||
|
||||
// Reader generates a stream of text of uppercase letters with trigrams
|
||||
// distributed according to a language model of the English language.
|
||||
type Reader struct {
|
||||
rnd *rand.Rand
|
||||
g3 string
|
||||
}
|
||||
|
||||
// NewReader creates a new reader. The argument src must create a uniformly
|
||||
// distributed stream of random values.
|
||||
func NewReader(src rand.Source) *Reader {
|
||||
rnd := rand.New(src)
|
||||
i := pcdf.SearchProb(rnd.Float64())
|
||||
return &Reader{rnd, pcdf[i].s}
|
||||
}
|
||||
|
||||
// Read reads random text. The Read function will always return len(p)
|
||||
// bytes and will never return an error.
|
||||
func (r *Reader) Read(p []byte) (n int, err error) {
|
||||
for i := range p {
|
||||
r.g3 = cmap.trigram(r.g3[1:], r.rnd.Float64())
|
||||
p[i] = r.g3[2]
|
||||
}
|
||||
return len(p), nil
|
||||
}
|
37
vendor/github.com/ulikunitz/xz/internal/randtxt/probs_test.go
generated
vendored
Normal file
37
vendor/github.com/ulikunitz/xz/internal/randtxt/probs_test.go
generated
vendored
Normal file
@ -0,0 +1,37 @@
|
||||
// Copyright 2014-2017 Ulrich Kunitz. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package randtxt
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"math/rand"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestReader(t *testing.T) {
|
||||
lr := io.LimitReader(NewReader(rand.NewSource(13)), 195)
|
||||
pretty := NewGroupReader(lr)
|
||||
scanner := bufio.NewScanner(pretty)
|
||||
for scanner.Scan() {
|
||||
t.Log(scanner.Text())
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
t.Fatalf("scanner error %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestComap(t *testing.T) {
|
||||
prs := cmap["TH"]
|
||||
for _, p := range prs[3:6] {
|
||||
t.Logf("%v", p)
|
||||
}
|
||||
p := 0.2
|
||||
x := cmap.trigram("TH", p)
|
||||
if x != "THE" {
|
||||
t.Fatalf("cmap.trigram(%q, %.1f) returned %q; want %q",
|
||||
"TH", p, x, "THE")
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user