Change made by: - running "gvt fetch" on each of the packages mentioned in Godeps/Godeps.json - `rm -rf Godeps` - tweaking the build scripts to not mention Godeps - tweaking the build scripts to test `./lib/...`, `./cmd/...` explicitly (to avoid testing vendor) - tweaking the build scripts to not juggle GOPATH for Godeps and instead set GO15VENDOREXPERIMENT. This also results in some updated packages at the same time I bet. Building with Go 1.3 and 1.4 still *works* but won't use our vendored dependencies - the user needs to have the actual packages in their GOPATH then, which they'll get with a normal "go get". Building with Go 1.6+ will get our vendored dependencies by default even when not using our build script, which is nice. By doing this we gain some freedom in that we can pick and choose manually what to include in vendor, as it's not based on just dependency analysis of our own code. This is also a risk as we might pick up dependencies we are unaware of, as the build may work locally with those packages present in GOPATH. On the other hand the build server will detect this as it has no packages in it's GOPATH beyond what is included in the repo. Recommended tool to manage dependencies is github.com/FiloSottile/gvt.
451 lines
11 KiB
Go
451 lines
11 KiB
Go
// Copyright 2011 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package norm
|
|
|
|
import (
|
|
"fmt"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
|
|
// sequence of starter and non-starter runes for the purpose of normalization.
|
|
const MaxSegmentSize = maxByteBufferSize
|
|
|
|
// An Iter iterates over a string or byte slice, while normalizing it
|
|
// to a given Form.
|
|
type Iter struct {
|
|
rb reorderBuffer
|
|
buf [maxByteBufferSize]byte
|
|
info Properties // first character saved from previous iteration
|
|
next iterFunc // implementation of next depends on form
|
|
asciiF iterFunc
|
|
|
|
p int // current position in input source
|
|
multiSeg []byte // remainder of multi-segment decomposition
|
|
}
|
|
|
|
type iterFunc func(*Iter) []byte
|
|
|
|
// Init initializes i to iterate over src after normalizing it to Form f.
|
|
func (i *Iter) Init(f Form, src []byte) {
|
|
i.p = 0
|
|
if len(src) == 0 {
|
|
i.setDone()
|
|
i.rb.nsrc = 0
|
|
return
|
|
}
|
|
i.multiSeg = nil
|
|
i.rb.init(f, src)
|
|
i.next = i.rb.f.nextMain
|
|
i.asciiF = nextASCIIBytes
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
}
|
|
|
|
// InitString initializes i to iterate over src after normalizing it to Form f.
|
|
func (i *Iter) InitString(f Form, src string) {
|
|
i.p = 0
|
|
if len(src) == 0 {
|
|
i.setDone()
|
|
i.rb.nsrc = 0
|
|
return
|
|
}
|
|
i.multiSeg = nil
|
|
i.rb.initString(f, src)
|
|
i.next = i.rb.f.nextMain
|
|
i.asciiF = nextASCIIString
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
}
|
|
|
|
// Seek sets the segment to be returned by the next call to Next to start
|
|
// at position p. It is the responsibility of the caller to set p to the
|
|
// start of a UTF8 rune.
|
|
func (i *Iter) Seek(offset int64, whence int) (int64, error) {
|
|
var abs int64
|
|
switch whence {
|
|
case 0:
|
|
abs = offset
|
|
case 1:
|
|
abs = int64(i.p) + offset
|
|
case 2:
|
|
abs = int64(i.rb.nsrc) + offset
|
|
default:
|
|
return 0, fmt.Errorf("norm: invalid whence")
|
|
}
|
|
if abs < 0 {
|
|
return 0, fmt.Errorf("norm: negative position")
|
|
}
|
|
if int(abs) >= i.rb.nsrc {
|
|
i.setDone()
|
|
return int64(i.p), nil
|
|
}
|
|
i.p = int(abs)
|
|
i.multiSeg = nil
|
|
i.next = i.rb.f.nextMain
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
return abs, nil
|
|
}
|
|
|
|
// returnSlice returns a slice of the underlying input type as a byte slice.
|
|
// If the underlying is of type []byte, it will simply return a slice.
|
|
// If the underlying is of type string, it will copy the slice to the buffer
|
|
// and return that.
|
|
func (i *Iter) returnSlice(a, b int) []byte {
|
|
if i.rb.src.bytes == nil {
|
|
return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
|
|
}
|
|
return i.rb.src.bytes[a:b]
|
|
}
|
|
|
|
// Pos returns the byte position at which the next call to Next will commence processing.
|
|
func (i *Iter) Pos() int {
|
|
return i.p
|
|
}
|
|
|
|
func (i *Iter) setDone() {
|
|
i.next = nextDone
|
|
i.p = i.rb.nsrc
|
|
}
|
|
|
|
// Done returns true if there is no more input to process.
|
|
func (i *Iter) Done() bool {
|
|
return i.p >= i.rb.nsrc
|
|
}
|
|
|
|
// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
|
|
// For any input a and b for which f(a) == f(b), subsequent calls
|
|
// to Next will return the same segments.
|
|
// Modifying runes are grouped together with the preceding starter, if such a starter exists.
|
|
// Although not guaranteed, n will typically be the smallest possible n.
|
|
func (i *Iter) Next() []byte {
|
|
return i.next(i)
|
|
}
|
|
|
|
func nextASCIIBytes(i *Iter) []byte {
|
|
p := i.p + 1
|
|
if p >= i.rb.nsrc {
|
|
i.setDone()
|
|
return i.rb.src.bytes[i.p:p]
|
|
}
|
|
if i.rb.src.bytes[p] < utf8.RuneSelf {
|
|
p0 := i.p
|
|
i.p = p
|
|
return i.rb.src.bytes[p0:p]
|
|
}
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
i.next = i.rb.f.nextMain
|
|
return i.next(i)
|
|
}
|
|
|
|
func nextASCIIString(i *Iter) []byte {
|
|
p := i.p + 1
|
|
if p >= i.rb.nsrc {
|
|
i.buf[0] = i.rb.src.str[i.p]
|
|
i.setDone()
|
|
return i.buf[:1]
|
|
}
|
|
if i.rb.src.str[p] < utf8.RuneSelf {
|
|
i.buf[0] = i.rb.src.str[i.p]
|
|
i.p = p
|
|
return i.buf[:1]
|
|
}
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
i.next = i.rb.f.nextMain
|
|
return i.next(i)
|
|
}
|
|
|
|
func nextHangul(i *Iter) []byte {
|
|
p := i.p
|
|
next := p + hangulUTF8Size
|
|
if next >= i.rb.nsrc {
|
|
i.setDone()
|
|
} else if i.rb.src.hangul(next) == 0 {
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
i.next = i.rb.f.nextMain
|
|
return i.next(i)
|
|
}
|
|
i.p = next
|
|
return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
|
|
}
|
|
|
|
func nextDone(i *Iter) []byte {
|
|
return nil
|
|
}
|
|
|
|
// nextMulti is used for iterating over multi-segment decompositions
|
|
// for decomposing normal forms.
|
|
func nextMulti(i *Iter) []byte {
|
|
j := 0
|
|
d := i.multiSeg
|
|
// skip first rune
|
|
for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
|
|
}
|
|
for j < len(d) {
|
|
info := i.rb.f.info(input{bytes: d}, j)
|
|
if info.BoundaryBefore() {
|
|
i.multiSeg = d[j:]
|
|
return d[:j]
|
|
}
|
|
j += int(info.size)
|
|
}
|
|
// treat last segment as normal decomposition
|
|
i.next = i.rb.f.nextMain
|
|
return i.next(i)
|
|
}
|
|
|
|
// nextMultiNorm is used for iterating over multi-segment decompositions
|
|
// for composing normal forms.
|
|
func nextMultiNorm(i *Iter) []byte {
|
|
j := 0
|
|
d := i.multiSeg
|
|
for j < len(d) {
|
|
info := i.rb.f.info(input{bytes: d}, j)
|
|
if info.BoundaryBefore() {
|
|
i.rb.compose()
|
|
seg := i.buf[:i.rb.flushCopy(i.buf[:])]
|
|
i.rb.ss.first(info)
|
|
i.rb.insertUnsafe(input{bytes: d}, j, info)
|
|
i.multiSeg = d[j+int(info.size):]
|
|
return seg
|
|
}
|
|
i.rb.ss.next(info)
|
|
i.rb.insertUnsafe(input{bytes: d}, j, info)
|
|
j += int(info.size)
|
|
}
|
|
i.multiSeg = nil
|
|
i.next = nextComposed
|
|
return doNormComposed(i)
|
|
}
|
|
|
|
// nextDecomposed is the implementation of Next for forms NFD and NFKD.
|
|
func nextDecomposed(i *Iter) (next []byte) {
|
|
outp := 0
|
|
inCopyStart, outCopyStart := i.p, 0
|
|
ss := mkStreamSafe(i.info)
|
|
for {
|
|
if sz := int(i.info.size); sz <= 1 {
|
|
p := i.p
|
|
i.p++ // ASCII or illegal byte. Either way, advance by 1.
|
|
if i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
return i.returnSlice(p, i.p)
|
|
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
|
|
i.next = i.asciiF
|
|
return i.returnSlice(p, i.p)
|
|
}
|
|
outp++
|
|
} else if d := i.info.Decomposition(); d != nil {
|
|
// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
|
|
// Case 1: there is a leftover to copy. In this case the decomposition
|
|
// must begin with a modifier and should always be appended.
|
|
// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
|
|
p := outp + len(d)
|
|
if outp > 0 {
|
|
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
|
|
if p > len(i.buf) {
|
|
return i.buf[:outp]
|
|
}
|
|
} else if i.info.multiSegment() {
|
|
// outp must be 0 as multi-segment decompositions always
|
|
// start a new segment.
|
|
if i.multiSeg == nil {
|
|
i.multiSeg = d
|
|
i.next = nextMulti
|
|
return nextMulti(i)
|
|
}
|
|
// We are in the last segment. Treat as normal decomposition.
|
|
d = i.multiSeg
|
|
i.multiSeg = nil
|
|
p = len(d)
|
|
}
|
|
prevCC := i.info.tccc
|
|
if i.p += sz; i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
i.info = Properties{} // Force BoundaryBefore to succeed.
|
|
} else {
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
}
|
|
switch ss.next(i.info) {
|
|
case ssOverflow:
|
|
i.next = nextCGJDecompose
|
|
fallthrough
|
|
case ssStarter:
|
|
if outp > 0 {
|
|
copy(i.buf[outp:], d)
|
|
return i.buf[:p]
|
|
}
|
|
return d
|
|
}
|
|
copy(i.buf[outp:], d)
|
|
outp = p
|
|
inCopyStart, outCopyStart = i.p, outp
|
|
if i.info.ccc < prevCC {
|
|
goto doNorm
|
|
}
|
|
continue
|
|
} else if r := i.rb.src.hangul(i.p); r != 0 {
|
|
outp = decomposeHangul(i.buf[:], r)
|
|
i.p += hangulUTF8Size
|
|
inCopyStart, outCopyStart = i.p, outp
|
|
if i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
break
|
|
} else if i.rb.src.hangul(i.p) != 0 {
|
|
i.next = nextHangul
|
|
return i.buf[:outp]
|
|
}
|
|
} else {
|
|
p := outp + sz
|
|
if p > len(i.buf) {
|
|
break
|
|
}
|
|
outp = p
|
|
i.p += sz
|
|
}
|
|
if i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
break
|
|
}
|
|
prevCC := i.info.tccc
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
if v := ss.next(i.info); v == ssStarter {
|
|
break
|
|
} else if v == ssOverflow {
|
|
i.next = nextCGJDecompose
|
|
break
|
|
}
|
|
if i.info.ccc < prevCC {
|
|
goto doNorm
|
|
}
|
|
}
|
|
if outCopyStart == 0 {
|
|
return i.returnSlice(inCopyStart, i.p)
|
|
} else if inCopyStart < i.p {
|
|
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
|
|
}
|
|
return i.buf[:outp]
|
|
doNorm:
|
|
// Insert what we have decomposed so far in the reorderBuffer.
|
|
// As we will only reorder, there will always be enough room.
|
|
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
|
|
i.rb.insertDecomposed(i.buf[0:outp])
|
|
return doNormDecomposed(i)
|
|
}
|
|
|
|
func doNormDecomposed(i *Iter) []byte {
|
|
for {
|
|
if s := i.rb.ss.next(i.info); s == ssOverflow {
|
|
i.next = nextCGJDecompose
|
|
break
|
|
}
|
|
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
|
|
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
break
|
|
}
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
if i.info.ccc == 0 {
|
|
break
|
|
}
|
|
}
|
|
// new segment or too many combining characters: exit normalization
|
|
return i.buf[:i.rb.flushCopy(i.buf[:])]
|
|
}
|
|
|
|
func nextCGJDecompose(i *Iter) []byte {
|
|
i.rb.ss = 0
|
|
i.rb.insertCGJ()
|
|
i.next = nextDecomposed
|
|
buf := doNormDecomposed(i)
|
|
return buf
|
|
}
|
|
|
|
// nextComposed is the implementation of Next for forms NFC and NFKC.
|
|
func nextComposed(i *Iter) []byte {
|
|
outp, startp := 0, i.p
|
|
var prevCC uint8
|
|
ss := mkStreamSafe(i.info)
|
|
for {
|
|
if !i.info.isYesC() {
|
|
goto doNorm
|
|
}
|
|
prevCC = i.info.tccc
|
|
sz := int(i.info.size)
|
|
if sz == 0 {
|
|
sz = 1 // illegal rune: copy byte-by-byte
|
|
}
|
|
p := outp + sz
|
|
if p > len(i.buf) {
|
|
break
|
|
}
|
|
outp = p
|
|
i.p += sz
|
|
if i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
break
|
|
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
|
|
i.next = i.asciiF
|
|
break
|
|
}
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
if v := ss.next(i.info); v == ssStarter {
|
|
break
|
|
} else if v == ssOverflow {
|
|
i.next = nextCGJCompose
|
|
break
|
|
}
|
|
if i.info.ccc < prevCC {
|
|
goto doNorm
|
|
}
|
|
}
|
|
return i.returnSlice(startp, i.p)
|
|
doNorm:
|
|
i.p = startp
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
if i.info.multiSegment() {
|
|
d := i.info.Decomposition()
|
|
info := i.rb.f.info(input{bytes: d}, 0)
|
|
i.rb.insertUnsafe(input{bytes: d}, 0, info)
|
|
i.multiSeg = d[int(info.size):]
|
|
i.next = nextMultiNorm
|
|
return nextMultiNorm(i)
|
|
}
|
|
i.rb.ss.first(i.info)
|
|
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
|
|
return doNormComposed(i)
|
|
}
|
|
|
|
func doNormComposed(i *Iter) []byte {
|
|
// First rune should already be inserted.
|
|
for {
|
|
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
|
|
i.setDone()
|
|
break
|
|
}
|
|
i.info = i.rb.f.info(i.rb.src, i.p)
|
|
if s := i.rb.ss.next(i.info); s == ssStarter {
|
|
break
|
|
} else if s == ssOverflow {
|
|
i.next = nextCGJCompose
|
|
break
|
|
}
|
|
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
|
|
}
|
|
i.rb.compose()
|
|
seg := i.buf[:i.rb.flushCopy(i.buf[:])]
|
|
return seg
|
|
}
|
|
|
|
func nextCGJCompose(i *Iter) []byte {
|
|
i.rb.ss = 0 // instead of first
|
|
i.rb.insertCGJ()
|
|
i.next = nextComposed
|
|
// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
|
|
// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
|
|
// If we ever change that, insert a check here.
|
|
i.rb.ss.first(i.info)
|
|
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
|
|
return doNormComposed(i)
|
|
}
|