mirror of https://github.com/ethereum/go-ethereum
parent
5b30aa59d6
commit
8ee5bb2289
@ -0,0 +1,512 @@ |
|||||||
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package norm |
||||||
|
|
||||||
|
import "unicode/utf8" |
||||||
|
|
||||||
|
const ( |
||||||
|
maxNonStarters = 30 |
||||||
|
// The maximum number of characters needed for a buffer is
|
||||||
|
// maxNonStarters + 1 for the starter + 1 for the GCJ
|
||||||
|
maxBufferSize = maxNonStarters + 2 |
||||||
|
maxNFCExpansion = 3 // NFC(0x1D160)
|
||||||
|
maxNFKCExpansion = 18 // NFKC(0xFDFA)
|
||||||
|
|
||||||
|
maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
|
||||||
|
) |
||||||
|
|
||||||
|
// ssState is used for reporting the segment state after inserting a rune.
|
||||||
|
// It is returned by streamSafe.next.
|
||||||
|
type ssState int |
||||||
|
|
||||||
|
const ( |
||||||
|
// Indicates a rune was successfully added to the segment.
|
||||||
|
ssSuccess ssState = iota |
||||||
|
// Indicates a rune starts a new segment and should not be added.
|
||||||
|
ssStarter |
||||||
|
// Indicates a rune caused a segment overflow and a CGJ should be inserted.
|
||||||
|
ssOverflow |
||||||
|
) |
||||||
|
|
||||||
|
// streamSafe implements the policy of when a CGJ should be inserted.
|
||||||
|
type streamSafe uint8 |
||||||
|
|
||||||
|
// first inserts the first rune of a segment. It is a faster version of next if
|
||||||
|
// it is known p represents the first rune in a segment.
|
||||||
|
func (ss *streamSafe) first(p Properties) { |
||||||
|
*ss = streamSafe(p.nTrailingNonStarters()) |
||||||
|
} |
||||||
|
|
||||||
|
// insert returns a ssState value to indicate whether a rune represented by p
|
||||||
|
// can be inserted.
|
||||||
|
func (ss *streamSafe) next(p Properties) ssState { |
||||||
|
if *ss > maxNonStarters { |
||||||
|
panic("streamSafe was not reset") |
||||||
|
} |
||||||
|
n := p.nLeadingNonStarters() |
||||||
|
if *ss += streamSafe(n); *ss > maxNonStarters { |
||||||
|
*ss = 0 |
||||||
|
return ssOverflow |
||||||
|
} |
||||||
|
// The Stream-Safe Text Processing prescribes that the counting can stop
|
||||||
|
// as soon as a starter is encountered. However, there are some starters,
|
||||||
|
// like Jamo V and T, that can combine with other runes, leaving their
|
||||||
|
// successive non-starters appended to the previous, possibly causing an
|
||||||
|
// overflow. We will therefore consider any rune with a non-zero nLead to
|
||||||
|
// be a non-starter. Note that it always hold that if nLead > 0 then
|
||||||
|
// nLead == nTrail.
|
||||||
|
if n == 0 { |
||||||
|
*ss = streamSafe(p.nTrailingNonStarters()) |
||||||
|
return ssStarter |
||||||
|
} |
||||||
|
return ssSuccess |
||||||
|
} |
||||||
|
|
||||||
|
// backwards is used for checking for overflow and segment starts
|
||||||
|
// when traversing a string backwards. Users do not need to call first
|
||||||
|
// for the first rune. The state of the streamSafe retains the count of
|
||||||
|
// the non-starters loaded.
|
||||||
|
func (ss *streamSafe) backwards(p Properties) ssState { |
||||||
|
if *ss > maxNonStarters { |
||||||
|
panic("streamSafe was not reset") |
||||||
|
} |
||||||
|
c := *ss + streamSafe(p.nTrailingNonStarters()) |
||||||
|
if c > maxNonStarters { |
||||||
|
return ssOverflow |
||||||
|
} |
||||||
|
*ss = c |
||||||
|
if p.nLeadingNonStarters() == 0 { |
||||||
|
return ssStarter |
||||||
|
} |
||||||
|
return ssSuccess |
||||||
|
} |
||||||
|
|
||||||
|
func (ss streamSafe) isMax() bool { |
||||||
|
return ss == maxNonStarters |
||||||
|
} |
||||||
|
|
||||||
|
// GraphemeJoiner is inserted after maxNonStarters non-starter runes.
|
||||||
|
const GraphemeJoiner = "\u034F" |
||||||
|
|
||||||
|
// reorderBuffer is used to normalize a single segment. Characters inserted with
|
||||||
|
// insert are decomposed and reordered based on CCC. The compose method can
|
||||||
|
// be used to recombine characters. Note that the byte buffer does not hold
|
||||||
|
// the UTF-8 characters in order. Only the rune array is maintained in sorted
|
||||||
|
// order. flush writes the resulting segment to a byte array.
|
||||||
|
type reorderBuffer struct { |
||||||
|
rune [maxBufferSize]Properties // Per character info.
|
||||||
|
byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
|
||||||
|
nbyte uint8 // Number or bytes.
|
||||||
|
ss streamSafe // For limiting length of non-starter sequence.
|
||||||
|
nrune int // Number of runeInfos.
|
||||||
|
f formInfo |
||||||
|
|
||||||
|
src input |
||||||
|
nsrc int |
||||||
|
tmpBytes input |
||||||
|
|
||||||
|
out []byte |
||||||
|
flushF func(*reorderBuffer) bool |
||||||
|
} |
||||||
|
|
||||||
|
func (rb *reorderBuffer) init(f Form, src []byte) { |
||||||
|
rb.f = *formTable[f] |
||||||
|
rb.src.setBytes(src) |
||||||
|
rb.nsrc = len(src) |
||||||
|
rb.ss = 0 |
||||||
|
} |
||||||
|
|
||||||
|
func (rb *reorderBuffer) initString(f Form, src string) { |
||||||
|
rb.f = *formTable[f] |
||||||
|
rb.src.setString(src) |
||||||
|
rb.nsrc = len(src) |
||||||
|
rb.ss = 0 |
||||||
|
} |
||||||
|
|
||||||
|
func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) { |
||||||
|
rb.out = out |
||||||
|
rb.flushF = f |
||||||
|
} |
||||||
|
|
||||||
|
// reset discards all characters from the buffer.
|
||||||
|
func (rb *reorderBuffer) reset() { |
||||||
|
rb.nrune = 0 |
||||||
|
rb.nbyte = 0 |
||||||
|
} |
||||||
|
|
||||||
|
func (rb *reorderBuffer) doFlush() bool { |
||||||
|
if rb.f.composing { |
||||||
|
rb.compose() |
||||||
|
} |
||||||
|
res := rb.flushF(rb) |
||||||
|
rb.reset() |
||||||
|
return res |
||||||
|
} |
||||||
|
|
||||||
|
// appendFlush appends the normalized segment to rb.out.
|
||||||
|
func appendFlush(rb *reorderBuffer) bool { |
||||||
|
for i := 0; i < rb.nrune; i++ { |
||||||
|
start := rb.rune[i].pos |
||||||
|
end := start + rb.rune[i].size |
||||||
|
rb.out = append(rb.out, rb.byte[start:end]...) |
||||||
|
} |
||||||
|
return true |
||||||
|
} |
||||||
|
|
||||||
|
// flush appends the normalized segment to out and resets rb.
|
||||||
|
func (rb *reorderBuffer) flush(out []byte) []byte { |
||||||
|
for i := 0; i < rb.nrune; i++ { |
||||||
|
start := rb.rune[i].pos |
||||||
|
end := start + rb.rune[i].size |
||||||
|
out = append(out, rb.byte[start:end]...) |
||||||
|
} |
||||||
|
rb.reset() |
||||||
|
return out |
||||||
|
} |
||||||
|
|
||||||
|
// flushCopy copies the normalized segment to buf and resets rb.
|
||||||
|
// It returns the number of bytes written to buf.
|
||||||
|
func (rb *reorderBuffer) flushCopy(buf []byte) int { |
||||||
|
p := 0 |
||||||
|
for i := 0; i < rb.nrune; i++ { |
||||||
|
runep := rb.rune[i] |
||||||
|
p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size]) |
||||||
|
} |
||||||
|
rb.reset() |
||||||
|
return p |
||||||
|
} |
||||||
|
|
||||||
|
// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
|
||||||
|
// It returns false if the buffer is not large enough to hold the rune.
|
||||||
|
// It is used internally by insert and insertString only.
|
||||||
|
func (rb *reorderBuffer) insertOrdered(info Properties) { |
||||||
|
n := rb.nrune |
||||||
|
b := rb.rune[:] |
||||||
|
cc := info.ccc |
||||||
|
if cc > 0 { |
||||||
|
// Find insertion position + move elements to make room.
|
||||||
|
for ; n > 0; n-- { |
||||||
|
if b[n-1].ccc <= cc { |
||||||
|
break |
||||||
|
} |
||||||
|
b[n] = b[n-1] |
||||||
|
} |
||||||
|
} |
||||||
|
rb.nrune += 1 |
||||||
|
pos := uint8(rb.nbyte) |
||||||
|
rb.nbyte += utf8.UTFMax |
||||||
|
info.pos = pos |
||||||
|
b[n] = info |
||||||
|
} |
||||||
|
|
||||||
|
// insertErr is an error code returned by insert. Using this type instead
|
||||||
|
// of error improves performance up to 20% for many of the benchmarks.
|
||||||
|
type insertErr int |
||||||
|
|
||||||
|
const ( |
||||||
|
iSuccess insertErr = -iota |
||||||
|
iShortDst |
||||||
|
iShortSrc |
||||||
|
) |
||||||
|
|
||||||
|
// insertFlush inserts the given rune in the buffer ordered by CCC.
|
||||||
|
// If a decomposition with multiple segments are encountered, they leading
|
||||||
|
// ones are flushed.
|
||||||
|
// It returns a non-zero error code if the rune was not inserted.
|
||||||
|
func (rb *reorderBuffer) insertFlush(src input, i int, info Properties) insertErr { |
||||||
|
if rune := src.hangul(i); rune != 0 { |
||||||
|
rb.decomposeHangul(rune) |
||||||
|
return iSuccess |
||||||
|
} |
||||||
|
if info.hasDecomposition() { |
||||||
|
return rb.insertDecomposed(info.Decomposition()) |
||||||
|
} |
||||||
|
rb.insertSingle(src, i, info) |
||||||
|
return iSuccess |
||||||
|
} |
||||||
|
|
||||||
|
// insertUnsafe inserts the given rune in the buffer ordered by CCC.
|
||||||
|
// It is assumed there is sufficient space to hold the runes. It is the
|
||||||
|
// responsibility of the caller to ensure this. This can be done by checking
|
||||||
|
// the state returned by the streamSafe type.
|
||||||
|
func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) { |
||||||
|
if rune := src.hangul(i); rune != 0 { |
||||||
|
rb.decomposeHangul(rune) |
||||||
|
} |
||||||
|
if info.hasDecomposition() { |
||||||
|
// TODO: inline.
|
||||||
|
rb.insertDecomposed(info.Decomposition()) |
||||||
|
} else { |
||||||
|
rb.insertSingle(src, i, info) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// insertDecomposed inserts an entry in to the reorderBuffer for each rune
|
||||||
|
// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
|
||||||
|
// It flushes the buffer on each new segment start.
|
||||||
|
func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr { |
||||||
|
rb.tmpBytes.setBytes(dcomp) |
||||||
|
// As the streamSafe accounting already handles the counting for modifiers,
|
||||||
|
// we don't have to call next. However, we do need to keep the accounting
|
||||||
|
// intact when flushing the buffer.
|
||||||
|
for i := 0; i < len(dcomp); { |
||||||
|
info := rb.f.info(rb.tmpBytes, i) |
||||||
|
if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() { |
||||||
|
return iShortDst |
||||||
|
} |
||||||
|
i += copy(rb.byte[rb.nbyte:], dcomp[i:i+int(info.size)]) |
||||||
|
rb.insertOrdered(info) |
||||||
|
} |
||||||
|
return iSuccess |
||||||
|
} |
||||||
|
|
||||||
|
// insertSingle inserts an entry in the reorderBuffer for the rune at
|
||||||
|
// position i. info is the runeInfo for the rune at position i.
|
||||||
|
func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) { |
||||||
|
src.copySlice(rb.byte[rb.nbyte:], i, i+int(info.size)) |
||||||
|
rb.insertOrdered(info) |
||||||
|
} |
||||||
|
|
||||||
|
// insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb.
|
||||||
|
func (rb *reorderBuffer) insertCGJ() { |
||||||
|
rb.insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))}) |
||||||
|
} |
||||||
|
|
||||||
|
// appendRune inserts a rune at the end of the buffer. It is used for Hangul.
|
||||||
|
func (rb *reorderBuffer) appendRune(r rune) { |
||||||
|
bn := rb.nbyte |
||||||
|
sz := utf8.EncodeRune(rb.byte[bn:], rune(r)) |
||||||
|
rb.nbyte += utf8.UTFMax |
||||||
|
rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)} |
||||||
|
rb.nrune++ |
||||||
|
} |
||||||
|
|
||||||
|
// assignRune sets a rune at position pos. It is used for Hangul and recomposition.
|
||||||
|
func (rb *reorderBuffer) assignRune(pos int, r rune) { |
||||||
|
bn := rb.rune[pos].pos |
||||||
|
sz := utf8.EncodeRune(rb.byte[bn:], rune(r)) |
||||||
|
rb.rune[pos] = Properties{pos: bn, size: uint8(sz)} |
||||||
|
} |
||||||
|
|
||||||
|
// runeAt returns the rune at position n. It is used for Hangul and recomposition.
|
||||||
|
func (rb *reorderBuffer) runeAt(n int) rune { |
||||||
|
inf := rb.rune[n] |
||||||
|
r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size]) |
||||||
|
return r |
||||||
|
} |
||||||
|
|
||||||
|
// bytesAt returns the UTF-8 encoding of the rune at position n.
|
||||||
|
// It is used for Hangul and recomposition.
|
||||||
|
func (rb *reorderBuffer) bytesAt(n int) []byte { |
||||||
|
inf := rb.rune[n] |
||||||
|
return rb.byte[inf.pos : int(inf.pos)+int(inf.size)] |
||||||
|
} |
||||||
|
|
||||||
|
// For Hangul we combine algorithmically, instead of using tables.
|
||||||
|
const ( |
||||||
|
hangulBase = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
|
||||||
|
hangulBase0 = 0xEA |
||||||
|
hangulBase1 = 0xB0 |
||||||
|
hangulBase2 = 0x80 |
||||||
|
|
||||||
|
hangulEnd = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
|
||||||
|
hangulEnd0 = 0xED |
||||||
|
hangulEnd1 = 0x9E |
||||||
|
hangulEnd2 = 0xA4 |
||||||
|
|
||||||
|
jamoLBase = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
|
||||||
|
jamoLBase0 = 0xE1 |
||||||
|
jamoLBase1 = 0x84 |
||||||
|
jamoLEnd = 0x1113 |
||||||
|
jamoVBase = 0x1161 |
||||||
|
jamoVEnd = 0x1176 |
||||||
|
jamoTBase = 0x11A7 |
||||||
|
jamoTEnd = 0x11C3 |
||||||
|
|
||||||
|
jamoTCount = 28 |
||||||
|
jamoVCount = 21 |
||||||
|
jamoVTCount = 21 * 28 |
||||||
|
jamoLVTCount = 19 * 21 * 28 |
||||||
|
) |
||||||
|
|
||||||
|
const hangulUTF8Size = 3 |
||||||
|
|
||||||
|
func isHangul(b []byte) bool { |
||||||
|
if len(b) < hangulUTF8Size { |
||||||
|
return false |
||||||
|
} |
||||||
|
b0 := b[0] |
||||||
|
if b0 < hangulBase0 { |
||||||
|
return false |
||||||
|
} |
||||||
|
b1 := b[1] |
||||||
|
switch { |
||||||
|
case b0 == hangulBase0: |
||||||
|
return b1 >= hangulBase1 |
||||||
|
case b0 < hangulEnd0: |
||||||
|
return true |
||||||
|
case b0 > hangulEnd0: |
||||||
|
return false |
||||||
|
case b1 < hangulEnd1: |
||||||
|
return true |
||||||
|
} |
||||||
|
return b1 == hangulEnd1 && b[2] < hangulEnd2 |
||||||
|
} |
||||||
|
|
||||||
|
func isHangulString(b string) bool { |
||||||
|
if len(b) < hangulUTF8Size { |
||||||
|
return false |
||||||
|
} |
||||||
|
b0 := b[0] |
||||||
|
if b0 < hangulBase0 { |
||||||
|
return false |
||||||
|
} |
||||||
|
b1 := b[1] |
||||||
|
switch { |
||||||
|
case b0 == hangulBase0: |
||||||
|
return b1 >= hangulBase1 |
||||||
|
case b0 < hangulEnd0: |
||||||
|
return true |
||||||
|
case b0 > hangulEnd0: |
||||||
|
return false |
||||||
|
case b1 < hangulEnd1: |
||||||
|
return true |
||||||
|
} |
||||||
|
return b1 == hangulEnd1 && b[2] < hangulEnd2 |
||||||
|
} |
||||||
|
|
||||||
|
// Caller must ensure len(b) >= 2.
|
||||||
|
func isJamoVT(b []byte) bool { |
||||||
|
// True if (rune & 0xff00) == jamoLBase
|
||||||
|
return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1 |
||||||
|
} |
||||||
|
|
||||||
|
func isHangulWithoutJamoT(b []byte) bool { |
||||||
|
c, _ := utf8.DecodeRune(b) |
||||||
|
c -= hangulBase |
||||||
|
return c < jamoLVTCount && c%jamoTCount == 0 |
||||||
|
} |
||||||
|
|
||||||
|
// decomposeHangul writes the decomposed Hangul to buf and returns the number
|
||||||
|
// of bytes written. len(buf) should be at least 9.
|
||||||
|
func decomposeHangul(buf []byte, r rune) int { |
||||||
|
const JamoUTF8Len = 3 |
||||||
|
r -= hangulBase |
||||||
|
x := r % jamoTCount |
||||||
|
r /= jamoTCount |
||||||
|
utf8.EncodeRune(buf, jamoLBase+r/jamoVCount) |
||||||
|
utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount) |
||||||
|
if x != 0 { |
||||||
|
utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x) |
||||||
|
return 3 * JamoUTF8Len |
||||||
|
} |
||||||
|
return 2 * JamoUTF8Len |
||||||
|
} |
||||||
|
|
||||||
|
// decomposeHangul algorithmically decomposes a Hangul rune into
|
||||||
|
// its Jamo components.
|
||||||
|
// See https://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
|
||||||
|
func (rb *reorderBuffer) decomposeHangul(r rune) { |
||||||
|
r -= hangulBase |
||||||
|
x := r % jamoTCount |
||||||
|
r /= jamoTCount |
||||||
|
rb.appendRune(jamoLBase + r/jamoVCount) |
||||||
|
rb.appendRune(jamoVBase + r%jamoVCount) |
||||||
|
if x != 0 { |
||||||
|
rb.appendRune(jamoTBase + x) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// combineHangul algorithmically combines Jamo character components into Hangul.
|
||||||
|
// See https://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
|
||||||
|
func (rb *reorderBuffer) combineHangul(s, i, k int) { |
||||||
|
b := rb.rune[:] |
||||||
|
bn := rb.nrune |
||||||
|
for ; i < bn; i++ { |
||||||
|
cccB := b[k-1].ccc |
||||||
|
cccC := b[i].ccc |
||||||
|
if cccB == 0 { |
||||||
|
s = k - 1 |
||||||
|
} |
||||||
|
if s != k-1 && cccB >= cccC { |
||||||
|
// b[i] is blocked by greater-equal cccX below it
|
||||||
|
b[k] = b[i] |
||||||
|
k++ |
||||||
|
} else { |
||||||
|
l := rb.runeAt(s) // also used to compare to hangulBase
|
||||||
|
v := rb.runeAt(i) // also used to compare to jamoT
|
||||||
|
switch { |
||||||
|
case jamoLBase <= l && l < jamoLEnd && |
||||||
|
jamoVBase <= v && v < jamoVEnd: |
||||||
|
// 11xx plus 116x to LV
|
||||||
|
rb.assignRune(s, hangulBase+ |
||||||
|
(l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount) |
||||||
|
case hangulBase <= l && l < hangulEnd && |
||||||
|
jamoTBase < v && v < jamoTEnd && |
||||||
|
((l-hangulBase)%jamoTCount) == 0: |
||||||
|
// ACxx plus 11Ax to LVT
|
||||||
|
rb.assignRune(s, l+v-jamoTBase) |
||||||
|
default: |
||||||
|
b[k] = b[i] |
||||||
|
k++ |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
rb.nrune = k |
||||||
|
} |
||||||
|
|
||||||
|
// compose recombines the runes in the buffer.
|
||||||
|
// It should only be used to recompose a single segment, as it will not
|
||||||
|
// handle alternations between Hangul and non-Hangul characters correctly.
|
||||||
|
func (rb *reorderBuffer) compose() { |
||||||
|
// Lazily load the map used by the combine func below, but do
|
||||||
|
// it outside of the loop.
|
||||||
|
recompMapOnce.Do(buildRecompMap) |
||||||
|
|
||||||
|
// UAX #15, section X5 , including Corrigendum #5
|
||||||
|
// "In any character sequence beginning with starter S, a character C is
|
||||||
|
// blocked from S if and only if there is some character B between S
|
||||||
|
// and C, and either B is a starter or it has the same or higher
|
||||||
|
// combining class as C."
|
||||||
|
bn := rb.nrune |
||||||
|
if bn == 0 { |
||||||
|
return |
||||||
|
} |
||||||
|
k := 1 |
||||||
|
b := rb.rune[:] |
||||||
|
for s, i := 0, 1; i < bn; i++ { |
||||||
|
if isJamoVT(rb.bytesAt(i)) { |
||||||
|
// Redo from start in Hangul mode. Necessary to support
|
||||||
|
// U+320E..U+321E in NFKC mode.
|
||||||
|
rb.combineHangul(s, i, k) |
||||||
|
return |
||||||
|
} |
||||||
|
ii := b[i] |
||||||
|
// We can only use combineForward as a filter if we later
|
||||||
|
// get the info for the combined character. This is more
|
||||||
|
// expensive than using the filter. Using combinesBackward()
|
||||||
|
// is safe.
|
||||||
|
if ii.combinesBackward() { |
||||||
|
cccB := b[k-1].ccc |
||||||
|
cccC := ii.ccc |
||||||
|
blocked := false // b[i] blocked by starter or greater or equal CCC?
|
||||||
|
if cccB == 0 { |
||||||
|
s = k - 1 |
||||||
|
} else { |
||||||
|
blocked = s != k-1 && cccB >= cccC |
||||||
|
} |
||||||
|
if !blocked { |
||||||
|
combined := combine(rb.runeAt(s), rb.runeAt(i)) |
||||||
|
if combined != 0 { |
||||||
|
rb.assignRune(s, combined) |
||||||
|
continue |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
b[k] = b[i] |
||||||
|
k++ |
||||||
|
} |
||||||
|
rb.nrune = k |
||||||
|
} |
@ -0,0 +1,278 @@ |
|||||||
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package norm |
||||||
|
|
||||||
|
import "encoding/binary" |
||||||
|
|
||||||
|
// This file contains Form-specific logic and wrappers for data in tables.go.
|
||||||
|
|
||||||
|
// Rune info is stored in a separate trie per composing form. A composing form
|
||||||
|
// and its corresponding decomposing form share the same trie. Each trie maps
|
||||||
|
// a rune to a uint16. The values take two forms. For v >= 0x8000:
|
||||||
|
// bits
|
||||||
|
// 15: 1 (inverse of NFD_QC bit of qcInfo)
|
||||||
|
// 13..7: qcInfo (see below). isYesD is always true (no decompostion).
|
||||||
|
// 6..0: ccc (compressed CCC value).
|
||||||
|
// For v < 0x8000, the respective rune has a decomposition and v is an index
|
||||||
|
// into a byte array of UTF-8 decomposition sequences and additional info and
|
||||||
|
// has the form:
|
||||||
|
// <header> <decomp_byte>* [<tccc> [<lccc>]]
|
||||||
|
// The header contains the number of bytes in the decomposition (excluding this
|
||||||
|
// length byte). The two most significant bits of this length byte correspond
|
||||||
|
// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.
|
||||||
|
// The byte sequence is followed by a trailing and leading CCC if the values
|
||||||
|
// for these are not zero. The value of v determines which ccc are appended
|
||||||
|
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
|
||||||
|
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
|
||||||
|
// there is an additional leading ccc. The value of tccc itself is the
|
||||||
|
// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
|
||||||
|
// are the number of trailing non-starters.
|
||||||
|
|
||||||
|
const ( |
||||||
|
qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
|
||||||
|
headerLenMask = 0x3F // extract the length value from the header byte
|
||||||
|
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
|
||||||
|
) |
||||||
|
|
||||||
|
// Properties provides access to normalization properties of a rune.
|
||||||
|
type Properties struct { |
||||||
|
pos uint8 // start position in reorderBuffer; used in composition.go
|
||||||
|
size uint8 // length of UTF-8 encoding of this rune
|
||||||
|
ccc uint8 // leading canonical combining class (ccc if not decomposition)
|
||||||
|
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
|
||||||
|
nLead uint8 // number of leading non-starters.
|
||||||
|
flags qcInfo // quick check flags
|
||||||
|
index uint16 |
||||||
|
} |
||||||
|
|
||||||
|
// functions dispatchable per form
|
||||||
|
type lookupFunc func(b input, i int) Properties |
||||||
|
|
||||||
|
// formInfo holds Form-specific functions and tables.
|
||||||
|
type formInfo struct { |
||||||
|
form Form |
||||||
|
composing, compatibility bool // form type
|
||||||
|
info lookupFunc |
||||||
|
nextMain iterFunc |
||||||
|
} |
||||||
|
|
||||||
|
var formTable = []*formInfo{{ |
||||||
|
form: NFC, |
||||||
|
composing: true, |
||||||
|
compatibility: false, |
||||||
|
info: lookupInfoNFC, |
||||||
|
nextMain: nextComposed, |
||||||
|
}, { |
||||||
|
form: NFD, |
||||||
|
composing: false, |
||||||
|
compatibility: false, |
||||||
|
info: lookupInfoNFC, |
||||||
|
nextMain: nextDecomposed, |
||||||
|
}, { |
||||||
|
form: NFKC, |
||||||
|
composing: true, |
||||||
|
compatibility: true, |
||||||
|
info: lookupInfoNFKC, |
||||||
|
nextMain: nextComposed, |
||||||
|
}, { |
||||||
|
form: NFKD, |
||||||
|
composing: false, |
||||||
|
compatibility: true, |
||||||
|
info: lookupInfoNFKC, |
||||||
|
nextMain: nextDecomposed, |
||||||
|
}} |
||||||
|
|
||||||
|
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
|
||||||
|
// unexpected behavior for the user. For example, in NFD, there is a boundary
|
||||||
|
// after 'a'. However, 'a' might combine with modifiers, so from the application's
|
||||||
|
// perspective it is not a good boundary. We will therefore always use the
|
||||||
|
// boundaries for the combining variants.
|
||||||
|
|
||||||
|
// BoundaryBefore returns true if this rune starts a new segment and
|
||||||
|
// cannot combine with any rune on the left.
|
||||||
|
func (p Properties) BoundaryBefore() bool { |
||||||
|
if p.ccc == 0 && !p.combinesBackward() { |
||||||
|
return true |
||||||
|
} |
||||||
|
// We assume that the CCC of the first character in a decomposition
|
||||||
|
// is always non-zero if different from info.ccc and that we can return
|
||||||
|
// false at this point. This is verified by maketables.
|
||||||
|
return false |
||||||
|
} |
||||||
|
|
||||||
|
// BoundaryAfter returns true if runes cannot combine with or otherwise
|
||||||
|
// interact with this or previous runes.
|
||||||
|
func (p Properties) BoundaryAfter() bool { |
||||||
|
// TODO: loosen these conditions.
|
||||||
|
return p.isInert() |
||||||
|
} |
||||||
|
|
||||||
|
// We pack quick check data in 4 bits:
|
||||||
|
// 5: Combines forward (0 == false, 1 == true)
|
||||||
|
// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
|
||||||
|
// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
|
||||||
|
// 1..0: Number of trailing non-starters.
|
||||||
|
//
|
||||||
|
// When all 4 bits are zero, the character is inert, meaning it is never
|
||||||
|
// influenced by normalization.
|
||||||
|
type qcInfo uint8 |
||||||
|
|
||||||
|
func (p Properties) isYesC() bool { return p.flags&0x10 == 0 } |
||||||
|
func (p Properties) isYesD() bool { return p.flags&0x4 == 0 } |
||||||
|
|
||||||
|
func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 } |
||||||
|
func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
|
||||||
|
func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
|
||||||
|
|
||||||
|
func (p Properties) isInert() bool { |
||||||
|
return p.flags&qcInfoMask == 0 && p.ccc == 0 |
||||||
|
} |
||||||
|
|
||||||
|
func (p Properties) multiSegment() bool { |
||||||
|
return p.index >= firstMulti && p.index < endMulti |
||||||
|
} |
||||||
|
|
||||||
|
func (p Properties) nLeadingNonStarters() uint8 { |
||||||
|
return p.nLead |
||||||
|
} |
||||||
|
|
||||||
|
func (p Properties) nTrailingNonStarters() uint8 { |
||||||
|
return uint8(p.flags & 0x03) |
||||||
|
} |
||||||
|
|
||||||
|
// Decomposition returns the decomposition for the underlying rune
|
||||||
|
// or nil if there is none.
|
||||||
|
func (p Properties) Decomposition() []byte { |
||||||
|
// TODO: create the decomposition for Hangul?
|
||||||
|
if p.index == 0 { |
||||||
|
return nil |
||||||
|
} |
||||||
|
i := p.index |
||||||
|
n := decomps[i] & headerLenMask |
||||||
|
i++ |
||||||
|
return decomps[i : i+uint16(n)] |
||||||
|
} |
||||||
|
|
||||||
|
// Size returns the length of UTF-8 encoding of the rune.
|
||||||
|
func (p Properties) Size() int { |
||||||
|
return int(p.size) |
||||||
|
} |
||||||
|
|
||||||
|
// CCC returns the canonical combining class of the underlying rune.
|
||||||
|
func (p Properties) CCC() uint8 { |
||||||
|
if p.index >= firstCCCZeroExcept { |
||||||
|
return 0 |
||||||
|
} |
||||||
|
return ccc[p.ccc] |
||||||
|
} |
||||||
|
|
||||||
|
// LeadCCC returns the CCC of the first rune in the decomposition.
|
||||||
|
// If there is no decomposition, LeadCCC equals CCC.
|
||||||
|
func (p Properties) LeadCCC() uint8 { |
||||||
|
return ccc[p.ccc] |
||||||
|
} |
||||||
|
|
||||||
|
// TrailCCC returns the CCC of the last rune in the decomposition.
|
||||||
|
// If there is no decomposition, TrailCCC equals CCC.
|
||||||
|
func (p Properties) TrailCCC() uint8 { |
||||||
|
return ccc[p.tccc] |
||||||
|
} |
||||||
|
|
||||||
|
func buildRecompMap() { |
||||||
|
recompMap = make(map[uint32]rune, len(recompMapPacked)/8) |
||||||
|
var buf [8]byte |
||||||
|
for i := 0; i < len(recompMapPacked); i += 8 { |
||||||
|
copy(buf[:], recompMapPacked[i:i+8]) |
||||||
|
key := binary.BigEndian.Uint32(buf[:4]) |
||||||
|
val := binary.BigEndian.Uint32(buf[4:]) |
||||||
|
recompMap[key] = rune(val) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Recomposition
|
||||||
|
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
|
||||||
|
// This clips off the bits of three entries, but we know this will not
|
||||||
|
// result in a collision. In the unlikely event that changes to
|
||||||
|
// UnicodeData.txt introduce collisions, the compiler will catch it.
|
||||||
|
// Note that the recomposition map for NFC and NFKC are identical.
|
||||||
|
|
||||||
|
// combine returns the combined rune or 0 if it doesn't exist.
|
||||||
|
//
|
||||||
|
// The caller is responsible for calling
|
||||||
|
// recompMapOnce.Do(buildRecompMap) sometime before this is called.
|
||||||
|
func combine(a, b rune) rune { |
||||||
|
key := uint32(uint16(a))<<16 + uint32(uint16(b)) |
||||||
|
if recompMap == nil { |
||||||
|
panic("caller error") // see func comment
|
||||||
|
} |
||||||
|
return recompMap[key] |
||||||
|
} |
||||||
|
|
||||||
|
func lookupInfoNFC(b input, i int) Properties { |
||||||
|
v, sz := b.charinfoNFC(i) |
||||||
|
return compInfo(v, sz) |
||||||
|
} |
||||||
|
|
||||||
|
func lookupInfoNFKC(b input, i int) Properties { |
||||||
|
v, sz := b.charinfoNFKC(i) |
||||||
|
return compInfo(v, sz) |
||||||
|
} |
||||||
|
|
||||||
|
// Properties returns properties for the first rune in s.
|
||||||
|
func (f Form) Properties(s []byte) Properties { |
||||||
|
if f == NFC || f == NFD { |
||||||
|
return compInfo(nfcData.lookup(s)) |
||||||
|
} |
||||||
|
return compInfo(nfkcData.lookup(s)) |
||||||
|
} |
||||||
|
|
||||||
|
// PropertiesString returns properties for the first rune in s.
|
||||||
|
func (f Form) PropertiesString(s string) Properties { |
||||||
|
if f == NFC || f == NFD { |
||||||
|
return compInfo(nfcData.lookupString(s)) |
||||||
|
} |
||||||
|
return compInfo(nfkcData.lookupString(s)) |
||||||
|
} |
||||||
|
|
||||||
|
// compInfo converts the information contained in v and sz
|
||||||
|
// to a Properties. See the comment at the top of the file
|
||||||
|
// for more information on the format.
|
||||||
|
func compInfo(v uint16, sz int) Properties { |
||||||
|
if v == 0 { |
||||||
|
return Properties{size: uint8(sz)} |
||||||
|
} else if v >= 0x8000 { |
||||||
|
p := Properties{ |
||||||
|
size: uint8(sz), |
||||||
|
ccc: uint8(v), |
||||||
|
tccc: uint8(v), |
||||||
|
flags: qcInfo(v >> 8), |
||||||
|
} |
||||||
|
if p.ccc > 0 || p.combinesBackward() { |
||||||
|
p.nLead = uint8(p.flags & 0x3) |
||||||
|
} |
||||||
|
return p |
||||||
|
} |
||||||
|
// has decomposition
|
||||||
|
h := decomps[v] |
||||||
|
f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4 |
||||||
|
p := Properties{size: uint8(sz), flags: f, index: v} |
||||||
|
if v >= firstCCC { |
||||||
|
v += uint16(h&headerLenMask) + 1 |
||||||
|
c := decomps[v] |
||||||
|
p.tccc = c >> 2 |
||||||
|
p.flags |= qcInfo(c & 0x3) |
||||||
|
if v >= firstLeadingCCC { |
||||||
|
p.nLead = c & 0x3 |
||||||
|
if v >= firstStarterWithNLead { |
||||||
|
// We were tricked. Remove the decomposition.
|
||||||
|
p.flags &= 0x03 |
||||||
|
p.index = 0 |
||||||
|
return p |
||||||
|
} |
||||||
|
p.ccc = decomps[v+1] |
||||||
|
} |
||||||
|
} |
||||||
|
return p |
||||||
|
} |
@ -0,0 +1,109 @@ |
|||||||
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package norm |
||||||
|
|
||||||
|
import "unicode/utf8" |
||||||
|
|
||||||
|
type input struct { |
||||||
|
str string |
||||||
|
bytes []byte |
||||||
|
} |
||||||
|
|
||||||
|
func inputBytes(str []byte) input { |
||||||
|
return input{bytes: str} |
||||||
|
} |
||||||
|
|
||||||
|
func inputString(str string) input { |
||||||
|
return input{str: str} |
||||||
|
} |
||||||
|
|
||||||
|
func (in *input) setBytes(str []byte) { |
||||||
|
in.str = "" |
||||||
|
in.bytes = str |
||||||
|
} |
||||||
|
|
||||||
|
func (in *input) setString(str string) { |
||||||
|
in.str = str |
||||||
|
in.bytes = nil |
||||||
|
} |
||||||
|
|
||||||
|
func (in *input) _byte(p int) byte { |
||||||
|
if in.bytes == nil { |
||||||
|
return in.str[p] |
||||||
|
} |
||||||
|
return in.bytes[p] |
||||||
|
} |
||||||
|
|
||||||
|
func (in *input) skipASCII(p, max int) int { |
||||||
|
if in.bytes == nil { |
||||||
|
for ; p < max && in.str[p] < utf8.RuneSelf; p++ { |
||||||
|
} |
||||||
|
} else { |
||||||
|
for ; p < max && in.bytes[p] < utf8.RuneSelf; p++ { |
||||||
|
} |
||||||
|
} |
||||||
|
return p |
||||||
|
} |
||||||
|
|
||||||
|
func (in *input) skipContinuationBytes(p int) int { |
||||||
|
if in.bytes == nil { |
||||||
|
for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ { |
||||||
|
} |
||||||
|
} else { |
||||||
|
for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ { |
||||||
|
} |
||||||
|
} |
||||||
|
return p |
||||||
|
} |
||||||
|
|
||||||
|
func (in *input) appendSlice(buf []byte, b, e int) []byte { |
||||||
|
if in.bytes != nil { |
||||||
|
return append(buf, in.bytes[b:e]...) |
||||||
|
} |
||||||
|
for i := b; i < e; i++ { |
||||||
|
buf = append(buf, in.str[i]) |
||||||
|
} |
||||||
|
return buf |
||||||
|
} |
||||||
|
|
||||||
|
func (in *input) copySlice(buf []byte, b, e int) int { |
||||||
|
if in.bytes == nil { |
||||||
|
return copy(buf, in.str[b:e]) |
||||||
|
} |
||||||
|
return copy(buf, in.bytes[b:e]) |
||||||
|
} |
||||||
|
|
||||||
|
func (in *input) charinfoNFC(p int) (uint16, int) { |
||||||
|
if in.bytes == nil { |
||||||
|
return nfcData.lookupString(in.str[p:]) |
||||||
|
} |
||||||
|
return nfcData.lookup(in.bytes[p:]) |
||||||
|
} |
||||||
|
|
||||||
|
func (in *input) charinfoNFKC(p int) (uint16, int) { |
||||||
|
if in.bytes == nil { |
||||||
|
return nfkcData.lookupString(in.str[p:]) |
||||||
|
} |
||||||
|
return nfkcData.lookup(in.bytes[p:]) |
||||||
|
} |
||||||
|
|
||||||
|
func (in *input) hangul(p int) (r rune) { |
||||||
|
var size int |
||||||
|
if in.bytes == nil { |
||||||
|
if !isHangulString(in.str[p:]) { |
||||||
|
return 0 |
||||||
|
} |
||||||
|
r, size = utf8.DecodeRuneInString(in.str[p:]) |
||||||
|
} else { |
||||||
|
if !isHangul(in.bytes[p:]) { |
||||||
|
return 0 |
||||||
|
} |
||||||
|
r, size = utf8.DecodeRune(in.bytes[p:]) |
||||||
|
} |
||||||
|
if size != hangulUTF8Size { |
||||||
|
return 0 |
||||||
|
} |
||||||
|
return r |
||||||
|
} |
@ -0,0 +1,458 @@ |
|||||||
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package norm |
||||||
|
|
||||||
|
import ( |
||||||
|
"fmt" |
||||||
|
"unicode/utf8" |
||||||
|
) |
||||||
|
|
||||||
|
// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
|
||||||
|
// sequence of starter and non-starter runes for the purpose of normalization.
|
||||||
|
const MaxSegmentSize = maxByteBufferSize |
||||||
|
|
||||||
|
// An Iter iterates over a string or byte slice, while normalizing it
|
||||||
|
// to a given Form.
|
||||||
|
type Iter struct { |
||||||
|
rb reorderBuffer |
||||||
|
buf [maxByteBufferSize]byte |
||||||
|
info Properties // first character saved from previous iteration
|
||||||
|
next iterFunc // implementation of next depends on form
|
||||||
|
asciiF iterFunc |
||||||
|
|
||||||
|
p int // current position in input source
|
||||||
|
multiSeg []byte // remainder of multi-segment decomposition
|
||||||
|
} |
||||||
|
|
||||||
|
type iterFunc func(*Iter) []byte |
||||||
|
|
||||||
|
// Init initializes i to iterate over src after normalizing it to Form f.
|
||||||
|
func (i *Iter) Init(f Form, src []byte) { |
||||||
|
i.p = 0 |
||||||
|
if len(src) == 0 { |
||||||
|
i.setDone() |
||||||
|
i.rb.nsrc = 0 |
||||||
|
return |
||||||
|
} |
||||||
|
i.multiSeg = nil |
||||||
|
i.rb.init(f, src) |
||||||
|
i.next = i.rb.f.nextMain |
||||||
|
i.asciiF = nextASCIIBytes |
||||||
|
i.info = i.rb.f.info(i.rb.src, i.p) |
||||||
|
i.rb.ss.first(i.info) |
||||||
|
} |
||||||
|
|
||||||
|
// InitString initializes i to iterate over src after normalizing it to Form f.
|
||||||
|
func (i *Iter) InitString(f Form, src string) { |
||||||
|
i.p = 0 |
||||||
|
if len(src) == 0 { |
||||||
|
i.setDone() |
||||||
|
i.rb.nsrc = 0 |
||||||
|
return |
||||||
|
} |
||||||
|
i.multiSeg = nil |
||||||
|
i.rb.initString(f, src) |
||||||
|
i.next = i.rb.f.nextMain |
||||||
|
i.asciiF = nextASCIIString |
||||||
|
i.info = i.rb.f.info(i.rb.src, i.p) |
||||||
|
i.rb.ss.first(i.info) |
||||||
|
} |
||||||
|
|
||||||
|
// Seek sets the segment to be returned by the next call to Next to start
|
||||||
|
// at position p. It is the responsibility of the caller to set p to the
|
||||||
|
// start of a segment.
|
||||||
|
func (i *Iter) Seek(offset int64, whence int) (int64, error) { |
||||||
|
var abs int64 |
||||||
|
switch whence { |
||||||
|
case 0: |
||||||
|
abs = offset |
||||||
|
case 1: |
||||||
|
abs = int64(i.p) + offset |
||||||
|
case 2: |
||||||
|
abs = int64(i.rb.nsrc) + offset |
||||||
|
default: |
||||||
|
return 0, fmt.Errorf("norm: invalid whence") |
||||||
|
} |
||||||
|
if abs < 0 { |
||||||
|
return 0, fmt.Errorf("norm: negative position") |
||||||
|
} |
||||||
|
if int(abs) >= i.rb.nsrc { |
||||||
|
i.setDone() |
||||||
|
return int64(i.p), nil |
||||||
|
} |
||||||
|
i.p = int(abs) |
||||||
|
i.multiSeg = nil |
||||||
|
i.next = i.rb.f.nextMain |
||||||
|
i.info = i.rb.f.info(i.rb.src, i.p) |
||||||
|
i.rb.ss.first(i.info) |
||||||
|
return abs, nil |
||||||
|
} |
||||||
|
|
||||||
|
// returnSlice returns a slice of the underlying input type as a byte slice.
|
||||||
|
// If the underlying is of type []byte, it will simply return a slice.
|
||||||
|
// If the underlying is of type string, it will copy the slice to the buffer
|
||||||
|
// and return that.
|
||||||
|
func (i *Iter) returnSlice(a, b int) []byte { |
||||||
|
if i.rb.src.bytes == nil { |
||||||
|
return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] |
||||||
|
} |
||||||
|
return i.rb.src.bytes[a:b] |
||||||
|
} |
||||||
|
|
||||||
|
// Pos returns the byte position at which the next call to Next will commence processing.
|
||||||
|
func (i *Iter) Pos() int { |
||||||
|
return i.p |
||||||
|
} |
||||||
|
|
||||||
|
func (i *Iter) setDone() { |
||||||
|
i.next = nextDone |
||||||
|
i.p = i.rb.nsrc |
||||||
|
} |
||||||
|
|
||||||
|
// Done returns true if there is no more input to process.
|
||||||
|
func (i *Iter) Done() bool { |
||||||
|
return i.p >= i.rb.nsrc |
||||||
|
} |
||||||
|
|
||||||
|
// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
|
||||||
|
// For any input a and b for which f(a) == f(b), subsequent calls
|
||||||
|
// to Next will return the same segments.
|
||||||
|
// Modifying runes are grouped together with the preceding starter, if such a starter exists.
|
||||||
|
// Although not guaranteed, n will typically be the smallest possible n.
|
||||||
|
func (i *Iter) Next() []byte { |
||||||
|
return i.next(i) |
||||||
|
} |
||||||
|
|
||||||
|
func nextASCIIBytes(i *Iter) []byte { |
||||||
|
p := i.p + 1 |
||||||
|
if p >= i.rb.nsrc { |
||||||
|
p0 := i.p |
||||||
|
i.setDone() |
||||||
|
return i.rb.src.bytes[p0:p] |
||||||
|
} |
||||||
|
if i.rb.src.bytes[p] < utf8.RuneSelf { |
||||||
|
p0 := i.p |
||||||
|
i.p = p |
||||||
|
return i.rb.src.bytes[p0:p] |
||||||
|
} |
||||||
|
i.info = i.rb.f.info(i.rb.src, i.p) |
||||||
|
i.next = i.rb.f.nextMain |
||||||
|
return i.next(i) |
||||||
|
} |
||||||
|
|
||||||
|
func nextASCIIString(i *Iter) []byte { |
||||||
|
p := i.p + 1 |
||||||
|
if p >= i.rb.nsrc { |
||||||
|
i.buf[0] = i.rb.src.str[i.p] |
||||||
|
i.setDone() |
||||||
|
return i.buf[:1] |
||||||
|
} |
||||||
|
if i.rb.src.str[p] < utf8.RuneSelf { |
||||||
|
i.buf[0] = i.rb.src.str[i.p] |
||||||
|
i.p = p |
||||||
|
return i.buf[:1] |
||||||
|
} |
||||||
|
i.info = i.rb.f.info(i.rb.src, i.p) |
||||||
|
i.next = i.rb.f.nextMain |
||||||
|
return i.next(i) |
||||||
|
} |
||||||
|
|
||||||
|
func nextHangul(i *Iter) []byte { |
||||||
|
p := i.p |
||||||
|
next := p + hangulUTF8Size |
||||||
|
if next >= i.rb.nsrc { |
||||||
|
i.setDone() |
||||||
|
} else if i.rb.src.hangul(next) == 0 { |
||||||
|
i.rb.ss.next(i.info) |
||||||
|
i.info = i.rb.f.info(i.rb.src, i.p) |
||||||
|
i.next = i.rb.f.nextMain |
||||||
|
return i.next(i) |
||||||
|
} |
||||||
|
i.p = next |
||||||
|
return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] |
||||||
|
} |
||||||
|
|
||||||
|
func nextDone(i *Iter) []byte { |
||||||
|
return nil |
||||||
|
} |
||||||
|
|
||||||
|
// nextMulti is used for iterating over multi-segment decompositions
|
||||||
|
// for decomposing normal forms.
|
||||||
|
func nextMulti(i *Iter) []byte { |
||||||
|
j := 0 |
||||||
|
d := i.multiSeg |
||||||
|
// skip first rune
|
||||||
|
for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { |
||||||
|
} |
||||||
|
for j < len(d) { |
||||||
|
info := i.rb.f.info(input{bytes: d}, j) |
||||||
|
if info.BoundaryBefore() { |
||||||
|
i.multiSeg = d[j:] |
||||||
|
return d[:j] |
||||||
|
} |
||||||
|
j += int(info.size) |
||||||
|
} |
||||||
|
// treat last segment as normal decomposition
|
||||||
|
i.next = i.rb.f.nextMain |
||||||
|
return i.next(i) |
||||||
|
} |
||||||
|
|
||||||
|
// nextMultiNorm is used for iterating over multi-segment decompositions
|
||||||
|
// for composing normal forms.
|
||||||
|
func nextMultiNorm(i *Iter) []byte { |
||||||
|
j := 0 |
||||||
|
d := i.multiSeg |
||||||
|
for j < len(d) { |
||||||
|
info := i.rb.f.info(input{bytes: d}, j) |
||||||
|
if info.BoundaryBefore() { |
||||||
|
i.rb.compose() |
||||||
|
seg := i.buf[:i.rb.flushCopy(i.buf[:])] |
||||||
|
i.rb.insertUnsafe(input{bytes: d}, j, info) |
||||||
|
i.multiSeg = d[j+int(info.size):] |
||||||
|
return seg |
||||||
|
} |
||||||
|
i.rb.insertUnsafe(input{bytes: d}, j, info) |
||||||
|
j += int(info.size) |
||||||
|
} |
||||||
|
i.multiSeg = nil |
||||||
|
i.next = nextComposed |
||||||
|
return doNormComposed(i) |
||||||
|
} |
||||||
|
|
||||||
|
// nextDecomposed is the implementation of Next for forms NFD and NFKD.
|
||||||
|
func nextDecomposed(i *Iter) (next []byte) { |
||||||
|
outp := 0 |
||||||
|
inCopyStart, outCopyStart := i.p, 0 |
||||||
|
for { |
||||||
|
if sz := int(i.info.size); sz <= 1 { |
||||||
|
i.rb.ss = 0 |
||||||
|
p := i.p |
||||||
|
i.p++ // ASCII or illegal byte. Either way, advance by 1.
|
||||||
|
if i.p >= i.rb.nsrc { |
||||||
|
i.setDone() |
||||||
|
return i.returnSlice(p, i.p) |
||||||
|
} else if i.rb.src._byte(i.p) < utf8.RuneSelf { |
||||||
|
i.next = i.asciiF |
||||||
|
return i.returnSlice(p, i.p) |
||||||
|
} |
||||||
|
outp++ |
||||||
|
} else if d := i.info.Decomposition(); d != nil { |
||||||
|
// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
|
||||||
|
// Case 1: there is a leftover to copy. In this case the decomposition
|
||||||
|
// must begin with a modifier and should always be appended.
|
||||||
|
// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
|
||||||
|
p := outp + len(d) |
||||||
|
if outp > 0 { |
||||||
|
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
||||||
|
// TODO: this condition should not be possible, but we leave it
|
||||||
|
// in for defensive purposes.
|
||||||
|
if p > len(i.buf) { |
||||||
|
return i.buf[:outp] |
||||||
|
} |
||||||
|
} else if i.info.multiSegment() { |
||||||
|
// outp must be 0 as multi-segment decompositions always
|
||||||
|
// start a new segment.
|
||||||
|
if i.multiSeg == nil { |
||||||
|
i.multiSeg = d |
||||||
|
i.next = nextMulti |
||||||
|
return nextMulti(i) |
||||||
|
} |
||||||
|
// We are in the last segment. Treat as normal decomposition.
|
||||||
|
d = i.multiSeg |
||||||
|
i.multiSeg = nil |
||||||
|
p = len(d) |
||||||
|
} |
||||||
|
prevCC := i.info.tccc |
||||||
|
if i.p += sz; i.p >= i.rb.nsrc { |
||||||
|
i.setDone() |
||||||
|
i.info = Properties{} // Force BoundaryBefore to succeed.
|
||||||
|
} else { |
||||||
|
i.info = i.rb.f.info(i.rb.src, i.p) |
||||||
|
} |
||||||
|
switch i.rb.ss.next(i.info) { |
||||||
|
case ssOverflow: |
||||||
|
i.next = nextCGJDecompose |
||||||
|
fallthrough |
||||||
|
case ssStarter: |
||||||
|
if outp > 0 { |
||||||
|
copy(i.buf[outp:], d) |
||||||
|
return i.buf[:p] |
||||||
|
} |
||||||
|
return d |
||||||
|
} |
||||||
|
copy(i.buf[outp:], d) |
||||||
|
outp = p |
||||||
|
inCopyStart, outCopyStart = i.p, outp |
||||||
|
if i.info.ccc < prevCC { |
||||||
|
goto doNorm |
||||||
|
} |
||||||
|
continue |
||||||
|
} else if r := i.rb.src.hangul(i.p); r != 0 { |
||||||
|
outp = decomposeHangul(i.buf[:], r) |
||||||
|
i.p += hangulUTF8Size |
||||||
|
inCopyStart, outCopyStart = i.p, outp |
||||||
|
if i.p >= i.rb.nsrc { |
||||||
|
i.setDone() |
||||||
|
break |
||||||
|
} else if i.rb.src.hangul(i.p) != 0 { |
||||||
|
i.next = nextHangul |
||||||
|
return i.buf[:outp] |
||||||
|
} |
||||||
|
} else { |
||||||
|
p := outp + sz |
||||||
|
if p > len(i.buf) { |
||||||
|
break |
||||||
|
} |
||||||
|
outp = p |
||||||
|
i.p += sz |
||||||
|
} |
||||||
|
if i.p >= i.rb.nsrc { |
||||||
|
i.setDone() |
||||||
|
break |
||||||
|
} |
||||||
|
prevCC := i.info.tccc |
||||||
|
i.info = i.rb.f.info(i.rb.src, i.p) |
||||||
|
if v := i.rb.ss.next(i.info); v == ssStarter { |
||||||
|
break |
||||||
|
} else if v == ssOverflow { |
||||||
|
i.next = nextCGJDecompose |
||||||
|
break |
||||||
|
} |
||||||
|
if i.info.ccc < prevCC { |
||||||
|
goto doNorm |
||||||
|
} |
||||||
|
} |
||||||
|
if outCopyStart == 0 { |
||||||
|
return i.returnSlice(inCopyStart, i.p) |
||||||
|
} else if inCopyStart < i.p { |
||||||
|
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
||||||
|
} |
||||||
|
return i.buf[:outp] |
||||||
|
doNorm: |
||||||
|
// Insert what we have decomposed so far in the reorderBuffer.
|
||||||
|
// As we will only reorder, there will always be enough room.
|
||||||
|
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
||||||
|
i.rb.insertDecomposed(i.buf[0:outp]) |
||||||
|
return doNormDecomposed(i) |
||||||
|
} |
||||||
|
|
||||||
|
func doNormDecomposed(i *Iter) []byte { |
||||||
|
for { |
||||||
|
i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
||||||
|
if i.p += int(i.info.size); i.p >= i.rb.nsrc { |
||||||
|
i.setDone() |
||||||
|
break |
||||||
|
} |
||||||
|
i.info = i.rb.f.info(i.rb.src, i.p) |
||||||
|
if i.info.ccc == 0 { |
||||||
|
break |
||||||
|
} |
||||||
|
if s := i.rb.ss.next(i.info); s == ssOverflow { |
||||||
|
i.next = nextCGJDecompose |
||||||
|
break |
||||||
|
} |
||||||
|
} |
||||||
|
// new segment or too many combining characters: exit normalization
|
||||||
|
return i.buf[:i.rb.flushCopy(i.buf[:])] |
||||||
|
} |
||||||
|
|
||||||
|
func nextCGJDecompose(i *Iter) []byte { |
||||||
|
i.rb.ss = 0 |
||||||
|
i.rb.insertCGJ() |
||||||
|
i.next = nextDecomposed |
||||||
|
i.rb.ss.first(i.info) |
||||||
|
buf := doNormDecomposed(i) |
||||||
|
return buf |
||||||
|
} |
||||||
|
|
||||||
|
// nextComposed is the implementation of Next for forms NFC and NFKC.
|
||||||
|
func nextComposed(i *Iter) []byte { |
||||||
|
outp, startp := 0, i.p |
||||||
|
var prevCC uint8 |
||||||
|
for { |
||||||
|
if !i.info.isYesC() { |
||||||
|
goto doNorm |
||||||
|
} |
||||||
|
prevCC = i.info.tccc |
||||||
|
sz := int(i.info.size) |
||||||
|
if sz == 0 { |
||||||
|
sz = 1 // illegal rune: copy byte-by-byte
|
||||||
|
} |
||||||
|
p := outp + sz |
||||||
|
if p > len(i.buf) { |
||||||
|
break |
||||||
|
} |
||||||
|
outp = p |
||||||
|
i.p += sz |
||||||
|
if i.p >= i.rb.nsrc { |
||||||
|
i.setDone() |
||||||
|
break |
||||||
|
} else if i.rb.src._byte(i.p) < utf8.RuneSelf { |
||||||
|
i.rb.ss = 0 |
||||||
|
i.next = i.asciiF |
||||||
|
break |
||||||
|
} |
||||||
|
i.info = i.rb.f.info(i.rb.src, i.p) |
||||||
|
if v := i.rb.ss.next(i.info); v == ssStarter { |
||||||
|
break |
||||||
|
} else if v == ssOverflow { |
||||||
|
i.next = nextCGJCompose |
||||||
|
break |
||||||
|
} |
||||||
|
if i.info.ccc < prevCC { |
||||||
|
goto doNorm |
||||||
|
} |
||||||
|
} |
||||||
|
return i.returnSlice(startp, i.p) |
||||||
|
doNorm: |
||||||
|
// reset to start position
|
||||||
|
i.p = startp |
||||||
|
i.info = i.rb.f.info(i.rb.src, i.p) |
||||||
|
i.rb.ss.first(i.info) |
||||||
|
if i.info.multiSegment() { |
||||||
|
d := i.info.Decomposition() |
||||||
|
info := i.rb.f.info(input{bytes: d}, 0) |
||||||
|
i.rb.insertUnsafe(input{bytes: d}, 0, info) |
||||||
|
i.multiSeg = d[int(info.size):] |
||||||
|
i.next = nextMultiNorm |
||||||
|
return nextMultiNorm(i) |
||||||
|
} |
||||||
|
i.rb.ss.first(i.info) |
||||||
|
i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
||||||
|
return doNormComposed(i) |
||||||
|
} |
||||||
|
|
||||||
|
func doNormComposed(i *Iter) []byte { |
||||||
|
// First rune should already be inserted.
|
||||||
|
for { |
||||||
|
if i.p += int(i.info.size); i.p >= i.rb.nsrc { |
||||||
|
i.setDone() |
||||||
|
break |
||||||
|
} |
||||||
|
i.info = i.rb.f.info(i.rb.src, i.p) |
||||||
|
if s := i.rb.ss.next(i.info); s == ssStarter { |
||||||
|
break |
||||||
|
} else if s == ssOverflow { |
||||||
|
i.next = nextCGJCompose |
||||||
|
break |
||||||
|
} |
||||||
|
i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
||||||
|
} |
||||||
|
i.rb.compose() |
||||||
|
seg := i.buf[:i.rb.flushCopy(i.buf[:])] |
||||||
|
return seg |
||||||
|
} |
||||||
|
|
||||||
|
func nextCGJCompose(i *Iter) []byte { |
||||||
|
i.rb.ss = 0 // instead of first
|
||||||
|
i.rb.insertCGJ() |
||||||
|
i.next = nextComposed |
||||||
|
// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
|
||||||
|
// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
|
||||||
|
// If we ever change that, insert a check here.
|
||||||
|
i.rb.ss.first(i.info) |
||||||
|
i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
||||||
|
return doNormComposed(i) |
||||||
|
} |
@ -0,0 +1,986 @@ |
|||||||
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// +build ignore
|
||||||
|
|
||||||
|
// Normalization table generator.
|
||||||
|
// Data read from the web.
|
||||||
|
// See forminfo.go for a description of the trie values associated with each rune.
|
||||||
|
|
||||||
|
package main |
||||||
|
|
||||||
|
import ( |
||||||
|
"bytes" |
||||||
|
"encoding/binary" |
||||||
|
"flag" |
||||||
|
"fmt" |
||||||
|
"io" |
||||||
|
"log" |
||||||
|
"sort" |
||||||
|
"strconv" |
||||||
|
"strings" |
||||||
|
|
||||||
|
"golang.org/x/text/internal/gen" |
||||||
|
"golang.org/x/text/internal/triegen" |
||||||
|
"golang.org/x/text/internal/ucd" |
||||||
|
) |
||||||
|
|
||||||
|
func main() { |
||||||
|
gen.Init() |
||||||
|
loadUnicodeData() |
||||||
|
compactCCC() |
||||||
|
loadCompositionExclusions() |
||||||
|
completeCharFields(FCanonical) |
||||||
|
completeCharFields(FCompatibility) |
||||||
|
computeNonStarterCounts() |
||||||
|
verifyComputed() |
||||||
|
printChars() |
||||||
|
testDerived() |
||||||
|
printTestdata() |
||||||
|
makeTables() |
||||||
|
} |
||||||
|
|
||||||
|
var ( |
||||||
|
tablelist = flag.String("tables", |
||||||
|
"all", |
||||||
|
"comma-separated list of which tables to generate; "+ |
||||||
|
"can be 'decomp', 'recomp', 'info' and 'all'") |
||||||
|
test = flag.Bool("test", |
||||||
|
false, |
||||||
|
"test existing tables against DerivedNormalizationProps and generate test data for regression testing") |
||||||
|
verbose = flag.Bool("verbose", |
||||||
|
false, |
||||||
|
"write data to stdout as it is parsed") |
||||||
|
) |
||||||
|
|
||||||
|
const MaxChar = 0x10FFFF // anything above this shouldn't exist
|
||||||
|
|
||||||
|
// Quick Check properties of runes allow us to quickly
|
||||||
|
// determine whether a rune may occur in a normal form.
|
||||||
|
// For a given normal form, a rune may be guaranteed to occur
|
||||||
|
// verbatim (QC=Yes), may or may not combine with another
|
||||||
|
// rune (QC=Maybe), or may not occur (QC=No).
|
||||||
|
type QCResult int |
||||||
|
|
||||||
|
const ( |
||||||
|
QCUnknown QCResult = iota |
||||||
|
QCYes |
||||||
|
QCNo |
||||||
|
QCMaybe |
||||||
|
) |
||||||
|
|
||||||
|
func (r QCResult) String() string { |
||||||
|
switch r { |
||||||
|
case QCYes: |
||||||
|
return "Yes" |
||||||
|
case QCNo: |
||||||
|
return "No" |
||||||
|
case QCMaybe: |
||||||
|
return "Maybe" |
||||||
|
} |
||||||
|
return "***UNKNOWN***" |
||||||
|
} |
||||||
|
|
||||||
|
const ( |
||||||
|
FCanonical = iota // NFC or NFD
|
||||||
|
FCompatibility // NFKC or NFKD
|
||||||
|
FNumberOfFormTypes |
||||||
|
) |
||||||
|
|
||||||
|
const ( |
||||||
|
MComposed = iota // NFC or NFKC
|
||||||
|
MDecomposed // NFD or NFKD
|
||||||
|
MNumberOfModes |
||||||
|
) |
||||||
|
|
||||||
|
// This contains only the properties we're interested in.
|
||||||
|
type Char struct { |
||||||
|
name string |
||||||
|
codePoint rune // if zero, this index is not a valid code point.
|
||||||
|
ccc uint8 // canonical combining class
|
||||||
|
origCCC uint8 |
||||||
|
excludeInComp bool // from CompositionExclusions.txt
|
||||||
|
compatDecomp bool // it has a compatibility expansion
|
||||||
|
|
||||||
|
nTrailingNonStarters uint8 |
||||||
|
nLeadingNonStarters uint8 // must be equal to trailing if non-zero
|
||||||
|
|
||||||
|
forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility
|
||||||
|
|
||||||
|
state State |
||||||
|
} |
||||||
|
|
||||||
|
var chars = make([]Char, MaxChar+1) |
||||||
|
var cccMap = make(map[uint8]uint8) |
||||||
|
|
||||||
|
func (c Char) String() string { |
||||||
|
buf := new(bytes.Buffer) |
||||||
|
|
||||||
|
fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name) |
||||||
|
fmt.Fprintf(buf, " ccc: %v\n", c.ccc) |
||||||
|
fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp) |
||||||
|
fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp) |
||||||
|
fmt.Fprintf(buf, " state: %v\n", c.state) |
||||||
|
fmt.Fprintf(buf, " NFC:\n") |
||||||
|
fmt.Fprint(buf, c.forms[FCanonical]) |
||||||
|
fmt.Fprintf(buf, " NFKC:\n") |
||||||
|
fmt.Fprint(buf, c.forms[FCompatibility]) |
||||||
|
|
||||||
|
return buf.String() |
||||||
|
} |
||||||
|
|
||||||
|
// In UnicodeData.txt, some ranges are marked like this:
|
||||||
|
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||||
|
// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||||
|
// parseCharacter keeps a state variable indicating the weirdness.
|
||||||
|
type State int |
||||||
|
|
||||||
|
const ( |
||||||
|
SNormal State = iota // known to be zero for the type
|
||||||
|
SFirst |
||||||
|
SLast |
||||||
|
SMissing |
||||||
|
) |
||||||
|
|
||||||
|
var lastChar = rune('\u0000') |
||||||
|
|
||||||
|
func (c Char) isValid() bool { |
||||||
|
return c.codePoint != 0 && c.state != SMissing |
||||||
|
} |
||||||
|
|
||||||
|
type FormInfo struct { |
||||||
|
quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed
|
||||||
|
verified [MNumberOfModes]bool // index: MComposed or MDecomposed
|
||||||
|
|
||||||
|
combinesForward bool // May combine with rune on the right
|
||||||
|
combinesBackward bool // May combine with rune on the left
|
||||||
|
isOneWay bool // Never appears in result
|
||||||
|
inDecomp bool // Some decompositions result in this char.
|
||||||
|
decomp Decomposition |
||||||
|
expandedDecomp Decomposition |
||||||
|
} |
||||||
|
|
||||||
|
func (f FormInfo) String() string { |
||||||
|
buf := bytes.NewBuffer(make([]byte, 0)) |
||||||
|
|
||||||
|
fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed]) |
||||||
|
fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed]) |
||||||
|
fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward) |
||||||
|
fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward) |
||||||
|
fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay) |
||||||
|
fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp) |
||||||
|
fmt.Fprintf(buf, " decomposition: %X\n", f.decomp) |
||||||
|
fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp) |
||||||
|
|
||||||
|
return buf.String() |
||||||
|
} |
||||||
|
|
||||||
|
type Decomposition []rune |
||||||
|
|
||||||
|
func parseDecomposition(s string, skipfirst bool) (a []rune, err error) { |
||||||
|
decomp := strings.Split(s, " ") |
||||||
|
if len(decomp) > 0 && skipfirst { |
||||||
|
decomp = decomp[1:] |
||||||
|
} |
||||||
|
for _, d := range decomp { |
||||||
|
point, err := strconv.ParseUint(d, 16, 64) |
||||||
|
if err != nil { |
||||||
|
return a, err |
||||||
|
} |
||||||
|
a = append(a, rune(point)) |
||||||
|
} |
||||||
|
return a, nil |
||||||
|
} |
||||||
|
|
||||||
|
func loadUnicodeData() { |
||||||
|
f := gen.OpenUCDFile("UnicodeData.txt") |
||||||
|
defer f.Close() |
||||||
|
p := ucd.New(f) |
||||||
|
for p.Next() { |
||||||
|
r := p.Rune(ucd.CodePoint) |
||||||
|
char := &chars[r] |
||||||
|
|
||||||
|
char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass)) |
||||||
|
decmap := p.String(ucd.DecompMapping) |
||||||
|
|
||||||
|
exp, err := parseDecomposition(decmap, false) |
||||||
|
isCompat := false |
||||||
|
if err != nil { |
||||||
|
if len(decmap) > 0 { |
||||||
|
exp, err = parseDecomposition(decmap, true) |
||||||
|
if err != nil { |
||||||
|
log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err) |
||||||
|
} |
||||||
|
isCompat = true |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
char.name = p.String(ucd.Name) |
||||||
|
char.codePoint = r |
||||||
|
char.forms[FCompatibility].decomp = exp |
||||||
|
if !isCompat { |
||||||
|
char.forms[FCanonical].decomp = exp |
||||||
|
} else { |
||||||
|
char.compatDecomp = true |
||||||
|
} |
||||||
|
if len(decmap) > 0 { |
||||||
|
char.forms[FCompatibility].decomp = exp |
||||||
|
} |
||||||
|
} |
||||||
|
if err := p.Err(); err != nil { |
||||||
|
log.Fatal(err) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// compactCCC converts the sparse set of CCC values to a continguous one,
|
||||||
|
// reducing the number of bits needed from 8 to 6.
|
||||||
|
func compactCCC() { |
||||||
|
m := make(map[uint8]uint8) |
||||||
|
for i := range chars { |
||||||
|
c := &chars[i] |
||||||
|
m[c.ccc] = 0 |
||||||
|
} |
||||||
|
cccs := []int{} |
||||||
|
for v, _ := range m { |
||||||
|
cccs = append(cccs, int(v)) |
||||||
|
} |
||||||
|
sort.Ints(cccs) |
||||||
|
for i, c := range cccs { |
||||||
|
cccMap[uint8(i)] = uint8(c) |
||||||
|
m[uint8(c)] = uint8(i) |
||||||
|
} |
||||||
|
for i := range chars { |
||||||
|
c := &chars[i] |
||||||
|
c.origCCC = c.ccc |
||||||
|
c.ccc = m[c.ccc] |
||||||
|
} |
||||||
|
if len(m) >= 1<<6 { |
||||||
|
log.Fatalf("too many difference CCC values: %d >= 64", len(m)) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// CompositionExclusions.txt has form:
|
||||||
|
// 0958 # ...
|
||||||
|
// See https://unicode.org/reports/tr44/ for full explanation
|
||||||
|
func loadCompositionExclusions() { |
||||||
|
f := gen.OpenUCDFile("CompositionExclusions.txt") |
||||||
|
defer f.Close() |
||||||
|
p := ucd.New(f) |
||||||
|
for p.Next() { |
||||||
|
c := &chars[p.Rune(0)] |
||||||
|
if c.excludeInComp { |
||||||
|
log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint) |
||||||
|
} |
||||||
|
c.excludeInComp = true |
||||||
|
} |
||||||
|
if e := p.Err(); e != nil { |
||||||
|
log.Fatal(e) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// hasCompatDecomp returns true if any of the recursive
|
||||||
|
// decompositions contains a compatibility expansion.
|
||||||
|
// In this case, the character may not occur in NFK*.
|
||||||
|
func hasCompatDecomp(r rune) bool { |
||||||
|
c := &chars[r] |
||||||
|
if c.compatDecomp { |
||||||
|
return true |
||||||
|
} |
||||||
|
for _, d := range c.forms[FCompatibility].decomp { |
||||||
|
if hasCompatDecomp(d) { |
||||||
|
return true |
||||||
|
} |
||||||
|
} |
||||||
|
return false |
||||||
|
} |
||||||
|
|
||||||
|
// Hangul related constants.
|
||||||
|
const ( |
||||||
|
HangulBase = 0xAC00 |
||||||
|
HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28)
|
||||||
|
|
||||||
|
JamoLBase = 0x1100 |
||||||
|
JamoLEnd = 0x1113 |
||||||
|
JamoVBase = 0x1161 |
||||||
|
JamoVEnd = 0x1176 |
||||||
|
JamoTBase = 0x11A8 |
||||||
|
JamoTEnd = 0x11C3 |
||||||
|
|
||||||
|
JamoLVTCount = 19 * 21 * 28 |
||||||
|
JamoTCount = 28 |
||||||
|
) |
||||||
|
|
||||||
|
func isHangul(r rune) bool { |
||||||
|
return HangulBase <= r && r < HangulEnd |
||||||
|
} |
||||||
|
|
||||||
|
func isHangulWithoutJamoT(r rune) bool { |
||||||
|
if !isHangul(r) { |
||||||
|
return false |
||||||
|
} |
||||||
|
r -= HangulBase |
||||||
|
return r < JamoLVTCount && r%JamoTCount == 0 |
||||||
|
} |
||||||
|
|
||||||
|
func ccc(r rune) uint8 { |
||||||
|
return chars[r].ccc |
||||||
|
} |
||||||
|
|
||||||
|
// Insert a rune in a buffer, ordered by Canonical Combining Class.
|
||||||
|
func insertOrdered(b Decomposition, r rune) Decomposition { |
||||||
|
n := len(b) |
||||||
|
b = append(b, 0) |
||||||
|
cc := ccc(r) |
||||||
|
if cc > 0 { |
||||||
|
// Use bubble sort.
|
||||||
|
for ; n > 0; n-- { |
||||||
|
if ccc(b[n-1]) <= cc { |
||||||
|
break |
||||||
|
} |
||||||
|
b[n] = b[n-1] |
||||||
|
} |
||||||
|
} |
||||||
|
b[n] = r |
||||||
|
return b |
||||||
|
} |
||||||
|
|
||||||
|
// Recursively decompose.
|
||||||
|
func decomposeRecursive(form int, r rune, d Decomposition) Decomposition { |
||||||
|
dcomp := chars[r].forms[form].decomp |
||||||
|
if len(dcomp) == 0 { |
||||||
|
return insertOrdered(d, r) |
||||||
|
} |
||||||
|
for _, c := range dcomp { |
||||||
|
d = decomposeRecursive(form, c, d) |
||||||
|
} |
||||||
|
return d |
||||||
|
} |
||||||
|
|
||||||
|
func completeCharFields(form int) { |
||||||
|
// Phase 0: pre-expand decomposition.
|
||||||
|
for i := range chars { |
||||||
|
f := &chars[i].forms[form] |
||||||
|
if len(f.decomp) == 0 { |
||||||
|
continue |
||||||
|
} |
||||||
|
exp := make(Decomposition, 0) |
||||||
|
for _, c := range f.decomp { |
||||||
|
exp = decomposeRecursive(form, c, exp) |
||||||
|
} |
||||||
|
f.expandedDecomp = exp |
||||||
|
} |
||||||
|
|
||||||
|
// Phase 1: composition exclusion, mark decomposition.
|
||||||
|
for i := range chars { |
||||||
|
c := &chars[i] |
||||||
|
f := &c.forms[form] |
||||||
|
|
||||||
|
// Marks script-specific exclusions and version restricted.
|
||||||
|
f.isOneWay = c.excludeInComp |
||||||
|
|
||||||
|
// Singletons
|
||||||
|
f.isOneWay = f.isOneWay || len(f.decomp) == 1 |
||||||
|
|
||||||
|
// Non-starter decompositions
|
||||||
|
if len(f.decomp) > 1 { |
||||||
|
chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0 |
||||||
|
f.isOneWay = f.isOneWay || chk |
||||||
|
} |
||||||
|
|
||||||
|
// Runes that decompose into more than two runes.
|
||||||
|
f.isOneWay = f.isOneWay || len(f.decomp) > 2 |
||||||
|
|
||||||
|
if form == FCompatibility { |
||||||
|
f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint) |
||||||
|
} |
||||||
|
|
||||||
|
for _, r := range f.decomp { |
||||||
|
chars[r].forms[form].inDecomp = true |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Phase 2: forward and backward combining.
|
||||||
|
for i := range chars { |
||||||
|
c := &chars[i] |
||||||
|
f := &c.forms[form] |
||||||
|
|
||||||
|
if !f.isOneWay && len(f.decomp) == 2 { |
||||||
|
f0 := &chars[f.decomp[0]].forms[form] |
||||||
|
f1 := &chars[f.decomp[1]].forms[form] |
||||||
|
if !f0.isOneWay { |
||||||
|
f0.combinesForward = true |
||||||
|
} |
||||||
|
if !f1.isOneWay { |
||||||
|
f1.combinesBackward = true |
||||||
|
} |
||||||
|
} |
||||||
|
if isHangulWithoutJamoT(rune(i)) { |
||||||
|
f.combinesForward = true |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Phase 3: quick check values.
|
||||||
|
for i := range chars { |
||||||
|
c := &chars[i] |
||||||
|
f := &c.forms[form] |
||||||
|
|
||||||
|
switch { |
||||||
|
case len(f.decomp) > 0: |
||||||
|
f.quickCheck[MDecomposed] = QCNo |
||||||
|
case isHangul(rune(i)): |
||||||
|
f.quickCheck[MDecomposed] = QCNo |
||||||
|
default: |
||||||
|
f.quickCheck[MDecomposed] = QCYes |
||||||
|
} |
||||||
|
switch { |
||||||
|
case f.isOneWay: |
||||||
|
f.quickCheck[MComposed] = QCNo |
||||||
|
case (i & 0xffff00) == JamoLBase: |
||||||
|
f.quickCheck[MComposed] = QCYes |
||||||
|
if JamoLBase <= i && i < JamoLEnd { |
||||||
|
f.combinesForward = true |
||||||
|
} |
||||||
|
if JamoVBase <= i && i < JamoVEnd { |
||||||
|
f.quickCheck[MComposed] = QCMaybe |
||||||
|
f.combinesBackward = true |
||||||
|
f.combinesForward = true |
||||||
|
} |
||||||
|
if JamoTBase <= i && i < JamoTEnd { |
||||||
|
f.quickCheck[MComposed] = QCMaybe |
||||||
|
f.combinesBackward = true |
||||||
|
} |
||||||
|
case !f.combinesBackward: |
||||||
|
f.quickCheck[MComposed] = QCYes |
||||||
|
default: |
||||||
|
f.quickCheck[MComposed] = QCMaybe |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func computeNonStarterCounts() { |
||||||
|
// Phase 4: leading and trailing non-starter count
|
||||||
|
for i := range chars { |
||||||
|
c := &chars[i] |
||||||
|
|
||||||
|
runes := []rune{rune(i)} |
||||||
|
// We always use FCompatibility so that the CGJ insertion points do not
|
||||||
|
// change for repeated normalizations with different forms.
|
||||||
|
if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 { |
||||||
|
runes = exp |
||||||
|
} |
||||||
|
// We consider runes that combine backwards to be non-starters for the
|
||||||
|
// purpose of Stream-Safe Text Processing.
|
||||||
|
for _, r := range runes { |
||||||
|
if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { |
||||||
|
break |
||||||
|
} |
||||||
|
c.nLeadingNonStarters++ |
||||||
|
} |
||||||
|
for i := len(runes) - 1; i >= 0; i-- { |
||||||
|
if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { |
||||||
|
break |
||||||
|
} |
||||||
|
c.nTrailingNonStarters++ |
||||||
|
} |
||||||
|
if c.nTrailingNonStarters > 3 { |
||||||
|
log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes) |
||||||
|
} |
||||||
|
|
||||||
|
if isHangul(rune(i)) { |
||||||
|
c.nTrailingNonStarters = 2 |
||||||
|
if isHangulWithoutJamoT(rune(i)) { |
||||||
|
c.nTrailingNonStarters = 1 |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t { |
||||||
|
log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t) |
||||||
|
} |
||||||
|
if t := c.nTrailingNonStarters; t > 3 { |
||||||
|
log.Fatalf("%U: number of trailing non-starters is %d > 3", t) |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
func printBytes(w io.Writer, b []byte, name string) { |
||||||
|
fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b)) |
||||||
|
fmt.Fprintf(w, "var %s = [...]byte {", name) |
||||||
|
for i, c := range b { |
||||||
|
switch { |
||||||
|
case i%64 == 0: |
||||||
|
fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63) |
||||||
|
case i%8 == 0: |
||||||
|
fmt.Fprintf(w, "\n") |
||||||
|
} |
||||||
|
fmt.Fprintf(w, "0x%.2X, ", c) |
||||||
|
} |
||||||
|
fmt.Fprint(w, "\n}\n\n") |
||||||
|
} |
||||||
|
|
||||||
|
// See forminfo.go for format.
|
||||||
|
func makeEntry(f *FormInfo, c *Char) uint16 { |
||||||
|
e := uint16(0) |
||||||
|
if r := c.codePoint; HangulBase <= r && r < HangulEnd { |
||||||
|
e |= 0x40 |
||||||
|
} |
||||||
|
if f.combinesForward { |
||||||
|
e |= 0x20 |
||||||
|
} |
||||||
|
if f.quickCheck[MDecomposed] == QCNo { |
||||||
|
e |= 0x4 |
||||||
|
} |
||||||
|
switch f.quickCheck[MComposed] { |
||||||
|
case QCYes: |
||||||
|
case QCNo: |
||||||
|
e |= 0x10 |
||||||
|
case QCMaybe: |
||||||
|
e |= 0x18 |
||||||
|
default: |
||||||
|
log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed]) |
||||||
|
} |
||||||
|
e |= uint16(c.nTrailingNonStarters) |
||||||
|
return e |
||||||
|
} |
||||||
|
|
||||||
|
// decompSet keeps track of unique decompositions, grouped by whether
|
||||||
|
// the decomposition is followed by a trailing and/or leading CCC.
|
||||||
|
type decompSet [7]map[string]bool |
||||||
|
|
||||||
|
const ( |
||||||
|
normalDecomp = iota |
||||||
|
firstMulti |
||||||
|
firstCCC |
||||||
|
endMulti |
||||||
|
firstLeadingCCC |
||||||
|
firstCCCZeroExcept |
||||||
|
firstStarterWithNLead |
||||||
|
lastDecomp |
||||||
|
) |
||||||
|
|
||||||
|
var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"} |
||||||
|
|
||||||
|
func makeDecompSet() decompSet { |
||||||
|
m := decompSet{} |
||||||
|
for i := range m { |
||||||
|
m[i] = make(map[string]bool) |
||||||
|
} |
||||||
|
return m |
||||||
|
} |
||||||
|
func (m *decompSet) insert(key int, s string) { |
||||||
|
m[key][s] = true |
||||||
|
} |
||||||
|
|
||||||
|
func printCharInfoTables(w io.Writer) int { |
||||||
|
mkstr := func(r rune, f *FormInfo) (int, string) { |
||||||
|
d := f.expandedDecomp |
||||||
|
s := string([]rune(d)) |
||||||
|
if max := 1 << 6; len(s) >= max { |
||||||
|
const msg = "%U: too many bytes in decomposition: %d >= %d" |
||||||
|
log.Fatalf(msg, r, len(s), max) |
||||||
|
} |
||||||
|
head := uint8(len(s)) |
||||||
|
if f.quickCheck[MComposed] != QCYes { |
||||||
|
head |= 0x40 |
||||||
|
} |
||||||
|
if f.combinesForward { |
||||||
|
head |= 0x80 |
||||||
|
} |
||||||
|
s = string([]byte{head}) + s |
||||||
|
|
||||||
|
lccc := ccc(d[0]) |
||||||
|
tccc := ccc(d[len(d)-1]) |
||||||
|
cc := ccc(r) |
||||||
|
if cc != 0 && lccc == 0 && tccc == 0 { |
||||||
|
log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc) |
||||||
|
} |
||||||
|
if tccc < lccc && lccc != 0 { |
||||||
|
const msg = "%U: lccc (%d) must be <= tcc (%d)" |
||||||
|
log.Fatalf(msg, r, lccc, tccc) |
||||||
|
} |
||||||
|
index := normalDecomp |
||||||
|
nTrail := chars[r].nTrailingNonStarters |
||||||
|
nLead := chars[r].nLeadingNonStarters |
||||||
|
if tccc > 0 || lccc > 0 || nTrail > 0 { |
||||||
|
tccc <<= 2 |
||||||
|
tccc |= nTrail |
||||||
|
s += string([]byte{tccc}) |
||||||
|
index = endMulti |
||||||
|
for _, r := range d[1:] { |
||||||
|
if ccc(r) == 0 { |
||||||
|
index = firstCCC |
||||||
|
} |
||||||
|
} |
||||||
|
if lccc > 0 || nLead > 0 { |
||||||
|
s += string([]byte{lccc}) |
||||||
|
if index == firstCCC { |
||||||
|
log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r) |
||||||
|
} |
||||||
|
index = firstLeadingCCC |
||||||
|
} |
||||||
|
if cc != lccc { |
||||||
|
if cc != 0 { |
||||||
|
log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc) |
||||||
|
} |
||||||
|
index = firstCCCZeroExcept |
||||||
|
} |
||||||
|
} else if len(d) > 1 { |
||||||
|
index = firstMulti |
||||||
|
} |
||||||
|
return index, s |
||||||
|
} |
||||||
|
|
||||||
|
decompSet := makeDecompSet() |
||||||
|
const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail.
|
||||||
|
decompSet.insert(firstStarterWithNLead, nLeadStr) |
||||||
|
|
||||||
|
// Store the uniqued decompositions in a byte buffer,
|
||||||
|
// preceded by their byte length.
|
||||||
|
for _, c := range chars { |
||||||
|
for _, f := range c.forms { |
||||||
|
if len(f.expandedDecomp) == 0 { |
||||||
|
continue |
||||||
|
} |
||||||
|
if f.combinesBackward { |
||||||
|
log.Fatalf("%U: combinesBackward and decompose", c.codePoint) |
||||||
|
} |
||||||
|
index, s := mkstr(c.codePoint, &f) |
||||||
|
decompSet.insert(index, s) |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
decompositions := bytes.NewBuffer(make([]byte, 0, 10000)) |
||||||
|
size := 0 |
||||||
|
positionMap := make(map[string]uint16) |
||||||
|
decompositions.WriteString("\000") |
||||||
|
fmt.Fprintln(w, "const (") |
||||||
|
for i, m := range decompSet { |
||||||
|
sa := []string{} |
||||||
|
for s := range m { |
||||||
|
sa = append(sa, s) |
||||||
|
} |
||||||
|
sort.Strings(sa) |
||||||
|
for _, s := range sa { |
||||||
|
p := decompositions.Len() |
||||||
|
decompositions.WriteString(s) |
||||||
|
positionMap[s] = uint16(p) |
||||||
|
} |
||||||
|
if cname[i] != "" { |
||||||
|
fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len()) |
||||||
|
} |
||||||
|
} |
||||||
|
fmt.Fprintln(w, "maxDecomp = 0x8000") |
||||||
|
fmt.Fprintln(w, ")") |
||||||
|
b := decompositions.Bytes() |
||||||
|
printBytes(w, b, "decomps") |
||||||
|
size += len(b) |
||||||
|
|
||||||
|
varnames := []string{"nfc", "nfkc"} |
||||||
|
for i := 0; i < FNumberOfFormTypes; i++ { |
||||||
|
trie := triegen.NewTrie(varnames[i]) |
||||||
|
|
||||||
|
for r, c := range chars { |
||||||
|
f := c.forms[i] |
||||||
|
d := f.expandedDecomp |
||||||
|
if len(d) != 0 { |
||||||
|
_, key := mkstr(c.codePoint, &f) |
||||||
|
trie.Insert(rune(r), uint64(positionMap[key])) |
||||||
|
if c.ccc != ccc(d[0]) { |
||||||
|
// We assume the lead ccc of a decomposition !=0 in this case.
|
||||||
|
if ccc(d[0]) == 0 { |
||||||
|
log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc) |
||||||
|
} |
||||||
|
} |
||||||
|
} else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward { |
||||||
|
// Handle cases where it can't be detected that the nLead should be equal
|
||||||
|
// to nTrail.
|
||||||
|
trie.Insert(c.codePoint, uint64(positionMap[nLeadStr])) |
||||||
|
} else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 { |
||||||
|
trie.Insert(c.codePoint, uint64(0x8000|v)) |
||||||
|
} |
||||||
|
} |
||||||
|
sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]})) |
||||||
|
if err != nil { |
||||||
|
log.Fatal(err) |
||||||
|
} |
||||||
|
size += sz |
||||||
|
} |
||||||
|
return size |
||||||
|
} |
||||||
|
|
||||||
|
func contains(sa []string, s string) bool { |
||||||
|
for _, a := range sa { |
||||||
|
if a == s { |
||||||
|
return true |
||||||
|
} |
||||||
|
} |
||||||
|
return false |
||||||
|
} |
||||||
|
|
||||||
|
func makeTables() { |
||||||
|
w := &bytes.Buffer{} |
||||||
|
|
||||||
|
size := 0 |
||||||
|
if *tablelist == "" { |
||||||
|
return |
||||||
|
} |
||||||
|
list := strings.Split(*tablelist, ",") |
||||||
|
if *tablelist == "all" { |
||||||
|
list = []string{"recomp", "info"} |
||||||
|
} |
||||||
|
|
||||||
|
// Compute maximum decomposition size.
|
||||||
|
max := 0 |
||||||
|
for _, c := range chars { |
||||||
|
if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max { |
||||||
|
max = n |
||||||
|
} |
||||||
|
} |
||||||
|
fmt.Fprintln(w, `import "sync"`) |
||||||
|
fmt.Fprintln(w) |
||||||
|
|
||||||
|
fmt.Fprintln(w, "const (") |
||||||
|
fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.") |
||||||
|
fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion()) |
||||||
|
fmt.Fprintln(w) |
||||||
|
fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform") |
||||||
|
fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at") |
||||||
|
fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that") |
||||||
|
fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.") |
||||||
|
fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max) |
||||||
|
fmt.Fprintln(w, ")\n") |
||||||
|
|
||||||
|
// Print the CCC remap table.
|
||||||
|
size += len(cccMap) |
||||||
|
fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap)) |
||||||
|
for i := 0; i < len(cccMap); i++ { |
||||||
|
if i%8 == 0 { |
||||||
|
fmt.Fprintln(w) |
||||||
|
} |
||||||
|
fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)]) |
||||||
|
} |
||||||
|
fmt.Fprintln(w, "\n}\n") |
||||||
|
|
||||||
|
if contains(list, "info") { |
||||||
|
size += printCharInfoTables(w) |
||||||
|
} |
||||||
|
|
||||||
|
if contains(list, "recomp") { |
||||||
|
// Note that we use 32 bit keys, instead of 64 bit.
|
||||||
|
// This clips the bits of three entries, but we know
|
||||||
|
// this won't cause a collision. The compiler will catch
|
||||||
|
// any changes made to UnicodeData.txt that introduces
|
||||||
|
// a collision.
|
||||||
|
// Note that the recomposition map for NFC and NFKC
|
||||||
|
// are identical.
|
||||||
|
|
||||||
|
// Recomposition map
|
||||||
|
nrentries := 0 |
||||||
|
for _, c := range chars { |
||||||
|
f := c.forms[FCanonical] |
||||||
|
if !f.isOneWay && len(f.decomp) > 0 { |
||||||
|
nrentries++ |
||||||
|
} |
||||||
|
} |
||||||
|
sz := nrentries * 8 |
||||||
|
size += sz |
||||||
|
fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz) |
||||||
|
fmt.Fprintln(w, "var recompMap map[uint32]rune") |
||||||
|
fmt.Fprintln(w, "var recompMapOnce sync.Once\n") |
||||||
|
fmt.Fprintln(w, `const recompMapPacked = "" +`) |
||||||
|
var buf [8]byte |
||||||
|
for i, c := range chars { |
||||||
|
f := c.forms[FCanonical] |
||||||
|
d := f.decomp |
||||||
|
if !f.isOneWay && len(d) > 0 { |
||||||
|
key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1])) |
||||||
|
binary.BigEndian.PutUint32(buf[:4], key) |
||||||
|
binary.BigEndian.PutUint32(buf[4:], uint32(i)) |
||||||
|
fmt.Fprintf(w, "\t\t%q + // 0x%.8X: 0x%.8X\n", string(buf[:]), key, uint32(i)) |
||||||
|
} |
||||||
|
} |
||||||
|
// hack so we don't have to special case the trailing plus sign
|
||||||
|
fmt.Fprintf(w, ` ""`) |
||||||
|
fmt.Fprintln(w) |
||||||
|
} |
||||||
|
|
||||||
|
fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size) |
||||||
|
gen.WriteVersionedGoFile("tables.go", "norm", w.Bytes()) |
||||||
|
} |
||||||
|
|
||||||
|
func printChars() { |
||||||
|
if *verbose { |
||||||
|
for _, c := range chars { |
||||||
|
if !c.isValid() || c.state == SMissing { |
||||||
|
continue |
||||||
|
} |
||||||
|
fmt.Println(c) |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// verifyComputed does various consistency tests.
|
||||||
|
func verifyComputed() { |
||||||
|
for i, c := range chars { |
||||||
|
for _, f := range c.forms { |
||||||
|
isNo := (f.quickCheck[MDecomposed] == QCNo) |
||||||
|
if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) { |
||||||
|
log.Fatalf("%U: NF*D QC must be No if rune decomposes", i) |
||||||
|
} |
||||||
|
|
||||||
|
isMaybe := f.quickCheck[MComposed] == QCMaybe |
||||||
|
if f.combinesBackward != isMaybe { |
||||||
|
log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i) |
||||||
|
} |
||||||
|
if len(f.decomp) > 0 && f.combinesForward && isMaybe { |
||||||
|
log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i) |
||||||
|
} |
||||||
|
|
||||||
|
if len(f.expandedDecomp) != 0 { |
||||||
|
continue |
||||||
|
} |
||||||
|
if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b { |
||||||
|
// We accept these runes to be treated differently (it only affects
|
||||||
|
// segment breaking in iteration, most likely on improper use), but
|
||||||
|
// reconsider if more characters are added.
|
||||||
|
// U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;;
|
||||||
|
// U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;;
|
||||||
|
// U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;;
|
||||||
|
// U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;;
|
||||||
|
// U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;;
|
||||||
|
// U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;;
|
||||||
|
if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) { |
||||||
|
log.Fatalf("%U: nLead was %v; want %v", i, a, b) |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
nfc := c.forms[FCanonical] |
||||||
|
nfkc := c.forms[FCompatibility] |
||||||
|
if nfc.combinesBackward != nfkc.combinesBackward { |
||||||
|
log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint) |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Use values in DerivedNormalizationProps.txt to compare against the
|
||||||
|
// values we computed.
|
||||||
|
// DerivedNormalizationProps.txt has form:
|
||||||
|
// 00C0..00C5 ; NFD_QC; N # ...
|
||||||
|
// 0374 ; NFD_QC; N # ...
|
||||||
|
// See https://unicode.org/reports/tr44/ for full explanation
|
||||||
|
func testDerived() { |
||||||
|
f := gen.OpenUCDFile("DerivedNormalizationProps.txt") |
||||||
|
defer f.Close() |
||||||
|
p := ucd.New(f) |
||||||
|
for p.Next() { |
||||||
|
r := p.Rune(0) |
||||||
|
c := &chars[r] |
||||||
|
|
||||||
|
var ftype, mode int |
||||||
|
qt := p.String(1) |
||||||
|
switch qt { |
||||||
|
case "NFC_QC": |
||||||
|
ftype, mode = FCanonical, MComposed |
||||||
|
case "NFD_QC": |
||||||
|
ftype, mode = FCanonical, MDecomposed |
||||||
|
case "NFKC_QC": |
||||||
|
ftype, mode = FCompatibility, MComposed |
||||||
|
case "NFKD_QC": |
||||||
|
ftype, mode = FCompatibility, MDecomposed |
||||||
|
default: |
||||||
|
continue |
||||||
|
} |
||||||
|
var qr QCResult |
||||||
|
switch p.String(2) { |
||||||
|
case "Y": |
||||||
|
qr = QCYes |
||||||
|
case "N": |
||||||
|
qr = QCNo |
||||||
|
case "M": |
||||||
|
qr = QCMaybe |
||||||
|
default: |
||||||
|
log.Fatalf(`Unexpected quick check value "%s"`, p.String(2)) |
||||||
|
} |
||||||
|
if got := c.forms[ftype].quickCheck[mode]; got != qr { |
||||||
|
log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr) |
||||||
|
} |
||||||
|
c.forms[ftype].verified[mode] = true |
||||||
|
} |
||||||
|
if err := p.Err(); err != nil { |
||||||
|
log.Fatal(err) |
||||||
|
} |
||||||
|
// Any unspecified value must be QCYes. Verify this.
|
||||||
|
for i, c := range chars { |
||||||
|
for j, fd := range c.forms { |
||||||
|
for k, qr := range fd.quickCheck { |
||||||
|
if !fd.verified[k] && qr != QCYes { |
||||||
|
m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n" |
||||||
|
log.Printf(m, i, j, k, qr, c.name) |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
var testHeader = `const ( |
||||||
|
Yes = iota |
||||||
|
No |
||||||
|
Maybe |
||||||
|
) |
||||||
|
|
||||||
|
type formData struct { |
||||||
|
qc uint8 |
||||||
|
combinesForward bool |
||||||
|
decomposition string |
||||||
|
} |
||||||
|
|
||||||
|
type runeData struct { |
||||||
|
r rune |
||||||
|
ccc uint8 |
||||||
|
nLead uint8 |
||||||
|
nTrail uint8 |
||||||
|
f [2]formData // 0: canonical; 1: compatibility
|
||||||
|
} |
||||||
|
|
||||||
|
func f(qc uint8, cf bool, dec string) [2]formData { |
||||||
|
return [2]formData{{qc, cf, dec}, {qc, cf, dec}} |
||||||
|
} |
||||||
|
|
||||||
|
func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData { |
||||||
|
return [2]formData{{qc, cf, d}, {qck, cfk, dk}} |
||||||
|
} |
||||||
|
|
||||||
|
var testData = []runeData{ |
||||||
|
` |
||||||
|
|
||||||
|
func printTestdata() { |
||||||
|
type lastInfo struct { |
||||||
|
ccc uint8 |
||||||
|
nLead uint8 |
||||||
|
nTrail uint8 |
||||||
|
f string |
||||||
|
} |
||||||
|
|
||||||
|
last := lastInfo{} |
||||||
|
w := &bytes.Buffer{} |
||||||
|
fmt.Fprintf(w, testHeader) |
||||||
|
for r, c := range chars { |
||||||
|
f := c.forms[FCanonical] |
||||||
|
qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) |
||||||
|
f = c.forms[FCompatibility] |
||||||
|
qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) |
||||||
|
s := "" |
||||||
|
if d == dk && qc == qck && cf == cfk { |
||||||
|
s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d) |
||||||
|
} else { |
||||||
|
s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk) |
||||||
|
} |
||||||
|
current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s} |
||||||
|
if last != current { |
||||||
|
fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s) |
||||||
|
last = current |
||||||
|
} |
||||||
|
} |
||||||
|
fmt.Fprintln(w, "}") |
||||||
|
gen.WriteVersionedGoFile("data_test.go", "norm", w.Bytes()) |
||||||
|
} |
@ -0,0 +1,609 @@ |
|||||||
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
// Note: the file data_test.go that is generated should not be checked in.
|
||||||
|
//go:generate go run maketables.go triegen.go
|
||||||
|
//go:generate go test -tags test
|
||||||
|
|
||||||
|
// Package norm contains types and functions for normalizing Unicode strings.
|
||||||
|
package norm // import "golang.org/x/text/unicode/norm"
|
||||||
|
|
||||||
|
import ( |
||||||
|
"unicode/utf8" |
||||||
|
|
||||||
|
"golang.org/x/text/transform" |
||||||
|
) |
||||||
|
|
||||||
|
// A Form denotes a canonical representation of Unicode code points.
|
||||||
|
// The Unicode-defined normalization and equivalence forms are:
|
||||||
|
//
|
||||||
|
// NFC Unicode Normalization Form C
|
||||||
|
// NFD Unicode Normalization Form D
|
||||||
|
// NFKC Unicode Normalization Form KC
|
||||||
|
// NFKD Unicode Normalization Form KD
|
||||||
|
//
|
||||||
|
// For a Form f, this documentation uses the notation f(x) to mean
|
||||||
|
// the bytes or string x converted to the given form.
|
||||||
|
// A position n in x is called a boundary if conversion to the form can
|
||||||
|
// proceed independently on both sides:
|
||||||
|
// f(x) == append(f(x[0:n]), f(x[n:])...)
|
||||||
|
//
|
||||||
|
// References: https://unicode.org/reports/tr15/ and
|
||||||
|
// https://unicode.org/notes/tn5/.
|
||||||
|
type Form int |
||||||
|
|
||||||
|
const ( |
||||||
|
NFC Form = iota |
||||||
|
NFD |
||||||
|
NFKC |
||||||
|
NFKD |
||||||
|
) |
||||||
|
|
||||||
|
// Bytes returns f(b). May return b if f(b) = b.
|
||||||
|
func (f Form) Bytes(b []byte) []byte { |
||||||
|
src := inputBytes(b) |
||||||
|
ft := formTable[f] |
||||||
|
n, ok := ft.quickSpan(src, 0, len(b), true) |
||||||
|
if ok { |
||||||
|
return b |
||||||
|
} |
||||||
|
out := make([]byte, n, len(b)) |
||||||
|
copy(out, b[0:n]) |
||||||
|
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush} |
||||||
|
return doAppendInner(&rb, n) |
||||||
|
} |
||||||
|
|
||||||
|
// String returns f(s).
|
||||||
|
func (f Form) String(s string) string { |
||||||
|
src := inputString(s) |
||||||
|
ft := formTable[f] |
||||||
|
n, ok := ft.quickSpan(src, 0, len(s), true) |
||||||
|
if ok { |
||||||
|
return s |
||||||
|
} |
||||||
|
out := make([]byte, n, len(s)) |
||||||
|
copy(out, s[0:n]) |
||||||
|
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush} |
||||||
|
return string(doAppendInner(&rb, n)) |
||||||
|
} |
||||||
|
|
||||||
|
// IsNormal returns true if b == f(b).
|
||||||
|
func (f Form) IsNormal(b []byte) bool { |
||||||
|
src := inputBytes(b) |
||||||
|
ft := formTable[f] |
||||||
|
bp, ok := ft.quickSpan(src, 0, len(b), true) |
||||||
|
if ok { |
||||||
|
return true |
||||||
|
} |
||||||
|
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)} |
||||||
|
rb.setFlusher(nil, cmpNormalBytes) |
||||||
|
for bp < len(b) { |
||||||
|
rb.out = b[bp:] |
||||||
|
if bp = decomposeSegment(&rb, bp, true); bp < 0 { |
||||||
|
return false |
||||||
|
} |
||||||
|
bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true) |
||||||
|
} |
||||||
|
return true |
||||||
|
} |
||||||
|
|
||||||
|
func cmpNormalBytes(rb *reorderBuffer) bool { |
||||||
|
b := rb.out |
||||||
|
for i := 0; i < rb.nrune; i++ { |
||||||
|
info := rb.rune[i] |
||||||
|
if int(info.size) > len(b) { |
||||||
|
return false |
||||||
|
} |
||||||
|
p := info.pos |
||||||
|
pe := p + info.size |
||||||
|
for ; p < pe; p++ { |
||||||
|
if b[0] != rb.byte[p] { |
||||||
|
return false |
||||||
|
} |
||||||
|
b = b[1:] |
||||||
|
} |
||||||
|
} |
||||||
|
return true |
||||||
|
} |
||||||
|
|
||||||
|
// IsNormalString returns true if s == f(s).
|
||||||
|
func (f Form) IsNormalString(s string) bool { |
||||||
|
src := inputString(s) |
||||||
|
ft := formTable[f] |
||||||
|
bp, ok := ft.quickSpan(src, 0, len(s), true) |
||||||
|
if ok { |
||||||
|
return true |
||||||
|
} |
||||||
|
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)} |
||||||
|
rb.setFlusher(nil, func(rb *reorderBuffer) bool { |
||||||
|
for i := 0; i < rb.nrune; i++ { |
||||||
|
info := rb.rune[i] |
||||||
|
if bp+int(info.size) > len(s) { |
||||||
|
return false |
||||||
|
} |
||||||
|
p := info.pos |
||||||
|
pe := p + info.size |
||||||
|
for ; p < pe; p++ { |
||||||
|
if s[bp] != rb.byte[p] { |
||||||
|
return false |
||||||
|
} |
||||||
|
bp++ |
||||||
|
} |
||||||
|
} |
||||||
|
return true |
||||||
|
}) |
||||||
|
for bp < len(s) { |
||||||
|
if bp = decomposeSegment(&rb, bp, true); bp < 0 { |
||||||
|
return false |
||||||
|
} |
||||||
|
bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true) |
||||||
|
} |
||||||
|
return true |
||||||
|
} |
||||||
|
|
||||||
|
// patchTail fixes a case where a rune may be incorrectly normalized
|
||||||
|
// if it is followed by illegal continuation bytes. It returns the
|
||||||
|
// patched buffer and whether the decomposition is still in progress.
|
||||||
|
func patchTail(rb *reorderBuffer) bool { |
||||||
|
info, p := lastRuneStart(&rb.f, rb.out) |
||||||
|
if p == -1 || info.size == 0 { |
||||||
|
return true |
||||||
|
} |
||||||
|
end := p + int(info.size) |
||||||
|
extra := len(rb.out) - end |
||||||
|
if extra > 0 { |
||||||
|
// Potentially allocating memory. However, this only
|
||||||
|
// happens with ill-formed UTF-8.
|
||||||
|
x := make([]byte, 0) |
||||||
|
x = append(x, rb.out[len(rb.out)-extra:]...) |
||||||
|
rb.out = rb.out[:end] |
||||||
|
decomposeToLastBoundary(rb) |
||||||
|
rb.doFlush() |
||||||
|
rb.out = append(rb.out, x...) |
||||||
|
return false |
||||||
|
} |
||||||
|
buf := rb.out[p:] |
||||||
|
rb.out = rb.out[:p] |
||||||
|
decomposeToLastBoundary(rb) |
||||||
|
if s := rb.ss.next(info); s == ssStarter { |
||||||
|
rb.doFlush() |
||||||
|
rb.ss.first(info) |
||||||
|
} else if s == ssOverflow { |
||||||
|
rb.doFlush() |
||||||
|
rb.insertCGJ() |
||||||
|
rb.ss = 0 |
||||||
|
} |
||||||
|
rb.insertUnsafe(inputBytes(buf), 0, info) |
||||||
|
return true |
||||||
|
} |
||||||
|
|
||||||
|
func appendQuick(rb *reorderBuffer, i int) int { |
||||||
|
if rb.nsrc == i { |
||||||
|
return i |
||||||
|
} |
||||||
|
end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true) |
||||||
|
rb.out = rb.src.appendSlice(rb.out, i, end) |
||||||
|
return end |
||||||
|
} |
||||||
|
|
||||||
|
// Append returns f(append(out, b...)).
|
||||||
|
// The buffer out must be nil, empty, or equal to f(out).
|
||||||
|
func (f Form) Append(out []byte, src ...byte) []byte { |
||||||
|
return f.doAppend(out, inputBytes(src), len(src)) |
||||||
|
} |
||||||
|
|
||||||
|
func (f Form) doAppend(out []byte, src input, n int) []byte { |
||||||
|
if n == 0 { |
||||||
|
return out |
||||||
|
} |
||||||
|
ft := formTable[f] |
||||||
|
// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
|
||||||
|
if len(out) == 0 { |
||||||
|
p, _ := ft.quickSpan(src, 0, n, true) |
||||||
|
out = src.appendSlice(out, 0, p) |
||||||
|
if p == n { |
||||||
|
return out |
||||||
|
} |
||||||
|
rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush} |
||||||
|
return doAppendInner(&rb, p) |
||||||
|
} |
||||||
|
rb := reorderBuffer{f: *ft, src: src, nsrc: n} |
||||||
|
return doAppend(&rb, out, 0) |
||||||
|
} |
||||||
|
|
||||||
|
func doAppend(rb *reorderBuffer, out []byte, p int) []byte { |
||||||
|
rb.setFlusher(out, appendFlush) |
||||||
|
src, n := rb.src, rb.nsrc |
||||||
|
doMerge := len(out) > 0 |
||||||
|
if q := src.skipContinuationBytes(p); q > p { |
||||||
|
// Move leading non-starters to destination.
|
||||||
|
rb.out = src.appendSlice(rb.out, p, q) |
||||||
|
p = q |
||||||
|
doMerge = patchTail(rb) |
||||||
|
} |
||||||
|
fd := &rb.f |
||||||
|
if doMerge { |
||||||
|
var info Properties |
||||||
|
if p < n { |
||||||
|
info = fd.info(src, p) |
||||||
|
if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 { |
||||||
|
if p == 0 { |
||||||
|
decomposeToLastBoundary(rb) |
||||||
|
} |
||||||
|
p = decomposeSegment(rb, p, true) |
||||||
|
} |
||||||
|
} |
||||||
|
if info.size == 0 { |
||||||
|
rb.doFlush() |
||||||
|
// Append incomplete UTF-8 encoding.
|
||||||
|
return src.appendSlice(rb.out, p, n) |
||||||
|
} |
||||||
|
if rb.nrune > 0 { |
||||||
|
return doAppendInner(rb, p) |
||||||
|
} |
||||||
|
} |
||||||
|
p = appendQuick(rb, p) |
||||||
|
return doAppendInner(rb, p) |
||||||
|
} |
||||||
|
|
||||||
|
func doAppendInner(rb *reorderBuffer, p int) []byte { |
||||||
|
for n := rb.nsrc; p < n; { |
||||||
|
p = decomposeSegment(rb, p, true) |
||||||
|
p = appendQuick(rb, p) |
||||||
|
} |
||||||
|
return rb.out |
||||||
|
} |
||||||
|
|
||||||
|
// AppendString returns f(append(out, []byte(s))).
|
||||||
|
// The buffer out must be nil, empty, or equal to f(out).
|
||||||
|
func (f Form) AppendString(out []byte, src string) []byte { |
||||||
|
return f.doAppend(out, inputString(src), len(src)) |
||||||
|
} |
||||||
|
|
||||||
|
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
|
||||||
|
// It is not guaranteed to return the largest such n.
|
||||||
|
func (f Form) QuickSpan(b []byte) int { |
||||||
|
n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true) |
||||||
|
return n |
||||||
|
} |
||||||
|
|
||||||
|
// Span implements transform.SpanningTransformer. It returns a boundary n such
|
||||||
|
// that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
|
||||||
|
func (f Form) Span(b []byte, atEOF bool) (n int, err error) { |
||||||
|
n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF) |
||||||
|
if n < len(b) { |
||||||
|
if !ok { |
||||||
|
err = transform.ErrEndOfSpan |
||||||
|
} else { |
||||||
|
err = transform.ErrShortSrc |
||||||
|
} |
||||||
|
} |
||||||
|
return n, err |
||||||
|
} |
||||||
|
|
||||||
|
// SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
|
||||||
|
// It is not guaranteed to return the largest such n.
|
||||||
|
func (f Form) SpanString(s string, atEOF bool) (n int, err error) { |
||||||
|
n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF) |
||||||
|
if n < len(s) { |
||||||
|
if !ok { |
||||||
|
err = transform.ErrEndOfSpan |
||||||
|
} else { |
||||||
|
err = transform.ErrShortSrc |
||||||
|
} |
||||||
|
} |
||||||
|
return n, err |
||||||
|
} |
||||||
|
|
||||||
|
// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
|
||||||
|
// whether any non-normalized parts were found. If atEOF is false, n will
|
||||||
|
// not point past the last segment if this segment might be become
|
||||||
|
// non-normalized by appending other runes.
|
||||||
|
func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) { |
||||||
|
var lastCC uint8 |
||||||
|
ss := streamSafe(0) |
||||||
|
lastSegStart := i |
||||||
|
for n = end; i < n; { |
||||||
|
if j := src.skipASCII(i, n); i != j { |
||||||
|
i = j |
||||||
|
lastSegStart = i - 1 |
||||||
|
lastCC = 0 |
||||||
|
ss = 0 |
||||||
|
continue |
||||||
|
} |
||||||
|
info := f.info(src, i) |
||||||
|
if info.size == 0 { |
||||||
|
if atEOF { |
||||||
|
// include incomplete runes
|
||||||
|
return n, true |
||||||
|
} |
||||||
|
return lastSegStart, true |
||||||
|
} |
||||||
|
// This block needs to be before the next, because it is possible to
|
||||||
|
// have an overflow for runes that are starters (e.g. with U+FF9E).
|
||||||
|
switch ss.next(info) { |
||||||
|
case ssStarter: |
||||||
|
lastSegStart = i |
||||||
|
case ssOverflow: |
||||||
|
return lastSegStart, false |
||||||
|
case ssSuccess: |
||||||
|
if lastCC > info.ccc { |
||||||
|
return lastSegStart, false |
||||||
|
} |
||||||
|
} |
||||||
|
if f.composing { |
||||||
|
if !info.isYesC() { |
||||||
|
break |
||||||
|
} |
||||||
|
} else { |
||||||
|
if !info.isYesD() { |
||||||
|
break |
||||||
|
} |
||||||
|
} |
||||||
|
lastCC = info.ccc |
||||||
|
i += int(info.size) |
||||||
|
} |
||||||
|
if i == n { |
||||||
|
if !atEOF { |
||||||
|
n = lastSegStart |
||||||
|
} |
||||||
|
return n, true |
||||||
|
} |
||||||
|
return lastSegStart, false |
||||||
|
} |
||||||
|
|
||||||
|
// QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
|
||||||
|
// It is not guaranteed to return the largest such n.
|
||||||
|
func (f Form) QuickSpanString(s string) int { |
||||||
|
n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true) |
||||||
|
return n |
||||||
|
} |
||||||
|
|
||||||
|
// FirstBoundary returns the position i of the first boundary in b
|
||||||
|
// or -1 if b contains no boundary.
|
||||||
|
func (f Form) FirstBoundary(b []byte) int { |
||||||
|
return f.firstBoundary(inputBytes(b), len(b)) |
||||||
|
} |
||||||
|
|
||||||
|
func (f Form) firstBoundary(src input, nsrc int) int { |
||||||
|
i := src.skipContinuationBytes(0) |
||||||
|
if i >= nsrc { |
||||||
|
return -1 |
||||||
|
} |
||||||
|
fd := formTable[f] |
||||||
|
ss := streamSafe(0) |
||||||
|
// We should call ss.first here, but we can't as the first rune is
|
||||||
|
// skipped already. This means FirstBoundary can't really determine
|
||||||
|
// CGJ insertion points correctly. Luckily it doesn't have to.
|
||||||
|
for { |
||||||
|
info := fd.info(src, i) |
||||||
|
if info.size == 0 { |
||||||
|
return -1 |
||||||
|
} |
||||||
|
if s := ss.next(info); s != ssSuccess { |
||||||
|
return i |
||||||
|
} |
||||||
|
i += int(info.size) |
||||||
|
if i >= nsrc { |
||||||
|
if !info.BoundaryAfter() && !ss.isMax() { |
||||||
|
return -1 |
||||||
|
} |
||||||
|
return nsrc |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// FirstBoundaryInString returns the position i of the first boundary in s
|
||||||
|
// or -1 if s contains no boundary.
|
||||||
|
func (f Form) FirstBoundaryInString(s string) int { |
||||||
|
return f.firstBoundary(inputString(s), len(s)) |
||||||
|
} |
||||||
|
|
||||||
|
// NextBoundary reports the index of the boundary between the first and next
|
||||||
|
// segment in b or -1 if atEOF is false and there are not enough bytes to
|
||||||
|
// determine this boundary.
|
||||||
|
func (f Form) NextBoundary(b []byte, atEOF bool) int { |
||||||
|
return f.nextBoundary(inputBytes(b), len(b), atEOF) |
||||||
|
} |
||||||
|
|
||||||
|
// NextBoundaryInString reports the index of the boundary between the first and
|
||||||
|
// next segment in b or -1 if atEOF is false and there are not enough bytes to
|
||||||
|
// determine this boundary.
|
||||||
|
func (f Form) NextBoundaryInString(s string, atEOF bool) int { |
||||||
|
return f.nextBoundary(inputString(s), len(s), atEOF) |
||||||
|
} |
||||||
|
|
||||||
|
func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int { |
||||||
|
if nsrc == 0 { |
||||||
|
if atEOF { |
||||||
|
return 0 |
||||||
|
} |
||||||
|
return -1 |
||||||
|
} |
||||||
|
fd := formTable[f] |
||||||
|
info := fd.info(src, 0) |
||||||
|
if info.size == 0 { |
||||||
|
if atEOF { |
||||||
|
return 1 |
||||||
|
} |
||||||
|
return -1 |
||||||
|
} |
||||||
|
ss := streamSafe(0) |
||||||
|
ss.first(info) |
||||||
|
|
||||||
|
for i := int(info.size); i < nsrc; i += int(info.size) { |
||||||
|
info = fd.info(src, i) |
||||||
|
if info.size == 0 { |
||||||
|
if atEOF { |
||||||
|
return i |
||||||
|
} |
||||||
|
return -1 |
||||||
|
} |
||||||
|
// TODO: Using streamSafe to determine the boundary isn't the same as
|
||||||
|
// using BoundaryBefore. Determine which should be used.
|
||||||
|
if s := ss.next(info); s != ssSuccess { |
||||||
|
return i |
||||||
|
} |
||||||
|
} |
||||||
|
if !atEOF && !info.BoundaryAfter() && !ss.isMax() { |
||||||
|
return -1 |
||||||
|
} |
||||||
|
return nsrc |
||||||
|
} |
||||||
|
|
||||||
|
// LastBoundary returns the position i of the last boundary in b
|
||||||
|
// or -1 if b contains no boundary.
|
||||||
|
func (f Form) LastBoundary(b []byte) int { |
||||||
|
return lastBoundary(formTable[f], b) |
||||||
|
} |
||||||
|
|
||||||
|
func lastBoundary(fd *formInfo, b []byte) int { |
||||||
|
i := len(b) |
||||||
|
info, p := lastRuneStart(fd, b) |
||||||
|
if p == -1 { |
||||||
|
return -1 |
||||||
|
} |
||||||
|
if info.size == 0 { // ends with incomplete rune
|
||||||
|
if p == 0 { // starts with incomplete rune
|
||||||
|
return -1 |
||||||
|
} |
||||||
|
i = p |
||||||
|
info, p = lastRuneStart(fd, b[:i]) |
||||||
|
if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
|
||||||
|
return i |
||||||
|
} |
||||||
|
} |
||||||
|
if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
|
||||||
|
return i |
||||||
|
} |
||||||
|
if info.BoundaryAfter() { |
||||||
|
return i |
||||||
|
} |
||||||
|
ss := streamSafe(0) |
||||||
|
v := ss.backwards(info) |
||||||
|
for i = p; i >= 0 && v != ssStarter; i = p { |
||||||
|
info, p = lastRuneStart(fd, b[:i]) |
||||||
|
if v = ss.backwards(info); v == ssOverflow { |
||||||
|
break |
||||||
|
} |
||||||
|
if p+int(info.size) != i { |
||||||
|
if p == -1 { // no boundary found
|
||||||
|
return -1 |
||||||
|
} |
||||||
|
return i // boundary after an illegal UTF-8 encoding
|
||||||
|
} |
||||||
|
} |
||||||
|
return i |
||||||
|
} |
||||||
|
|
||||||
|
// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
|
||||||
|
// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
|
||||||
|
// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
|
||||||
|
func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int { |
||||||
|
// Force one character to be consumed.
|
||||||
|
info := rb.f.info(rb.src, sp) |
||||||
|
if info.size == 0 { |
||||||
|
return 0 |
||||||
|
} |
||||||
|
if s := rb.ss.next(info); s == ssStarter { |
||||||
|
// TODO: this could be removed if we don't support merging.
|
||||||
|
if rb.nrune > 0 { |
||||||
|
goto end |
||||||
|
} |
||||||
|
} else if s == ssOverflow { |
||||||
|
rb.insertCGJ() |
||||||
|
goto end |
||||||
|
} |
||||||
|
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { |
||||||
|
return int(err) |
||||||
|
} |
||||||
|
for { |
||||||
|
sp += int(info.size) |
||||||
|
if sp >= rb.nsrc { |
||||||
|
if !atEOF && !info.BoundaryAfter() { |
||||||
|
return int(iShortSrc) |
||||||
|
} |
||||||
|
break |
||||||
|
} |
||||||
|
info = rb.f.info(rb.src, sp) |
||||||
|
if info.size == 0 { |
||||||
|
if !atEOF { |
||||||
|
return int(iShortSrc) |
||||||
|
} |
||||||
|
break |
||||||
|
} |
||||||
|
if s := rb.ss.next(info); s == ssStarter { |
||||||
|
break |
||||||
|
} else if s == ssOverflow { |
||||||
|
rb.insertCGJ() |
||||||
|
break |
||||||
|
} |
||||||
|
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { |
||||||
|
return int(err) |
||||||
|
} |
||||||
|
} |
||||||
|
end: |
||||||
|
if !rb.doFlush() { |
||||||
|
return int(iShortDst) |
||||||
|
} |
||||||
|
return sp |
||||||
|
} |
||||||
|
|
||||||
|
// lastRuneStart returns the runeInfo and position of the last
|
||||||
|
// rune in buf or the zero runeInfo and -1 if no rune was found.
|
||||||
|
func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) { |
||||||
|
p := len(buf) - 1 |
||||||
|
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- { |
||||||
|
} |
||||||
|
if p < 0 { |
||||||
|
return Properties{}, -1 |
||||||
|
} |
||||||
|
return fd.info(inputBytes(buf), p), p |
||||||
|
} |
||||||
|
|
||||||
|
// decomposeToLastBoundary finds an open segment at the end of the buffer
|
||||||
|
// and scans it into rb. Returns the buffer minus the last segment.
|
||||||
|
func decomposeToLastBoundary(rb *reorderBuffer) { |
||||||
|
fd := &rb.f |
||||||
|
info, i := lastRuneStart(fd, rb.out) |
||||||
|
if int(info.size) != len(rb.out)-i { |
||||||
|
// illegal trailing continuation bytes
|
||||||
|
return |
||||||
|
} |
||||||
|
if info.BoundaryAfter() { |
||||||
|
return |
||||||
|
} |
||||||
|
var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
|
||||||
|
padd := 0 |
||||||
|
ss := streamSafe(0) |
||||||
|
p := len(rb.out) |
||||||
|
for { |
||||||
|
add[padd] = info |
||||||
|
v := ss.backwards(info) |
||||||
|
if v == ssOverflow { |
||||||
|
// Note that if we have an overflow, it the string we are appending to
|
||||||
|
// is not correctly normalized. In this case the behavior is undefined.
|
||||||
|
break |
||||||
|
} |
||||||
|
padd++ |
||||||
|
p -= int(info.size) |
||||||
|
if v == ssStarter || p < 0 { |
||||||
|
break |
||||||
|
} |
||||||
|
info, i = lastRuneStart(fd, rb.out[:p]) |
||||||
|
if int(info.size) != p-i { |
||||||
|
break |
||||||
|
} |
||||||
|
} |
||||||
|
rb.ss = ss |
||||||
|
// Copy bytes for insertion as we may need to overwrite rb.out.
|
||||||
|
var buf [maxBufferSize * utf8.UTFMax]byte |
||||||
|
cp := buf[:copy(buf[:], rb.out[p:])] |
||||||
|
rb.out = rb.out[:p] |
||||||
|
for padd--; padd >= 0; padd-- { |
||||||
|
info = add[padd] |
||||||
|
rb.insertUnsafe(inputBytes(cp), 0, info) |
||||||
|
cp = cp[info.size:] |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,125 @@ |
|||||||
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package norm |
||||||
|
|
||||||
|
import "io" |
||||||
|
|
||||||
|
type normWriter struct { |
||||||
|
rb reorderBuffer |
||||||
|
w io.Writer |
||||||
|
buf []byte |
||||||
|
} |
||||||
|
|
||||||
|
// Write implements the standard write interface. If the last characters are
|
||||||
|
// not at a normalization boundary, the bytes will be buffered for the next
|
||||||
|
// write. The remaining bytes will be written on close.
|
||||||
|
func (w *normWriter) Write(data []byte) (n int, err error) { |
||||||
|
// Process data in pieces to keep w.buf size bounded.
|
||||||
|
const chunk = 4000 |
||||||
|
|
||||||
|
for len(data) > 0 { |
||||||
|
// Normalize into w.buf.
|
||||||
|
m := len(data) |
||||||
|
if m > chunk { |
||||||
|
m = chunk |
||||||
|
} |
||||||
|
w.rb.src = inputBytes(data[:m]) |
||||||
|
w.rb.nsrc = m |
||||||
|
w.buf = doAppend(&w.rb, w.buf, 0) |
||||||
|
data = data[m:] |
||||||
|
n += m |
||||||
|
|
||||||
|
// Write out complete prefix, save remainder.
|
||||||
|
// Note that lastBoundary looks back at most 31 runes.
|
||||||
|
i := lastBoundary(&w.rb.f, w.buf) |
||||||
|
if i == -1 { |
||||||
|
i = 0 |
||||||
|
} |
||||||
|
if i > 0 { |
||||||
|
if _, err = w.w.Write(w.buf[:i]); err != nil { |
||||||
|
break |
||||||
|
} |
||||||
|
bn := copy(w.buf, w.buf[i:]) |
||||||
|
w.buf = w.buf[:bn] |
||||||
|
} |
||||||
|
} |
||||||
|
return n, err |
||||||
|
} |
||||||
|
|
||||||
|
// Close forces data that remains in the buffer to be written.
|
||||||
|
func (w *normWriter) Close() error { |
||||||
|
if len(w.buf) > 0 { |
||||||
|
_, err := w.w.Write(w.buf) |
||||||
|
if err != nil { |
||||||
|
return err |
||||||
|
} |
||||||
|
} |
||||||
|
return nil |
||||||
|
} |
||||||
|
|
||||||
|
// Writer returns a new writer that implements Write(b)
|
||||||
|
// by writing f(b) to w. The returned writer may use an
|
||||||
|
// internal buffer to maintain state across Write calls.
|
||||||
|
// Calling its Close method writes any buffered data to w.
|
||||||
|
func (f Form) Writer(w io.Writer) io.WriteCloser { |
||||||
|
wr := &normWriter{rb: reorderBuffer{}, w: w} |
||||||
|
wr.rb.init(f, nil) |
||||||
|
return wr |
||||||
|
} |
||||||
|
|
||||||
|
type normReader struct { |
||||||
|
rb reorderBuffer |
||||||
|
r io.Reader |
||||||
|
inbuf []byte |
||||||
|
outbuf []byte |
||||||
|
bufStart int |
||||||
|
lastBoundary int |
||||||
|
err error |
||||||
|
} |
||||||
|
|
||||||
|
// Read implements the standard read interface.
|
||||||
|
func (r *normReader) Read(p []byte) (int, error) { |
||||||
|
for { |
||||||
|
if r.lastBoundary-r.bufStart > 0 { |
||||||
|
n := copy(p, r.outbuf[r.bufStart:r.lastBoundary]) |
||||||
|
r.bufStart += n |
||||||
|
if r.lastBoundary-r.bufStart > 0 { |
||||||
|
return n, nil |
||||||
|
} |
||||||
|
return n, r.err |
||||||
|
} |
||||||
|
if r.err != nil { |
||||||
|
return 0, r.err |
||||||
|
} |
||||||
|
outn := copy(r.outbuf, r.outbuf[r.lastBoundary:]) |
||||||
|
r.outbuf = r.outbuf[0:outn] |
||||||
|
r.bufStart = 0 |
||||||
|
|
||||||
|
n, err := r.r.Read(r.inbuf) |
||||||
|
r.rb.src = inputBytes(r.inbuf[0:n]) |
||||||
|
r.rb.nsrc, r.err = n, err |
||||||
|
if n > 0 { |
||||||
|
r.outbuf = doAppend(&r.rb, r.outbuf, 0) |
||||||
|
} |
||||||
|
if err == io.EOF { |
||||||
|
r.lastBoundary = len(r.outbuf) |
||||||
|
} else { |
||||||
|
r.lastBoundary = lastBoundary(&r.rb.f, r.outbuf) |
||||||
|
if r.lastBoundary == -1 { |
||||||
|
r.lastBoundary = 0 |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Reader returns a new reader that implements Read
|
||||||
|
// by reading data from r and returning f(data).
|
||||||
|
func (f Form) Reader(r io.Reader) io.Reader { |
||||||
|
const chunk = 4000 |
||||||
|
buf := make([]byte, chunk) |
||||||
|
rr := &normReader{rb: reorderBuffer{}, r: r, inbuf: buf} |
||||||
|
rr.rb.init(f, buf) |
||||||
|
return rr |
||||||
|
} |
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,88 @@ |
|||||||
|
// Copyright 2013 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package norm |
||||||
|
|
||||||
|
import ( |
||||||
|
"unicode/utf8" |
||||||
|
|
||||||
|
"golang.org/x/text/transform" |
||||||
|
) |
||||||
|
|
||||||
|
// Reset implements the Reset method of the transform.Transformer interface.
|
||||||
|
func (Form) Reset() {} |
||||||
|
|
||||||
|
// Transform implements the Transform method of the transform.Transformer
|
||||||
|
// interface. It may need to write segments of up to MaxSegmentSize at once.
|
||||||
|
// Users should either catch ErrShortDst and allow dst to grow or have dst be at
|
||||||
|
// least of size MaxTransformChunkSize to be guaranteed of progress.
|
||||||
|
func (f Form) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||||
|
// Cap the maximum number of src bytes to check.
|
||||||
|
b := src |
||||||
|
eof := atEOF |
||||||
|
if ns := len(dst); ns < len(b) { |
||||||
|
err = transform.ErrShortDst |
||||||
|
eof = false |
||||||
|
b = b[:ns] |
||||||
|
} |
||||||
|
i, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), eof) |
||||||
|
n := copy(dst, b[:i]) |
||||||
|
if !ok { |
||||||
|
nDst, nSrc, err = f.transform(dst[n:], src[n:], atEOF) |
||||||
|
return nDst + n, nSrc + n, err |
||||||
|
} |
||||||
|
|
||||||
|
if err == nil && n < len(src) && !atEOF { |
||||||
|
err = transform.ErrShortSrc |
||||||
|
} |
||||||
|
return n, n, err |
||||||
|
} |
||||||
|
|
||||||
|
func flushTransform(rb *reorderBuffer) bool { |
||||||
|
// Write out (must fully fit in dst, or else it is an ErrShortDst).
|
||||||
|
if len(rb.out) < rb.nrune*utf8.UTFMax { |
||||||
|
return false |
||||||
|
} |
||||||
|
rb.out = rb.out[rb.flushCopy(rb.out):] |
||||||
|
return true |
||||||
|
} |
||||||
|
|
||||||
|
var errs = []error{nil, transform.ErrShortDst, transform.ErrShortSrc} |
||||||
|
|
||||||
|
// transform implements the transform.Transformer interface. It is only called
|
||||||
|
// when quickSpan does not pass for a given string.
|
||||||
|
func (f Form) transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||||
|
// TODO: get rid of reorderBuffer. See CL 23460044.
|
||||||
|
rb := reorderBuffer{} |
||||||
|
rb.init(f, src) |
||||||
|
for { |
||||||
|
// Load segment into reorder buffer.
|
||||||
|
rb.setFlusher(dst[nDst:], flushTransform) |
||||||
|
end := decomposeSegment(&rb, nSrc, atEOF) |
||||||
|
if end < 0 { |
||||||
|
return nDst, nSrc, errs[-end] |
||||||
|
} |
||||||
|
nDst = len(dst) - len(rb.out) |
||||||
|
nSrc = end |
||||||
|
|
||||||
|
// Next quickSpan.
|
||||||
|
end = rb.nsrc |
||||||
|
eof := atEOF |
||||||
|
if n := nSrc + len(dst) - nDst; n < end { |
||||||
|
err = transform.ErrShortDst |
||||||
|
end = n |
||||||
|
eof = false |
||||||
|
} |
||||||
|
end, ok := rb.f.quickSpan(rb.src, nSrc, end, eof) |
||||||
|
n := copy(dst[nDst:], rb.src.bytes[nSrc:end]) |
||||||
|
nSrc += n |
||||||
|
nDst += n |
||||||
|
if ok { |
||||||
|
if err == nil && n < rb.nsrc && !atEOF { |
||||||
|
err = transform.ErrShortSrc |
||||||
|
} |
||||||
|
return nDst, nSrc, err |
||||||
|
} |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,54 @@ |
|||||||
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
package norm |
||||||
|
|
||||||
|
type valueRange struct { |
||||||
|
value uint16 // header: value:stride
|
||||||
|
lo, hi byte // header: lo:n
|
||||||
|
} |
||||||
|
|
||||||
|
type sparseBlocks struct { |
||||||
|
values []valueRange |
||||||
|
offset []uint16 |
||||||
|
} |
||||||
|
|
||||||
|
var nfcSparse = sparseBlocks{ |
||||||
|
values: nfcSparseValues[:], |
||||||
|
offset: nfcSparseOffset[:], |
||||||
|
} |
||||||
|
|
||||||
|
var nfkcSparse = sparseBlocks{ |
||||||
|
values: nfkcSparseValues[:], |
||||||
|
offset: nfkcSparseOffset[:], |
||||||
|
} |
||||||
|
|
||||||
|
var ( |
||||||
|
nfcData = newNfcTrie(0) |
||||||
|
nfkcData = newNfkcTrie(0) |
||||||
|
) |
||||||
|
|
||||||
|
// lookupValue determines the type of block n and looks up the value for b.
|
||||||
|
// For n < t.cutoff, the block is a simple lookup table. Otherwise, the block
|
||||||
|
// is a list of ranges with an accompanying value. Given a matching range r,
|
||||||
|
// the value for b is by r.value + (b - r.lo) * stride.
|
||||||
|
func (t *sparseBlocks) lookup(n uint32, b byte) uint16 { |
||||||
|
offset := t.offset[n] |
||||||
|
header := t.values[offset] |
||||||
|
lo := offset + 1 |
||||||
|
hi := lo + uint16(header.lo) |
||||||
|
for lo < hi { |
||||||
|
m := lo + (hi-lo)/2 |
||||||
|
r := t.values[m] |
||||||
|
if r.lo <= b && b <= r.hi { |
||||||
|
return r.value + uint16(b-r.lo)*header.value |
||||||
|
} |
||||||
|
if b < r.lo { |
||||||
|
hi = m |
||||||
|
} else { |
||||||
|
lo = m + 1 |
||||||
|
} |
||||||
|
} |
||||||
|
return 0 |
||||||
|
} |
Loading…
Reference in new issue