mirror of https://github.com/ethereum/go-ethereum
parent
5b30aa59d6
commit
8ee5bb2289
@ -0,0 +1,512 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
import "unicode/utf8" |
||||
|
||||
const ( |
||||
maxNonStarters = 30 |
||||
// The maximum number of characters needed for a buffer is
|
||||
// maxNonStarters + 1 for the starter + 1 for the GCJ
|
||||
maxBufferSize = maxNonStarters + 2 |
||||
maxNFCExpansion = 3 // NFC(0x1D160)
|
||||
maxNFKCExpansion = 18 // NFKC(0xFDFA)
|
||||
|
||||
maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
|
||||
) |
||||
|
||||
// ssState is used for reporting the segment state after inserting a rune.
|
||||
// It is returned by streamSafe.next.
|
||||
type ssState int |
||||
|
||||
const ( |
||||
// Indicates a rune was successfully added to the segment.
|
||||
ssSuccess ssState = iota |
||||
// Indicates a rune starts a new segment and should not be added.
|
||||
ssStarter |
||||
// Indicates a rune caused a segment overflow and a CGJ should be inserted.
|
||||
ssOverflow |
||||
) |
||||
|
||||
// streamSafe implements the policy of when a CGJ should be inserted.
|
||||
type streamSafe uint8 |
||||
|
||||
// first inserts the first rune of a segment. It is a faster version of next if
|
||||
// it is known p represents the first rune in a segment.
|
||||
func (ss *streamSafe) first(p Properties) { |
||||
*ss = streamSafe(p.nTrailingNonStarters()) |
||||
} |
||||
|
||||
// insert returns a ssState value to indicate whether a rune represented by p
|
||||
// can be inserted.
|
||||
func (ss *streamSafe) next(p Properties) ssState { |
||||
if *ss > maxNonStarters { |
||||
panic("streamSafe was not reset") |
||||
} |
||||
n := p.nLeadingNonStarters() |
||||
if *ss += streamSafe(n); *ss > maxNonStarters { |
||||
*ss = 0 |
||||
return ssOverflow |
||||
} |
||||
// The Stream-Safe Text Processing prescribes that the counting can stop
|
||||
// as soon as a starter is encountered. However, there are some starters,
|
||||
// like Jamo V and T, that can combine with other runes, leaving their
|
||||
// successive non-starters appended to the previous, possibly causing an
|
||||
// overflow. We will therefore consider any rune with a non-zero nLead to
|
||||
// be a non-starter. Note that it always hold that if nLead > 0 then
|
||||
// nLead == nTrail.
|
||||
if n == 0 { |
||||
*ss = streamSafe(p.nTrailingNonStarters()) |
||||
return ssStarter |
||||
} |
||||
return ssSuccess |
||||
} |
||||
|
||||
// backwards is used for checking for overflow and segment starts
|
||||
// when traversing a string backwards. Users do not need to call first
|
||||
// for the first rune. The state of the streamSafe retains the count of
|
||||
// the non-starters loaded.
|
||||
func (ss *streamSafe) backwards(p Properties) ssState { |
||||
if *ss > maxNonStarters { |
||||
panic("streamSafe was not reset") |
||||
} |
||||
c := *ss + streamSafe(p.nTrailingNonStarters()) |
||||
if c > maxNonStarters { |
||||
return ssOverflow |
||||
} |
||||
*ss = c |
||||
if p.nLeadingNonStarters() == 0 { |
||||
return ssStarter |
||||
} |
||||
return ssSuccess |
||||
} |
||||
|
||||
func (ss streamSafe) isMax() bool { |
||||
return ss == maxNonStarters |
||||
} |
||||
|
||||
// GraphemeJoiner is inserted after maxNonStarters non-starter runes.
|
||||
const GraphemeJoiner = "\u034F" |
||||
|
||||
// reorderBuffer is used to normalize a single segment. Characters inserted with
|
||||
// insert are decomposed and reordered based on CCC. The compose method can
|
||||
// be used to recombine characters. Note that the byte buffer does not hold
|
||||
// the UTF-8 characters in order. Only the rune array is maintained in sorted
|
||||
// order. flush writes the resulting segment to a byte array.
|
||||
type reorderBuffer struct { |
||||
rune [maxBufferSize]Properties // Per character info.
|
||||
byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
|
||||
nbyte uint8 // Number or bytes.
|
||||
ss streamSafe // For limiting length of non-starter sequence.
|
||||
nrune int // Number of runeInfos.
|
||||
f formInfo |
||||
|
||||
src input |
||||
nsrc int |
||||
tmpBytes input |
||||
|
||||
out []byte |
||||
flushF func(*reorderBuffer) bool |
||||
} |
||||
|
||||
func (rb *reorderBuffer) init(f Form, src []byte) { |
||||
rb.f = *formTable[f] |
||||
rb.src.setBytes(src) |
||||
rb.nsrc = len(src) |
||||
rb.ss = 0 |
||||
} |
||||
|
||||
func (rb *reorderBuffer) initString(f Form, src string) { |
||||
rb.f = *formTable[f] |
||||
rb.src.setString(src) |
||||
rb.nsrc = len(src) |
||||
rb.ss = 0 |
||||
} |
||||
|
||||
func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) { |
||||
rb.out = out |
||||
rb.flushF = f |
||||
} |
||||
|
||||
// reset discards all characters from the buffer.
|
||||
func (rb *reorderBuffer) reset() { |
||||
rb.nrune = 0 |
||||
rb.nbyte = 0 |
||||
} |
||||
|
||||
func (rb *reorderBuffer) doFlush() bool { |
||||
if rb.f.composing { |
||||
rb.compose() |
||||
} |
||||
res := rb.flushF(rb) |
||||
rb.reset() |
||||
return res |
||||
} |
||||
|
||||
// appendFlush appends the normalized segment to rb.out.
|
||||
func appendFlush(rb *reorderBuffer) bool { |
||||
for i := 0; i < rb.nrune; i++ { |
||||
start := rb.rune[i].pos |
||||
end := start + rb.rune[i].size |
||||
rb.out = append(rb.out, rb.byte[start:end]...) |
||||
} |
||||
return true |
||||
} |
||||
|
||||
// flush appends the normalized segment to out and resets rb.
|
||||
func (rb *reorderBuffer) flush(out []byte) []byte { |
||||
for i := 0; i < rb.nrune; i++ { |
||||
start := rb.rune[i].pos |
||||
end := start + rb.rune[i].size |
||||
out = append(out, rb.byte[start:end]...) |
||||
} |
||||
rb.reset() |
||||
return out |
||||
} |
||||
|
||||
// flushCopy copies the normalized segment to buf and resets rb.
|
||||
// It returns the number of bytes written to buf.
|
||||
func (rb *reorderBuffer) flushCopy(buf []byte) int { |
||||
p := 0 |
||||
for i := 0; i < rb.nrune; i++ { |
||||
runep := rb.rune[i] |
||||
p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size]) |
||||
} |
||||
rb.reset() |
||||
return p |
||||
} |
||||
|
||||
// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
|
||||
// It returns false if the buffer is not large enough to hold the rune.
|
||||
// It is used internally by insert and insertString only.
|
||||
func (rb *reorderBuffer) insertOrdered(info Properties) { |
||||
n := rb.nrune |
||||
b := rb.rune[:] |
||||
cc := info.ccc |
||||
if cc > 0 { |
||||
// Find insertion position + move elements to make room.
|
||||
for ; n > 0; n-- { |
||||
if b[n-1].ccc <= cc { |
||||
break |
||||
} |
||||
b[n] = b[n-1] |
||||
} |
||||
} |
||||
rb.nrune += 1 |
||||
pos := uint8(rb.nbyte) |
||||
rb.nbyte += utf8.UTFMax |
||||
info.pos = pos |
||||
b[n] = info |
||||
} |
||||
|
||||
// insertErr is an error code returned by insert. Using this type instead
|
||||
// of error improves performance up to 20% for many of the benchmarks.
|
||||
type insertErr int |
||||
|
||||
const ( |
||||
iSuccess insertErr = -iota |
||||
iShortDst |
||||
iShortSrc |
||||
) |
||||
|
||||
// insertFlush inserts the given rune in the buffer ordered by CCC.
|
||||
// If a decomposition with multiple segments are encountered, they leading
|
||||
// ones are flushed.
|
||||
// It returns a non-zero error code if the rune was not inserted.
|
||||
func (rb *reorderBuffer) insertFlush(src input, i int, info Properties) insertErr { |
||||
if rune := src.hangul(i); rune != 0 { |
||||
rb.decomposeHangul(rune) |
||||
return iSuccess |
||||
} |
||||
if info.hasDecomposition() { |
||||
return rb.insertDecomposed(info.Decomposition()) |
||||
} |
||||
rb.insertSingle(src, i, info) |
||||
return iSuccess |
||||
} |
||||
|
||||
// insertUnsafe inserts the given rune in the buffer ordered by CCC.
|
||||
// It is assumed there is sufficient space to hold the runes. It is the
|
||||
// responsibility of the caller to ensure this. This can be done by checking
|
||||
// the state returned by the streamSafe type.
|
||||
func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) { |
||||
if rune := src.hangul(i); rune != 0 { |
||||
rb.decomposeHangul(rune) |
||||
} |
||||
if info.hasDecomposition() { |
||||
// TODO: inline.
|
||||
rb.insertDecomposed(info.Decomposition()) |
||||
} else { |
||||
rb.insertSingle(src, i, info) |
||||
} |
||||
} |
||||
|
||||
// insertDecomposed inserts an entry in to the reorderBuffer for each rune
|
||||
// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
|
||||
// It flushes the buffer on each new segment start.
|
||||
func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr { |
||||
rb.tmpBytes.setBytes(dcomp) |
||||
// As the streamSafe accounting already handles the counting for modifiers,
|
||||
// we don't have to call next. However, we do need to keep the accounting
|
||||
// intact when flushing the buffer.
|
||||
for i := 0; i < len(dcomp); { |
||||
info := rb.f.info(rb.tmpBytes, i) |
||||
if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() { |
||||
return iShortDst |
||||
} |
||||
i += copy(rb.byte[rb.nbyte:], dcomp[i:i+int(info.size)]) |
||||
rb.insertOrdered(info) |
||||
} |
||||
return iSuccess |
||||
} |
||||
|
||||
// insertSingle inserts an entry in the reorderBuffer for the rune at
|
||||
// position i. info is the runeInfo for the rune at position i.
|
||||
func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) { |
||||
src.copySlice(rb.byte[rb.nbyte:], i, i+int(info.size)) |
||||
rb.insertOrdered(info) |
||||
} |
||||
|
||||
// insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb.
|
||||
func (rb *reorderBuffer) insertCGJ() { |
||||
rb.insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))}) |
||||
} |
||||
|
||||
// appendRune inserts a rune at the end of the buffer. It is used for Hangul.
|
||||
func (rb *reorderBuffer) appendRune(r rune) { |
||||
bn := rb.nbyte |
||||
sz := utf8.EncodeRune(rb.byte[bn:], rune(r)) |
||||
rb.nbyte += utf8.UTFMax |
||||
rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)} |
||||
rb.nrune++ |
||||
} |
||||
|
||||
// assignRune sets a rune at position pos. It is used for Hangul and recomposition.
|
||||
func (rb *reorderBuffer) assignRune(pos int, r rune) { |
||||
bn := rb.rune[pos].pos |
||||
sz := utf8.EncodeRune(rb.byte[bn:], rune(r)) |
||||
rb.rune[pos] = Properties{pos: bn, size: uint8(sz)} |
||||
} |
||||
|
||||
// runeAt returns the rune at position n. It is used for Hangul and recomposition.
|
||||
func (rb *reorderBuffer) runeAt(n int) rune { |
||||
inf := rb.rune[n] |
||||
r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size]) |
||||
return r |
||||
} |
||||
|
||||
// bytesAt returns the UTF-8 encoding of the rune at position n.
|
||||
// It is used for Hangul and recomposition.
|
||||
func (rb *reorderBuffer) bytesAt(n int) []byte { |
||||
inf := rb.rune[n] |
||||
return rb.byte[inf.pos : int(inf.pos)+int(inf.size)] |
||||
} |
||||
|
||||
// For Hangul we combine algorithmically, instead of using tables.
|
||||
const ( |
||||
hangulBase = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
|
||||
hangulBase0 = 0xEA |
||||
hangulBase1 = 0xB0 |
||||
hangulBase2 = 0x80 |
||||
|
||||
hangulEnd = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
|
||||
hangulEnd0 = 0xED |
||||
hangulEnd1 = 0x9E |
||||
hangulEnd2 = 0xA4 |
||||
|
||||
jamoLBase = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
|
||||
jamoLBase0 = 0xE1 |
||||
jamoLBase1 = 0x84 |
||||
jamoLEnd = 0x1113 |
||||
jamoVBase = 0x1161 |
||||
jamoVEnd = 0x1176 |
||||
jamoTBase = 0x11A7 |
||||
jamoTEnd = 0x11C3 |
||||
|
||||
jamoTCount = 28 |
||||
jamoVCount = 21 |
||||
jamoVTCount = 21 * 28 |
||||
jamoLVTCount = 19 * 21 * 28 |
||||
) |
||||
|
||||
const hangulUTF8Size = 3 |
||||
|
||||
func isHangul(b []byte) bool { |
||||
if len(b) < hangulUTF8Size { |
||||
return false |
||||
} |
||||
b0 := b[0] |
||||
if b0 < hangulBase0 { |
||||
return false |
||||
} |
||||
b1 := b[1] |
||||
switch { |
||||
case b0 == hangulBase0: |
||||
return b1 >= hangulBase1 |
||||
case b0 < hangulEnd0: |
||||
return true |
||||
case b0 > hangulEnd0: |
||||
return false |
||||
case b1 < hangulEnd1: |
||||
return true |
||||
} |
||||
return b1 == hangulEnd1 && b[2] < hangulEnd2 |
||||
} |
||||
|
||||
func isHangulString(b string) bool { |
||||
if len(b) < hangulUTF8Size { |
||||
return false |
||||
} |
||||
b0 := b[0] |
||||
if b0 < hangulBase0 { |
||||
return false |
||||
} |
||||
b1 := b[1] |
||||
switch { |
||||
case b0 == hangulBase0: |
||||
return b1 >= hangulBase1 |
||||
case b0 < hangulEnd0: |
||||
return true |
||||
case b0 > hangulEnd0: |
||||
return false |
||||
case b1 < hangulEnd1: |
||||
return true |
||||
} |
||||
return b1 == hangulEnd1 && b[2] < hangulEnd2 |
||||
} |
||||
|
||||
// Caller must ensure len(b) >= 2.
|
||||
func isJamoVT(b []byte) bool { |
||||
// True if (rune & 0xff00) == jamoLBase
|
||||
return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1 |
||||
} |
||||
|
||||
func isHangulWithoutJamoT(b []byte) bool { |
||||
c, _ := utf8.DecodeRune(b) |
||||
c -= hangulBase |
||||
return c < jamoLVTCount && c%jamoTCount == 0 |
||||
} |
||||
|
||||
// decomposeHangul writes the decomposed Hangul to buf and returns the number
|
||||
// of bytes written. len(buf) should be at least 9.
|
||||
func decomposeHangul(buf []byte, r rune) int { |
||||
const JamoUTF8Len = 3 |
||||
r -= hangulBase |
||||
x := r % jamoTCount |
||||
r /= jamoTCount |
||||
utf8.EncodeRune(buf, jamoLBase+r/jamoVCount) |
||||
utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount) |
||||
if x != 0 { |
||||
utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x) |
||||
return 3 * JamoUTF8Len |
||||
} |
||||
return 2 * JamoUTF8Len |
||||
} |
||||
|
||||
// decomposeHangul algorithmically decomposes a Hangul rune into
|
||||
// its Jamo components.
|
||||
// See https://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
|
||||
func (rb *reorderBuffer) decomposeHangul(r rune) { |
||||
r -= hangulBase |
||||
x := r % jamoTCount |
||||
r /= jamoTCount |
||||
rb.appendRune(jamoLBase + r/jamoVCount) |
||||
rb.appendRune(jamoVBase + r%jamoVCount) |
||||
if x != 0 { |
||||
rb.appendRune(jamoTBase + x) |
||||
} |
||||
} |
||||
|
||||
// combineHangul algorithmically combines Jamo character components into Hangul.
|
||||
// See https://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
|
||||
func (rb *reorderBuffer) combineHangul(s, i, k int) { |
||||
b := rb.rune[:] |
||||
bn := rb.nrune |
||||
for ; i < bn; i++ { |
||||
cccB := b[k-1].ccc |
||||
cccC := b[i].ccc |
||||
if cccB == 0 { |
||||
s = k - 1 |
||||
} |
||||
if s != k-1 && cccB >= cccC { |
||||
// b[i] is blocked by greater-equal cccX below it
|
||||
b[k] = b[i] |
||||
k++ |
||||
} else { |
||||
l := rb.runeAt(s) // also used to compare to hangulBase
|
||||
v := rb.runeAt(i) // also used to compare to jamoT
|
||||
switch { |
||||
case jamoLBase <= l && l < jamoLEnd && |
||||
jamoVBase <= v && v < jamoVEnd: |
||||
// 11xx plus 116x to LV
|
||||
rb.assignRune(s, hangulBase+ |
||||
(l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount) |
||||
case hangulBase <= l && l < hangulEnd && |
||||
jamoTBase < v && v < jamoTEnd && |
||||
((l-hangulBase)%jamoTCount) == 0: |
||||
// ACxx plus 11Ax to LVT
|
||||
rb.assignRune(s, l+v-jamoTBase) |
||||
default: |
||||
b[k] = b[i] |
||||
k++ |
||||
} |
||||
} |
||||
} |
||||
rb.nrune = k |
||||
} |
||||
|
||||
// compose recombines the runes in the buffer.
|
||||
// It should only be used to recompose a single segment, as it will not
|
||||
// handle alternations between Hangul and non-Hangul characters correctly.
|
||||
func (rb *reorderBuffer) compose() { |
||||
// Lazily load the map used by the combine func below, but do
|
||||
// it outside of the loop.
|
||||
recompMapOnce.Do(buildRecompMap) |
||||
|
||||
// UAX #15, section X5 , including Corrigendum #5
|
||||
// "In any character sequence beginning with starter S, a character C is
|
||||
// blocked from S if and only if there is some character B between S
|
||||
// and C, and either B is a starter or it has the same or higher
|
||||
// combining class as C."
|
||||
bn := rb.nrune |
||||
if bn == 0 { |
||||
return |
||||
} |
||||
k := 1 |
||||
b := rb.rune[:] |
||||
for s, i := 0, 1; i < bn; i++ { |
||||
if isJamoVT(rb.bytesAt(i)) { |
||||
// Redo from start in Hangul mode. Necessary to support
|
||||
// U+320E..U+321E in NFKC mode.
|
||||
rb.combineHangul(s, i, k) |
||||
return |
||||
} |
||||
ii := b[i] |
||||
// We can only use combineForward as a filter if we later
|
||||
// get the info for the combined character. This is more
|
||||
// expensive than using the filter. Using combinesBackward()
|
||||
// is safe.
|
||||
if ii.combinesBackward() { |
||||
cccB := b[k-1].ccc |
||||
cccC := ii.ccc |
||||
blocked := false // b[i] blocked by starter or greater or equal CCC?
|
||||
if cccB == 0 { |
||||
s = k - 1 |
||||
} else { |
||||
blocked = s != k-1 && cccB >= cccC |
||||
} |
||||
if !blocked { |
||||
combined := combine(rb.runeAt(s), rb.runeAt(i)) |
||||
if combined != 0 { |
||||
rb.assignRune(s, combined) |
||||
continue |
||||
} |
||||
} |
||||
} |
||||
b[k] = b[i] |
||||
k++ |
||||
} |
||||
rb.nrune = k |
||||
} |
@ -0,0 +1,278 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
import "encoding/binary" |
||||
|
||||
// This file contains Form-specific logic and wrappers for data in tables.go.
|
||||
|
||||
// Rune info is stored in a separate trie per composing form. A composing form
|
||||
// and its corresponding decomposing form share the same trie. Each trie maps
|
||||
// a rune to a uint16. The values take two forms. For v >= 0x8000:
|
||||
// bits
|
||||
// 15: 1 (inverse of NFD_QC bit of qcInfo)
|
||||
// 13..7: qcInfo (see below). isYesD is always true (no decompostion).
|
||||
// 6..0: ccc (compressed CCC value).
|
||||
// For v < 0x8000, the respective rune has a decomposition and v is an index
|
||||
// into a byte array of UTF-8 decomposition sequences and additional info and
|
||||
// has the form:
|
||||
// <header> <decomp_byte>* [<tccc> [<lccc>]]
|
||||
// The header contains the number of bytes in the decomposition (excluding this
|
||||
// length byte). The two most significant bits of this length byte correspond
|
||||
// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.
|
||||
// The byte sequence is followed by a trailing and leading CCC if the values
|
||||
// for these are not zero. The value of v determines which ccc are appended
|
||||
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
|
||||
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
|
||||
// there is an additional leading ccc. The value of tccc itself is the
|
||||
// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
|
||||
// are the number of trailing non-starters.
|
||||
|
||||
const ( |
||||
qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
|
||||
headerLenMask = 0x3F // extract the length value from the header byte
|
||||
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
|
||||
) |
||||
|
||||
// Properties provides access to normalization properties of a rune.
|
||||
type Properties struct { |
||||
pos uint8 // start position in reorderBuffer; used in composition.go
|
||||
size uint8 // length of UTF-8 encoding of this rune
|
||||
ccc uint8 // leading canonical combining class (ccc if not decomposition)
|
||||
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
|
||||
nLead uint8 // number of leading non-starters.
|
||||
flags qcInfo // quick check flags
|
||||
index uint16 |
||||
} |
||||
|
||||
// functions dispatchable per form
|
||||
type lookupFunc func(b input, i int) Properties |
||||
|
||||
// formInfo holds Form-specific functions and tables.
|
||||
type formInfo struct { |
||||
form Form |
||||
composing, compatibility bool // form type
|
||||
info lookupFunc |
||||
nextMain iterFunc |
||||
} |
||||
|
||||
var formTable = []*formInfo{{ |
||||
form: NFC, |
||||
composing: true, |
||||
compatibility: false, |
||||
info: lookupInfoNFC, |
||||
nextMain: nextComposed, |
||||
}, { |
||||
form: NFD, |
||||
composing: false, |
||||
compatibility: false, |
||||
info: lookupInfoNFC, |
||||
nextMain: nextDecomposed, |
||||
}, { |
||||
form: NFKC, |
||||
composing: true, |
||||
compatibility: true, |
||||
info: lookupInfoNFKC, |
||||
nextMain: nextComposed, |
||||
}, { |
||||
form: NFKD, |
||||
composing: false, |
||||
compatibility: true, |
||||
info: lookupInfoNFKC, |
||||
nextMain: nextDecomposed, |
||||
}} |
||||
|
||||
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
|
||||
// unexpected behavior for the user. For example, in NFD, there is a boundary
|
||||
// after 'a'. However, 'a' might combine with modifiers, so from the application's
|
||||
// perspective it is not a good boundary. We will therefore always use the
|
||||
// boundaries for the combining variants.
|
||||
|
||||
// BoundaryBefore returns true if this rune starts a new segment and
|
||||
// cannot combine with any rune on the left.
|
||||
func (p Properties) BoundaryBefore() bool { |
||||
if p.ccc == 0 && !p.combinesBackward() { |
||||
return true |
||||
} |
||||
// We assume that the CCC of the first character in a decomposition
|
||||
// is always non-zero if different from info.ccc and that we can return
|
||||
// false at this point. This is verified by maketables.
|
||||
return false |
||||
} |
||||
|
||||
// BoundaryAfter returns true if runes cannot combine with or otherwise
|
||||
// interact with this or previous runes.
|
||||
func (p Properties) BoundaryAfter() bool { |
||||
// TODO: loosen these conditions.
|
||||
return p.isInert() |
||||
} |
||||
|
||||
// We pack quick check data in 4 bits:
|
||||
// 5: Combines forward (0 == false, 1 == true)
|
||||
// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
|
||||
// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
|
||||
// 1..0: Number of trailing non-starters.
|
||||
//
|
||||
// When all 4 bits are zero, the character is inert, meaning it is never
|
||||
// influenced by normalization.
|
||||
type qcInfo uint8 |
||||
|
||||
func (p Properties) isYesC() bool { return p.flags&0x10 == 0 } |
||||
func (p Properties) isYesD() bool { return p.flags&0x4 == 0 } |
||||
|
||||
func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 } |
||||
func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
|
||||
func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
|
||||
|
||||
func (p Properties) isInert() bool { |
||||
return p.flags&qcInfoMask == 0 && p.ccc == 0 |
||||
} |
||||
|
||||
func (p Properties) multiSegment() bool { |
||||
return p.index >= firstMulti && p.index < endMulti |
||||
} |
||||
|
||||
func (p Properties) nLeadingNonStarters() uint8 { |
||||
return p.nLead |
||||
} |
||||
|
||||
func (p Properties) nTrailingNonStarters() uint8 { |
||||
return uint8(p.flags & 0x03) |
||||
} |
||||
|
||||
// Decomposition returns the decomposition for the underlying rune
|
||||
// or nil if there is none.
|
||||
func (p Properties) Decomposition() []byte { |
||||
// TODO: create the decomposition for Hangul?
|
||||
if p.index == 0 { |
||||
return nil |
||||
} |
||||
i := p.index |
||||
n := decomps[i] & headerLenMask |
||||
i++ |
||||
return decomps[i : i+uint16(n)] |
||||
} |
||||
|
||||
// Size returns the length of UTF-8 encoding of the rune.
|
||||
func (p Properties) Size() int { |
||||
return int(p.size) |
||||
} |
||||
|
||||
// CCC returns the canonical combining class of the underlying rune.
|
||||
func (p Properties) CCC() uint8 { |
||||
if p.index >= firstCCCZeroExcept { |
||||
return 0 |
||||
} |
||||
return ccc[p.ccc] |
||||
} |
||||
|
||||
// LeadCCC returns the CCC of the first rune in the decomposition.
|
||||
// If there is no decomposition, LeadCCC equals CCC.
|
||||
func (p Properties) LeadCCC() uint8 { |
||||
return ccc[p.ccc] |
||||
} |
||||
|
||||
// TrailCCC returns the CCC of the last rune in the decomposition.
|
||||
// If there is no decomposition, TrailCCC equals CCC.
|
||||
func (p Properties) TrailCCC() uint8 { |
||||
return ccc[p.tccc] |
||||
} |
||||
|
||||
func buildRecompMap() { |
||||
recompMap = make(map[uint32]rune, len(recompMapPacked)/8) |
||||
var buf [8]byte |
||||
for i := 0; i < len(recompMapPacked); i += 8 { |
||||
copy(buf[:], recompMapPacked[i:i+8]) |
||||
key := binary.BigEndian.Uint32(buf[:4]) |
||||
val := binary.BigEndian.Uint32(buf[4:]) |
||||
recompMap[key] = rune(val) |
||||
} |
||||
} |
||||
|
||||
// Recomposition
|
||||
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
|
||||
// This clips off the bits of three entries, but we know this will not
|
||||
// result in a collision. In the unlikely event that changes to
|
||||
// UnicodeData.txt introduce collisions, the compiler will catch it.
|
||||
// Note that the recomposition map for NFC and NFKC are identical.
|
||||
|
||||
// combine returns the combined rune or 0 if it doesn't exist.
|
||||
//
|
||||
// The caller is responsible for calling
|
||||
// recompMapOnce.Do(buildRecompMap) sometime before this is called.
|
||||
func combine(a, b rune) rune { |
||||
key := uint32(uint16(a))<<16 + uint32(uint16(b)) |
||||
if recompMap == nil { |
||||
panic("caller error") // see func comment
|
||||
} |
||||
return recompMap[key] |
||||
} |
||||
|
||||
func lookupInfoNFC(b input, i int) Properties { |
||||
v, sz := b.charinfoNFC(i) |
||||
return compInfo(v, sz) |
||||
} |
||||
|
||||
func lookupInfoNFKC(b input, i int) Properties { |
||||
v, sz := b.charinfoNFKC(i) |
||||
return compInfo(v, sz) |
||||
} |
||||
|
||||
// Properties returns properties for the first rune in s.
|
||||
func (f Form) Properties(s []byte) Properties { |
||||
if f == NFC || f == NFD { |
||||
return compInfo(nfcData.lookup(s)) |
||||
} |
||||
return compInfo(nfkcData.lookup(s)) |
||||
} |
||||
|
||||
// PropertiesString returns properties for the first rune in s.
|
||||
func (f Form) PropertiesString(s string) Properties { |
||||
if f == NFC || f == NFD { |
||||
return compInfo(nfcData.lookupString(s)) |
||||
} |
||||
return compInfo(nfkcData.lookupString(s)) |
||||
} |
||||
|
||||
// compInfo converts the information contained in v and sz
|
||||
// to a Properties. See the comment at the top of the file
|
||||
// for more information on the format.
|
||||
func compInfo(v uint16, sz int) Properties { |
||||
if v == 0 { |
||||
return Properties{size: uint8(sz)} |
||||
} else if v >= 0x8000 { |
||||
p := Properties{ |
||||
size: uint8(sz), |
||||
ccc: uint8(v), |
||||
tccc: uint8(v), |
||||
flags: qcInfo(v >> 8), |
||||
} |
||||
if p.ccc > 0 || p.combinesBackward() { |
||||
p.nLead = uint8(p.flags & 0x3) |
||||
} |
||||
return p |
||||
} |
||||
// has decomposition
|
||||
h := decomps[v] |
||||
f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4 |
||||
p := Properties{size: uint8(sz), flags: f, index: v} |
||||
if v >= firstCCC { |
||||
v += uint16(h&headerLenMask) + 1 |
||||
c := decomps[v] |
||||
p.tccc = c >> 2 |
||||
p.flags |= qcInfo(c & 0x3) |
||||
if v >= firstLeadingCCC { |
||||
p.nLead = c & 0x3 |
||||
if v >= firstStarterWithNLead { |
||||
// We were tricked. Remove the decomposition.
|
||||
p.flags &= 0x03 |
||||
p.index = 0 |
||||
return p |
||||
} |
||||
p.ccc = decomps[v+1] |
||||
} |
||||
} |
||||
return p |
||||
} |
@ -0,0 +1,109 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
import "unicode/utf8" |
||||
|
||||
type input struct { |
||||
str string |
||||
bytes []byte |
||||
} |
||||
|
||||
func inputBytes(str []byte) input { |
||||
return input{bytes: str} |
||||
} |
||||
|
||||
func inputString(str string) input { |
||||
return input{str: str} |
||||
} |
||||
|
||||
func (in *input) setBytes(str []byte) { |
||||
in.str = "" |
||||
in.bytes = str |
||||
} |
||||
|
||||
func (in *input) setString(str string) { |
||||
in.str = str |
||||
in.bytes = nil |
||||
} |
||||
|
||||
func (in *input) _byte(p int) byte { |
||||
if in.bytes == nil { |
||||
return in.str[p] |
||||
} |
||||
return in.bytes[p] |
||||
} |
||||
|
||||
func (in *input) skipASCII(p, max int) int { |
||||
if in.bytes == nil { |
||||
for ; p < max && in.str[p] < utf8.RuneSelf; p++ { |
||||
} |
||||
} else { |
||||
for ; p < max && in.bytes[p] < utf8.RuneSelf; p++ { |
||||
} |
||||
} |
||||
return p |
||||
} |
||||
|
||||
func (in *input) skipContinuationBytes(p int) int { |
||||
if in.bytes == nil { |
||||
for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ { |
||||
} |
||||
} else { |
||||
for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ { |
||||
} |
||||
} |
||||
return p |
||||
} |
||||
|
||||
func (in *input) appendSlice(buf []byte, b, e int) []byte { |
||||
if in.bytes != nil { |
||||
return append(buf, in.bytes[b:e]...) |
||||
} |
||||
for i := b; i < e; i++ { |
||||
buf = append(buf, in.str[i]) |
||||
} |
||||
return buf |
||||
} |
||||
|
||||
func (in *input) copySlice(buf []byte, b, e int) int { |
||||
if in.bytes == nil { |
||||
return copy(buf, in.str[b:e]) |
||||
} |
||||
return copy(buf, in.bytes[b:e]) |
||||
} |
||||
|
||||
func (in *input) charinfoNFC(p int) (uint16, int) { |
||||
if in.bytes == nil { |
||||
return nfcData.lookupString(in.str[p:]) |
||||
} |
||||
return nfcData.lookup(in.bytes[p:]) |
||||
} |
||||
|
||||
func (in *input) charinfoNFKC(p int) (uint16, int) { |
||||
if in.bytes == nil { |
||||
return nfkcData.lookupString(in.str[p:]) |
||||
} |
||||
return nfkcData.lookup(in.bytes[p:]) |
||||
} |
||||
|
||||
func (in *input) hangul(p int) (r rune) { |
||||
var size int |
||||
if in.bytes == nil { |
||||
if !isHangulString(in.str[p:]) { |
||||
return 0 |
||||
} |
||||
r, size = utf8.DecodeRuneInString(in.str[p:]) |
||||
} else { |
||||
if !isHangul(in.bytes[p:]) { |
||||
return 0 |
||||
} |
||||
r, size = utf8.DecodeRune(in.bytes[p:]) |
||||
} |
||||
if size != hangulUTF8Size { |
||||
return 0 |
||||
} |
||||
return r |
||||
} |
@ -0,0 +1,458 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
import ( |
||||
"fmt" |
||||
"unicode/utf8" |
||||
) |
||||
|
||||
// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
|
||||
// sequence of starter and non-starter runes for the purpose of normalization.
|
||||
const MaxSegmentSize = maxByteBufferSize |
||||
|
||||
// An Iter iterates over a string or byte slice, while normalizing it
|
||||
// to a given Form.
|
||||
type Iter struct { |
||||
rb reorderBuffer |
||||
buf [maxByteBufferSize]byte |
||||
info Properties // first character saved from previous iteration
|
||||
next iterFunc // implementation of next depends on form
|
||||
asciiF iterFunc |
||||
|
||||
p int // current position in input source
|
||||
multiSeg []byte // remainder of multi-segment decomposition
|
||||
} |
||||
|
||||
type iterFunc func(*Iter) []byte |
||||
|
||||
// Init initializes i to iterate over src after normalizing it to Form f.
|
||||
func (i *Iter) Init(f Form, src []byte) { |
||||
i.p = 0 |
||||
if len(src) == 0 { |
||||
i.setDone() |
||||
i.rb.nsrc = 0 |
||||
return |
||||
} |
||||
i.multiSeg = nil |
||||
i.rb.init(f, src) |
||||
i.next = i.rb.f.nextMain |
||||
i.asciiF = nextASCIIBytes |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.rb.ss.first(i.info) |
||||
} |
||||
|
||||
// InitString initializes i to iterate over src after normalizing it to Form f.
|
||||
func (i *Iter) InitString(f Form, src string) { |
||||
i.p = 0 |
||||
if len(src) == 0 { |
||||
i.setDone() |
||||
i.rb.nsrc = 0 |
||||
return |
||||
} |
||||
i.multiSeg = nil |
||||
i.rb.initString(f, src) |
||||
i.next = i.rb.f.nextMain |
||||
i.asciiF = nextASCIIString |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.rb.ss.first(i.info) |
||||
} |
||||
|
||||
// Seek sets the segment to be returned by the next call to Next to start
|
||||
// at position p. It is the responsibility of the caller to set p to the
|
||||
// start of a segment.
|
||||
func (i *Iter) Seek(offset int64, whence int) (int64, error) { |
||||
var abs int64 |
||||
switch whence { |
||||
case 0: |
||||
abs = offset |
||||
case 1: |
||||
abs = int64(i.p) + offset |
||||
case 2: |
||||
abs = int64(i.rb.nsrc) + offset |
||||
default: |
||||
return 0, fmt.Errorf("norm: invalid whence") |
||||
} |
||||
if abs < 0 { |
||||
return 0, fmt.Errorf("norm: negative position") |
||||
} |
||||
if int(abs) >= i.rb.nsrc { |
||||
i.setDone() |
||||
return int64(i.p), nil |
||||
} |
||||
i.p = int(abs) |
||||
i.multiSeg = nil |
||||
i.next = i.rb.f.nextMain |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.rb.ss.first(i.info) |
||||
return abs, nil |
||||
} |
||||
|
||||
// returnSlice returns a slice of the underlying input type as a byte slice.
|
||||
// If the underlying is of type []byte, it will simply return a slice.
|
||||
// If the underlying is of type string, it will copy the slice to the buffer
|
||||
// and return that.
|
||||
func (i *Iter) returnSlice(a, b int) []byte { |
||||
if i.rb.src.bytes == nil { |
||||
return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] |
||||
} |
||||
return i.rb.src.bytes[a:b] |
||||
} |
||||
|
||||
// Pos returns the byte position at which the next call to Next will commence processing.
|
||||
func (i *Iter) Pos() int { |
||||
return i.p |
||||
} |
||||
|
||||
func (i *Iter) setDone() { |
||||
i.next = nextDone |
||||
i.p = i.rb.nsrc |
||||
} |
||||
|
||||
// Done returns true if there is no more input to process.
|
||||
func (i *Iter) Done() bool { |
||||
return i.p >= i.rb.nsrc |
||||
} |
||||
|
||||
// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
|
||||
// For any input a and b for which f(a) == f(b), subsequent calls
|
||||
// to Next will return the same segments.
|
||||
// Modifying runes are grouped together with the preceding starter, if such a starter exists.
|
||||
// Although not guaranteed, n will typically be the smallest possible n.
|
||||
func (i *Iter) Next() []byte { |
||||
return i.next(i) |
||||
} |
||||
|
||||
func nextASCIIBytes(i *Iter) []byte { |
||||
p := i.p + 1 |
||||
if p >= i.rb.nsrc { |
||||
p0 := i.p |
||||
i.setDone() |
||||
return i.rb.src.bytes[p0:p] |
||||
} |
||||
if i.rb.src.bytes[p] < utf8.RuneSelf { |
||||
p0 := i.p |
||||
i.p = p |
||||
return i.rb.src.bytes[p0:p] |
||||
} |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.next = i.rb.f.nextMain |
||||
return i.next(i) |
||||
} |
||||
|
||||
func nextASCIIString(i *Iter) []byte { |
||||
p := i.p + 1 |
||||
if p >= i.rb.nsrc { |
||||
i.buf[0] = i.rb.src.str[i.p] |
||||
i.setDone() |
||||
return i.buf[:1] |
||||
} |
||||
if i.rb.src.str[p] < utf8.RuneSelf { |
||||
i.buf[0] = i.rb.src.str[i.p] |
||||
i.p = p |
||||
return i.buf[:1] |
||||
} |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.next = i.rb.f.nextMain |
||||
return i.next(i) |
||||
} |
||||
|
||||
func nextHangul(i *Iter) []byte { |
||||
p := i.p |
||||
next := p + hangulUTF8Size |
||||
if next >= i.rb.nsrc { |
||||
i.setDone() |
||||
} else if i.rb.src.hangul(next) == 0 { |
||||
i.rb.ss.next(i.info) |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.next = i.rb.f.nextMain |
||||
return i.next(i) |
||||
} |
||||
i.p = next |
||||
return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] |
||||
} |
||||
|
||||
func nextDone(i *Iter) []byte { |
||||
return nil |
||||
} |
||||
|
||||
// nextMulti is used for iterating over multi-segment decompositions
|
||||
// for decomposing normal forms.
|
||||
func nextMulti(i *Iter) []byte { |
||||
j := 0 |
||||
d := i.multiSeg |
||||
// skip first rune
|
||||
for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { |
||||
} |
||||
for j < len(d) { |
||||
info := i.rb.f.info(input{bytes: d}, j) |
||||
if info.BoundaryBefore() { |
||||
i.multiSeg = d[j:] |
||||
return d[:j] |
||||
} |
||||
j += int(info.size) |
||||
} |
||||
// treat last segment as normal decomposition
|
||||
i.next = i.rb.f.nextMain |
||||
return i.next(i) |
||||
} |
||||
|
||||
// nextMultiNorm is used for iterating over multi-segment decompositions
|
||||
// for composing normal forms.
|
||||
func nextMultiNorm(i *Iter) []byte { |
||||
j := 0 |
||||
d := i.multiSeg |
||||
for j < len(d) { |
||||
info := i.rb.f.info(input{bytes: d}, j) |
||||
if info.BoundaryBefore() { |
||||
i.rb.compose() |
||||
seg := i.buf[:i.rb.flushCopy(i.buf[:])] |
||||
i.rb.insertUnsafe(input{bytes: d}, j, info) |
||||
i.multiSeg = d[j+int(info.size):] |
||||
return seg |
||||
} |
||||
i.rb.insertUnsafe(input{bytes: d}, j, info) |
||||
j += int(info.size) |
||||
} |
||||
i.multiSeg = nil |
||||
i.next = nextComposed |
||||
return doNormComposed(i) |
||||
} |
||||
|
||||
// nextDecomposed is the implementation of Next for forms NFD and NFKD.
|
||||
func nextDecomposed(i *Iter) (next []byte) { |
||||
outp := 0 |
||||
inCopyStart, outCopyStart := i.p, 0 |
||||
for { |
||||
if sz := int(i.info.size); sz <= 1 { |
||||
i.rb.ss = 0 |
||||
p := i.p |
||||
i.p++ // ASCII or illegal byte. Either way, advance by 1.
|
||||
if i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
return i.returnSlice(p, i.p) |
||||
} else if i.rb.src._byte(i.p) < utf8.RuneSelf { |
||||
i.next = i.asciiF |
||||
return i.returnSlice(p, i.p) |
||||
} |
||||
outp++ |
||||
} else if d := i.info.Decomposition(); d != nil { |
||||
// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
|
||||
// Case 1: there is a leftover to copy. In this case the decomposition
|
||||
// must begin with a modifier and should always be appended.
|
||||
// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
|
||||
p := outp + len(d) |
||||
if outp > 0 { |
||||
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
||||
// TODO: this condition should not be possible, but we leave it
|
||||
// in for defensive purposes.
|
||||
if p > len(i.buf) { |
||||
return i.buf[:outp] |
||||
} |
||||
} else if i.info.multiSegment() { |
||||
// outp must be 0 as multi-segment decompositions always
|
||||
// start a new segment.
|
||||
if i.multiSeg == nil { |
||||
i.multiSeg = d |
||||
i.next = nextMulti |
||||
return nextMulti(i) |
||||
} |
||||
// We are in the last segment. Treat as normal decomposition.
|
||||
d = i.multiSeg |
||||
i.multiSeg = nil |
||||
p = len(d) |
||||
} |
||||
prevCC := i.info.tccc |
||||
if i.p += sz; i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
i.info = Properties{} // Force BoundaryBefore to succeed.
|
||||
} else { |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
} |
||||
switch i.rb.ss.next(i.info) { |
||||
case ssOverflow: |
||||
i.next = nextCGJDecompose |
||||
fallthrough |
||||
case ssStarter: |
||||
if outp > 0 { |
||||
copy(i.buf[outp:], d) |
||||
return i.buf[:p] |
||||
} |
||||
return d |
||||
} |
||||
copy(i.buf[outp:], d) |
||||
outp = p |
||||
inCopyStart, outCopyStart = i.p, outp |
||||
if i.info.ccc < prevCC { |
||||
goto doNorm |
||||
} |
||||
continue |
||||
} else if r := i.rb.src.hangul(i.p); r != 0 { |
||||
outp = decomposeHangul(i.buf[:], r) |
||||
i.p += hangulUTF8Size |
||||
inCopyStart, outCopyStart = i.p, outp |
||||
if i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
break |
||||
} else if i.rb.src.hangul(i.p) != 0 { |
||||
i.next = nextHangul |
||||
return i.buf[:outp] |
||||
} |
||||
} else { |
||||
p := outp + sz |
||||
if p > len(i.buf) { |
||||
break |
||||
} |
||||
outp = p |
||||
i.p += sz |
||||
} |
||||
if i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
break |
||||
} |
||||
prevCC := i.info.tccc |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
if v := i.rb.ss.next(i.info); v == ssStarter { |
||||
break |
||||
} else if v == ssOverflow { |
||||
i.next = nextCGJDecompose |
||||
break |
||||
} |
||||
if i.info.ccc < prevCC { |
||||
goto doNorm |
||||
} |
||||
} |
||||
if outCopyStart == 0 { |
||||
return i.returnSlice(inCopyStart, i.p) |
||||
} else if inCopyStart < i.p { |
||||
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
||||
} |
||||
return i.buf[:outp] |
||||
doNorm: |
||||
// Insert what we have decomposed so far in the reorderBuffer.
|
||||
// As we will only reorder, there will always be enough room.
|
||||
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
||||
i.rb.insertDecomposed(i.buf[0:outp]) |
||||
return doNormDecomposed(i) |
||||
} |
||||
|
||||
func doNormDecomposed(i *Iter) []byte { |
||||
for { |
||||
i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
||||
if i.p += int(i.info.size); i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
break |
||||
} |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
if i.info.ccc == 0 { |
||||
break |
||||
} |
||||
if s := i.rb.ss.next(i.info); s == ssOverflow { |
||||
i.next = nextCGJDecompose |
||||
break |
||||
} |
||||
} |
||||
// new segment or too many combining characters: exit normalization
|
||||
return i.buf[:i.rb.flushCopy(i.buf[:])] |
||||
} |
||||
|
||||
func nextCGJDecompose(i *Iter) []byte { |
||||
i.rb.ss = 0 |
||||
i.rb.insertCGJ() |
||||
i.next = nextDecomposed |
||||
i.rb.ss.first(i.info) |
||||
buf := doNormDecomposed(i) |
||||
return buf |
||||
} |
||||
|
||||
// nextComposed is the implementation of Next for forms NFC and NFKC.
|
||||
func nextComposed(i *Iter) []byte { |
||||
outp, startp := 0, i.p |
||||
var prevCC uint8 |
||||
for { |
||||
if !i.info.isYesC() { |
||||
goto doNorm |
||||
} |
||||
prevCC = i.info.tccc |
||||
sz := int(i.info.size) |
||||
if sz == 0 { |
||||
sz = 1 // illegal rune: copy byte-by-byte
|
||||
} |
||||
p := outp + sz |
||||
if p > len(i.buf) { |
||||
break |
||||
} |
||||
outp = p |
||||
i.p += sz |
||||
if i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
break |
||||
} else if i.rb.src._byte(i.p) < utf8.RuneSelf { |
||||
i.rb.ss = 0 |
||||
i.next = i.asciiF |
||||
break |
||||
} |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
if v := i.rb.ss.next(i.info); v == ssStarter { |
||||
break |
||||
} else if v == ssOverflow { |
||||
i.next = nextCGJCompose |
||||
break |
||||
} |
||||
if i.info.ccc < prevCC { |
||||
goto doNorm |
||||
} |
||||
} |
||||
return i.returnSlice(startp, i.p) |
||||
doNorm: |
||||
// reset to start position
|
||||
i.p = startp |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.rb.ss.first(i.info) |
||||
if i.info.multiSegment() { |
||||
d := i.info.Decomposition() |
||||
info := i.rb.f.info(input{bytes: d}, 0) |
||||
i.rb.insertUnsafe(input{bytes: d}, 0, info) |
||||
i.multiSeg = d[int(info.size):] |
||||
i.next = nextMultiNorm |
||||
return nextMultiNorm(i) |
||||
} |
||||
i.rb.ss.first(i.info) |
||||
i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
||||
return doNormComposed(i) |
||||
} |
||||
|
||||
func doNormComposed(i *Iter) []byte { |
||||
// First rune should already be inserted.
|
||||
for { |
||||
if i.p += int(i.info.size); i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
break |
||||
} |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
if s := i.rb.ss.next(i.info); s == ssStarter { |
||||
break |
||||
} else if s == ssOverflow { |
||||
i.next = nextCGJCompose |
||||
break |
||||
} |
||||
i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
||||
} |
||||
i.rb.compose() |
||||
seg := i.buf[:i.rb.flushCopy(i.buf[:])] |
||||
return seg |
||||
} |
||||
|
||||
func nextCGJCompose(i *Iter) []byte { |
||||
i.rb.ss = 0 // instead of first
|
||||
i.rb.insertCGJ() |
||||
i.next = nextComposed |
||||
// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
|
||||
// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
|
||||
// If we ever change that, insert a check here.
|
||||
i.rb.ss.first(i.info) |
||||
i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
||||
return doNormComposed(i) |
||||
} |
@ -0,0 +1,986 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
// Normalization table generator.
|
||||
// Data read from the web.
|
||||
// See forminfo.go for a description of the trie values associated with each rune.
|
||||
|
||||
package main |
||||
|
||||
import ( |
||||
"bytes" |
||||
"encoding/binary" |
||||
"flag" |
||||
"fmt" |
||||
"io" |
||||
"log" |
||||
"sort" |
||||
"strconv" |
||||
"strings" |
||||
|
||||
"golang.org/x/text/internal/gen" |
||||
"golang.org/x/text/internal/triegen" |
||||
"golang.org/x/text/internal/ucd" |
||||
) |
||||
|
||||
func main() { |
||||
gen.Init() |
||||
loadUnicodeData() |
||||
compactCCC() |
||||
loadCompositionExclusions() |
||||
completeCharFields(FCanonical) |
||||
completeCharFields(FCompatibility) |
||||
computeNonStarterCounts() |
||||
verifyComputed() |
||||
printChars() |
||||
testDerived() |
||||
printTestdata() |
||||
makeTables() |
||||
} |
||||
|
||||
var ( |
||||
tablelist = flag.String("tables", |
||||
"all", |
||||
"comma-separated list of which tables to generate; "+ |
||||
"can be 'decomp', 'recomp', 'info' and 'all'") |
||||
test = flag.Bool("test", |
||||
false, |
||||
"test existing tables against DerivedNormalizationProps and generate test data for regression testing") |
||||
verbose = flag.Bool("verbose", |
||||
false, |
||||
"write data to stdout as it is parsed") |
||||
) |
||||
|
||||
const MaxChar = 0x10FFFF // anything above this shouldn't exist
|
||||
|
||||
// Quick Check properties of runes allow us to quickly
|
||||
// determine whether a rune may occur in a normal form.
|
||||
// For a given normal form, a rune may be guaranteed to occur
|
||||
// verbatim (QC=Yes), may or may not combine with another
|
||||
// rune (QC=Maybe), or may not occur (QC=No).
|
||||
type QCResult int |
||||
|
||||
const ( |
||||
QCUnknown QCResult = iota |
||||
QCYes |
||||
QCNo |
||||
QCMaybe |
||||
) |
||||
|
||||
func (r QCResult) String() string { |
||||
switch r { |
||||
case QCYes: |
||||
return "Yes" |
||||
case QCNo: |
||||
return "No" |
||||
case QCMaybe: |
||||
return "Maybe" |
||||
} |
||||
return "***UNKNOWN***" |
||||
} |
||||
|
||||
const ( |
||||
FCanonical = iota // NFC or NFD
|
||||
FCompatibility // NFKC or NFKD
|
||||
FNumberOfFormTypes |
||||
) |
||||
|
||||
const ( |
||||
MComposed = iota // NFC or NFKC
|
||||
MDecomposed // NFD or NFKD
|
||||
MNumberOfModes |
||||
) |
||||
|
||||
// This contains only the properties we're interested in.
|
||||
type Char struct { |
||||
name string |
||||
codePoint rune // if zero, this index is not a valid code point.
|
||||
ccc uint8 // canonical combining class
|
||||
origCCC uint8 |
||||
excludeInComp bool // from CompositionExclusions.txt
|
||||
compatDecomp bool // it has a compatibility expansion
|
||||
|
||||
nTrailingNonStarters uint8 |
||||
nLeadingNonStarters uint8 // must be equal to trailing if non-zero
|
||||
|
||||
forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility
|
||||
|
||||
state State |
||||
} |
||||
|
||||
var chars = make([]Char, MaxChar+1) |
||||
var cccMap = make(map[uint8]uint8) |
||||
|
||||
func (c Char) String() string { |
||||
buf := new(bytes.Buffer) |
||||
|
||||
fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name) |
||||
fmt.Fprintf(buf, " ccc: %v\n", c.ccc) |
||||
fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp) |
||||
fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp) |
||||
fmt.Fprintf(buf, " state: %v\n", c.state) |
||||
fmt.Fprintf(buf, " NFC:\n") |
||||
fmt.Fprint(buf, c.forms[FCanonical]) |
||||
fmt.Fprintf(buf, " NFKC:\n") |
||||
fmt.Fprint(buf, c.forms[FCompatibility]) |
||||
|
||||
return buf.String() |
||||
} |
||||
|
||||
// In UnicodeData.txt, some ranges are marked like this:
|
||||
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
// parseCharacter keeps a state variable indicating the weirdness.
|
||||
type State int |
||||
|
||||
const ( |
||||
SNormal State = iota // known to be zero for the type
|
||||
SFirst |
||||
SLast |
||||
SMissing |
||||
) |
||||
|
||||
var lastChar = rune('\u0000') |
||||
|
||||
func (c Char) isValid() bool { |
||||
return c.codePoint != 0 && c.state != SMissing |
||||
} |
||||
|
||||
type FormInfo struct { |
||||
quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed
|
||||
verified [MNumberOfModes]bool // index: MComposed or MDecomposed
|
||||
|
||||
combinesForward bool // May combine with rune on the right
|
||||
combinesBackward bool // May combine with rune on the left
|
||||
isOneWay bool // Never appears in result
|
||||
inDecomp bool // Some decompositions result in this char.
|
||||
decomp Decomposition |
||||
expandedDecomp Decomposition |
||||
} |
||||
|
||||
func (f FormInfo) String() string { |
||||
buf := bytes.NewBuffer(make([]byte, 0)) |
||||
|
||||
fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed]) |
||||
fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed]) |
||||
fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward) |
||||
fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward) |
||||
fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay) |
||||
fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp) |
||||
fmt.Fprintf(buf, " decomposition: %X\n", f.decomp) |
||||
fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp) |
||||
|
||||
return buf.String() |
||||
} |
||||
|
||||
type Decomposition []rune |
||||
|
||||
func parseDecomposition(s string, skipfirst bool) (a []rune, err error) { |
||||
decomp := strings.Split(s, " ") |
||||
if len(decomp) > 0 && skipfirst { |
||||
decomp = decomp[1:] |
||||
} |
||||
for _, d := range decomp { |
||||
point, err := strconv.ParseUint(d, 16, 64) |
||||
if err != nil { |
||||
return a, err |
||||
} |
||||
a = append(a, rune(point)) |
||||
} |
||||
return a, nil |
||||
} |
||||
|
||||
func loadUnicodeData() { |
||||
f := gen.OpenUCDFile("UnicodeData.txt") |
||||
defer f.Close() |
||||
p := ucd.New(f) |
||||
for p.Next() { |
||||
r := p.Rune(ucd.CodePoint) |
||||
char := &chars[r] |
||||
|
||||
char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass)) |
||||
decmap := p.String(ucd.DecompMapping) |
||||
|
||||
exp, err := parseDecomposition(decmap, false) |
||||
isCompat := false |
||||
if err != nil { |
||||
if len(decmap) > 0 { |
||||
exp, err = parseDecomposition(decmap, true) |
||||
if err != nil { |
||||
log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err) |
||||
} |
||||
isCompat = true |
||||
} |
||||
} |
||||
|
||||
char.name = p.String(ucd.Name) |
||||
char.codePoint = r |
||||
char.forms[FCompatibility].decomp = exp |
||||
if !isCompat { |
||||
char.forms[FCanonical].decomp = exp |
||||
} else { |
||||
char.compatDecomp = true |
||||
} |
||||
if len(decmap) > 0 { |
||||
char.forms[FCompatibility].decomp = exp |
||||
} |
||||
} |
||||
if err := p.Err(); err != nil { |
||||
log.Fatal(err) |
||||
} |
||||
} |
||||
|
||||
// compactCCC converts the sparse set of CCC values to a continguous one,
|
||||
// reducing the number of bits needed from 8 to 6.
|
||||
func compactCCC() { |
||||
m := make(map[uint8]uint8) |
||||
for i := range chars { |
||||
c := &chars[i] |
||||
m[c.ccc] = 0 |
||||
} |
||||
cccs := []int{} |
||||
for v, _ := range m { |
||||
cccs = append(cccs, int(v)) |
||||
} |
||||
sort.Ints(cccs) |
||||
for i, c := range cccs { |
||||
cccMap[uint8(i)] = uint8(c) |
||||
m[uint8(c)] = uint8(i) |
||||
} |
||||
for i := range chars { |
||||
c := &chars[i] |
||||
c.origCCC = c.ccc |
||||
c.ccc = m[c.ccc] |
||||
} |
||||
if len(m) >= 1<<6 { |
||||
log.Fatalf("too many difference CCC values: %d >= 64", len(m)) |
||||
} |
||||
} |
||||
|
||||
// CompositionExclusions.txt has form:
|
||||
// 0958 # ...
|
||||
// See https://unicode.org/reports/tr44/ for full explanation
|
||||
func loadCompositionExclusions() { |
||||
f := gen.OpenUCDFile("CompositionExclusions.txt") |
||||
defer f.Close() |
||||
p := ucd.New(f) |
||||
for p.Next() { |
||||
c := &chars[p.Rune(0)] |
||||
if c.excludeInComp { |
||||
log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint) |
||||
} |
||||
c.excludeInComp = true |
||||
} |
||||
if e := p.Err(); e != nil { |
||||
log.Fatal(e) |
||||
} |
||||
} |
||||
|
||||
// hasCompatDecomp returns true if any of the recursive
|
||||
// decompositions contains a compatibility expansion.
|
||||
// In this case, the character may not occur in NFK*.
|
||||
func hasCompatDecomp(r rune) bool { |
||||
c := &chars[r] |
||||
if c.compatDecomp { |
||||
return true |
||||
} |
||||
for _, d := range c.forms[FCompatibility].decomp { |
||||
if hasCompatDecomp(d) { |
||||
return true |
||||
} |
||||
} |
||||
return false |
||||
} |
||||
|
||||
// Hangul related constants.
|
||||
const ( |
||||
HangulBase = 0xAC00 |
||||
HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28)
|
||||
|
||||
JamoLBase = 0x1100 |
||||
JamoLEnd = 0x1113 |
||||
JamoVBase = 0x1161 |
||||
JamoVEnd = 0x1176 |
||||
JamoTBase = 0x11A8 |
||||
JamoTEnd = 0x11C3 |
||||
|
||||
JamoLVTCount = 19 * 21 * 28 |
||||
JamoTCount = 28 |
||||
) |
||||
|
||||
func isHangul(r rune) bool { |
||||
return HangulBase <= r && r < HangulEnd |
||||
} |
||||
|
||||
func isHangulWithoutJamoT(r rune) bool { |
||||
if !isHangul(r) { |
||||
return false |
||||
} |
||||
r -= HangulBase |
||||
return r < JamoLVTCount && r%JamoTCount == 0 |
||||
} |
||||
|
||||
func ccc(r rune) uint8 { |
||||
return chars[r].ccc |
||||
} |
||||
|
||||
// Insert a rune in a buffer, ordered by Canonical Combining Class.
|
||||
func insertOrdered(b Decomposition, r rune) Decomposition { |
||||
n := len(b) |
||||
b = append(b, 0) |
||||
cc := ccc(r) |
||||
if cc > 0 { |
||||
// Use bubble sort.
|
||||
for ; n > 0; n-- { |
||||
if ccc(b[n-1]) <= cc { |
||||
break |
||||
} |
||||
b[n] = b[n-1] |
||||
} |
||||
} |
||||
b[n] = r |
||||
return b |
||||
} |
||||
|
||||
// Recursively decompose.
|
||||
func decomposeRecursive(form int, r rune, d Decomposition) Decomposition { |
||||
dcomp := chars[r].forms[form].decomp |
||||
if len(dcomp) == 0 { |
||||
return insertOrdered(d, r) |
||||
} |
||||
for _, c := range dcomp { |
||||
d = decomposeRecursive(form, c, d) |
||||
} |
||||
return d |
||||
} |
||||
|
||||
func completeCharFields(form int) { |
||||
// Phase 0: pre-expand decomposition.
|
||||
for i := range chars { |
||||
f := &chars[i].forms[form] |
||||
if len(f.decomp) == 0 { |
||||
continue |
||||
} |
||||
exp := make(Decomposition, 0) |
||||
for _, c := range f.decomp { |
||||
exp = decomposeRecursive(form, c, exp) |
||||
} |
||||
f.expandedDecomp = exp |
||||
} |
||||
|
||||
// Phase 1: composition exclusion, mark decomposition.
|
||||
for i := range chars { |
||||
c := &chars[i] |
||||
f := &c.forms[form] |
||||
|
||||
// Marks script-specific exclusions and version restricted.
|
||||
f.isOneWay = c.excludeInComp |
||||
|
||||
// Singletons
|
||||
f.isOneWay = f.isOneWay || len(f.decomp) == 1 |
||||
|
||||
// Non-starter decompositions
|
||||
if len(f.decomp) > 1 { |
||||
chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0 |
||||
f.isOneWay = f.isOneWay || chk |
||||
} |
||||
|
||||
// Runes that decompose into more than two runes.
|
||||
f.isOneWay = f.isOneWay || len(f.decomp) > 2 |
||||
|
||||
if form == FCompatibility { |
||||
f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint) |
||||
} |
||||
|
||||
for _, r := range f.decomp { |
||||
chars[r].forms[form].inDecomp = true |
||||
} |
||||
} |
||||
|
||||
// Phase 2: forward and backward combining.
|
||||
for i := range chars { |
||||
c := &chars[i] |
||||
f := &c.forms[form] |
||||
|
||||
if !f.isOneWay && len(f.decomp) == 2 { |
||||
f0 := &chars[f.decomp[0]].forms[form] |
||||
f1 := &chars[f.decomp[1]].forms[form] |
||||
if !f0.isOneWay { |
||||
f0.combinesForward = true |
||||
} |
||||
if !f1.isOneWay { |
||||
f1.combinesBackward = true |
||||
} |
||||
} |
||||
if isHangulWithoutJamoT(rune(i)) { |
||||
f.combinesForward = true |
||||
} |
||||
} |
||||
|
||||
// Phase 3: quick check values.
|
||||
for i := range chars { |
||||
c := &chars[i] |
||||
f := &c.forms[form] |
||||
|
||||
switch { |
||||
case len(f.decomp) > 0: |
||||
f.quickCheck[MDecomposed] = QCNo |
||||
case isHangul(rune(i)): |
||||
f.quickCheck[MDecomposed] = QCNo |
||||
default: |
||||
f.quickCheck[MDecomposed] = QCYes |
||||
} |
||||
switch { |
||||
case f.isOneWay: |
||||
f.quickCheck[MComposed] = QCNo |
||||
case (i & 0xffff00) == JamoLBase: |
||||
f.quickCheck[MComposed] = QCYes |
||||
if JamoLBase <= i && i < JamoLEnd { |
||||
f.combinesForward = true |
||||
} |
||||
if JamoVBase <= i && i < JamoVEnd { |
||||
f.quickCheck[MComposed] = QCMaybe |
||||
f.combinesBackward = true |
||||
f.combinesForward = true |
||||
} |
||||
if JamoTBase <= i && i < JamoTEnd { |
||||
f.quickCheck[MComposed] = QCMaybe |
||||
f.combinesBackward = true |
||||
} |
||||
case !f.combinesBackward: |
||||
f.quickCheck[MComposed] = QCYes |
||||
default: |
||||
f.quickCheck[MComposed] = QCMaybe |
||||
} |
||||
} |
||||
} |
||||
|
||||
func computeNonStarterCounts() { |
||||
// Phase 4: leading and trailing non-starter count
|
||||
for i := range chars { |
||||
c := &chars[i] |
||||
|
||||
runes := []rune{rune(i)} |
||||
// We always use FCompatibility so that the CGJ insertion points do not
|
||||
// change for repeated normalizations with different forms.
|
||||
if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 { |
||||
runes = exp |
||||
} |
||||
// We consider runes that combine backwards to be non-starters for the
|
||||
// purpose of Stream-Safe Text Processing.
|
||||
for _, r := range runes { |
||||
if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { |
||||
break |
||||
} |
||||
c.nLeadingNonStarters++ |
||||
} |
||||
for i := len(runes) - 1; i >= 0; i-- { |
||||
if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { |
||||
break |
||||
} |
||||
c.nTrailingNonStarters++ |
||||
} |
||||
if c.nTrailingNonStarters > 3 { |
||||
log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes) |
||||
} |
||||
|
||||
if isHangul(rune(i)) { |
||||
c.nTrailingNonStarters = 2 |
||||
if isHangulWithoutJamoT(rune(i)) { |
||||
c.nTrailingNonStarters = 1 |
||||
} |
||||
} |
||||
|
||||
if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t { |
||||
log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t) |
||||
} |
||||
if t := c.nTrailingNonStarters; t > 3 { |
||||
log.Fatalf("%U: number of trailing non-starters is %d > 3", t) |
||||
} |
||||
} |
||||
} |
||||
|
||||
func printBytes(w io.Writer, b []byte, name string) { |
||||
fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b)) |
||||
fmt.Fprintf(w, "var %s = [...]byte {", name) |
||||
for i, c := range b { |
||||
switch { |
||||
case i%64 == 0: |
||||
fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63) |
||||
case i%8 == 0: |
||||
fmt.Fprintf(w, "\n") |
||||
} |
||||
fmt.Fprintf(w, "0x%.2X, ", c) |
||||
} |
||||
fmt.Fprint(w, "\n}\n\n") |
||||
} |
||||
|
||||
// See forminfo.go for format.
|
||||
func makeEntry(f *FormInfo, c *Char) uint16 { |
||||
e := uint16(0) |
||||
if r := c.codePoint; HangulBase <= r && r < HangulEnd { |
||||
e |= 0x40 |
||||
} |
||||
if f.combinesForward { |
||||
e |= 0x20 |
||||
} |
||||
if f.quickCheck[MDecomposed] == QCNo { |
||||
e |= 0x4 |
||||
} |
||||
switch f.quickCheck[MComposed] { |
||||
case QCYes: |
||||
case QCNo: |
||||
e |= 0x10 |
||||
case QCMaybe: |
||||
e |= 0x18 |
||||
default: |
||||
log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed]) |
||||
} |
||||
e |= uint16(c.nTrailingNonStarters) |
||||
return e |
||||
} |
||||
|
||||
// decompSet keeps track of unique decompositions, grouped by whether
|
||||
// the decomposition is followed by a trailing and/or leading CCC.
|
||||
type decompSet [7]map[string]bool |
||||
|
||||
const ( |
||||
normalDecomp = iota |
||||
firstMulti |
||||
firstCCC |
||||
endMulti |
||||
firstLeadingCCC |
||||
firstCCCZeroExcept |
||||
firstStarterWithNLead |
||||
lastDecomp |
||||
) |
||||
|
||||
var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"} |
||||
|
||||
func makeDecompSet() decompSet { |
||||
m := decompSet{} |
||||
for i := range m { |
||||
m[i] = make(map[string]bool) |
||||
} |
||||
return m |
||||
} |
||||
func (m *decompSet) insert(key int, s string) { |
||||
m[key][s] = true |
||||
} |
||||
|
||||
func printCharInfoTables(w io.Writer) int { |
||||
mkstr := func(r rune, f *FormInfo) (int, string) { |
||||
d := f.expandedDecomp |
||||
s := string([]rune(d)) |
||||
if max := 1 << 6; len(s) >= max { |
||||
const msg = "%U: too many bytes in decomposition: %d >= %d" |
||||
log.Fatalf(msg, r, len(s), max) |
||||
} |
||||
head := uint8(len(s)) |
||||
if f.quickCheck[MComposed] != QCYes { |
||||
head |= 0x40 |
||||
} |
||||
if f.combinesForward { |
||||
head |= 0x80 |
||||
} |
||||
s = string([]byte{head}) + s |
||||
|
||||
lccc := ccc(d[0]) |
||||
tccc := ccc(d[len(d)-1]) |
||||
cc := ccc(r) |
||||
if cc != 0 && lccc == 0 && tccc == 0 { |
||||
log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc) |
||||
} |
||||
if tccc < lccc && lccc != 0 { |
||||
const msg = "%U: lccc (%d) must be <= tcc (%d)" |
||||
log.Fatalf(msg, r, lccc, tccc) |
||||
} |
||||
index := normalDecomp |
||||
nTrail := chars[r].nTrailingNonStarters |
||||
nLead := chars[r].nLeadingNonStarters |
||||
if tccc > 0 || lccc > 0 || nTrail > 0 { |
||||
tccc <<= 2 |
||||
tccc |= nTrail |
||||
s += string([]byte{tccc}) |
||||
index = endMulti |
||||
for _, r := range d[1:] { |
||||
if ccc(r) == 0 { |
||||
index = firstCCC |
||||
} |
||||
} |
||||
if lccc > 0 || nLead > 0 { |
||||
s += string([]byte{lccc}) |
||||
if index == firstCCC { |
||||
log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r) |
||||
} |
||||
index = firstLeadingCCC |
||||
} |
||||
if cc != lccc { |
||||
if cc != 0 { |
||||
log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc) |
||||
} |
||||
index = firstCCCZeroExcept |
||||
} |
||||
} else if len(d) > 1 { |
||||
index = firstMulti |
||||
} |
||||
return index, s |
||||
} |
||||
|
||||
decompSet := makeDecompSet() |
||||
const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail.
|
||||
decompSet.insert(firstStarterWithNLead, nLeadStr) |
||||
|
||||
// Store the uniqued decompositions in a byte buffer,
|
||||
// preceded by their byte length.
|
||||
for _, c := range chars { |
||||
for _, f := range c.forms { |
||||
if len(f.expandedDecomp) == 0 { |
||||
continue |
||||
} |
||||
if f.combinesBackward { |
||||
log.Fatalf("%U: combinesBackward and decompose", c.codePoint) |
||||
} |
||||
index, s := mkstr(c.codePoint, &f) |
||||
decompSet.insert(index, s) |
||||
} |
||||
} |
||||
|
||||
decompositions := bytes.NewBuffer(make([]byte, 0, 10000)) |
||||
size := 0 |
||||
positionMap := make(map[string]uint16) |
||||
decompositions.WriteString("\000") |
||||
fmt.Fprintln(w, "const (") |
||||
for i, m := range decompSet { |
||||
sa := []string{} |
||||
for s := range m { |
||||
sa = append(sa, s) |
||||
} |
||||
sort.Strings(sa) |
||||
for _, s := range sa { |
||||
p := decompositions.Len() |
||||
decompositions.WriteString(s) |
||||
positionMap[s] = uint16(p) |
||||
} |
||||
if cname[i] != "" { |
||||
fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len()) |
||||
} |
||||
} |
||||
fmt.Fprintln(w, "maxDecomp = 0x8000") |
||||
fmt.Fprintln(w, ")") |
||||
b := decompositions.Bytes() |
||||
printBytes(w, b, "decomps") |
||||
size += len(b) |
||||
|
||||
varnames := []string{"nfc", "nfkc"} |
||||
for i := 0; i < FNumberOfFormTypes; i++ { |
||||
trie := triegen.NewTrie(varnames[i]) |
||||
|
||||
for r, c := range chars { |
||||
f := c.forms[i] |
||||
d := f.expandedDecomp |
||||
if len(d) != 0 { |
||||
_, key := mkstr(c.codePoint, &f) |
||||
trie.Insert(rune(r), uint64(positionMap[key])) |
||||
if c.ccc != ccc(d[0]) { |
||||
// We assume the lead ccc of a decomposition !=0 in this case.
|
||||
if ccc(d[0]) == 0 { |
||||
log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc) |
||||
} |
||||
} |
||||
} else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward { |
||||
// Handle cases where it can't be detected that the nLead should be equal
|
||||
// to nTrail.
|
||||
trie.Insert(c.codePoint, uint64(positionMap[nLeadStr])) |
||||
} else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 { |
||||
trie.Insert(c.codePoint, uint64(0x8000|v)) |
||||
} |
||||
} |
||||
sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]})) |
||||
if err != nil { |
||||
log.Fatal(err) |
||||
} |
||||
size += sz |
||||
} |
||||
return size |
||||
} |
||||
|
||||
func contains(sa []string, s string) bool { |
||||
for _, a := range sa { |
||||
if a == s { |
||||
return true |
||||
} |
||||
} |
||||
return false |
||||
} |
||||
|
||||
func makeTables() { |
||||
w := &bytes.Buffer{} |
||||
|
||||
size := 0 |
||||
if *tablelist == "" { |
||||
return |
||||
} |
||||
list := strings.Split(*tablelist, ",") |
||||
if *tablelist == "all" { |
||||
list = []string{"recomp", "info"} |
||||
} |
||||
|
||||
// Compute maximum decomposition size.
|
||||
max := 0 |
||||
for _, c := range chars { |
||||
if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max { |
||||
max = n |
||||
} |
||||
} |
||||
fmt.Fprintln(w, `import "sync"`) |
||||
fmt.Fprintln(w) |
||||
|
||||
fmt.Fprintln(w, "const (") |
||||
fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.") |
||||
fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion()) |
||||
fmt.Fprintln(w) |
||||
fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform") |
||||
fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at") |
||||
fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that") |
||||
fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.") |
||||
fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max) |
||||
fmt.Fprintln(w, ")\n") |
||||
|
||||
// Print the CCC remap table.
|
||||
size += len(cccMap) |
||||
fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap)) |
||||
for i := 0; i < len(cccMap); i++ { |
||||
if i%8 == 0 { |
||||
fmt.Fprintln(w) |
||||
} |
||||
fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)]) |
||||
} |
||||
fmt.Fprintln(w, "\n}\n") |
||||
|
||||
if contains(list, "info") { |
||||
size += printCharInfoTables(w) |
||||
} |
||||
|
||||
if contains(list, "recomp") { |
||||
// Note that we use 32 bit keys, instead of 64 bit.
|
||||
// This clips the bits of three entries, but we know
|
||||
// this won't cause a collision. The compiler will catch
|
||||
// any changes made to UnicodeData.txt that introduces
|
||||
// a collision.
|
||||
// Note that the recomposition map for NFC and NFKC
|
||||
// are identical.
|
||||
|
||||
// Recomposition map
|
||||
nrentries := 0 |
||||
for _, c := range chars { |
||||
f := c.forms[FCanonical] |
||||
if !f.isOneWay && len(f.decomp) > 0 { |
||||
nrentries++ |
||||
} |
||||
} |
||||
sz := nrentries * 8 |
||||
size += sz |
||||
fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz) |
||||
fmt.Fprintln(w, "var recompMap map[uint32]rune") |
||||
fmt.Fprintln(w, "var recompMapOnce sync.Once\n") |
||||
fmt.Fprintln(w, `const recompMapPacked = "" +`) |
||||
var buf [8]byte |
||||
for i, c := range chars { |
||||
f := c.forms[FCanonical] |
||||
d := f.decomp |
||||
if !f.isOneWay && len(d) > 0 { |
||||
key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1])) |
||||
binary.BigEndian.PutUint32(buf[:4], key) |
||||
binary.BigEndian.PutUint32(buf[4:], uint32(i)) |
||||
fmt.Fprintf(w, "\t\t%q + // 0x%.8X: 0x%.8X\n", string(buf[:]), key, uint32(i)) |
||||
} |
||||
} |
||||
// hack so we don't have to special case the trailing plus sign
|
||||
fmt.Fprintf(w, ` ""`) |
||||
fmt.Fprintln(w) |
||||
} |
||||
|
||||
fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size) |
||||
gen.WriteVersionedGoFile("tables.go", "norm", w.Bytes()) |
||||
} |
||||
|
||||
func printChars() { |
||||
if *verbose { |
||||
for _, c := range chars { |
||||
if !c.isValid() || c.state == SMissing { |
||||
continue |
||||
} |
||||
fmt.Println(c) |
||||
} |
||||
} |
||||
} |
||||
|
||||
// verifyComputed does various consistency tests.
|
||||
func verifyComputed() { |
||||
for i, c := range chars { |
||||
for _, f := range c.forms { |
||||
isNo := (f.quickCheck[MDecomposed] == QCNo) |
||||
if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) { |
||||
log.Fatalf("%U: NF*D QC must be No if rune decomposes", i) |
||||
} |
||||
|
||||
isMaybe := f.quickCheck[MComposed] == QCMaybe |
||||
if f.combinesBackward != isMaybe { |
||||
log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i) |
||||
} |
||||
if len(f.decomp) > 0 && f.combinesForward && isMaybe { |
||||
log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i) |
||||
} |
||||
|
||||
if len(f.expandedDecomp) != 0 { |
||||
continue |
||||
} |
||||
if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b { |
||||
// We accept these runes to be treated differently (it only affects
|
||||
// segment breaking in iteration, most likely on improper use), but
|
||||
// reconsider if more characters are added.
|
||||
// U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;;
|
||||
// U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;;
|
||||
// U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;;
|
||||
// U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;;
|
||||
// U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;;
|
||||
// U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;;
|
||||
if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) { |
||||
log.Fatalf("%U: nLead was %v; want %v", i, a, b) |
||||
} |
||||
} |
||||
} |
||||
nfc := c.forms[FCanonical] |
||||
nfkc := c.forms[FCompatibility] |
||||
if nfc.combinesBackward != nfkc.combinesBackward { |
||||
log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint) |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Use values in DerivedNormalizationProps.txt to compare against the
|
||||
// values we computed.
|
||||
// DerivedNormalizationProps.txt has form:
|
||||
// 00C0..00C5 ; NFD_QC; N # ...
|
||||
// 0374 ; NFD_QC; N # ...
|
||||
// See https://unicode.org/reports/tr44/ for full explanation
|
||||
func testDerived() { |
||||
f := gen.OpenUCDFile("DerivedNormalizationProps.txt") |
||||
defer f.Close() |
||||
p := ucd.New(f) |
||||
for p.Next() { |
||||
r := p.Rune(0) |
||||
c := &chars[r] |
||||
|
||||
var ftype, mode int |
||||
qt := p.String(1) |
||||
switch qt { |
||||
case "NFC_QC": |
||||
ftype, mode = FCanonical, MComposed |
||||
case "NFD_QC": |
||||
ftype, mode = FCanonical, MDecomposed |
||||
case "NFKC_QC": |
||||
ftype, mode = FCompatibility, MComposed |
||||
case "NFKD_QC": |
||||
ftype, mode = FCompatibility, MDecomposed |
||||
default: |
||||
continue |
||||
} |
||||
var qr QCResult |
||||
switch p.String(2) { |
||||
case "Y": |
||||
qr = QCYes |
||||
case "N": |
||||
qr = QCNo |
||||
case "M": |
||||
qr = QCMaybe |
||||
default: |
||||
log.Fatalf(`Unexpected quick check value "%s"`, p.String(2)) |
||||
} |
||||
if got := c.forms[ftype].quickCheck[mode]; got != qr { |
||||
log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr) |
||||
} |
||||
c.forms[ftype].verified[mode] = true |
||||
} |
||||
if err := p.Err(); err != nil { |
||||
log.Fatal(err) |
||||
} |
||||
// Any unspecified value must be QCYes. Verify this.
|
||||
for i, c := range chars { |
||||
for j, fd := range c.forms { |
||||
for k, qr := range fd.quickCheck { |
||||
if !fd.verified[k] && qr != QCYes { |
||||
m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n" |
||||
log.Printf(m, i, j, k, qr, c.name) |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
var testHeader = `const ( |
||||
Yes = iota |
||||
No |
||||
Maybe |
||||
) |
||||
|
||||
type formData struct { |
||||
qc uint8 |
||||
combinesForward bool |
||||
decomposition string |
||||
} |
||||
|
||||
type runeData struct { |
||||
r rune |
||||
ccc uint8 |
||||
nLead uint8 |
||||
nTrail uint8 |
||||
f [2]formData // 0: canonical; 1: compatibility
|
||||
} |
||||
|
||||
func f(qc uint8, cf bool, dec string) [2]formData { |
||||
return [2]formData{{qc, cf, dec}, {qc, cf, dec}} |
||||
} |
||||
|
||||
func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData { |
||||
return [2]formData{{qc, cf, d}, {qck, cfk, dk}} |
||||
} |
||||
|
||||
var testData = []runeData{ |
||||
` |
||||
|
||||
func printTestdata() { |
||||
type lastInfo struct { |
||||
ccc uint8 |
||||
nLead uint8 |
||||
nTrail uint8 |
||||
f string |
||||
} |
||||
|
||||
last := lastInfo{} |
||||
w := &bytes.Buffer{} |
||||
fmt.Fprintf(w, testHeader) |
||||
for r, c := range chars { |
||||
f := c.forms[FCanonical] |
||||
qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) |
||||
f = c.forms[FCompatibility] |
||||
qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) |
||||
s := "" |
||||
if d == dk && qc == qck && cf == cfk { |
||||
s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d) |
||||
} else { |
||||
s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk) |
||||
} |
||||
current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s} |
||||
if last != current { |
||||
fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s) |
||||
last = current |
||||
} |
||||
} |
||||
fmt.Fprintln(w, "}") |
||||
gen.WriteVersionedGoFile("data_test.go", "norm", w.Bytes()) |
||||
} |
@ -0,0 +1,609 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Note: the file data_test.go that is generated should not be checked in.
|
||||
//go:generate go run maketables.go triegen.go
|
||||
//go:generate go test -tags test
|
||||
|
||||
// Package norm contains types and functions for normalizing Unicode strings.
|
||||
package norm // import "golang.org/x/text/unicode/norm"
|
||||
|
||||
import ( |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// A Form denotes a canonical representation of Unicode code points.
|
||||
// The Unicode-defined normalization and equivalence forms are:
|
||||
//
|
||||
// NFC Unicode Normalization Form C
|
||||
// NFD Unicode Normalization Form D
|
||||
// NFKC Unicode Normalization Form KC
|
||||
// NFKD Unicode Normalization Form KD
|
||||
//
|
||||
// For a Form f, this documentation uses the notation f(x) to mean
|
||||
// the bytes or string x converted to the given form.
|
||||
// A position n in x is called a boundary if conversion to the form can
|
||||
// proceed independently on both sides:
|
||||
// f(x) == append(f(x[0:n]), f(x[n:])...)
|
||||
//
|
||||
// References: https://unicode.org/reports/tr15/ and
|
||||
// https://unicode.org/notes/tn5/.
|
||||
type Form int |
||||
|
||||
const ( |
||||
NFC Form = iota |
||||
NFD |
||||
NFKC |
||||
NFKD |
||||
) |
||||
|
||||
// Bytes returns f(b). May return b if f(b) = b.
|
||||
func (f Form) Bytes(b []byte) []byte { |
||||
src := inputBytes(b) |
||||
ft := formTable[f] |
||||
n, ok := ft.quickSpan(src, 0, len(b), true) |
||||
if ok { |
||||
return b |
||||
} |
||||
out := make([]byte, n, len(b)) |
||||
copy(out, b[0:n]) |
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush} |
||||
return doAppendInner(&rb, n) |
||||
} |
||||
|
||||
// String returns f(s).
|
||||
func (f Form) String(s string) string { |
||||
src := inputString(s) |
||||
ft := formTable[f] |
||||
n, ok := ft.quickSpan(src, 0, len(s), true) |
||||
if ok { |
||||
return s |
||||
} |
||||
out := make([]byte, n, len(s)) |
||||
copy(out, s[0:n]) |
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush} |
||||
return string(doAppendInner(&rb, n)) |
||||
} |
||||
|
||||
// IsNormal returns true if b == f(b).
|
||||
func (f Form) IsNormal(b []byte) bool { |
||||
src := inputBytes(b) |
||||
ft := formTable[f] |
||||
bp, ok := ft.quickSpan(src, 0, len(b), true) |
||||
if ok { |
||||
return true |
||||
} |
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)} |
||||
rb.setFlusher(nil, cmpNormalBytes) |
||||
for bp < len(b) { |
||||
rb.out = b[bp:] |
||||
if bp = decomposeSegment(&rb, bp, true); bp < 0 { |
||||
return false |
||||
} |
||||
bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true) |
||||
} |
||||
return true |
||||
} |
||||
|
||||
func cmpNormalBytes(rb *reorderBuffer) bool { |
||||
b := rb.out |
||||
for i := 0; i < rb.nrune; i++ { |
||||
info := rb.rune[i] |
||||
if int(info.size) > len(b) { |
||||
return false |
||||
} |
||||
p := info.pos |
||||
pe := p + info.size |
||||
for ; p < pe; p++ { |
||||
if b[0] != rb.byte[p] { |
||||
return false |
||||
} |
||||
b = b[1:] |
||||
} |
||||
} |
||||
return true |
||||
} |
||||
|
||||
// IsNormalString returns true if s == f(s).
|
||||
func (f Form) IsNormalString(s string) bool { |
||||
src := inputString(s) |
||||
ft := formTable[f] |
||||
bp, ok := ft.quickSpan(src, 0, len(s), true) |
||||
if ok { |
||||
return true |
||||
} |
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)} |
||||
rb.setFlusher(nil, func(rb *reorderBuffer) bool { |
||||
for i := 0; i < rb.nrune; i++ { |
||||
info := rb.rune[i] |
||||
if bp+int(info.size) > len(s) { |
||||
return false |
||||
} |
||||
p := info.pos |
||||
pe := p + info.size |
||||
for ; p < pe; p++ { |
||||
if s[bp] != rb.byte[p] { |
||||
return false |
||||
} |
||||
bp++ |
||||
} |
||||
} |
||||
return true |
||||
}) |
||||
for bp < len(s) { |
||||
if bp = decomposeSegment(&rb, bp, true); bp < 0 { |
||||
return false |
||||
} |
||||
bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true) |
||||
} |
||||
return true |
||||
} |
||||
|
||||
// patchTail fixes a case where a rune may be incorrectly normalized
|
||||
// if it is followed by illegal continuation bytes. It returns the
|
||||
// patched buffer and whether the decomposition is still in progress.
|
||||
func patchTail(rb *reorderBuffer) bool { |
||||
info, p := lastRuneStart(&rb.f, rb.out) |
||||
if p == -1 || info.size == 0 { |
||||
return true |
||||
} |
||||
end := p + int(info.size) |
||||
extra := len(rb.out) - end |
||||
if extra > 0 { |
||||
// Potentially allocating memory. However, this only
|
||||
// happens with ill-formed UTF-8.
|
||||
x := make([]byte, 0) |
||||
x = append(x, rb.out[len(rb.out)-extra:]...) |
||||
rb.out = rb.out[:end] |
||||
decomposeToLastBoundary(rb) |
||||
rb.doFlush() |
||||
rb.out = append(rb.out, x...) |
||||
return false |
||||
} |
||||
buf := rb.out[p:] |
||||
rb.out = rb.out[:p] |
||||
decomposeToLastBoundary(rb) |
||||
if s := rb.ss.next(info); s == ssStarter { |
||||
rb.doFlush() |
||||
rb.ss.first(info) |
||||
} else if s == ssOverflow { |
||||
rb.doFlush() |
||||
rb.insertCGJ() |
||||
rb.ss = 0 |
||||
} |
||||
rb.insertUnsafe(inputBytes(buf), 0, info) |
||||
return true |
||||
} |
||||
|
||||
func appendQuick(rb *reorderBuffer, i int) int { |
||||
if rb.nsrc == i { |
||||
return i |
||||
} |
||||
end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true) |
||||
rb.out = rb.src.appendSlice(rb.out, i, end) |
||||
return end |
||||
} |
||||
|
||||
// Append returns f(append(out, b...)).
|
||||
// The buffer out must be nil, empty, or equal to f(out).
|
||||
func (f Form) Append(out []byte, src ...byte) []byte { |
||||
return f.doAppend(out, inputBytes(src), len(src)) |
||||
} |
||||
|
||||
func (f Form) doAppend(out []byte, src input, n int) []byte { |
||||
if n == 0 { |
||||
return out |
||||
} |
||||
ft := formTable[f] |
||||
// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
|
||||
if len(out) == 0 { |
||||
p, _ := ft.quickSpan(src, 0, n, true) |
||||
out = src.appendSlice(out, 0, p) |
||||
if p == n { |
||||
return out |
||||
} |
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush} |
||||
return doAppendInner(&rb, p) |
||||
} |
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: n} |
||||
return doAppend(&rb, out, 0) |
||||
} |
||||
|
||||
func doAppend(rb *reorderBuffer, out []byte, p int) []byte { |
||||
rb.setFlusher(out, appendFlush) |
||||
src, n := rb.src, rb.nsrc |
||||
doMerge := len(out) > 0 |
||||
if q := src.skipContinuationBytes(p); q > p { |
||||
// Move leading non-starters to destination.
|
||||
rb.out = src.appendSlice(rb.out, p, q) |
||||
p = q |
||||
doMerge = patchTail(rb) |
||||
} |
||||
fd := &rb.f |
||||
if doMerge { |
||||
var info Properties |
||||
if p < n { |
||||
info = fd.info(src, p) |
||||
if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 { |
||||
if p == 0 { |
||||
decomposeToLastBoundary(rb) |
||||
} |
||||
p = decomposeSegment(rb, p, true) |
||||
} |
||||
} |
||||
if info.size == 0 { |
||||
rb.doFlush() |
||||
// Append incomplete UTF-8 encoding.
|
||||
return src.appendSlice(rb.out, p, n) |
||||
} |
||||
if rb.nrune > 0 { |
||||
return doAppendInner(rb, p) |
||||
} |
||||
} |
||||
p = appendQuick(rb, p) |
||||
return doAppendInner(rb, p) |
||||
} |
||||
|
||||
func doAppendInner(rb *reorderBuffer, p int) []byte { |
||||
for n := rb.nsrc; p < n; { |
||||
p = decomposeSegment(rb, p, true) |
||||
p = appendQuick(rb, p) |
||||
} |
||||
return rb.out |
||||
} |
||||
|
||||
// AppendString returns f(append(out, []byte(s))).
|
||||
// The buffer out must be nil, empty, or equal to f(out).
|
||||
func (f Form) AppendString(out []byte, src string) []byte { |
||||
return f.doAppend(out, inputString(src), len(src)) |
||||
} |
||||
|
||||
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
|
||||
// It is not guaranteed to return the largest such n.
|
||||
func (f Form) QuickSpan(b []byte) int { |
||||
n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true) |
||||
return n |
||||
} |
||||
|
||||
// Span implements transform.SpanningTransformer. It returns a boundary n such
|
||||
// that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
|
||||
func (f Form) Span(b []byte, atEOF bool) (n int, err error) { |
||||
n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF) |
||||
if n < len(b) { |
||||
if !ok { |
||||
err = transform.ErrEndOfSpan |
||||
} else { |
||||
err = transform.ErrShortSrc |
||||
} |
||||
} |
||||
return n, err |
||||
} |
||||
|
||||
// SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
|
||||
// It is not guaranteed to return the largest such n.
|
||||
func (f Form) SpanString(s string, atEOF bool) (n int, err error) { |
||||
n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF) |
||||
if n < len(s) { |
||||
if !ok { |
||||
err = transform.ErrEndOfSpan |
||||
} else { |
||||
err = transform.ErrShortSrc |
||||
} |
||||
} |
||||
return n, err |
||||
} |
||||
|
||||
// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
|
||||
// whether any non-normalized parts were found. If atEOF is false, n will
|
||||
// not point past the last segment if this segment might be become
|
||||
// non-normalized by appending other runes.
|
||||
func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) { |
||||
var lastCC uint8 |
||||
ss := streamSafe(0) |
||||
lastSegStart := i |
||||
for n = end; i < n; { |
||||
if j := src.skipASCII(i, n); i != j { |
||||
i = j |
||||
lastSegStart = i - 1 |
||||
lastCC = 0 |
||||
ss = 0 |
||||
continue |
||||
} |
||||
info := f.info(src, i) |
||||
if info.size == 0 { |
||||
if atEOF { |
||||
// include incomplete runes
|
||||
return n, true |
||||
} |
||||
return lastSegStart, true |
||||
} |
||||
// This block needs to be before the next, because it is possible to
|
||||
// have an overflow for runes that are starters (e.g. with U+FF9E).
|
||||
switch ss.next(info) { |
||||
case ssStarter: |
||||
lastSegStart = i |
||||
case ssOverflow: |
||||
return lastSegStart, false |
||||
case ssSuccess: |
||||
if lastCC > info.ccc { |
||||
return lastSegStart, false |
||||
} |
||||
} |
||||
if f.composing { |
||||
if !info.isYesC() { |
||||
break |
||||
} |
||||
} else { |
||||
if !info.isYesD() { |
||||
break |
||||
} |
||||
} |
||||
lastCC = info.ccc |
||||
i += int(info.size) |
||||
} |
||||
if i == n { |
||||
if !atEOF { |
||||
n = lastSegStart |
||||
} |
||||
return n, true |
||||
} |
||||
return lastSegStart, false |
||||
} |
||||
|
||||
// QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
|
||||
// It is not guaranteed to return the largest such n.
|
||||
func (f Form) QuickSpanString(s string) int { |
||||
n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true) |
||||
return n |
||||
} |
||||
|
||||
// FirstBoundary returns the position i of the first boundary in b
|
||||
// or -1 if b contains no boundary.
|
||||
func (f Form) FirstBoundary(b []byte) int { |
||||
return f.firstBoundary(inputBytes(b), len(b)) |
||||
} |
||||
|
||||
func (f Form) firstBoundary(src input, nsrc int) int { |
||||
i := src.skipContinuationBytes(0) |
||||
if i >= nsrc { |
||||
return -1 |
||||
} |
||||
fd := formTable[f] |
||||
ss := streamSafe(0) |
||||
// We should call ss.first here, but we can't as the first rune is
|
||||
// skipped already. This means FirstBoundary can't really determine
|
||||
// CGJ insertion points correctly. Luckily it doesn't have to.
|
||||
for { |
||||
info := fd.info(src, i) |
||||
if info.size == 0 { |
||||
return -1 |
||||
} |
||||
if s := ss.next(info); s != ssSuccess { |
||||
return i |
||||
} |
||||
i += int(info.size) |
||||
if i >= nsrc { |
||||
if !info.BoundaryAfter() && !ss.isMax() { |
||||
return -1 |
||||
} |
||||
return nsrc |
||||
} |
||||
} |
||||
} |
||||
|
||||
// FirstBoundaryInString returns the position i of the first boundary in s
|
||||
// or -1 if s contains no boundary.
|
||||
func (f Form) FirstBoundaryInString(s string) int { |
||||
return f.firstBoundary(inputString(s), len(s)) |
||||
} |
||||
|
||||
// NextBoundary reports the index of the boundary between the first and next
|
||||
// segment in b or -1 if atEOF is false and there are not enough bytes to
|
||||
// determine this boundary.
|
||||
func (f Form) NextBoundary(b []byte, atEOF bool) int { |
||||
return f.nextBoundary(inputBytes(b), len(b), atEOF) |
||||
} |
||||
|
||||
// NextBoundaryInString reports the index of the boundary between the first and
|
||||
// next segment in b or -1 if atEOF is false and there are not enough bytes to
|
||||
// determine this boundary.
|
||||
func (f Form) NextBoundaryInString(s string, atEOF bool) int { |
||||
return f.nextBoundary(inputString(s), len(s), atEOF) |
||||
} |
||||
|
||||
func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int { |
||||
if nsrc == 0 { |
||||
if atEOF { |
||||
return 0 |
||||
} |
||||
return -1 |
||||
} |
||||
fd := formTable[f] |
||||
info := fd.info(src, 0) |
||||
if info.size == 0 { |
||||
if atEOF { |
||||
return 1 |
||||
} |
||||
return -1 |
||||
} |
||||
ss := streamSafe(0) |
||||
ss.first(info) |
||||
|
||||
for i := int(info.size); i < nsrc; i += int(info.size) { |
||||
info = fd.info(src, i) |
||||
if info.size == 0 { |
||||
if atEOF { |
||||
return i |
||||
} |
||||
return -1 |
||||
} |
||||
// TODO: Using streamSafe to determine the boundary isn't the same as
|
||||
// using BoundaryBefore. Determine which should be used.
|
||||
if s := ss.next(info); s != ssSuccess { |
||||
return i |
||||
} |
||||
} |
||||
if !atEOF && !info.BoundaryAfter() && !ss.isMax() { |
||||
return -1 |
||||
} |
||||
return nsrc |
||||
} |
||||
|
||||
// LastBoundary returns the position i of the last boundary in b
|
||||
// or -1 if b contains no boundary.
|
||||
func (f Form) LastBoundary(b []byte) int { |
||||
return lastBoundary(formTable[f], b) |
||||
} |
||||
|
||||
func lastBoundary(fd *formInfo, b []byte) int { |
||||
i := len(b) |
||||
info, p := lastRuneStart(fd, b) |
||||
if p == -1 { |
||||
return -1 |
||||
} |
||||
if info.size == 0 { // ends with incomplete rune
|
||||
if p == 0 { // starts with incomplete rune
|
||||
return -1 |
||||
} |
||||
i = p |
||||
info, p = lastRuneStart(fd, b[:i]) |
||||
if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
|
||||
return i |
||||
} |
||||
} |
||||
if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
|
||||
return i |
||||
} |
||||
if info.BoundaryAfter() { |
||||
return i |
||||
} |
||||
ss := streamSafe(0) |
||||
v := ss.backwards(info) |
||||
for i = p; i >= 0 && v != ssStarter; i = p { |
||||
info, p = lastRuneStart(fd, b[:i]) |
||||
if v = ss.backwards(info); v == ssOverflow { |
||||
break |
||||
} |
||||
if p+int(info.size) != i { |
||||
if p == -1 { // no boundary found
|
||||
return -1 |
||||
} |
||||
return i // boundary after an illegal UTF-8 encoding
|
||||
} |
||||
} |
||||
return i |
||||
} |
||||
|
||||
// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
|
||||
// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
|
||||
// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
|
||||
func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int { |
||||
// Force one character to be consumed.
|
||||
info := rb.f.info(rb.src, sp) |
||||
if info.size == 0 { |
||||
return 0 |
||||
} |
||||
if s := rb.ss.next(info); s == ssStarter { |
||||
// TODO: this could be removed if we don't support merging.
|
||||
if rb.nrune > 0 { |
||||
goto end |
||||
} |
||||
} else if s == ssOverflow { |
||||
rb.insertCGJ() |
||||
goto end |
||||
} |
||||
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { |
||||
return int(err) |
||||
} |
||||
for { |
||||
sp += int(info.size) |
||||
if sp >= rb.nsrc { |
||||
if !atEOF && !info.BoundaryAfter() { |
||||
return int(iShortSrc) |
||||
} |
||||
break |
||||
} |
||||
info = rb.f.info(rb.src, sp) |
||||
if info.size == 0 { |
||||
if !atEOF { |
||||
return int(iShortSrc) |
||||
} |
||||
break |
||||
} |
||||
if s := rb.ss.next(info); s == ssStarter { |
||||
break |
||||
} else if s == ssOverflow { |
||||
rb.insertCGJ() |
||||
break |
||||
} |
||||
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { |
||||
return int(err) |
||||
} |
||||
} |
||||
end: |
||||
if !rb.doFlush() { |
||||
return int(iShortDst) |
||||
} |
||||
return sp |
||||
} |
||||
|
||||
// lastRuneStart returns the runeInfo and position of the last
|
||||
// rune in buf or the zero runeInfo and -1 if no rune was found.
|
||||
func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) { |
||||
p := len(buf) - 1 |
||||
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- { |
||||
} |
||||
if p < 0 { |
||||
return Properties{}, -1 |
||||
} |
||||
return fd.info(inputBytes(buf), p), p |
||||
} |
||||
|
||||
// decomposeToLastBoundary finds an open segment at the end of the buffer
|
||||
// and scans it into rb. Returns the buffer minus the last segment.
|
||||
func decomposeToLastBoundary(rb *reorderBuffer) { |
||||
fd := &rb.f |
||||
info, i := lastRuneStart(fd, rb.out) |
||||
if int(info.size) != len(rb.out)-i { |
||||
// illegal trailing continuation bytes
|
||||
return |
||||
} |
||||
if info.BoundaryAfter() { |
||||
return |
||||
} |
||||
var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
|
||||
padd := 0 |
||||
ss := streamSafe(0) |
||||
p := len(rb.out) |
||||
for { |
||||
add[padd] = info |
||||
v := ss.backwards(info) |
||||
if v == ssOverflow { |
||||
// Note that if we have an overflow, it the string we are appending to
|
||||
// is not correctly normalized. In this case the behavior is undefined.
|
||||
break |
||||
} |
||||
padd++ |
||||
p -= int(info.size) |
||||
if v == ssStarter || p < 0 { |
||||
break |
||||
} |
||||
info, i = lastRuneStart(fd, rb.out[:p]) |
||||
if int(info.size) != p-i { |
||||
break |
||||
} |
||||
} |
||||
rb.ss = ss |
||||
// Copy bytes for insertion as we may need to overwrite rb.out.
|
||||
var buf [maxBufferSize * utf8.UTFMax]byte |
||||
cp := buf[:copy(buf[:], rb.out[p:])] |
||||
rb.out = rb.out[:p] |
||||
for padd--; padd >= 0; padd-- { |
||||
info = add[padd] |
||||
rb.insertUnsafe(inputBytes(cp), 0, info) |
||||
cp = cp[info.size:] |
||||
} |
||||
} |
@ -0,0 +1,125 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
import "io" |
||||
|
||||
type normWriter struct { |
||||
rb reorderBuffer |
||||
w io.Writer |
||||
buf []byte |
||||
} |
||||
|
||||
// Write implements the standard write interface. If the last characters are
|
||||
// not at a normalization boundary, the bytes will be buffered for the next
|
||||
// write. The remaining bytes will be written on close.
|
||||
func (w *normWriter) Write(data []byte) (n int, err error) { |
||||
// Process data in pieces to keep w.buf size bounded.
|
||||
const chunk = 4000 |
||||
|
||||
for len(data) > 0 { |
||||
// Normalize into w.buf.
|
||||
m := len(data) |
||||
if m > chunk { |
||||
m = chunk |
||||
} |
||||
w.rb.src = inputBytes(data[:m]) |
||||
w.rb.nsrc = m |
||||
w.buf = doAppend(&w.rb, w.buf, 0) |
||||
data = data[m:] |
||||
n += m |
||||
|
||||
// Write out complete prefix, save remainder.
|
||||
// Note that lastBoundary looks back at most 31 runes.
|
||||
i := lastBoundary(&w.rb.f, w.buf) |
||||
if i == -1 { |
||||
i = 0 |
||||
} |
||||
if i > 0 { |
||||
if _, err = w.w.Write(w.buf[:i]); err != nil { |
||||
break |
||||
} |
||||
bn := copy(w.buf, w.buf[i:]) |
||||
w.buf = w.buf[:bn] |
||||
} |
||||
} |
||||
return n, err |
||||
} |
||||
|
||||
// Close forces data that remains in the buffer to be written.
|
||||
func (w *normWriter) Close() error { |
||||
if len(w.buf) > 0 { |
||||
_, err := w.w.Write(w.buf) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
// Writer returns a new writer that implements Write(b)
|
||||
// by writing f(b) to w. The returned writer may use an
|
||||
// internal buffer to maintain state across Write calls.
|
||||
// Calling its Close method writes any buffered data to w.
|
||||
func (f Form) Writer(w io.Writer) io.WriteCloser { |
||||
wr := &normWriter{rb: reorderBuffer{}, w: w} |
||||
wr.rb.init(f, nil) |
||||
return wr |
||||
} |
||||
|
||||
type normReader struct { |
||||
rb reorderBuffer |
||||
r io.Reader |
||||
inbuf []byte |
||||
outbuf []byte |
||||
bufStart int |
||||
lastBoundary int |
||||
err error |
||||
} |
||||
|
||||
// Read implements the standard read interface.
|
||||
func (r *normReader) Read(p []byte) (int, error) { |
||||
for { |
||||
if r.lastBoundary-r.bufStart > 0 { |
||||
n := copy(p, r.outbuf[r.bufStart:r.lastBoundary]) |
||||
r.bufStart += n |
||||
if r.lastBoundary-r.bufStart > 0 { |
||||
return n, nil |
||||
} |
||||
return n, r.err |
||||
} |
||||
if r.err != nil { |
||||
return 0, r.err |
||||
} |
||||
outn := copy(r.outbuf, r.outbuf[r.lastBoundary:]) |
||||
r.outbuf = r.outbuf[0:outn] |
||||
r.bufStart = 0 |
||||
|
||||
n, err := r.r.Read(r.inbuf) |
||||
r.rb.src = inputBytes(r.inbuf[0:n]) |
||||
r.rb.nsrc, r.err = n, err |
||||
if n > 0 { |
||||
r.outbuf = doAppend(&r.rb, r.outbuf, 0) |
||||
} |
||||
if err == io.EOF { |
||||
r.lastBoundary = len(r.outbuf) |
||||
} else { |
||||
r.lastBoundary = lastBoundary(&r.rb.f, r.outbuf) |
||||
if r.lastBoundary == -1 { |
||||
r.lastBoundary = 0 |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Reader returns a new reader that implements Read
|
||||
// by reading data from r and returning f(data).
|
||||
func (f Form) Reader(r io.Reader) io.Reader { |
||||
const chunk = 4000 |
||||
buf := make([]byte, chunk) |
||||
rr := &normReader{rb: reorderBuffer{}, r: r, inbuf: buf} |
||||
rr.rb.init(f, buf) |
||||
return rr |
||||
} |
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,88 @@ |
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
import ( |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// Reset implements the Reset method of the transform.Transformer interface.
|
||||
func (Form) Reset() {} |
||||
|
||||
// Transform implements the Transform method of the transform.Transformer
|
||||
// interface. It may need to write segments of up to MaxSegmentSize at once.
|
||||
// Users should either catch ErrShortDst and allow dst to grow or have dst be at
|
||||
// least of size MaxTransformChunkSize to be guaranteed of progress.
|
||||
func (f Form) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
// Cap the maximum number of src bytes to check.
|
||||
b := src |
||||
eof := atEOF |
||||
if ns := len(dst); ns < len(b) { |
||||
err = transform.ErrShortDst |
||||
eof = false |
||||
b = b[:ns] |
||||
} |
||||
i, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), eof) |
||||
n := copy(dst, b[:i]) |
||||
if !ok { |
||||
nDst, nSrc, err = f.transform(dst[n:], src[n:], atEOF) |
||||
return nDst + n, nSrc + n, err |
||||
} |
||||
|
||||
if err == nil && n < len(src) && !atEOF { |
||||
err = transform.ErrShortSrc |
||||
} |
||||
return n, n, err |
||||
} |
||||
|
||||
func flushTransform(rb *reorderBuffer) bool { |
||||
// Write out (must fully fit in dst, or else it is an ErrShortDst).
|
||||
if len(rb.out) < rb.nrune*utf8.UTFMax { |
||||
return false |
||||
} |
||||
rb.out = rb.out[rb.flushCopy(rb.out):] |
||||
return true |
||||
} |
||||
|
||||
var errs = []error{nil, transform.ErrShortDst, transform.ErrShortSrc} |
||||
|
||||
// transform implements the transform.Transformer interface. It is only called
|
||||
// when quickSpan does not pass for a given string.
|
||||
func (f Form) transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
// TODO: get rid of reorderBuffer. See CL 23460044.
|
||||
rb := reorderBuffer{} |
||||
rb.init(f, src) |
||||
for { |
||||
// Load segment into reorder buffer.
|
||||
rb.setFlusher(dst[nDst:], flushTransform) |
||||
end := decomposeSegment(&rb, nSrc, atEOF) |
||||
if end < 0 { |
||||
return nDst, nSrc, errs[-end] |
||||
} |
||||
nDst = len(dst) - len(rb.out) |
||||
nSrc = end |
||||
|
||||
// Next quickSpan.
|
||||
end = rb.nsrc |
||||
eof := atEOF |
||||
if n := nSrc + len(dst) - nDst; n < end { |
||||
err = transform.ErrShortDst |
||||
end = n |
||||
eof = false |
||||
} |
||||
end, ok := rb.f.quickSpan(rb.src, nSrc, end, eof) |
||||
n := copy(dst[nDst:], rb.src.bytes[nSrc:end]) |
||||
nSrc += n |
||||
nDst += n |
||||
if ok { |
||||
if err == nil && n < rb.nsrc && !atEOF { |
||||
err = transform.ErrShortSrc |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,54 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
type valueRange struct { |
||||
value uint16 // header: value:stride
|
||||
lo, hi byte // header: lo:n
|
||||
} |
||||
|
||||
type sparseBlocks struct { |
||||
values []valueRange |
||||
offset []uint16 |
||||
} |
||||
|
||||
var nfcSparse = sparseBlocks{ |
||||
values: nfcSparseValues[:], |
||||
offset: nfcSparseOffset[:], |
||||
} |
||||
|
||||
var nfkcSparse = sparseBlocks{ |
||||
values: nfkcSparseValues[:], |
||||
offset: nfkcSparseOffset[:], |
||||
} |
||||
|
||||
var ( |
||||
nfcData = newNfcTrie(0) |
||||
nfkcData = newNfkcTrie(0) |
||||
) |
||||
|
||||
// lookupValue determines the type of block n and looks up the value for b.
|
||||
// For n < t.cutoff, the block is a simple lookup table. Otherwise, the block
|
||||
// is a list of ranges with an accompanying value. Given a matching range r,
|
||||
// the value for b is by r.value + (b - r.lo) * stride.
|
||||
func (t *sparseBlocks) lookup(n uint32, b byte) uint16 { |
||||
offset := t.offset[n] |
||||
header := t.values[offset] |
||||
lo := offset + 1 |
||||
hi := lo + uint16(header.lo) |
||||
for lo < hi { |
||||
m := lo + (hi-lo)/2 |
||||
r := t.values[m] |
||||
if r.lo <= b && b <= r.hi { |
||||
return r.value + uint16(b-r.lo)*header.value |
||||
} |
||||
if b < r.lo { |
||||
hi = m |
||||
} else { |
||||
lo = m + 1 |
||||
} |
||||
} |
||||
return 0 |
||||
} |
Loading…
Reference in new issue