mirror of https://github.com/ethereum/go-ethereum
vendor: added vendor packages necessary for the swarm-network-rewrite merge (#16792)
* vendor: added vendor packages necessary for the swarm-network-rewrite merge into ethereum master * vendor: removed multihash depspull/16760/merge
parent
cbfb40b0aa
commit
5bee5d69d7
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,732 @@ |
||||
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package idna implements IDNA2008 using the compatibility processing
|
||||
// defined by UTS (Unicode Technical Standard) #46, which defines a standard to
|
||||
// deal with the transition from IDNA2003.
|
||||
//
|
||||
// IDNA2008 (Internationalized Domain Names for Applications), is defined in RFC
|
||||
// 5890, RFC 5891, RFC 5892, RFC 5893 and RFC 5894.
|
||||
// UTS #46 is defined in http://www.unicode.org/reports/tr46.
|
||||
// See http://unicode.org/cldr/utility/idna.jsp for a visualization of the
|
||||
// differences between these two standards.
|
||||
package idna // import "golang.org/x/net/idna"
|
||||
|
||||
import ( |
||||
"fmt" |
||||
"strings" |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/secure/bidirule" |
||||
"golang.org/x/text/unicode/bidi" |
||||
"golang.org/x/text/unicode/norm" |
||||
) |
||||
|
||||
// NOTE: Unlike common practice in Go APIs, the functions will return a
|
||||
// sanitized domain name in case of errors. Browsers sometimes use a partially
|
||||
// evaluated string as lookup.
|
||||
// TODO: the current error handling is, in my opinion, the least opinionated.
|
||||
// Other strategies are also viable, though:
|
||||
// Option 1) Return an empty string in case of error, but allow the user to
|
||||
// specify explicitly which errors to ignore.
|
||||
// Option 2) Return the partially evaluated string if it is itself a valid
|
||||
// string, otherwise return the empty string in case of error.
|
||||
// Option 3) Option 1 and 2.
|
||||
// Option 4) Always return an empty string for now and implement Option 1 as
|
||||
// needed, and document that the return string may not be empty in case of
|
||||
// error in the future.
|
||||
// I think Option 1 is best, but it is quite opinionated.
|
||||
|
||||
// ToASCII is a wrapper for Punycode.ToASCII.
|
||||
func ToASCII(s string) (string, error) { |
||||
return Punycode.process(s, true) |
||||
} |
||||
|
||||
// ToUnicode is a wrapper for Punycode.ToUnicode.
|
||||
func ToUnicode(s string) (string, error) { |
||||
return Punycode.process(s, false) |
||||
} |
||||
|
||||
// An Option configures a Profile at creation time.
|
||||
type Option func(*options) |
||||
|
||||
// Transitional sets a Profile to use the Transitional mapping as defined in UTS
|
||||
// #46. This will cause, for example, "ß" to be mapped to "ss". Using the
|
||||
// transitional mapping provides a compromise between IDNA2003 and IDNA2008
|
||||
// compatibility. It is used by most browsers when resolving domain names. This
|
||||
// option is only meaningful if combined with MapForLookup.
|
||||
func Transitional(transitional bool) Option { |
||||
return func(o *options) { o.transitional = true } |
||||
} |
||||
|
||||
// VerifyDNSLength sets whether a Profile should fail if any of the IDN parts
|
||||
// are longer than allowed by the RFC.
|
||||
func VerifyDNSLength(verify bool) Option { |
||||
return func(o *options) { o.verifyDNSLength = verify } |
||||
} |
||||
|
||||
// RemoveLeadingDots removes leading label separators. Leading runes that map to
|
||||
// dots, such as U+3002 IDEOGRAPHIC FULL STOP, are removed as well.
|
||||
//
|
||||
// This is the behavior suggested by the UTS #46 and is adopted by some
|
||||
// browsers.
|
||||
func RemoveLeadingDots(remove bool) Option { |
||||
return func(o *options) { o.removeLeadingDots = remove } |
||||
} |
||||
|
||||
// ValidateLabels sets whether to check the mandatory label validation criteria
|
||||
// as defined in Section 5.4 of RFC 5891. This includes testing for correct use
|
||||
// of hyphens ('-'), normalization, validity of runes, and the context rules.
|
||||
func ValidateLabels(enable bool) Option { |
||||
return func(o *options) { |
||||
// Don't override existing mappings, but set one that at least checks
|
||||
// normalization if it is not set.
|
||||
if o.mapping == nil && enable { |
||||
o.mapping = normalize |
||||
} |
||||
o.trie = trie |
||||
o.validateLabels = enable |
||||
o.fromPuny = validateFromPunycode |
||||
} |
||||
} |
||||
|
||||
// StrictDomainName limits the set of permissible ASCII characters to those
|
||||
// allowed in domain names as defined in RFC 1034 (A-Z, a-z, 0-9 and the
|
||||
// hyphen). This is set by default for MapForLookup and ValidateForRegistration.
|
||||
//
|
||||
// This option is useful, for instance, for browsers that allow characters
|
||||
// outside this range, for example a '_' (U+005F LOW LINE). See
|
||||
// http://www.rfc-editor.org/std/std3.txt for more details This option
|
||||
// corresponds to the UseSTD3ASCIIRules option in UTS #46.
|
||||
func StrictDomainName(use bool) Option { |
||||
return func(o *options) { |
||||
o.trie = trie |
||||
o.useSTD3Rules = use |
||||
o.fromPuny = validateFromPunycode |
||||
} |
||||
} |
||||
|
||||
// NOTE: the following options pull in tables. The tables should not be linked
|
||||
// in as long as the options are not used.
|
||||
|
||||
// BidiRule enables the Bidi rule as defined in RFC 5893. Any application
|
||||
// that relies on proper validation of labels should include this rule.
|
||||
func BidiRule() Option { |
||||
return func(o *options) { o.bidirule = bidirule.ValidString } |
||||
} |
||||
|
||||
// ValidateForRegistration sets validation options to verify that a given IDN is
|
||||
// properly formatted for registration as defined by Section 4 of RFC 5891.
|
||||
func ValidateForRegistration() Option { |
||||
return func(o *options) { |
||||
o.mapping = validateRegistration |
||||
StrictDomainName(true)(o) |
||||
ValidateLabels(true)(o) |
||||
VerifyDNSLength(true)(o) |
||||
BidiRule()(o) |
||||
} |
||||
} |
||||
|
||||
// MapForLookup sets validation and mapping options such that a given IDN is
|
||||
// transformed for domain name lookup according to the requirements set out in
|
||||
// Section 5 of RFC 5891. The mappings follow the recommendations of RFC 5894,
|
||||
// RFC 5895 and UTS 46. It does not add the Bidi Rule. Use the BidiRule option
|
||||
// to add this check.
|
||||
//
|
||||
// The mappings include normalization and mapping case, width and other
|
||||
// compatibility mappings.
|
||||
func MapForLookup() Option { |
||||
return func(o *options) { |
||||
o.mapping = validateAndMap |
||||
StrictDomainName(true)(o) |
||||
ValidateLabels(true)(o) |
||||
} |
||||
} |
||||
|
||||
type options struct { |
||||
transitional bool |
||||
useSTD3Rules bool |
||||
validateLabels bool |
||||
verifyDNSLength bool |
||||
removeLeadingDots bool |
||||
|
||||
trie *idnaTrie |
||||
|
||||
// fromPuny calls validation rules when converting A-labels to U-labels.
|
||||
fromPuny func(p *Profile, s string) error |
||||
|
||||
// mapping implements a validation and mapping step as defined in RFC 5895
|
||||
// or UTS 46, tailored to, for example, domain registration or lookup.
|
||||
mapping func(p *Profile, s string) (mapped string, isBidi bool, err error) |
||||
|
||||
// bidirule, if specified, checks whether s conforms to the Bidi Rule
|
||||
// defined in RFC 5893.
|
||||
bidirule func(s string) bool |
||||
} |
||||
|
||||
// A Profile defines the configuration of an IDNA mapper.
|
||||
type Profile struct { |
||||
options |
||||
} |
||||
|
||||
func apply(o *options, opts []Option) { |
||||
for _, f := range opts { |
||||
f(o) |
||||
} |
||||
} |
||||
|
||||
// New creates a new Profile.
|
||||
//
|
||||
// With no options, the returned Profile is the most permissive and equals the
|
||||
// Punycode Profile. Options can be passed to further restrict the Profile. The
|
||||
// MapForLookup and ValidateForRegistration options set a collection of options,
|
||||
// for lookup and registration purposes respectively, which can be tailored by
|
||||
// adding more fine-grained options, where later options override earlier
|
||||
// options.
|
||||
func New(o ...Option) *Profile { |
||||
p := &Profile{} |
||||
apply(&p.options, o) |
||||
return p |
||||
} |
||||
|
||||
// ToASCII converts a domain or domain label to its ASCII form. For example,
|
||||
// ToASCII("bücher.example.com") is "xn--bcher-kva.example.com", and
|
||||
// ToASCII("golang") is "golang". If an error is encountered it will return
|
||||
// an error and a (partially) processed result.
|
||||
func (p *Profile) ToASCII(s string) (string, error) { |
||||
return p.process(s, true) |
||||
} |
||||
|
||||
// ToUnicode converts a domain or domain label to its Unicode form. For example,
|
||||
// ToUnicode("xn--bcher-kva.example.com") is "bücher.example.com", and
|
||||
// ToUnicode("golang") is "golang". If an error is encountered it will return
|
||||
// an error and a (partially) processed result.
|
||||
func (p *Profile) ToUnicode(s string) (string, error) { |
||||
pp := *p |
||||
pp.transitional = false |
||||
return pp.process(s, false) |
||||
} |
||||
|
||||
// String reports a string with a description of the profile for debugging
|
||||
// purposes. The string format may change with different versions.
|
||||
func (p *Profile) String() string { |
||||
s := "" |
||||
if p.transitional { |
||||
s = "Transitional" |
||||
} else { |
||||
s = "NonTransitional" |
||||
} |
||||
if p.useSTD3Rules { |
||||
s += ":UseSTD3Rules" |
||||
} |
||||
if p.validateLabels { |
||||
s += ":ValidateLabels" |
||||
} |
||||
if p.verifyDNSLength { |
||||
s += ":VerifyDNSLength" |
||||
} |
||||
return s |
||||
} |
||||
|
||||
var ( |
||||
// Punycode is a Profile that does raw punycode processing with a minimum
|
||||
// of validation.
|
||||
Punycode *Profile = punycode |
||||
|
||||
// Lookup is the recommended profile for looking up domain names, according
|
||||
// to Section 5 of RFC 5891. The exact configuration of this profile may
|
||||
// change over time.
|
||||
Lookup *Profile = lookup |
||||
|
||||
// Display is the recommended profile for displaying domain names.
|
||||
// The configuration of this profile may change over time.
|
||||
Display *Profile = display |
||||
|
||||
// Registration is the recommended profile for checking whether a given
|
||||
// IDN is valid for registration, according to Section 4 of RFC 5891.
|
||||
Registration *Profile = registration |
||||
|
||||
punycode = &Profile{} |
||||
lookup = &Profile{options{ |
||||
transitional: true, |
||||
useSTD3Rules: true, |
||||
validateLabels: true, |
||||
trie: trie, |
||||
fromPuny: validateFromPunycode, |
||||
mapping: validateAndMap, |
||||
bidirule: bidirule.ValidString, |
||||
}} |
||||
display = &Profile{options{ |
||||
useSTD3Rules: true, |
||||
validateLabels: true, |
||||
trie: trie, |
||||
fromPuny: validateFromPunycode, |
||||
mapping: validateAndMap, |
||||
bidirule: bidirule.ValidString, |
||||
}} |
||||
registration = &Profile{options{ |
||||
useSTD3Rules: true, |
||||
validateLabels: true, |
||||
verifyDNSLength: true, |
||||
trie: trie, |
||||
fromPuny: validateFromPunycode, |
||||
mapping: validateRegistration, |
||||
bidirule: bidirule.ValidString, |
||||
}} |
||||
|
||||
// TODO: profiles
|
||||
// Register: recommended for approving domain names: don't do any mappings
|
||||
// but rather reject on invalid input. Bundle or block deviation characters.
|
||||
) |
||||
|
||||
type labelError struct{ label, code_ string } |
||||
|
||||
func (e labelError) code() string { return e.code_ } |
||||
func (e labelError) Error() string { |
||||
return fmt.Sprintf("idna: invalid label %q", e.label) |
||||
} |
||||
|
||||
type runeError rune |
||||
|
||||
func (e runeError) code() string { return "P1" } |
||||
func (e runeError) Error() string { |
||||
return fmt.Sprintf("idna: disallowed rune %U", e) |
||||
} |
||||
|
||||
// process implements the algorithm described in section 4 of UTS #46,
|
||||
// see http://www.unicode.org/reports/tr46.
|
||||
func (p *Profile) process(s string, toASCII bool) (string, error) { |
||||
var err error |
||||
var isBidi bool |
||||
if p.mapping != nil { |
||||
s, isBidi, err = p.mapping(p, s) |
||||
} |
||||
// Remove leading empty labels.
|
||||
if p.removeLeadingDots { |
||||
for ; len(s) > 0 && s[0] == '.'; s = s[1:] { |
||||
} |
||||
} |
||||
// TODO: allow for a quick check of the tables data.
|
||||
// It seems like we should only create this error on ToASCII, but the
|
||||
// UTS 46 conformance tests suggests we should always check this.
|
||||
if err == nil && p.verifyDNSLength && s == "" { |
||||
err = &labelError{s, "A4"} |
||||
} |
||||
labels := labelIter{orig: s} |
||||
for ; !labels.done(); labels.next() { |
||||
label := labels.label() |
||||
if label == "" { |
||||
// Empty labels are not okay. The label iterator skips the last
|
||||
// label if it is empty.
|
||||
if err == nil && p.verifyDNSLength { |
||||
err = &labelError{s, "A4"} |
||||
} |
||||
continue |
||||
} |
||||
if strings.HasPrefix(label, acePrefix) { |
||||
u, err2 := decode(label[len(acePrefix):]) |
||||
if err2 != nil { |
||||
if err == nil { |
||||
err = err2 |
||||
} |
||||
// Spec says keep the old label.
|
||||
continue |
||||
} |
||||
isBidi = isBidi || bidirule.DirectionString(u) != bidi.LeftToRight |
||||
labels.set(u) |
||||
if err == nil && p.validateLabels { |
||||
err = p.fromPuny(p, u) |
||||
} |
||||
if err == nil { |
||||
// This should be called on NonTransitional, according to the
|
||||
// spec, but that currently does not have any effect. Use the
|
||||
// original profile to preserve options.
|
||||
err = p.validateLabel(u) |
||||
} |
||||
} else if err == nil { |
||||
err = p.validateLabel(label) |
||||
} |
||||
} |
||||
if isBidi && p.bidirule != nil && err == nil { |
||||
for labels.reset(); !labels.done(); labels.next() { |
||||
if !p.bidirule(labels.label()) { |
||||
err = &labelError{s, "B"} |
||||
break |
||||
} |
||||
} |
||||
} |
||||
if toASCII { |
||||
for labels.reset(); !labels.done(); labels.next() { |
||||
label := labels.label() |
||||
if !ascii(label) { |
||||
a, err2 := encode(acePrefix, label) |
||||
if err == nil { |
||||
err = err2 |
||||
} |
||||
label = a |
||||
labels.set(a) |
||||
} |
||||
n := len(label) |
||||
if p.verifyDNSLength && err == nil && (n == 0 || n > 63) { |
||||
err = &labelError{label, "A4"} |
||||
} |
||||
} |
||||
} |
||||
s = labels.result() |
||||
if toASCII && p.verifyDNSLength && err == nil { |
||||
// Compute the length of the domain name minus the root label and its dot.
|
||||
n := len(s) |
||||
if n > 0 && s[n-1] == '.' { |
||||
n-- |
||||
} |
||||
if len(s) < 1 || n > 253 { |
||||
err = &labelError{s, "A4"} |
||||
} |
||||
} |
||||
return s, err |
||||
} |
||||
|
||||
func normalize(p *Profile, s string) (mapped string, isBidi bool, err error) { |
||||
// TODO: consider first doing a quick check to see if any of these checks
|
||||
// need to be done. This will make it slower in the general case, but
|
||||
// faster in the common case.
|
||||
mapped = norm.NFC.String(s) |
||||
isBidi = bidirule.DirectionString(mapped) == bidi.RightToLeft |
||||
return mapped, isBidi, nil |
||||
} |
||||
|
||||
func validateRegistration(p *Profile, s string) (idem string, bidi bool, err error) { |
||||
// TODO: filter need for normalization in loop below.
|
||||
if !norm.NFC.IsNormalString(s) { |
||||
return s, false, &labelError{s, "V1"} |
||||
} |
||||
for i := 0; i < len(s); { |
||||
v, sz := trie.lookupString(s[i:]) |
||||
if sz == 0 { |
||||
return s, bidi, runeError(utf8.RuneError) |
||||
} |
||||
bidi = bidi || info(v).isBidi(s[i:]) |
||||
// Copy bytes not copied so far.
|
||||
switch p.simplify(info(v).category()) { |
||||
// TODO: handle the NV8 defined in the Unicode idna data set to allow
|
||||
// for strict conformance to IDNA2008.
|
||||
case valid, deviation: |
||||
case disallowed, mapped, unknown, ignored: |
||||
r, _ := utf8.DecodeRuneInString(s[i:]) |
||||
return s, bidi, runeError(r) |
||||
} |
||||
i += sz |
||||
} |
||||
return s, bidi, nil |
||||
} |
||||
|
||||
func (c info) isBidi(s string) bool { |
||||
if !c.isMapped() { |
||||
return c&attributesMask == rtl |
||||
} |
||||
// TODO: also store bidi info for mapped data. This is possible, but a bit
|
||||
// cumbersome and not for the common case.
|
||||
p, _ := bidi.LookupString(s) |
||||
switch p.Class() { |
||||
case bidi.R, bidi.AL, bidi.AN: |
||||
return true |
||||
} |
||||
return false |
||||
} |
||||
|
||||
func validateAndMap(p *Profile, s string) (vm string, bidi bool, err error) { |
||||
var ( |
||||
b []byte |
||||
k int |
||||
) |
||||
// combinedInfoBits contains the or-ed bits of all runes. We use this
|
||||
// to derive the mayNeedNorm bit later. This may trigger normalization
|
||||
// overeagerly, but it will not do so in the common case. The end result
|
||||
// is another 10% saving on BenchmarkProfile for the common case.
|
||||
var combinedInfoBits info |
||||
for i := 0; i < len(s); { |
||||
v, sz := trie.lookupString(s[i:]) |
||||
if sz == 0 { |
||||
b = append(b, s[k:i]...) |
||||
b = append(b, "\ufffd"...) |
||||
k = len(s) |
||||
if err == nil { |
||||
err = runeError(utf8.RuneError) |
||||
} |
||||
break |
||||
} |
||||
combinedInfoBits |= info(v) |
||||
bidi = bidi || info(v).isBidi(s[i:]) |
||||
start := i |
||||
i += sz |
||||
// Copy bytes not copied so far.
|
||||
switch p.simplify(info(v).category()) { |
||||
case valid: |
||||
continue |
||||
case disallowed: |
||||
if err == nil { |
||||
r, _ := utf8.DecodeRuneInString(s[start:]) |
||||
err = runeError(r) |
||||
} |
||||
continue |
||||
case mapped, deviation: |
||||
b = append(b, s[k:start]...) |
||||
b = info(v).appendMapping(b, s[start:i]) |
||||
case ignored: |
||||
b = append(b, s[k:start]...) |
||||
// drop the rune
|
||||
case unknown: |
||||
b = append(b, s[k:start]...) |
||||
b = append(b, "\ufffd"...) |
||||
} |
||||
k = i |
||||
} |
||||
if k == 0 { |
||||
// No changes so far.
|
||||
if combinedInfoBits&mayNeedNorm != 0 { |
||||
s = norm.NFC.String(s) |
||||
} |
||||
} else { |
||||
b = append(b, s[k:]...) |
||||
if norm.NFC.QuickSpan(b) != len(b) { |
||||
b = norm.NFC.Bytes(b) |
||||
} |
||||
// TODO: the punycode converters require strings as input.
|
||||
s = string(b) |
||||
} |
||||
return s, bidi, err |
||||
} |
||||
|
||||
// A labelIter allows iterating over domain name labels.
|
||||
type labelIter struct { |
||||
orig string |
||||
slice []string |
||||
curStart int |
||||
curEnd int |
||||
i int |
||||
} |
||||
|
||||
func (l *labelIter) reset() { |
||||
l.curStart = 0 |
||||
l.curEnd = 0 |
||||
l.i = 0 |
||||
} |
||||
|
||||
func (l *labelIter) done() bool { |
||||
return l.curStart >= len(l.orig) |
||||
} |
||||
|
||||
func (l *labelIter) result() string { |
||||
if l.slice != nil { |
||||
return strings.Join(l.slice, ".") |
||||
} |
||||
return l.orig |
||||
} |
||||
|
||||
func (l *labelIter) label() string { |
||||
if l.slice != nil { |
||||
return l.slice[l.i] |
||||
} |
||||
p := strings.IndexByte(l.orig[l.curStart:], '.') |
||||
l.curEnd = l.curStart + p |
||||
if p == -1 { |
||||
l.curEnd = len(l.orig) |
||||
} |
||||
return l.orig[l.curStart:l.curEnd] |
||||
} |
||||
|
||||
// next sets the value to the next label. It skips the last label if it is empty.
|
||||
func (l *labelIter) next() { |
||||
l.i++ |
||||
if l.slice != nil { |
||||
if l.i >= len(l.slice) || l.i == len(l.slice)-1 && l.slice[l.i] == "" { |
||||
l.curStart = len(l.orig) |
||||
} |
||||
} else { |
||||
l.curStart = l.curEnd + 1 |
||||
if l.curStart == len(l.orig)-1 && l.orig[l.curStart] == '.' { |
||||
l.curStart = len(l.orig) |
||||
} |
||||
} |
||||
} |
||||
|
||||
func (l *labelIter) set(s string) { |
||||
if l.slice == nil { |
||||
l.slice = strings.Split(l.orig, ".") |
||||
} |
||||
l.slice[l.i] = s |
||||
} |
||||
|
||||
// acePrefix is the ASCII Compatible Encoding prefix.
|
||||
const acePrefix = "xn--" |
||||
|
||||
func (p *Profile) simplify(cat category) category { |
||||
switch cat { |
||||
case disallowedSTD3Mapped: |
||||
if p.useSTD3Rules { |
||||
cat = disallowed |
||||
} else { |
||||
cat = mapped |
||||
} |
||||
case disallowedSTD3Valid: |
||||
if p.useSTD3Rules { |
||||
cat = disallowed |
||||
} else { |
||||
cat = valid |
||||
} |
||||
case deviation: |
||||
if !p.transitional { |
||||
cat = valid |
||||
} |
||||
case validNV8, validXV8: |
||||
// TODO: handle V2008
|
||||
cat = valid |
||||
} |
||||
return cat |
||||
} |
||||
|
||||
func validateFromPunycode(p *Profile, s string) error { |
||||
if !norm.NFC.IsNormalString(s) { |
||||
return &labelError{s, "V1"} |
||||
} |
||||
// TODO: detect whether string may have to be normalized in the following
|
||||
// loop.
|
||||
for i := 0; i < len(s); { |
||||
v, sz := trie.lookupString(s[i:]) |
||||
if sz == 0 { |
||||
return runeError(utf8.RuneError) |
||||
} |
||||
if c := p.simplify(info(v).category()); c != valid && c != deviation { |
||||
return &labelError{s, "V6"} |
||||
} |
||||
i += sz |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
const ( |
||||
zwnj = "\u200c" |
||||
zwj = "\u200d" |
||||
) |
||||
|
||||
type joinState int8 |
||||
|
||||
const ( |
||||
stateStart joinState = iota |
||||
stateVirama |
||||
stateBefore |
||||
stateBeforeVirama |
||||
stateAfter |
||||
stateFAIL |
||||
) |
||||
|
||||
var joinStates = [][numJoinTypes]joinState{ |
||||
stateStart: { |
||||
joiningL: stateBefore, |
||||
joiningD: stateBefore, |
||||
joinZWNJ: stateFAIL, |
||||
joinZWJ: stateFAIL, |
||||
joinVirama: stateVirama, |
||||
}, |
||||
stateVirama: { |
||||
joiningL: stateBefore, |
||||
joiningD: stateBefore, |
||||
}, |
||||
stateBefore: { |
||||
joiningL: stateBefore, |
||||
joiningD: stateBefore, |
||||
joiningT: stateBefore, |
||||
joinZWNJ: stateAfter, |
||||
joinZWJ: stateFAIL, |
||||
joinVirama: stateBeforeVirama, |
||||
}, |
||||
stateBeforeVirama: { |
||||
joiningL: stateBefore, |
||||
joiningD: stateBefore, |
||||
joiningT: stateBefore, |
||||
}, |
||||
stateAfter: { |
||||
joiningL: stateFAIL, |
||||
joiningD: stateBefore, |
||||
joiningT: stateAfter, |
||||
joiningR: stateStart, |
||||
joinZWNJ: stateFAIL, |
||||
joinZWJ: stateFAIL, |
||||
joinVirama: stateAfter, // no-op as we can't accept joiners here
|
||||
}, |
||||
stateFAIL: { |
||||
0: stateFAIL, |
||||
joiningL: stateFAIL, |
||||
joiningD: stateFAIL, |
||||
joiningT: stateFAIL, |
||||
joiningR: stateFAIL, |
||||
joinZWNJ: stateFAIL, |
||||
joinZWJ: stateFAIL, |
||||
joinVirama: stateFAIL, |
||||
}, |
||||
} |
||||
|
||||
// validateLabel validates the criteria from Section 4.1. Item 1, 4, and 6 are
|
||||
// already implicitly satisfied by the overall implementation.
|
||||
func (p *Profile) validateLabel(s string) (err error) { |
||||
if s == "" { |
||||
if p.verifyDNSLength { |
||||
return &labelError{s, "A4"} |
||||
} |
||||
return nil |
||||
} |
||||
if !p.validateLabels { |
||||
return nil |
||||
} |
||||
trie := p.trie // p.validateLabels is only set if trie is set.
|
||||
if len(s) > 4 && s[2] == '-' && s[3] == '-' { |
||||
return &labelError{s, "V2"} |
||||
} |
||||
if s[0] == '-' || s[len(s)-1] == '-' { |
||||
return &labelError{s, "V3"} |
||||
} |
||||
// TODO: merge the use of this in the trie.
|
||||
v, sz := trie.lookupString(s) |
||||
x := info(v) |
||||
if x.isModifier() { |
||||
return &labelError{s, "V5"} |
||||
} |
||||
// Quickly return in the absence of zero-width (non) joiners.
|
||||
if strings.Index(s, zwj) == -1 && strings.Index(s, zwnj) == -1 { |
||||
return nil |
||||
} |
||||
st := stateStart |
||||
for i := 0; ; { |
||||
jt := x.joinType() |
||||
if s[i:i+sz] == zwj { |
||||
jt = joinZWJ |
||||
} else if s[i:i+sz] == zwnj { |
||||
jt = joinZWNJ |
||||
} |
||||
st = joinStates[st][jt] |
||||
if x.isViramaModifier() { |
||||
st = joinStates[st][joinVirama] |
||||
} |
||||
if i += sz; i == len(s) { |
||||
break |
||||
} |
||||
v, sz = trie.lookupString(s[i:]) |
||||
x = info(v) |
||||
} |
||||
if st == stateFAIL || st == stateAfter { |
||||
return &labelError{s, "C"} |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
func ascii(s string) bool { |
||||
for i := 0; i < len(s); i++ { |
||||
if s[i] >= utf8.RuneSelf { |
||||
return false |
||||
} |
||||
} |
||||
return true |
||||
} |
@ -0,0 +1,203 @@ |
||||
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package idna |
||||
|
||||
// This file implements the Punycode algorithm from RFC 3492.
|
||||
|
||||
import ( |
||||
"math" |
||||
"strings" |
||||
"unicode/utf8" |
||||
) |
||||
|
||||
// These parameter values are specified in section 5.
|
||||
//
|
||||
// All computation is done with int32s, so that overflow behavior is identical
|
||||
// regardless of whether int is 32-bit or 64-bit.
|
||||
const ( |
||||
base int32 = 36 |
||||
damp int32 = 700 |
||||
initialBias int32 = 72 |
||||
initialN int32 = 128 |
||||
skew int32 = 38 |
||||
tmax int32 = 26 |
||||
tmin int32 = 1 |
||||
) |
||||
|
||||
func punyError(s string) error { return &labelError{s, "A3"} } |
||||
|
||||
// decode decodes a string as specified in section 6.2.
|
||||
func decode(encoded string) (string, error) { |
||||
if encoded == "" { |
||||
return "", nil |
||||
} |
||||
pos := 1 + strings.LastIndex(encoded, "-") |
||||
if pos == 1 { |
||||
return "", punyError(encoded) |
||||
} |
||||
if pos == len(encoded) { |
||||
return encoded[:len(encoded)-1], nil |
||||
} |
||||
output := make([]rune, 0, len(encoded)) |
||||
if pos != 0 { |
||||
for _, r := range encoded[:pos-1] { |
||||
output = append(output, r) |
||||
} |
||||
} |
||||
i, n, bias := int32(0), initialN, initialBias |
||||
for pos < len(encoded) { |
||||
oldI, w := i, int32(1) |
||||
for k := base; ; k += base { |
||||
if pos == len(encoded) { |
||||
return "", punyError(encoded) |
||||
} |
||||
digit, ok := decodeDigit(encoded[pos]) |
||||
if !ok { |
||||
return "", punyError(encoded) |
||||
} |
||||
pos++ |
||||
i += digit * w |
||||
if i < 0 { |
||||
return "", punyError(encoded) |
||||
} |
||||
t := k - bias |
||||
if t < tmin { |
||||
t = tmin |
||||
} else if t > tmax { |
||||
t = tmax |
||||
} |
||||
if digit < t { |
||||
break |
||||
} |
||||
w *= base - t |
||||
if w >= math.MaxInt32/base { |
||||
return "", punyError(encoded) |
||||
} |
||||
} |
||||
x := int32(len(output) + 1) |
||||
bias = adapt(i-oldI, x, oldI == 0) |
||||
n += i / x |
||||
i %= x |
||||
if n > utf8.MaxRune || len(output) >= 1024 { |
||||
return "", punyError(encoded) |
||||
} |
||||
output = append(output, 0) |
||||
copy(output[i+1:], output[i:]) |
||||
output[i] = n |
||||
i++ |
||||
} |
||||
return string(output), nil |
||||
} |
||||
|
||||
// encode encodes a string as specified in section 6.3 and prepends prefix to
|
||||
// the result.
|
||||
//
|
||||
// The "while h < length(input)" line in the specification becomes "for
|
||||
// remaining != 0" in the Go code, because len(s) in Go is in bytes, not runes.
|
||||
func encode(prefix, s string) (string, error) { |
||||
output := make([]byte, len(prefix), len(prefix)+1+2*len(s)) |
||||
copy(output, prefix) |
||||
delta, n, bias := int32(0), initialN, initialBias |
||||
b, remaining := int32(0), int32(0) |
||||
for _, r := range s { |
||||
if r < 0x80 { |
||||
b++ |
||||
output = append(output, byte(r)) |
||||
} else { |
||||
remaining++ |
||||
} |
||||
} |
||||
h := b |
||||
if b > 0 { |
||||
output = append(output, '-') |
||||
} |
||||
for remaining != 0 { |
||||
m := int32(0x7fffffff) |
||||
for _, r := range s { |
||||
if m > r && r >= n { |
||||
m = r |
||||
} |
||||
} |
||||
delta += (m - n) * (h + 1) |
||||
if delta < 0 { |
||||
return "", punyError(s) |
||||
} |
||||
n = m |
||||
for _, r := range s { |
||||
if r < n { |
||||
delta++ |
||||
if delta < 0 { |
||||
return "", punyError(s) |
||||
} |
||||
continue |
||||
} |
||||
if r > n { |
||||
continue |
||||
} |
||||
q := delta |
||||
for k := base; ; k += base { |
||||
t := k - bias |
||||
if t < tmin { |
||||
t = tmin |
||||
} else if t > tmax { |
||||
t = tmax |
||||
} |
||||
if q < t { |
||||
break |
||||
} |
||||
output = append(output, encodeDigit(t+(q-t)%(base-t))) |
||||
q = (q - t) / (base - t) |
||||
} |
||||
output = append(output, encodeDigit(q)) |
||||
bias = adapt(delta, h+1, h == b) |
||||
delta = 0 |
||||
h++ |
||||
remaining-- |
||||
} |
||||
delta++ |
||||
n++ |
||||
} |
||||
return string(output), nil |
||||
} |
||||
|
||||
func decodeDigit(x byte) (digit int32, ok bool) { |
||||
switch { |
||||
case '0' <= x && x <= '9': |
||||
return int32(x - ('0' - 26)), true |
||||
case 'A' <= x && x <= 'Z': |
||||
return int32(x - 'A'), true |
||||
case 'a' <= x && x <= 'z': |
||||
return int32(x - 'a'), true |
||||
} |
||||
return 0, false |
||||
} |
||||
|
||||
func encodeDigit(digit int32) byte { |
||||
switch { |
||||
case 0 <= digit && digit < 26: |
||||
return byte(digit + 'a') |
||||
case 26 <= digit && digit < 36: |
||||
return byte(digit + ('0' - 26)) |
||||
} |
||||
panic("idna: internal error in punycode encoding") |
||||
} |
||||
|
||||
// adapt is the bias adaptation function specified in section 6.1.
|
||||
func adapt(delta, numPoints int32, firstTime bool) int32 { |
||||
if firstTime { |
||||
delta /= damp |
||||
} else { |
||||
delta /= 2 |
||||
} |
||||
delta += delta / numPoints |
||||
k := int32(0) |
||||
for delta > ((base-tmin)*tmax)/2 { |
||||
delta /= base - tmin |
||||
k += base |
||||
} |
||||
return k + (base-tmin+1)*delta/(delta+skew) |
||||
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,72 @@ |
||||
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||
|
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package idna |
||||
|
||||
// appendMapping appends the mapping for the respective rune. isMapped must be
|
||||
// true. A mapping is a categorization of a rune as defined in UTS #46.
|
||||
func (c info) appendMapping(b []byte, s string) []byte { |
||||
index := int(c >> indexShift) |
||||
if c&xorBit == 0 { |
||||
s := mappings[index:] |
||||
return append(b, s[1:s[0]+1]...) |
||||
} |
||||
b = append(b, s...) |
||||
if c&inlineXOR == inlineXOR { |
||||
// TODO: support and handle two-byte inline masks
|
||||
b[len(b)-1] ^= byte(index) |
||||
} else { |
||||
for p := len(b) - int(xorData[index]); p < len(b); p++ { |
||||
index++ |
||||
b[p] ^= xorData[index] |
||||
} |
||||
} |
||||
return b |
||||
} |
||||
|
||||
// Sparse block handling code.
|
||||
|
||||
type valueRange struct { |
||||
value uint16 // header: value:stride
|
||||
lo, hi byte // header: lo:n
|
||||
} |
||||
|
||||
type sparseBlocks struct { |
||||
values []valueRange |
||||
offset []uint16 |
||||
} |
||||
|
||||
var idnaSparse = sparseBlocks{ |
||||
values: idnaSparseValues[:], |
||||
offset: idnaSparseOffset[:], |
||||
} |
||||
|
||||
// Don't use newIdnaTrie to avoid unconditional linking in of the table.
|
||||
var trie = &idnaTrie{} |
||||
|
||||
// lookup determines the type of block n and looks up the value for b.
|
||||
// For n < t.cutoff, the block is a simple lookup table. Otherwise, the block
|
||||
// is a list of ranges with an accompanying value. Given a matching range r,
|
||||
// the value for b is by r.value + (b - r.lo) * stride.
|
||||
func (t *sparseBlocks) lookup(n uint32, b byte) uint16 { |
||||
offset := t.offset[n] |
||||
header := t.values[offset] |
||||
lo := offset + 1 |
||||
hi := lo + uint16(header.lo) |
||||
for lo < hi { |
||||
m := lo + (hi-lo)/2 |
||||
r := t.values[m] |
||||
if r.lo <= b && b <= r.hi { |
||||
return r.value + uint16(b-r.lo)*header.value |
||||
} |
||||
if b < r.lo { |
||||
hi = m |
||||
} else { |
||||
lo = m + 1 |
||||
} |
||||
} |
||||
return 0 |
||||
} |
@ -0,0 +1,119 @@ |
||||
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||
|
||||
package idna |
||||
|
||||
// This file contains definitions for interpreting the trie value of the idna
|
||||
// trie generated by "go run gen*.go". It is shared by both the generator
|
||||
// program and the resultant package. Sharing is achieved by the generator
|
||||
// copying gen_trieval.go to trieval.go and changing what's above this comment.
|
||||
|
||||
// info holds information from the IDNA mapping table for a single rune. It is
|
||||
// the value returned by a trie lookup. In most cases, all information fits in
|
||||
// a 16-bit value. For mappings, this value may contain an index into a slice
|
||||
// with the mapped string. Such mappings can consist of the actual mapped value
|
||||
// or an XOR pattern to be applied to the bytes of the UTF8 encoding of the
|
||||
// input rune. This technique is used by the cases packages and reduces the
|
||||
// table size significantly.
|
||||
//
|
||||
// The per-rune values have the following format:
|
||||
//
|
||||
// if mapped {
|
||||
// if inlinedXOR {
|
||||
// 15..13 inline XOR marker
|
||||
// 12..11 unused
|
||||
// 10..3 inline XOR mask
|
||||
// } else {
|
||||
// 15..3 index into xor or mapping table
|
||||
// }
|
||||
// } else {
|
||||
// 15..14 unused
|
||||
// 13 mayNeedNorm
|
||||
// 12..11 attributes
|
||||
// 10..8 joining type
|
||||
// 7..3 category type
|
||||
// }
|
||||
// 2 use xor pattern
|
||||
// 1..0 mapped category
|
||||
//
|
||||
// See the definitions below for a more detailed description of the various
|
||||
// bits.
|
||||
type info uint16 |
||||
|
||||
const ( |
||||
catSmallMask = 0x3 |
||||
catBigMask = 0xF8 |
||||
indexShift = 3 |
||||
xorBit = 0x4 // interpret the index as an xor pattern
|
||||
inlineXOR = 0xE000 // These bits are set if the XOR pattern is inlined.
|
||||
|
||||
joinShift = 8 |
||||
joinMask = 0x07 |
||||
|
||||
// Attributes
|
||||
attributesMask = 0x1800 |
||||
viramaModifier = 0x1800 |
||||
modifier = 0x1000 |
||||
rtl = 0x0800 |
||||
|
||||
mayNeedNorm = 0x2000 |
||||
) |
||||
|
||||
// A category corresponds to a category defined in the IDNA mapping table.
|
||||
type category uint16 |
||||
|
||||
const ( |
||||
unknown category = 0 // not currently defined in unicode.
|
||||
mapped category = 1 |
||||
disallowedSTD3Mapped category = 2 |
||||
deviation category = 3 |
||||
) |
||||
|
||||
const ( |
||||
valid category = 0x08 |
||||
validNV8 category = 0x18 |
||||
validXV8 category = 0x28 |
||||
disallowed category = 0x40 |
||||
disallowedSTD3Valid category = 0x80 |
||||
ignored category = 0xC0 |
||||
) |
||||
|
||||
// join types and additional rune information
|
||||
const ( |
||||
joiningL = (iota + 1) |
||||
joiningD |
||||
joiningT |
||||
joiningR |
||||
|
||||
//the following types are derived during processing
|
||||
joinZWJ |
||||
joinZWNJ |
||||
joinVirama |
||||
numJoinTypes |
||||
) |
||||
|
||||
func (c info) isMapped() bool { |
||||
return c&0x3 != 0 |
||||
} |
||||
|
||||
func (c info) category() category { |
||||
small := c & catSmallMask |
||||
if small != 0 { |
||||
return category(small) |
||||
} |
||||
return category(c & catBigMask) |
||||
} |
||||
|
||||
func (c info) joinType() info { |
||||
if c.isMapped() { |
||||
return 0 |
||||
} |
||||
return (c >> joinShift) & joinMask |
||||
} |
||||
|
||||
func (c info) isModifier() bool { |
||||
return c&(modifier|catSmallMask) == modifier |
||||
} |
||||
|
||||
func (c info) isViramaModifier() bool { |
||||
return c&(attributesMask|catSmallMask) == viramaModifier |
||||
} |
@ -0,0 +1,336 @@ |
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package bidirule implements the Bidi Rule defined by RFC 5893.
|
||||
//
|
||||
// This package is under development. The API may change without notice and
|
||||
// without preserving backward compatibility.
|
||||
package bidirule |
||||
|
||||
import ( |
||||
"errors" |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/transform" |
||||
"golang.org/x/text/unicode/bidi" |
||||
) |
||||
|
||||
// This file contains an implementation of RFC 5893: Right-to-Left Scripts for
|
||||
// Internationalized Domain Names for Applications (IDNA)
|
||||
//
|
||||
// A label is an individual component of a domain name. Labels are usually
|
||||
// shown separated by dots; for example, the domain name "www.example.com" is
|
||||
// composed of three labels: "www", "example", and "com".
|
||||
//
|
||||
// An RTL label is a label that contains at least one character of class R, AL,
|
||||
// or AN. An LTR label is any label that is not an RTL label.
|
||||
//
|
||||
// A "Bidi domain name" is a domain name that contains at least one RTL label.
|
||||
//
|
||||
// The following guarantees can be made based on the above:
|
||||
//
|
||||
// o In a domain name consisting of only labels that satisfy the rule,
|
||||
// the requirements of Section 3 are satisfied. Note that even LTR
|
||||
// labels and pure ASCII labels have to be tested.
|
||||
//
|
||||
// o In a domain name consisting of only LDH labels (as defined in the
|
||||
// Definitions document [RFC5890]) and labels that satisfy the rule,
|
||||
// the requirements of Section 3 are satisfied as long as a label
|
||||
// that starts with an ASCII digit does not come after a
|
||||
// right-to-left label.
|
||||
//
|
||||
// No guarantee is given for other combinations.
|
||||
|
||||
// ErrInvalid indicates a label is invalid according to the Bidi Rule.
|
||||
var ErrInvalid = errors.New("bidirule: failed Bidi Rule") |
||||
|
||||
type ruleState uint8 |
||||
|
||||
const ( |
||||
ruleInitial ruleState = iota |
||||
ruleLTR |
||||
ruleLTRFinal |
||||
ruleRTL |
||||
ruleRTLFinal |
||||
ruleInvalid |
||||
) |
||||
|
||||
type ruleTransition struct { |
||||
next ruleState |
||||
mask uint16 |
||||
} |
||||
|
||||
var transitions = [...][2]ruleTransition{ |
||||
// [2.1] The first character must be a character with Bidi property L, R, or
|
||||
// AL. If it has the R or AL property, it is an RTL label; if it has the L
|
||||
// property, it is an LTR label.
|
||||
ruleInitial: { |
||||
{ruleLTRFinal, 1 << bidi.L}, |
||||
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL}, |
||||
}, |
||||
ruleRTL: { |
||||
// [2.3] In an RTL label, the end of the label must be a character with
|
||||
// Bidi property R, AL, EN, or AN, followed by zero or more characters
|
||||
// with Bidi property NSM.
|
||||
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN}, |
||||
|
||||
// [2.2] In an RTL label, only characters with the Bidi properties R,
|
||||
// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
|
||||
// We exclude the entries from [2.3]
|
||||
{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM}, |
||||
}, |
||||
ruleRTLFinal: { |
||||
// [2.3] In an RTL label, the end of the label must be a character with
|
||||
// Bidi property R, AL, EN, or AN, followed by zero or more characters
|
||||
// with Bidi property NSM.
|
||||
{ruleRTLFinal, 1<<bidi.R | 1<<bidi.AL | 1<<bidi.EN | 1<<bidi.AN | 1<<bidi.NSM}, |
||||
|
||||
// [2.2] In an RTL label, only characters with the Bidi properties R,
|
||||
// AL, AN, EN, ES, CS, ET, ON, BN, or NSM are allowed.
|
||||
// We exclude the entries from [2.3] and NSM.
|
||||
{ruleRTL, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN}, |
||||
}, |
||||
ruleLTR: { |
||||
// [2.6] In an LTR label, the end of the label must be a character with
|
||||
// Bidi property L or EN, followed by zero or more characters with Bidi
|
||||
// property NSM.
|
||||
{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN}, |
||||
|
||||
// [2.5] In an LTR label, only characters with the Bidi properties L,
|
||||
// EN, ES, CS, ET, ON, BN, or NSM are allowed.
|
||||
// We exclude the entries from [2.6].
|
||||
{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN | 1<<bidi.NSM}, |
||||
}, |
||||
ruleLTRFinal: { |
||||
// [2.6] In an LTR label, the end of the label must be a character with
|
||||
// Bidi property L or EN, followed by zero or more characters with Bidi
|
||||
// property NSM.
|
||||
{ruleLTRFinal, 1<<bidi.L | 1<<bidi.EN | 1<<bidi.NSM}, |
||||
|
||||
// [2.5] In an LTR label, only characters with the Bidi properties L,
|
||||
// EN, ES, CS, ET, ON, BN, or NSM are allowed.
|
||||
// We exclude the entries from [2.6].
|
||||
{ruleLTR, 1<<bidi.ES | 1<<bidi.CS | 1<<bidi.ET | 1<<bidi.ON | 1<<bidi.BN}, |
||||
}, |
||||
ruleInvalid: { |
||||
{ruleInvalid, 0}, |
||||
{ruleInvalid, 0}, |
||||
}, |
||||
} |
||||
|
||||
// [2.4] In an RTL label, if an EN is present, no AN may be present, and
|
||||
// vice versa.
|
||||
const exclusiveRTL = uint16(1<<bidi.EN | 1<<bidi.AN) |
||||
|
||||
// From RFC 5893
|
||||
// An RTL label is a label that contains at least one character of type
|
||||
// R, AL, or AN.
|
||||
//
|
||||
// An LTR label is any label that is not an RTL label.
|
||||
|
||||
// Direction reports the direction of the given label as defined by RFC 5893.
|
||||
// The Bidi Rule does not have to be applied to labels of the category
|
||||
// LeftToRight.
|
||||
func Direction(b []byte) bidi.Direction { |
||||
for i := 0; i < len(b); { |
||||
e, sz := bidi.Lookup(b[i:]) |
||||
if sz == 0 { |
||||
i++ |
||||
} |
||||
c := e.Class() |
||||
if c == bidi.R || c == bidi.AL || c == bidi.AN { |
||||
return bidi.RightToLeft |
||||
} |
||||
i += sz |
||||
} |
||||
return bidi.LeftToRight |
||||
} |
||||
|
||||
// DirectionString reports the direction of the given label as defined by RFC
|
||||
// 5893. The Bidi Rule does not have to be applied to labels of the category
|
||||
// LeftToRight.
|
||||
func DirectionString(s string) bidi.Direction { |
||||
for i := 0; i < len(s); { |
||||
e, sz := bidi.LookupString(s[i:]) |
||||
if sz == 0 { |
||||
i++ |
||||
continue |
||||
} |
||||
c := e.Class() |
||||
if c == bidi.R || c == bidi.AL || c == bidi.AN { |
||||
return bidi.RightToLeft |
||||
} |
||||
i += sz |
||||
} |
||||
return bidi.LeftToRight |
||||
} |
||||
|
||||
// Valid reports whether b conforms to the BiDi rule.
|
||||
func Valid(b []byte) bool { |
||||
var t Transformer |
||||
if n, ok := t.advance(b); !ok || n < len(b) { |
||||
return false |
||||
} |
||||
return t.isFinal() |
||||
} |
||||
|
||||
// ValidString reports whether s conforms to the BiDi rule.
|
||||
func ValidString(s string) bool { |
||||
var t Transformer |
||||
if n, ok := t.advanceString(s); !ok || n < len(s) { |
||||
return false |
||||
} |
||||
return t.isFinal() |
||||
} |
||||
|
||||
// New returns a Transformer that verifies that input adheres to the Bidi Rule.
|
||||
func New() *Transformer { |
||||
return &Transformer{} |
||||
} |
||||
|
||||
// Transformer implements transform.Transform.
|
||||
type Transformer struct { |
||||
state ruleState |
||||
hasRTL bool |
||||
seen uint16 |
||||
} |
||||
|
||||
// A rule can only be violated for "Bidi Domain names", meaning if one of the
|
||||
// following categories has been observed.
|
||||
func (t *Transformer) isRTL() bool { |
||||
const isRTL = 1<<bidi.R | 1<<bidi.AL | 1<<bidi.AN |
||||
return t.seen&isRTL != 0 |
||||
} |
||||
|
||||
// Reset implements transform.Transformer.
|
||||
func (t *Transformer) Reset() { *t = Transformer{} } |
||||
|
||||
// Transform implements transform.Transformer. This Transformer has state and
|
||||
// needs to be reset between uses.
|
||||
func (t *Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
if len(dst) < len(src) { |
||||
src = src[:len(dst)] |
||||
atEOF = false |
||||
err = transform.ErrShortDst |
||||
} |
||||
n, err1 := t.Span(src, atEOF) |
||||
copy(dst, src[:n]) |
||||
if err == nil || err1 != nil && err1 != transform.ErrShortSrc { |
||||
err = err1 |
||||
} |
||||
return n, n, err |
||||
} |
||||
|
||||
// Span returns the first n bytes of src that conform to the Bidi rule.
|
||||
func (t *Transformer) Span(src []byte, atEOF bool) (n int, err error) { |
||||
if t.state == ruleInvalid && t.isRTL() { |
||||
return 0, ErrInvalid |
||||
} |
||||
n, ok := t.advance(src) |
||||
switch { |
||||
case !ok: |
||||
err = ErrInvalid |
||||
case n < len(src): |
||||
if !atEOF { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
err = ErrInvalid |
||||
case !t.isFinal(): |
||||
err = ErrInvalid |
||||
} |
||||
return n, err |
||||
} |
||||
|
||||
// Precomputing the ASCII values decreases running time for the ASCII fast path
|
||||
// by about 30%.
|
||||
var asciiTable [128]bidi.Properties |
||||
|
||||
func init() { |
||||
for i := range asciiTable { |
||||
p, _ := bidi.LookupRune(rune(i)) |
||||
asciiTable[i] = p |
||||
} |
||||
} |
||||
|
||||
func (t *Transformer) advance(s []byte) (n int, ok bool) { |
||||
var e bidi.Properties |
||||
var sz int |
||||
for n < len(s) { |
||||
if s[n] < utf8.RuneSelf { |
||||
e, sz = asciiTable[s[n]], 1 |
||||
} else { |
||||
e, sz = bidi.Lookup(s[n:]) |
||||
if sz <= 1 { |
||||
if sz == 1 { |
||||
// We always consider invalid UTF-8 to be invalid, even if
|
||||
// the string has not yet been determined to be RTL.
|
||||
// TODO: is this correct?
|
||||
return n, false |
||||
} |
||||
return n, true // incomplete UTF-8 encoding
|
||||
} |
||||
} |
||||
// TODO: using CompactClass would result in noticeable speedup.
|
||||
// See unicode/bidi/prop.go:Properties.CompactClass.
|
||||
c := uint16(1 << e.Class()) |
||||
t.seen |= c |
||||
if t.seen&exclusiveRTL == exclusiveRTL { |
||||
t.state = ruleInvalid |
||||
return n, false |
||||
} |
||||
switch tr := transitions[t.state]; { |
||||
case tr[0].mask&c != 0: |
||||
t.state = tr[0].next |
||||
case tr[1].mask&c != 0: |
||||
t.state = tr[1].next |
||||
default: |
||||
t.state = ruleInvalid |
||||
if t.isRTL() { |
||||
return n, false |
||||
} |
||||
} |
||||
n += sz |
||||
} |
||||
return n, true |
||||
} |
||||
|
||||
func (t *Transformer) advanceString(s string) (n int, ok bool) { |
||||
var e bidi.Properties |
||||
var sz int |
||||
for n < len(s) { |
||||
if s[n] < utf8.RuneSelf { |
||||
e, sz = asciiTable[s[n]], 1 |
||||
} else { |
||||
e, sz = bidi.LookupString(s[n:]) |
||||
if sz <= 1 { |
||||
if sz == 1 { |
||||
return n, false // invalid UTF-8
|
||||
} |
||||
return n, true // incomplete UTF-8 encoding
|
||||
} |
||||
} |
||||
// TODO: using CompactClass results in noticeable speedup.
|
||||
// See unicode/bidi/prop.go:Properties.CompactClass.
|
||||
c := uint16(1 << e.Class()) |
||||
t.seen |= c |
||||
if t.seen&exclusiveRTL == exclusiveRTL { |
||||
t.state = ruleInvalid |
||||
return n, false |
||||
} |
||||
switch tr := transitions[t.state]; { |
||||
case tr[0].mask&c != 0: |
||||
t.state = tr[0].next |
||||
case tr[1].mask&c != 0: |
||||
t.state = tr[1].next |
||||
default: |
||||
t.state = ruleInvalid |
||||
if t.isRTL() { |
||||
return n, false |
||||
} |
||||
} |
||||
n += sz |
||||
} |
||||
return n, true |
||||
} |
@ -0,0 +1,11 @@ |
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build go1.10
|
||||
|
||||
package bidirule |
||||
|
||||
func (t *Transformer) isFinal() bool { |
||||
return t.state == ruleLTRFinal || t.state == ruleRTLFinal || t.state == ruleInitial |
||||
} |
@ -0,0 +1,14 @@ |
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !go1.10
|
||||
|
||||
package bidirule |
||||
|
||||
func (t *Transformer) isFinal() bool { |
||||
if !t.isRTL() { |
||||
return true |
||||
} |
||||
return t.state == ruleLTRFinal || t.state == ruleRTLFinal || t.state == ruleInitial |
||||
} |
@ -0,0 +1,198 @@ |
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run gen.go gen_trieval.go gen_ranges.go
|
||||
|
||||
// Package bidi contains functionality for bidirectional text support.
|
||||
//
|
||||
// See http://www.unicode.org/reports/tr9.
|
||||
//
|
||||
// NOTE: UNDER CONSTRUCTION. This API may change in backwards incompatible ways
|
||||
// and without notice.
|
||||
package bidi // import "golang.org/x/text/unicode/bidi"
|
||||
|
||||
// TODO:
|
||||
// The following functionality would not be hard to implement, but hinges on
|
||||
// the definition of a Segmenter interface. For now this is up to the user.
|
||||
// - Iterate over paragraphs
|
||||
// - Segmenter to iterate over runs directly from a given text.
|
||||
// Also:
|
||||
// - Transformer for reordering?
|
||||
// - Transformer (validator, really) for Bidi Rule.
|
||||
|
||||
// This API tries to avoid dealing with embedding levels for now. Under the hood
|
||||
// these will be computed, but the question is to which extent the user should
|
||||
// know they exist. We should at some point allow the user to specify an
|
||||
// embedding hierarchy, though.
|
||||
|
||||
// A Direction indicates the overall flow of text.
|
||||
type Direction int |
||||
|
||||
const ( |
||||
// LeftToRight indicates the text contains no right-to-left characters and
|
||||
// that either there are some left-to-right characters or the option
|
||||
// DefaultDirection(LeftToRight) was passed.
|
||||
LeftToRight Direction = iota |
||||
|
||||
// RightToLeft indicates the text contains no left-to-right characters and
|
||||
// that either there are some right-to-left characters or the option
|
||||
// DefaultDirection(RightToLeft) was passed.
|
||||
RightToLeft |
||||
|
||||
// Mixed indicates text contains both left-to-right and right-to-left
|
||||
// characters.
|
||||
Mixed |
||||
|
||||
// Neutral means that text contains no left-to-right and right-to-left
|
||||
// characters and that no default direction has been set.
|
||||
Neutral |
||||
) |
||||
|
||||
type options struct{} |
||||
|
||||
// An Option is an option for Bidi processing.
|
||||
type Option func(*options) |
||||
|
||||
// ICU allows the user to define embedding levels. This may be used, for example,
|
||||
// to use hierarchical structure of markup languages to define embeddings.
|
||||
// The following option may be a way to expose this functionality in this API.
|
||||
// // LevelFunc sets a function that associates nesting levels with the given text.
|
||||
// // The levels function will be called with monotonically increasing values for p.
|
||||
// func LevelFunc(levels func(p int) int) Option {
|
||||
// panic("unimplemented")
|
||||
// }
|
||||
|
||||
// DefaultDirection sets the default direction for a Paragraph. The direction is
|
||||
// overridden if the text contains directional characters.
|
||||
func DefaultDirection(d Direction) Option { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// A Paragraph holds a single Paragraph for Bidi processing.
|
||||
type Paragraph struct { |
||||
// buffers
|
||||
} |
||||
|
||||
// SetBytes configures p for the given paragraph text. It replaces text
|
||||
// previously set by SetBytes or SetString. If b contains a paragraph separator
|
||||
// it will only process the first paragraph and report the number of bytes
|
||||
// consumed from b including this separator. Error may be non-nil if options are
|
||||
// given.
|
||||
func (p *Paragraph) SetBytes(b []byte, opts ...Option) (n int, err error) { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// SetString configures p for the given paragraph text. It replaces text
|
||||
// previously set by SetBytes or SetString. If b contains a paragraph separator
|
||||
// it will only process the first paragraph and report the number of bytes
|
||||
// consumed from b including this separator. Error may be non-nil if options are
|
||||
// given.
|
||||
func (p *Paragraph) SetString(s string, opts ...Option) (n int, err error) { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// IsLeftToRight reports whether the principle direction of rendering for this
|
||||
// paragraphs is left-to-right. If this returns false, the principle direction
|
||||
// of rendering is right-to-left.
|
||||
func (p *Paragraph) IsLeftToRight() bool { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// Direction returns the direction of the text of this paragraph.
|
||||
//
|
||||
// The direction may be LeftToRight, RightToLeft, Mixed, or Neutral.
|
||||
func (p *Paragraph) Direction() Direction { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// RunAt reports the Run at the given position of the input text.
|
||||
//
|
||||
// This method can be used for computing line breaks on paragraphs.
|
||||
func (p *Paragraph) RunAt(pos int) Run { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// Order computes the visual ordering of all the runs in a Paragraph.
|
||||
func (p *Paragraph) Order() (Ordering, error) { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// Line computes the visual ordering of runs for a single line starting and
|
||||
// ending at the given positions in the original text.
|
||||
func (p *Paragraph) Line(start, end int) (Ordering, error) { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// An Ordering holds the computed visual order of runs of a Paragraph. Calling
|
||||
// SetBytes or SetString on the originating Paragraph invalidates an Ordering.
|
||||
// The methods of an Ordering should only be called by one goroutine at a time.
|
||||
type Ordering struct{} |
||||
|
||||
// Direction reports the directionality of the runs.
|
||||
//
|
||||
// The direction may be LeftToRight, RightToLeft, Mixed, or Neutral.
|
||||
func (o *Ordering) Direction() Direction { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// NumRuns returns the number of runs.
|
||||
func (o *Ordering) NumRuns() int { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// Run returns the ith run within the ordering.
|
||||
func (o *Ordering) Run(i int) Run { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// TODO: perhaps with options.
|
||||
// // Reorder creates a reader that reads the runes in visual order per character.
|
||||
// // Modifiers remain after the runes they modify.
|
||||
// func (l *Runs) Reorder() io.Reader {
|
||||
// panic("unimplemented")
|
||||
// }
|
||||
|
||||
// A Run is a continuous sequence of characters of a single direction.
|
||||
type Run struct { |
||||
} |
||||
|
||||
// String returns the text of the run in its original order.
|
||||
func (r *Run) String() string { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// Bytes returns the text of the run in its original order.
|
||||
func (r *Run) Bytes() []byte { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// TODO: methods for
|
||||
// - Display order
|
||||
// - headers and footers
|
||||
// - bracket replacement.
|
||||
|
||||
// Direction reports the direction of the run.
|
||||
func (r *Run) Direction() Direction { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// Position of the Run within the text passed to SetBytes or SetString of the
|
||||
// originating Paragraph value.
|
||||
func (r *Run) Pos() (start, end int) { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// AppendReverse reverses the order of characters of in, appends them to out,
|
||||
// and returns the result. Modifiers will still follow the runes they modify.
|
||||
// Brackets are replaced with their counterparts.
|
||||
func AppendReverse(out, in []byte) []byte { |
||||
panic("unimplemented") |
||||
} |
||||
|
||||
// ReverseString reverses the order of characters in s and returns a new string.
|
||||
// Modifiers will still follow the runes they modify. Brackets are replaced with
|
||||
// their counterparts.
|
||||
func ReverseString(s string) string { |
||||
panic("unimplemented") |
||||
} |
@ -0,0 +1,335 @@ |
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package bidi |
||||
|
||||
import ( |
||||
"container/list" |
||||
"fmt" |
||||
"sort" |
||||
) |
||||
|
||||
// This file contains a port of the reference implementation of the
|
||||
// Bidi Parentheses Algorithm:
|
||||
// http://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java
|
||||
//
|
||||
// The implementation in this file covers definitions BD14-BD16 and rule N0
|
||||
// of UAX#9.
|
||||
//
|
||||
// Some preprocessing is done for each rune before data is passed to this
|
||||
// algorithm:
|
||||
// - opening and closing brackets are identified
|
||||
// - a bracket pair type, like '(' and ')' is assigned a unique identifier that
|
||||
// is identical for the opening and closing bracket. It is left to do these
|
||||
// mappings.
|
||||
// - The BPA algorithm requires that bracket characters that are canonical
|
||||
// equivalents of each other be able to be substituted for each other.
|
||||
// It is the responsibility of the caller to do this canonicalization.
|
||||
//
|
||||
// In implementing BD16, this implementation departs slightly from the "logical"
|
||||
// algorithm defined in UAX#9. In particular, the stack referenced there
|
||||
// supports operations that go beyond a "basic" stack. An equivalent
|
||||
// implementation based on a linked list is used here.
|
||||
|
||||
// Bidi_Paired_Bracket_Type
|
||||
// BD14. An opening paired bracket is a character whose
|
||||
// Bidi_Paired_Bracket_Type property value is Open.
|
||||
//
|
||||
// BD15. A closing paired bracket is a character whose
|
||||
// Bidi_Paired_Bracket_Type property value is Close.
|
||||
type bracketType byte |
||||
|
||||
const ( |
||||
bpNone bracketType = iota |
||||
bpOpen |
||||
bpClose |
||||
) |
||||
|
||||
// bracketPair holds a pair of index values for opening and closing bracket
|
||||
// location of a bracket pair.
|
||||
type bracketPair struct { |
||||
opener int |
||||
closer int |
||||
} |
||||
|
||||
func (b *bracketPair) String() string { |
||||
return fmt.Sprintf("(%v, %v)", b.opener, b.closer) |
||||
} |
||||
|
||||
// bracketPairs is a slice of bracketPairs with a sort.Interface implementation.
|
||||
type bracketPairs []bracketPair |
||||
|
||||
func (b bracketPairs) Len() int { return len(b) } |
||||
func (b bracketPairs) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
||||
func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener } |
||||
|
||||
// resolvePairedBrackets runs the paired bracket part of the UBA algorithm.
|
||||
//
|
||||
// For each rune, it takes the indexes into the original string, the class the
|
||||
// bracket type (in pairTypes) and the bracket identifier (pairValues). It also
|
||||
// takes the direction type for the start-of-sentence and the embedding level.
|
||||
//
|
||||
// The identifiers for bracket types are the rune of the canonicalized opening
|
||||
// bracket for brackets (open or close) or 0 for runes that are not brackets.
|
||||
func resolvePairedBrackets(s *isolatingRunSequence) { |
||||
p := bracketPairer{ |
||||
sos: s.sos, |
||||
openers: list.New(), |
||||
codesIsolatedRun: s.types, |
||||
indexes: s.indexes, |
||||
} |
||||
dirEmbed := L |
||||
if s.level&1 != 0 { |
||||
dirEmbed = R |
||||
} |
||||
p.locateBrackets(s.p.pairTypes, s.p.pairValues) |
||||
p.resolveBrackets(dirEmbed, s.p.initialTypes) |
||||
} |
||||
|
||||
type bracketPairer struct { |
||||
sos Class // direction corresponding to start of sequence
|
||||
|
||||
// The following is a restatement of BD 16 using non-algorithmic language.
|
||||
//
|
||||
// A bracket pair is a pair of characters consisting of an opening
|
||||
// paired bracket and a closing paired bracket such that the
|
||||
// Bidi_Paired_Bracket property value of the former equals the latter,
|
||||
// subject to the following constraints.
|
||||
// - both characters of a pair occur in the same isolating run sequence
|
||||
// - the closing character of a pair follows the opening character
|
||||
// - any bracket character can belong at most to one pair, the earliest possible one
|
||||
// - any bracket character not part of a pair is treated like an ordinary character
|
||||
// - pairs may nest properly, but their spans may not overlap otherwise
|
||||
|
||||
// Bracket characters with canonical decompositions are supposed to be
|
||||
// treated as if they had been normalized, to allow normalized and non-
|
||||
// normalized text to give the same result. In this implementation that step
|
||||
// is pushed out to the caller. The caller has to ensure that the pairValue
|
||||
// slices contain the rune of the opening bracket after normalization for
|
||||
// any opening or closing bracket.
|
||||
|
||||
openers *list.List // list of positions for opening brackets
|
||||
|
||||
// bracket pair positions sorted by location of opening bracket
|
||||
pairPositions bracketPairs |
||||
|
||||
codesIsolatedRun []Class // directional bidi codes for an isolated run
|
||||
indexes []int // array of index values into the original string
|
||||
|
||||
} |
||||
|
||||
// matchOpener reports whether characters at given positions form a matching
|
||||
// bracket pair.
|
||||
func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool { |
||||
return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]] |
||||
} |
||||
|
||||
const maxPairingDepth = 63 |
||||
|
||||
// locateBrackets locates matching bracket pairs according to BD16.
|
||||
//
|
||||
// This implementation uses a linked list instead of a stack, because, while
|
||||
// elements are added at the front (like a push) they are not generally removed
|
||||
// in atomic 'pop' operations, reducing the benefit of the stack archetype.
|
||||
func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) { |
||||
// traverse the run
|
||||
// do that explicitly (not in a for-each) so we can record position
|
||||
for i, index := range p.indexes { |
||||
|
||||
// look at the bracket type for each character
|
||||
if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON { |
||||
// continue scanning
|
||||
continue |
||||
} |
||||
switch pairTypes[index] { |
||||
case bpOpen: |
||||
// check if maximum pairing depth reached
|
||||
if p.openers.Len() == maxPairingDepth { |
||||
p.openers.Init() |
||||
return |
||||
} |
||||
// remember opener location, most recent first
|
||||
p.openers.PushFront(i) |
||||
|
||||
case bpClose: |
||||
// see if there is a match
|
||||
count := 0 |
||||
for elem := p.openers.Front(); elem != nil; elem = elem.Next() { |
||||
count++ |
||||
opener := elem.Value.(int) |
||||
if p.matchOpener(pairValues, opener, i) { |
||||
// if the opener matches, add nested pair to the ordered list
|
||||
p.pairPositions = append(p.pairPositions, bracketPair{opener, i}) |
||||
// remove up to and including matched opener
|
||||
for ; count > 0; count-- { |
||||
p.openers.Remove(p.openers.Front()) |
||||
} |
||||
break |
||||
} |
||||
} |
||||
sort.Sort(p.pairPositions) |
||||
// if we get here, the closing bracket matched no openers
|
||||
// and gets ignored
|
||||
} |
||||
} |
||||
} |
||||
|
||||
// Bracket pairs within an isolating run sequence are processed as units so
|
||||
// that both the opening and the closing paired bracket in a pair resolve to
|
||||
// the same direction.
|
||||
//
|
||||
// N0. Process bracket pairs in an isolating run sequence sequentially in
|
||||
// the logical order of the text positions of the opening paired brackets
|
||||
// using the logic given below. Within this scope, bidirectional types EN
|
||||
// and AN are treated as R.
|
||||
//
|
||||
// Identify the bracket pairs in the current isolating run sequence
|
||||
// according to BD16. For each bracket-pair element in the list of pairs of
|
||||
// text positions:
|
||||
//
|
||||
// a Inspect the bidirectional types of the characters enclosed within the
|
||||
// bracket pair.
|
||||
//
|
||||
// b If any strong type (either L or R) matching the embedding direction is
|
||||
// found, set the type for both brackets in the pair to match the embedding
|
||||
// direction.
|
||||
//
|
||||
// o [ e ] o -> o e e e o
|
||||
//
|
||||
// o [ o e ] -> o e o e e
|
||||
//
|
||||
// o [ NI e ] -> o e NI e e
|
||||
//
|
||||
// c Otherwise, if a strong type (opposite the embedding direction) is
|
||||
// found, test for adjacent strong types as follows: 1 First, check
|
||||
// backwards before the opening paired bracket until the first strong type
|
||||
// (L, R, or sos) is found. If that first preceding strong type is opposite
|
||||
// the embedding direction, then set the type for both brackets in the pair
|
||||
// to that type. 2 Otherwise, set the type for both brackets in the pair to
|
||||
// the embedding direction.
|
||||
//
|
||||
// o [ o ] e -> o o o o e
|
||||
//
|
||||
// o [ o NI ] o -> o o o NI o o
|
||||
//
|
||||
// e [ o ] o -> e e o e o
|
||||
//
|
||||
// e [ o ] e -> e e o e e
|
||||
//
|
||||
// e ( o [ o ] NI ) e -> e e o o o o NI e e
|
||||
//
|
||||
// d Otherwise, do not set the type for the current bracket pair. Note that
|
||||
// if the enclosed text contains no strong types the paired brackets will
|
||||
// both resolve to the same level when resolved individually using rules N1
|
||||
// and N2.
|
||||
//
|
||||
// e ( NI ) o -> e ( NI ) o
|
||||
|
||||
// getStrongTypeN0 maps character's directional code to strong type as required
|
||||
// by rule N0.
|
||||
//
|
||||
// TODO: have separate type for "strong" directionality.
|
||||
func (p *bracketPairer) getStrongTypeN0(index int) Class { |
||||
switch p.codesIsolatedRun[index] { |
||||
// in the scope of N0, number types are treated as R
|
||||
case EN, AN, AL, R: |
||||
return R |
||||
case L: |
||||
return L |
||||
default: |
||||
return ON |
||||
} |
||||
} |
||||
|
||||
// classifyPairContent reports the strong types contained inside a Bracket Pair,
|
||||
// assuming the given embedding direction.
|
||||
//
|
||||
// It returns ON if no strong type is found. If a single strong type is found,
|
||||
// it returns this this type. Otherwise it returns the embedding direction.
|
||||
//
|
||||
// TODO: use separate type for "strong" directionality.
|
||||
func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class { |
||||
dirOpposite := ON |
||||
for i := loc.opener + 1; i < loc.closer; i++ { |
||||
dir := p.getStrongTypeN0(i) |
||||
if dir == ON { |
||||
continue |
||||
} |
||||
if dir == dirEmbed { |
||||
return dir // type matching embedding direction found
|
||||
} |
||||
dirOpposite = dir |
||||
} |
||||
// return ON if no strong type found, or class opposite to dirEmbed
|
||||
return dirOpposite |
||||
} |
||||
|
||||
// classBeforePair determines which strong types are present before a Bracket
|
||||
// Pair. Return R or L if strong type found, otherwise ON.
|
||||
func (p *bracketPairer) classBeforePair(loc bracketPair) Class { |
||||
for i := loc.opener - 1; i >= 0; i-- { |
||||
if dir := p.getStrongTypeN0(i); dir != ON { |
||||
return dir |
||||
} |
||||
} |
||||
// no strong types found, return sos
|
||||
return p.sos |
||||
} |
||||
|
||||
// assignBracketType implements rule N0 for a single bracket pair.
|
||||
func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) { |
||||
// rule "N0, a", inspect contents of pair
|
||||
dirPair := p.classifyPairContent(loc, dirEmbed) |
||||
|
||||
// dirPair is now L, R, or N (no strong type found)
|
||||
|
||||
// the following logical tests are performed out of order compared to
|
||||
// the statement of the rules but yield the same results
|
||||
if dirPair == ON { |
||||
return // case "d" - nothing to do
|
||||
} |
||||
|
||||
if dirPair != dirEmbed { |
||||
// case "c": strong type found, opposite - check before (c.1)
|
||||
dirPair = p.classBeforePair(loc) |
||||
if dirPair == dirEmbed || dirPair == ON { |
||||
// no strong opposite type found before - use embedding (c.2)
|
||||
dirPair = dirEmbed |
||||
} |
||||
} |
||||
// else: case "b", strong type found matching embedding,
|
||||
// no explicit action needed, as dirPair is already set to embedding
|
||||
// direction
|
||||
|
||||
// set the bracket types to the type found
|
||||
p.setBracketsToType(loc, dirPair, initialTypes) |
||||
} |
||||
|
||||
func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) { |
||||
p.codesIsolatedRun[loc.opener] = dirPair |
||||
p.codesIsolatedRun[loc.closer] = dirPair |
||||
|
||||
for i := loc.opener + 1; i < loc.closer; i++ { |
||||
index := p.indexes[i] |
||||
if initialTypes[index] != NSM { |
||||
break |
||||
} |
||||
p.codesIsolatedRun[i] = dirPair |
||||
} |
||||
|
||||
for i := loc.closer + 1; i < len(p.indexes); i++ { |
||||
index := p.indexes[i] |
||||
if initialTypes[index] != NSM { |
||||
break |
||||
} |
||||
p.codesIsolatedRun[i] = dirPair |
||||
} |
||||
} |
||||
|
||||
// resolveBrackets implements rule N0 for a list of pairs.
|
||||
func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) { |
||||
for _, loc := range p.pairPositions { |
||||
p.assignBracketType(loc, dirEmbed, initialTypes) |
||||
} |
||||
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,206 @@ |
||||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package bidi |
||||
|
||||
import "unicode/utf8" |
||||
|
||||
// Properties provides access to BiDi properties of runes.
|
||||
type Properties struct { |
||||
entry uint8 |
||||
last uint8 |
||||
} |
||||
|
||||
var trie = newBidiTrie(0) |
||||
|
||||
// TODO: using this for bidirule reduces the running time by about 5%. Consider
|
||||
// if this is worth exposing or if we can find a way to speed up the Class
|
||||
// method.
|
||||
//
|
||||
// // CompactClass is like Class, but maps all of the BiDi control classes
|
||||
// // (LRO, RLO, LRE, RLE, PDF, LRI, RLI, FSI, PDI) to the class Control.
|
||||
// func (p Properties) CompactClass() Class {
|
||||
// return Class(p.entry & 0x0F)
|
||||
// }
|
||||
|
||||
// Class returns the Bidi class for p.
|
||||
func (p Properties) Class() Class { |
||||
c := Class(p.entry & 0x0F) |
||||
if c == Control { |
||||
c = controlByteToClass[p.last&0xF] |
||||
} |
||||
return c |
||||
} |
||||
|
||||
// IsBracket reports whether the rune is a bracket.
|
||||
func (p Properties) IsBracket() bool { return p.entry&0xF0 != 0 } |
||||
|
||||
// IsOpeningBracket reports whether the rune is an opening bracket.
|
||||
// IsBracket must return true.
|
||||
func (p Properties) IsOpeningBracket() bool { return p.entry&openMask != 0 } |
||||
|
||||
// TODO: find a better API and expose.
|
||||
func (p Properties) reverseBracket(r rune) rune { |
||||
return xorMasks[p.entry>>xorMaskShift] ^ r |
||||
} |
||||
|
||||
var controlByteToClass = [16]Class{ |
||||
0xD: LRO, // U+202D LeftToRightOverride,
|
||||
0xE: RLO, // U+202E RightToLeftOverride,
|
||||
0xA: LRE, // U+202A LeftToRightEmbedding,
|
||||
0xB: RLE, // U+202B RightToLeftEmbedding,
|
||||
0xC: PDF, // U+202C PopDirectionalFormat,
|
||||
0x6: LRI, // U+2066 LeftToRightIsolate,
|
||||
0x7: RLI, // U+2067 RightToLeftIsolate,
|
||||
0x8: FSI, // U+2068 FirstStrongIsolate,
|
||||
0x9: PDI, // U+2069 PopDirectionalIsolate,
|
||||
} |
||||
|
||||
// LookupRune returns properties for r.
|
||||
func LookupRune(r rune) (p Properties, size int) { |
||||
var buf [4]byte |
||||
n := utf8.EncodeRune(buf[:], r) |
||||
return Lookup(buf[:n]) |
||||
} |
||||
|
||||
// TODO: these lookup methods are based on the generated trie code. The returned
|
||||
// sizes have slightly different semantics from the generated code, in that it
|
||||
// always returns size==1 for an illegal UTF-8 byte (instead of the length
|
||||
// of the maximum invalid subsequence). Most Transformers, like unicode/norm,
|
||||
// leave invalid UTF-8 untouched, in which case it has performance benefits to
|
||||
// do so (without changing the semantics). Bidi requires the semantics used here
|
||||
// for the bidirule implementation to be compatible with the Go semantics.
|
||||
// They ultimately should perhaps be adopted by all trie implementations, for
|
||||
// convenience sake.
|
||||
// This unrolled code also boosts performance of the secure/bidirule package by
|
||||
// about 30%.
|
||||
// So, to remove this code:
|
||||
// - add option to trie generator to define return type.
|
||||
// - always return 1 byte size for ill-formed UTF-8 runes.
|
||||
|
||||
// Lookup returns properties for the first rune in s and the width in bytes of
|
||||
// its encoding. The size will be 0 if s does not hold enough bytes to complete
|
||||
// the encoding.
|
||||
func Lookup(s []byte) (p Properties, sz int) { |
||||
c0 := s[0] |
||||
switch { |
||||
case c0 < 0x80: // is ASCII
|
||||
return Properties{entry: bidiValues[c0]}, 1 |
||||
case c0 < 0xC2: |
||||
return Properties{}, 1 |
||||
case c0 < 0xE0: // 2-byte UTF-8
|
||||
if len(s) < 2 { |
||||
return Properties{}, 0 |
||||
} |
||||
i := bidiIndex[c0] |
||||
c1 := s[1] |
||||
if c1 < 0x80 || 0xC0 <= c1 { |
||||
return Properties{}, 1 |
||||
} |
||||
return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2 |
||||
case c0 < 0xF0: // 3-byte UTF-8
|
||||
if len(s) < 3 { |
||||
return Properties{}, 0 |
||||
} |
||||
i := bidiIndex[c0] |
||||
c1 := s[1] |
||||
if c1 < 0x80 || 0xC0 <= c1 { |
||||
return Properties{}, 1 |
||||
} |
||||
o := uint32(i)<<6 + uint32(c1) |
||||
i = bidiIndex[o] |
||||
c2 := s[2] |
||||
if c2 < 0x80 || 0xC0 <= c2 { |
||||
return Properties{}, 1 |
||||
} |
||||
return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3 |
||||
case c0 < 0xF8: // 4-byte UTF-8
|
||||
if len(s) < 4 { |
||||
return Properties{}, 0 |
||||
} |
||||
i := bidiIndex[c0] |
||||
c1 := s[1] |
||||
if c1 < 0x80 || 0xC0 <= c1 { |
||||
return Properties{}, 1 |
||||
} |
||||
o := uint32(i)<<6 + uint32(c1) |
||||
i = bidiIndex[o] |
||||
c2 := s[2] |
||||
if c2 < 0x80 || 0xC0 <= c2 { |
||||
return Properties{}, 1 |
||||
} |
||||
o = uint32(i)<<6 + uint32(c2) |
||||
i = bidiIndex[o] |
||||
c3 := s[3] |
||||
if c3 < 0x80 || 0xC0 <= c3 { |
||||
return Properties{}, 1 |
||||
} |
||||
return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4 |
||||
} |
||||
// Illegal rune
|
||||
return Properties{}, 1 |
||||
} |
||||
|
||||
// LookupString returns properties for the first rune in s and the width in
|
||||
// bytes of its encoding. The size will be 0 if s does not hold enough bytes to
|
||||
// complete the encoding.
|
||||
func LookupString(s string) (p Properties, sz int) { |
||||
c0 := s[0] |
||||
switch { |
||||
case c0 < 0x80: // is ASCII
|
||||
return Properties{entry: bidiValues[c0]}, 1 |
||||
case c0 < 0xC2: |
||||
return Properties{}, 1 |
||||
case c0 < 0xE0: // 2-byte UTF-8
|
||||
if len(s) < 2 { |
||||
return Properties{}, 0 |
||||
} |
||||
i := bidiIndex[c0] |
||||
c1 := s[1] |
||||
if c1 < 0x80 || 0xC0 <= c1 { |
||||
return Properties{}, 1 |
||||
} |
||||
return Properties{entry: trie.lookupValue(uint32(i), c1)}, 2 |
||||
case c0 < 0xF0: // 3-byte UTF-8
|
||||
if len(s) < 3 { |
||||
return Properties{}, 0 |
||||
} |
||||
i := bidiIndex[c0] |
||||
c1 := s[1] |
||||
if c1 < 0x80 || 0xC0 <= c1 { |
||||
return Properties{}, 1 |
||||
} |
||||
o := uint32(i)<<6 + uint32(c1) |
||||
i = bidiIndex[o] |
||||
c2 := s[2] |
||||
if c2 < 0x80 || 0xC0 <= c2 { |
||||
return Properties{}, 1 |
||||
} |
||||
return Properties{entry: trie.lookupValue(uint32(i), c2), last: c2}, 3 |
||||
case c0 < 0xF8: // 4-byte UTF-8
|
||||
if len(s) < 4 { |
||||
return Properties{}, 0 |
||||
} |
||||
i := bidiIndex[c0] |
||||
c1 := s[1] |
||||
if c1 < 0x80 || 0xC0 <= c1 { |
||||
return Properties{}, 1 |
||||
} |
||||
o := uint32(i)<<6 + uint32(c1) |
||||
i = bidiIndex[o] |
||||
c2 := s[2] |
||||
if c2 < 0x80 || 0xC0 <= c2 { |
||||
return Properties{}, 1 |
||||
} |
||||
o = uint32(i)<<6 + uint32(c2) |
||||
i = bidiIndex[o] |
||||
c3 := s[3] |
||||
if c3 < 0x80 || 0xC0 <= c3 { |
||||
return Properties{}, 1 |
||||
} |
||||
return Properties{entry: trie.lookupValue(uint32(i), c3)}, 4 |
||||
} |
||||
// Illegal rune
|
||||
return Properties{}, 1 |
||||
} |
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,60 @@ |
||||
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
|
||||
|
||||
package bidi |
||||
|
||||
// Class is the Unicode BiDi class. Each rune has a single class.
|
||||
type Class uint |
||||
|
||||
const ( |
||||
L Class = iota // LeftToRight
|
||||
R // RightToLeft
|
||||
EN // EuropeanNumber
|
||||
ES // EuropeanSeparator
|
||||
ET // EuropeanTerminator
|
||||
AN // ArabicNumber
|
||||
CS // CommonSeparator
|
||||
B // ParagraphSeparator
|
||||
S // SegmentSeparator
|
||||
WS // WhiteSpace
|
||||
ON // OtherNeutral
|
||||
BN // BoundaryNeutral
|
||||
NSM // NonspacingMark
|
||||
AL // ArabicLetter
|
||||
Control // Control LRO - PDI
|
||||
|
||||
numClass |
||||
|
||||
LRO // LeftToRightOverride
|
||||
RLO // RightToLeftOverride
|
||||
LRE // LeftToRightEmbedding
|
||||
RLE // RightToLeftEmbedding
|
||||
PDF // PopDirectionalFormat
|
||||
LRI // LeftToRightIsolate
|
||||
RLI // RightToLeftIsolate
|
||||
FSI // FirstStrongIsolate
|
||||
PDI // PopDirectionalIsolate
|
||||
|
||||
unknownClass = ^Class(0) |
||||
) |
||||
|
||||
var controlToClass = map[rune]Class{ |
||||
0x202D: LRO, // LeftToRightOverride,
|
||||
0x202E: RLO, // RightToLeftOverride,
|
||||
0x202A: LRE, // LeftToRightEmbedding,
|
||||
0x202B: RLE, // RightToLeftEmbedding,
|
||||
0x202C: PDF, // PopDirectionalFormat,
|
||||
0x2066: LRI, // LeftToRightIsolate,
|
||||
0x2067: RLI, // RightToLeftIsolate,
|
||||
0x2068: FSI, // FirstStrongIsolate,
|
||||
0x2069: PDI, // PopDirectionalIsolate,
|
||||
} |
||||
|
||||
// A trie entry has the following bits:
|
||||
// 7..5 XOR mask for brackets
|
||||
// 4 1: Bracket open, 0: Bracket close
|
||||
// 3..0 Class type
|
||||
|
||||
const ( |
||||
openMask = 0x10 |
||||
xorMaskShift = 5 |
||||
) |
@ -0,0 +1,508 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
import "unicode/utf8" |
||||
|
||||
const ( |
||||
maxNonStarters = 30 |
||||
// The maximum number of characters needed for a buffer is
|
||||
// maxNonStarters + 1 for the starter + 1 for the GCJ
|
||||
maxBufferSize = maxNonStarters + 2 |
||||
maxNFCExpansion = 3 // NFC(0x1D160)
|
||||
maxNFKCExpansion = 18 // NFKC(0xFDFA)
|
||||
|
||||
maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128
|
||||
) |
||||
|
||||
// ssState is used for reporting the segment state after inserting a rune.
|
||||
// It is returned by streamSafe.next.
|
||||
type ssState int |
||||
|
||||
const ( |
||||
// Indicates a rune was successfully added to the segment.
|
||||
ssSuccess ssState = iota |
||||
// Indicates a rune starts a new segment and should not be added.
|
||||
ssStarter |
||||
// Indicates a rune caused a segment overflow and a CGJ should be inserted.
|
||||
ssOverflow |
||||
) |
||||
|
||||
// streamSafe implements the policy of when a CGJ should be inserted.
|
||||
type streamSafe uint8 |
||||
|
||||
// first inserts the first rune of a segment. It is a faster version of next if
|
||||
// it is known p represents the first rune in a segment.
|
||||
func (ss *streamSafe) first(p Properties) { |
||||
*ss = streamSafe(p.nTrailingNonStarters()) |
||||
} |
||||
|
||||
// insert returns a ssState value to indicate whether a rune represented by p
|
||||
// can be inserted.
|
||||
func (ss *streamSafe) next(p Properties) ssState { |
||||
if *ss > maxNonStarters { |
||||
panic("streamSafe was not reset") |
||||
} |
||||
n := p.nLeadingNonStarters() |
||||
if *ss += streamSafe(n); *ss > maxNonStarters { |
||||
*ss = 0 |
||||
return ssOverflow |
||||
} |
||||
// The Stream-Safe Text Processing prescribes that the counting can stop
|
||||
// as soon as a starter is encountered. However, there are some starters,
|
||||
// like Jamo V and T, that can combine with other runes, leaving their
|
||||
// successive non-starters appended to the previous, possibly causing an
|
||||
// overflow. We will therefore consider any rune with a non-zero nLead to
|
||||
// be a non-starter. Note that it always hold that if nLead > 0 then
|
||||
// nLead == nTrail.
|
||||
if n == 0 { |
||||
*ss = streamSafe(p.nTrailingNonStarters()) |
||||
return ssStarter |
||||
} |
||||
return ssSuccess |
||||
} |
||||
|
||||
// backwards is used for checking for overflow and segment starts
|
||||
// when traversing a string backwards. Users do not need to call first
|
||||
// for the first rune. The state of the streamSafe retains the count of
|
||||
// the non-starters loaded.
|
||||
func (ss *streamSafe) backwards(p Properties) ssState { |
||||
if *ss > maxNonStarters { |
||||
panic("streamSafe was not reset") |
||||
} |
||||
c := *ss + streamSafe(p.nTrailingNonStarters()) |
||||
if c > maxNonStarters { |
||||
return ssOverflow |
||||
} |
||||
*ss = c |
||||
if p.nLeadingNonStarters() == 0 { |
||||
return ssStarter |
||||
} |
||||
return ssSuccess |
||||
} |
||||
|
||||
func (ss streamSafe) isMax() bool { |
||||
return ss == maxNonStarters |
||||
} |
||||
|
||||
// GraphemeJoiner is inserted after maxNonStarters non-starter runes.
|
||||
const GraphemeJoiner = "\u034F" |
||||
|
||||
// reorderBuffer is used to normalize a single segment. Characters inserted with
|
||||
// insert are decomposed and reordered based on CCC. The compose method can
|
||||
// be used to recombine characters. Note that the byte buffer does not hold
|
||||
// the UTF-8 characters in order. Only the rune array is maintained in sorted
|
||||
// order. flush writes the resulting segment to a byte array.
|
||||
type reorderBuffer struct { |
||||
rune [maxBufferSize]Properties // Per character info.
|
||||
byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos.
|
||||
nbyte uint8 // Number or bytes.
|
||||
ss streamSafe // For limiting length of non-starter sequence.
|
||||
nrune int // Number of runeInfos.
|
||||
f formInfo |
||||
|
||||
src input |
||||
nsrc int |
||||
tmpBytes input |
||||
|
||||
out []byte |
||||
flushF func(*reorderBuffer) bool |
||||
} |
||||
|
||||
func (rb *reorderBuffer) init(f Form, src []byte) { |
||||
rb.f = *formTable[f] |
||||
rb.src.setBytes(src) |
||||
rb.nsrc = len(src) |
||||
rb.ss = 0 |
||||
} |
||||
|
||||
func (rb *reorderBuffer) initString(f Form, src string) { |
||||
rb.f = *formTable[f] |
||||
rb.src.setString(src) |
||||
rb.nsrc = len(src) |
||||
rb.ss = 0 |
||||
} |
||||
|
||||
func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) { |
||||
rb.out = out |
||||
rb.flushF = f |
||||
} |
||||
|
||||
// reset discards all characters from the buffer.
|
||||
func (rb *reorderBuffer) reset() { |
||||
rb.nrune = 0 |
||||
rb.nbyte = 0 |
||||
} |
||||
|
||||
func (rb *reorderBuffer) doFlush() bool { |
||||
if rb.f.composing { |
||||
rb.compose() |
||||
} |
||||
res := rb.flushF(rb) |
||||
rb.reset() |
||||
return res |
||||
} |
||||
|
||||
// appendFlush appends the normalized segment to rb.out.
|
||||
func appendFlush(rb *reorderBuffer) bool { |
||||
for i := 0; i < rb.nrune; i++ { |
||||
start := rb.rune[i].pos |
||||
end := start + rb.rune[i].size |
||||
rb.out = append(rb.out, rb.byte[start:end]...) |
||||
} |
||||
return true |
||||
} |
||||
|
||||
// flush appends the normalized segment to out and resets rb.
|
||||
func (rb *reorderBuffer) flush(out []byte) []byte { |
||||
for i := 0; i < rb.nrune; i++ { |
||||
start := rb.rune[i].pos |
||||
end := start + rb.rune[i].size |
||||
out = append(out, rb.byte[start:end]...) |
||||
} |
||||
rb.reset() |
||||
return out |
||||
} |
||||
|
||||
// flushCopy copies the normalized segment to buf and resets rb.
|
||||
// It returns the number of bytes written to buf.
|
||||
func (rb *reorderBuffer) flushCopy(buf []byte) int { |
||||
p := 0 |
||||
for i := 0; i < rb.nrune; i++ { |
||||
runep := rb.rune[i] |
||||
p += copy(buf[p:], rb.byte[runep.pos:runep.pos+runep.size]) |
||||
} |
||||
rb.reset() |
||||
return p |
||||
} |
||||
|
||||
// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.
|
||||
// It returns false if the buffer is not large enough to hold the rune.
|
||||
// It is used internally by insert and insertString only.
|
||||
func (rb *reorderBuffer) insertOrdered(info Properties) { |
||||
n := rb.nrune |
||||
b := rb.rune[:] |
||||
cc := info.ccc |
||||
if cc > 0 { |
||||
// Find insertion position + move elements to make room.
|
||||
for ; n > 0; n-- { |
||||
if b[n-1].ccc <= cc { |
||||
break |
||||
} |
||||
b[n] = b[n-1] |
||||
} |
||||
} |
||||
rb.nrune += 1 |
||||
pos := uint8(rb.nbyte) |
||||
rb.nbyte += utf8.UTFMax |
||||
info.pos = pos |
||||
b[n] = info |
||||
} |
||||
|
||||
// insertErr is an error code returned by insert. Using this type instead
|
||||
// of error improves performance up to 20% for many of the benchmarks.
|
||||
type insertErr int |
||||
|
||||
const ( |
||||
iSuccess insertErr = -iota |
||||
iShortDst |
||||
iShortSrc |
||||
) |
||||
|
||||
// insertFlush inserts the given rune in the buffer ordered by CCC.
|
||||
// If a decomposition with multiple segments are encountered, they leading
|
||||
// ones are flushed.
|
||||
// It returns a non-zero error code if the rune was not inserted.
|
||||
func (rb *reorderBuffer) insertFlush(src input, i int, info Properties) insertErr { |
||||
if rune := src.hangul(i); rune != 0 { |
||||
rb.decomposeHangul(rune) |
||||
return iSuccess |
||||
} |
||||
if info.hasDecomposition() { |
||||
return rb.insertDecomposed(info.Decomposition()) |
||||
} |
||||
rb.insertSingle(src, i, info) |
||||
return iSuccess |
||||
} |
||||
|
||||
// insertUnsafe inserts the given rune in the buffer ordered by CCC.
|
||||
// It is assumed there is sufficient space to hold the runes. It is the
|
||||
// responsibility of the caller to ensure this. This can be done by checking
|
||||
// the state returned by the streamSafe type.
|
||||
func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) { |
||||
if rune := src.hangul(i); rune != 0 { |
||||
rb.decomposeHangul(rune) |
||||
} |
||||
if info.hasDecomposition() { |
||||
// TODO: inline.
|
||||
rb.insertDecomposed(info.Decomposition()) |
||||
} else { |
||||
rb.insertSingle(src, i, info) |
||||
} |
||||
} |
||||
|
||||
// insertDecomposed inserts an entry in to the reorderBuffer for each rune
|
||||
// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.
|
||||
// It flushes the buffer on each new segment start.
|
||||
func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr { |
||||
rb.tmpBytes.setBytes(dcomp) |
||||
// As the streamSafe accounting already handles the counting for modifiers,
|
||||
// we don't have to call next. However, we do need to keep the accounting
|
||||
// intact when flushing the buffer.
|
||||
for i := 0; i < len(dcomp); { |
||||
info := rb.f.info(rb.tmpBytes, i) |
||||
if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() { |
||||
return iShortDst |
||||
} |
||||
i += copy(rb.byte[rb.nbyte:], dcomp[i:i+int(info.size)]) |
||||
rb.insertOrdered(info) |
||||
} |
||||
return iSuccess |
||||
} |
||||
|
||||
// insertSingle inserts an entry in the reorderBuffer for the rune at
|
||||
// position i. info is the runeInfo for the rune at position i.
|
||||
func (rb *reorderBuffer) insertSingle(src input, i int, info Properties) { |
||||
src.copySlice(rb.byte[rb.nbyte:], i, i+int(info.size)) |
||||
rb.insertOrdered(info) |
||||
} |
||||
|
||||
// insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb.
|
||||
func (rb *reorderBuffer) insertCGJ() { |
||||
rb.insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))}) |
||||
} |
||||
|
||||
// appendRune inserts a rune at the end of the buffer. It is used for Hangul.
|
||||
func (rb *reorderBuffer) appendRune(r rune) { |
||||
bn := rb.nbyte |
||||
sz := utf8.EncodeRune(rb.byte[bn:], rune(r)) |
||||
rb.nbyte += utf8.UTFMax |
||||
rb.rune[rb.nrune] = Properties{pos: bn, size: uint8(sz)} |
||||
rb.nrune++ |
||||
} |
||||
|
||||
// assignRune sets a rune at position pos. It is used for Hangul and recomposition.
|
||||
func (rb *reorderBuffer) assignRune(pos int, r rune) { |
||||
bn := rb.rune[pos].pos |
||||
sz := utf8.EncodeRune(rb.byte[bn:], rune(r)) |
||||
rb.rune[pos] = Properties{pos: bn, size: uint8(sz)} |
||||
} |
||||
|
||||
// runeAt returns the rune at position n. It is used for Hangul and recomposition.
|
||||
func (rb *reorderBuffer) runeAt(n int) rune { |
||||
inf := rb.rune[n] |
||||
r, _ := utf8.DecodeRune(rb.byte[inf.pos : inf.pos+inf.size]) |
||||
return r |
||||
} |
||||
|
||||
// bytesAt returns the UTF-8 encoding of the rune at position n.
|
||||
// It is used for Hangul and recomposition.
|
||||
func (rb *reorderBuffer) bytesAt(n int) []byte { |
||||
inf := rb.rune[n] |
||||
return rb.byte[inf.pos : int(inf.pos)+int(inf.size)] |
||||
} |
||||
|
||||
// For Hangul we combine algorithmically, instead of using tables.
|
||||
const ( |
||||
hangulBase = 0xAC00 // UTF-8(hangulBase) -> EA B0 80
|
||||
hangulBase0 = 0xEA |
||||
hangulBase1 = 0xB0 |
||||
hangulBase2 = 0x80 |
||||
|
||||
hangulEnd = hangulBase + jamoLVTCount // UTF-8(0xD7A4) -> ED 9E A4
|
||||
hangulEnd0 = 0xED |
||||
hangulEnd1 = 0x9E |
||||
hangulEnd2 = 0xA4 |
||||
|
||||
jamoLBase = 0x1100 // UTF-8(jamoLBase) -> E1 84 00
|
||||
jamoLBase0 = 0xE1 |
||||
jamoLBase1 = 0x84 |
||||
jamoLEnd = 0x1113 |
||||
jamoVBase = 0x1161 |
||||
jamoVEnd = 0x1176 |
||||
jamoTBase = 0x11A7 |
||||
jamoTEnd = 0x11C3 |
||||
|
||||
jamoTCount = 28 |
||||
jamoVCount = 21 |
||||
jamoVTCount = 21 * 28 |
||||
jamoLVTCount = 19 * 21 * 28 |
||||
) |
||||
|
||||
const hangulUTF8Size = 3 |
||||
|
||||
func isHangul(b []byte) bool { |
||||
if len(b) < hangulUTF8Size { |
||||
return false |
||||
} |
||||
b0 := b[0] |
||||
if b0 < hangulBase0 { |
||||
return false |
||||
} |
||||
b1 := b[1] |
||||
switch { |
||||
case b0 == hangulBase0: |
||||
return b1 >= hangulBase1 |
||||
case b0 < hangulEnd0: |
||||
return true |
||||
case b0 > hangulEnd0: |
||||
return false |
||||
case b1 < hangulEnd1: |
||||
return true |
||||
} |
||||
return b1 == hangulEnd1 && b[2] < hangulEnd2 |
||||
} |
||||
|
||||
func isHangulString(b string) bool { |
||||
if len(b) < hangulUTF8Size { |
||||
return false |
||||
} |
||||
b0 := b[0] |
||||
if b0 < hangulBase0 { |
||||
return false |
||||
} |
||||
b1 := b[1] |
||||
switch { |
||||
case b0 == hangulBase0: |
||||
return b1 >= hangulBase1 |
||||
case b0 < hangulEnd0: |
||||
return true |
||||
case b0 > hangulEnd0: |
||||
return false |
||||
case b1 < hangulEnd1: |
||||
return true |
||||
} |
||||
return b1 == hangulEnd1 && b[2] < hangulEnd2 |
||||
} |
||||
|
||||
// Caller must ensure len(b) >= 2.
|
||||
func isJamoVT(b []byte) bool { |
||||
// True if (rune & 0xff00) == jamoLBase
|
||||
return b[0] == jamoLBase0 && (b[1]&0xFC) == jamoLBase1 |
||||
} |
||||
|
||||
func isHangulWithoutJamoT(b []byte) bool { |
||||
c, _ := utf8.DecodeRune(b) |
||||
c -= hangulBase |
||||
return c < jamoLVTCount && c%jamoTCount == 0 |
||||
} |
||||
|
||||
// decomposeHangul writes the decomposed Hangul to buf and returns the number
|
||||
// of bytes written. len(buf) should be at least 9.
|
||||
func decomposeHangul(buf []byte, r rune) int { |
||||
const JamoUTF8Len = 3 |
||||
r -= hangulBase |
||||
x := r % jamoTCount |
||||
r /= jamoTCount |
||||
utf8.EncodeRune(buf, jamoLBase+r/jamoVCount) |
||||
utf8.EncodeRune(buf[JamoUTF8Len:], jamoVBase+r%jamoVCount) |
||||
if x != 0 { |
||||
utf8.EncodeRune(buf[2*JamoUTF8Len:], jamoTBase+x) |
||||
return 3 * JamoUTF8Len |
||||
} |
||||
return 2 * JamoUTF8Len |
||||
} |
||||
|
||||
// decomposeHangul algorithmically decomposes a Hangul rune into
|
||||
// its Jamo components.
|
||||
// See http://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.
|
||||
func (rb *reorderBuffer) decomposeHangul(r rune) { |
||||
r -= hangulBase |
||||
x := r % jamoTCount |
||||
r /= jamoTCount |
||||
rb.appendRune(jamoLBase + r/jamoVCount) |
||||
rb.appendRune(jamoVBase + r%jamoVCount) |
||||
if x != 0 { |
||||
rb.appendRune(jamoTBase + x) |
||||
} |
||||
} |
||||
|
||||
// combineHangul algorithmically combines Jamo character components into Hangul.
|
||||
// See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul.
|
||||
func (rb *reorderBuffer) combineHangul(s, i, k int) { |
||||
b := rb.rune[:] |
||||
bn := rb.nrune |
||||
for ; i < bn; i++ { |
||||
cccB := b[k-1].ccc |
||||
cccC := b[i].ccc |
||||
if cccB == 0 { |
||||
s = k - 1 |
||||
} |
||||
if s != k-1 && cccB >= cccC { |
||||
// b[i] is blocked by greater-equal cccX below it
|
||||
b[k] = b[i] |
||||
k++ |
||||
} else { |
||||
l := rb.runeAt(s) // also used to compare to hangulBase
|
||||
v := rb.runeAt(i) // also used to compare to jamoT
|
||||
switch { |
||||
case jamoLBase <= l && l < jamoLEnd && |
||||
jamoVBase <= v && v < jamoVEnd: |
||||
// 11xx plus 116x to LV
|
||||
rb.assignRune(s, hangulBase+ |
||||
(l-jamoLBase)*jamoVTCount+(v-jamoVBase)*jamoTCount) |
||||
case hangulBase <= l && l < hangulEnd && |
||||
jamoTBase < v && v < jamoTEnd && |
||||
((l-hangulBase)%jamoTCount) == 0: |
||||
// ACxx plus 11Ax to LVT
|
||||
rb.assignRune(s, l+v-jamoTBase) |
||||
default: |
||||
b[k] = b[i] |
||||
k++ |
||||
} |
||||
} |
||||
} |
||||
rb.nrune = k |
||||
} |
||||
|
||||
// compose recombines the runes in the buffer.
|
||||
// It should only be used to recompose a single segment, as it will not
|
||||
// handle alternations between Hangul and non-Hangul characters correctly.
|
||||
func (rb *reorderBuffer) compose() { |
||||
// UAX #15, section X5 , including Corrigendum #5
|
||||
// "In any character sequence beginning with starter S, a character C is
|
||||
// blocked from S if and only if there is some character B between S
|
||||
// and C, and either B is a starter or it has the same or higher
|
||||
// combining class as C."
|
||||
bn := rb.nrune |
||||
if bn == 0 { |
||||
return |
||||
} |
||||
k := 1 |
||||
b := rb.rune[:] |
||||
for s, i := 0, 1; i < bn; i++ { |
||||
if isJamoVT(rb.bytesAt(i)) { |
||||
// Redo from start in Hangul mode. Necessary to support
|
||||
// U+320E..U+321E in NFKC mode.
|
||||
rb.combineHangul(s, i, k) |
||||
return |
||||
} |
||||
ii := b[i] |
||||
// We can only use combineForward as a filter if we later
|
||||
// get the info for the combined character. This is more
|
||||
// expensive than using the filter. Using combinesBackward()
|
||||
// is safe.
|
||||
if ii.combinesBackward() { |
||||
cccB := b[k-1].ccc |
||||
cccC := ii.ccc |
||||
blocked := false // b[i] blocked by starter or greater or equal CCC?
|
||||
if cccB == 0 { |
||||
s = k - 1 |
||||
} else { |
||||
blocked = s != k-1 && cccB >= cccC |
||||
} |
||||
if !blocked { |
||||
combined := combine(rb.runeAt(s), rb.runeAt(i)) |
||||
if combined != 0 { |
||||
rb.assignRune(s, combined) |
||||
continue |
||||
} |
||||
} |
||||
} |
||||
b[k] = b[i] |
||||
k++ |
||||
} |
||||
rb.nrune = k |
||||
} |
@ -0,0 +1,259 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
// This file contains Form-specific logic and wrappers for data in tables.go.
|
||||
|
||||
// Rune info is stored in a separate trie per composing form. A composing form
|
||||
// and its corresponding decomposing form share the same trie. Each trie maps
|
||||
// a rune to a uint16. The values take two forms. For v >= 0x8000:
|
||||
// bits
|
||||
// 15: 1 (inverse of NFD_QC bit of qcInfo)
|
||||
// 13..7: qcInfo (see below). isYesD is always true (no decompostion).
|
||||
// 6..0: ccc (compressed CCC value).
|
||||
// For v < 0x8000, the respective rune has a decomposition and v is an index
|
||||
// into a byte array of UTF-8 decomposition sequences and additional info and
|
||||
// has the form:
|
||||
// <header> <decomp_byte>* [<tccc> [<lccc>]]
|
||||
// The header contains the number of bytes in the decomposition (excluding this
|
||||
// length byte). The two most significant bits of this length byte correspond
|
||||
// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.
|
||||
// The byte sequence is followed by a trailing and leading CCC if the values
|
||||
// for these are not zero. The value of v determines which ccc are appended
|
||||
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
|
||||
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
|
||||
// there is an additional leading ccc. The value of tccc itself is the
|
||||
// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
|
||||
// are the number of trailing non-starters.
|
||||
|
||||
const ( |
||||
qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
|
||||
headerLenMask = 0x3F // extract the length value from the header byte
|
||||
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
|
||||
) |
||||
|
||||
// Properties provides access to normalization properties of a rune.
|
||||
type Properties struct { |
||||
pos uint8 // start position in reorderBuffer; used in composition.go
|
||||
size uint8 // length of UTF-8 encoding of this rune
|
||||
ccc uint8 // leading canonical combining class (ccc if not decomposition)
|
||||
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
|
||||
nLead uint8 // number of leading non-starters.
|
||||
flags qcInfo // quick check flags
|
||||
index uint16 |
||||
} |
||||
|
||||
// functions dispatchable per form
|
||||
type lookupFunc func(b input, i int) Properties |
||||
|
||||
// formInfo holds Form-specific functions and tables.
|
||||
type formInfo struct { |
||||
form Form |
||||
composing, compatibility bool // form type
|
||||
info lookupFunc |
||||
nextMain iterFunc |
||||
} |
||||
|
||||
var formTable = []*formInfo{{ |
||||
form: NFC, |
||||
composing: true, |
||||
compatibility: false, |
||||
info: lookupInfoNFC, |
||||
nextMain: nextComposed, |
||||
}, { |
||||
form: NFD, |
||||
composing: false, |
||||
compatibility: false, |
||||
info: lookupInfoNFC, |
||||
nextMain: nextDecomposed, |
||||
}, { |
||||
form: NFKC, |
||||
composing: true, |
||||
compatibility: true, |
||||
info: lookupInfoNFKC, |
||||
nextMain: nextComposed, |
||||
}, { |
||||
form: NFKD, |
||||
composing: false, |
||||
compatibility: true, |
||||
info: lookupInfoNFKC, |
||||
nextMain: nextDecomposed, |
||||
}} |
||||
|
||||
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
|
||||
// unexpected behavior for the user. For example, in NFD, there is a boundary
|
||||
// after 'a'. However, 'a' might combine with modifiers, so from the application's
|
||||
// perspective it is not a good boundary. We will therefore always use the
|
||||
// boundaries for the combining variants.
|
||||
|
||||
// BoundaryBefore returns true if this rune starts a new segment and
|
||||
// cannot combine with any rune on the left.
|
||||
func (p Properties) BoundaryBefore() bool { |
||||
if p.ccc == 0 && !p.combinesBackward() { |
||||
return true |
||||
} |
||||
// We assume that the CCC of the first character in a decomposition
|
||||
// is always non-zero if different from info.ccc and that we can return
|
||||
// false at this point. This is verified by maketables.
|
||||
return false |
||||
} |
||||
|
||||
// BoundaryAfter returns true if runes cannot combine with or otherwise
|
||||
// interact with this or previous runes.
|
||||
func (p Properties) BoundaryAfter() bool { |
||||
// TODO: loosen these conditions.
|
||||
return p.isInert() |
||||
} |
||||
|
||||
// We pack quick check data in 4 bits:
|
||||
// 5: Combines forward (0 == false, 1 == true)
|
||||
// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
|
||||
// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
|
||||
// 1..0: Number of trailing non-starters.
|
||||
//
|
||||
// When all 4 bits are zero, the character is inert, meaning it is never
|
||||
// influenced by normalization.
|
||||
type qcInfo uint8 |
||||
|
||||
func (p Properties) isYesC() bool { return p.flags&0x10 == 0 } |
||||
func (p Properties) isYesD() bool { return p.flags&0x4 == 0 } |
||||
|
||||
func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 } |
||||
func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
|
||||
func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
|
||||
|
||||
func (p Properties) isInert() bool { |
||||
return p.flags&qcInfoMask == 0 && p.ccc == 0 |
||||
} |
||||
|
||||
func (p Properties) multiSegment() bool { |
||||
return p.index >= firstMulti && p.index < endMulti |
||||
} |
||||
|
||||
func (p Properties) nLeadingNonStarters() uint8 { |
||||
return p.nLead |
||||
} |
||||
|
||||
func (p Properties) nTrailingNonStarters() uint8 { |
||||
return uint8(p.flags & 0x03) |
||||
} |
||||
|
||||
// Decomposition returns the decomposition for the underlying rune
|
||||
// or nil if there is none.
|
||||
func (p Properties) Decomposition() []byte { |
||||
// TODO: create the decomposition for Hangul?
|
||||
if p.index == 0 { |
||||
return nil |
||||
} |
||||
i := p.index |
||||
n := decomps[i] & headerLenMask |
||||
i++ |
||||
return decomps[i : i+uint16(n)] |
||||
} |
||||
|
||||
// Size returns the length of UTF-8 encoding of the rune.
|
||||
func (p Properties) Size() int { |
||||
return int(p.size) |
||||
} |
||||
|
||||
// CCC returns the canonical combining class of the underlying rune.
|
||||
func (p Properties) CCC() uint8 { |
||||
if p.index >= firstCCCZeroExcept { |
||||
return 0 |
||||
} |
||||
return ccc[p.ccc] |
||||
} |
||||
|
||||
// LeadCCC returns the CCC of the first rune in the decomposition.
|
||||
// If there is no decomposition, LeadCCC equals CCC.
|
||||
func (p Properties) LeadCCC() uint8 { |
||||
return ccc[p.ccc] |
||||
} |
||||
|
||||
// TrailCCC returns the CCC of the last rune in the decomposition.
|
||||
// If there is no decomposition, TrailCCC equals CCC.
|
||||
func (p Properties) TrailCCC() uint8 { |
||||
return ccc[p.tccc] |
||||
} |
||||
|
||||
// Recomposition
|
||||
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
|
||||
// This clips off the bits of three entries, but we know this will not
|
||||
// result in a collision. In the unlikely event that changes to
|
||||
// UnicodeData.txt introduce collisions, the compiler will catch it.
|
||||
// Note that the recomposition map for NFC and NFKC are identical.
|
||||
|
||||
// combine returns the combined rune or 0 if it doesn't exist.
|
||||
func combine(a, b rune) rune { |
||||
key := uint32(uint16(a))<<16 + uint32(uint16(b)) |
||||
return recompMap[key] |
||||
} |
||||
|
||||
func lookupInfoNFC(b input, i int) Properties { |
||||
v, sz := b.charinfoNFC(i) |
||||
return compInfo(v, sz) |
||||
} |
||||
|
||||
func lookupInfoNFKC(b input, i int) Properties { |
||||
v, sz := b.charinfoNFKC(i) |
||||
return compInfo(v, sz) |
||||
} |
||||
|
||||
// Properties returns properties for the first rune in s.
|
||||
func (f Form) Properties(s []byte) Properties { |
||||
if f == NFC || f == NFD { |
||||
return compInfo(nfcData.lookup(s)) |
||||
} |
||||
return compInfo(nfkcData.lookup(s)) |
||||
} |
||||
|
||||
// PropertiesString returns properties for the first rune in s.
|
||||
func (f Form) PropertiesString(s string) Properties { |
||||
if f == NFC || f == NFD { |
||||
return compInfo(nfcData.lookupString(s)) |
||||
} |
||||
return compInfo(nfkcData.lookupString(s)) |
||||
} |
||||
|
||||
// compInfo converts the information contained in v and sz
|
||||
// to a Properties. See the comment at the top of the file
|
||||
// for more information on the format.
|
||||
func compInfo(v uint16, sz int) Properties { |
||||
if v == 0 { |
||||
return Properties{size: uint8(sz)} |
||||
} else if v >= 0x8000 { |
||||
p := Properties{ |
||||
size: uint8(sz), |
||||
ccc: uint8(v), |
||||
tccc: uint8(v), |
||||
flags: qcInfo(v >> 8), |
||||
} |
||||
if p.ccc > 0 || p.combinesBackward() { |
||||
p.nLead = uint8(p.flags & 0x3) |
||||
} |
||||
return p |
||||
} |
||||
// has decomposition
|
||||
h := decomps[v] |
||||
f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4 |
||||
p := Properties{size: uint8(sz), flags: f, index: v} |
||||
if v >= firstCCC { |
||||
v += uint16(h&headerLenMask) + 1 |
||||
c := decomps[v] |
||||
p.tccc = c >> 2 |
||||
p.flags |= qcInfo(c & 0x3) |
||||
if v >= firstLeadingCCC { |
||||
p.nLead = c & 0x3 |
||||
if v >= firstStarterWithNLead { |
||||
// We were tricked. Remove the decomposition.
|
||||
p.flags &= 0x03 |
||||
p.index = 0 |
||||
return p |
||||
} |
||||
p.ccc = decomps[v+1] |
||||
} |
||||
} |
||||
return p |
||||
} |
@ -0,0 +1,109 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
import "unicode/utf8" |
||||
|
||||
type input struct { |
||||
str string |
||||
bytes []byte |
||||
} |
||||
|
||||
func inputBytes(str []byte) input { |
||||
return input{bytes: str} |
||||
} |
||||
|
||||
func inputString(str string) input { |
||||
return input{str: str} |
||||
} |
||||
|
||||
func (in *input) setBytes(str []byte) { |
||||
in.str = "" |
||||
in.bytes = str |
||||
} |
||||
|
||||
func (in *input) setString(str string) { |
||||
in.str = str |
||||
in.bytes = nil |
||||
} |
||||
|
||||
func (in *input) _byte(p int) byte { |
||||
if in.bytes == nil { |
||||
return in.str[p] |
||||
} |
||||
return in.bytes[p] |
||||
} |
||||
|
||||
func (in *input) skipASCII(p, max int) int { |
||||
if in.bytes == nil { |
||||
for ; p < max && in.str[p] < utf8.RuneSelf; p++ { |
||||
} |
||||
} else { |
||||
for ; p < max && in.bytes[p] < utf8.RuneSelf; p++ { |
||||
} |
||||
} |
||||
return p |
||||
} |
||||
|
||||
func (in *input) skipContinuationBytes(p int) int { |
||||
if in.bytes == nil { |
||||
for ; p < len(in.str) && !utf8.RuneStart(in.str[p]); p++ { |
||||
} |
||||
} else { |
||||
for ; p < len(in.bytes) && !utf8.RuneStart(in.bytes[p]); p++ { |
||||
} |
||||
} |
||||
return p |
||||
} |
||||
|
||||
func (in *input) appendSlice(buf []byte, b, e int) []byte { |
||||
if in.bytes != nil { |
||||
return append(buf, in.bytes[b:e]...) |
||||
} |
||||
for i := b; i < e; i++ { |
||||
buf = append(buf, in.str[i]) |
||||
} |
||||
return buf |
||||
} |
||||
|
||||
func (in *input) copySlice(buf []byte, b, e int) int { |
||||
if in.bytes == nil { |
||||
return copy(buf, in.str[b:e]) |
||||
} |
||||
return copy(buf, in.bytes[b:e]) |
||||
} |
||||
|
||||
func (in *input) charinfoNFC(p int) (uint16, int) { |
||||
if in.bytes == nil { |
||||
return nfcData.lookupString(in.str[p:]) |
||||
} |
||||
return nfcData.lookup(in.bytes[p:]) |
||||
} |
||||
|
||||
func (in *input) charinfoNFKC(p int) (uint16, int) { |
||||
if in.bytes == nil { |
||||
return nfkcData.lookupString(in.str[p:]) |
||||
} |
||||
return nfkcData.lookup(in.bytes[p:]) |
||||
} |
||||
|
||||
func (in *input) hangul(p int) (r rune) { |
||||
var size int |
||||
if in.bytes == nil { |
||||
if !isHangulString(in.str[p:]) { |
||||
return 0 |
||||
} |
||||
r, size = utf8.DecodeRuneInString(in.str[p:]) |
||||
} else { |
||||
if !isHangul(in.bytes[p:]) { |
||||
return 0 |
||||
} |
||||
r, size = utf8.DecodeRune(in.bytes[p:]) |
||||
} |
||||
if size != hangulUTF8Size { |
||||
return 0 |
||||
} |
||||
return r |
||||
} |
@ -0,0 +1,457 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
import ( |
||||
"fmt" |
||||
"unicode/utf8" |
||||
) |
||||
|
||||
// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
|
||||
// sequence of starter and non-starter runes for the purpose of normalization.
|
||||
const MaxSegmentSize = maxByteBufferSize |
||||
|
||||
// An Iter iterates over a string or byte slice, while normalizing it
|
||||
// to a given Form.
|
||||
type Iter struct { |
||||
rb reorderBuffer |
||||
buf [maxByteBufferSize]byte |
||||
info Properties // first character saved from previous iteration
|
||||
next iterFunc // implementation of next depends on form
|
||||
asciiF iterFunc |
||||
|
||||
p int // current position in input source
|
||||
multiSeg []byte // remainder of multi-segment decomposition
|
||||
} |
||||
|
||||
type iterFunc func(*Iter) []byte |
||||
|
||||
// Init initializes i to iterate over src after normalizing it to Form f.
|
||||
func (i *Iter) Init(f Form, src []byte) { |
||||
i.p = 0 |
||||
if len(src) == 0 { |
||||
i.setDone() |
||||
i.rb.nsrc = 0 |
||||
return |
||||
} |
||||
i.multiSeg = nil |
||||
i.rb.init(f, src) |
||||
i.next = i.rb.f.nextMain |
||||
i.asciiF = nextASCIIBytes |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.rb.ss.first(i.info) |
||||
} |
||||
|
||||
// InitString initializes i to iterate over src after normalizing it to Form f.
|
||||
func (i *Iter) InitString(f Form, src string) { |
||||
i.p = 0 |
||||
if len(src) == 0 { |
||||
i.setDone() |
||||
i.rb.nsrc = 0 |
||||
return |
||||
} |
||||
i.multiSeg = nil |
||||
i.rb.initString(f, src) |
||||
i.next = i.rb.f.nextMain |
||||
i.asciiF = nextASCIIString |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.rb.ss.first(i.info) |
||||
} |
||||
|
||||
// Seek sets the segment to be returned by the next call to Next to start
|
||||
// at position p. It is the responsibility of the caller to set p to the
|
||||
// start of a segment.
|
||||
func (i *Iter) Seek(offset int64, whence int) (int64, error) { |
||||
var abs int64 |
||||
switch whence { |
||||
case 0: |
||||
abs = offset |
||||
case 1: |
||||
abs = int64(i.p) + offset |
||||
case 2: |
||||
abs = int64(i.rb.nsrc) + offset |
||||
default: |
||||
return 0, fmt.Errorf("norm: invalid whence") |
||||
} |
||||
if abs < 0 { |
||||
return 0, fmt.Errorf("norm: negative position") |
||||
} |
||||
if int(abs) >= i.rb.nsrc { |
||||
i.setDone() |
||||
return int64(i.p), nil |
||||
} |
||||
i.p = int(abs) |
||||
i.multiSeg = nil |
||||
i.next = i.rb.f.nextMain |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.rb.ss.first(i.info) |
||||
return abs, nil |
||||
} |
||||
|
||||
// returnSlice returns a slice of the underlying input type as a byte slice.
|
||||
// If the underlying is of type []byte, it will simply return a slice.
|
||||
// If the underlying is of type string, it will copy the slice to the buffer
|
||||
// and return that.
|
||||
func (i *Iter) returnSlice(a, b int) []byte { |
||||
if i.rb.src.bytes == nil { |
||||
return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] |
||||
} |
||||
return i.rb.src.bytes[a:b] |
||||
} |
||||
|
||||
// Pos returns the byte position at which the next call to Next will commence processing.
|
||||
func (i *Iter) Pos() int { |
||||
return i.p |
||||
} |
||||
|
||||
func (i *Iter) setDone() { |
||||
i.next = nextDone |
||||
i.p = i.rb.nsrc |
||||
} |
||||
|
||||
// Done returns true if there is no more input to process.
|
||||
func (i *Iter) Done() bool { |
||||
return i.p >= i.rb.nsrc |
||||
} |
||||
|
||||
// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
|
||||
// For any input a and b for which f(a) == f(b), subsequent calls
|
||||
// to Next will return the same segments.
|
||||
// Modifying runes are grouped together with the preceding starter, if such a starter exists.
|
||||
// Although not guaranteed, n will typically be the smallest possible n.
|
||||
func (i *Iter) Next() []byte { |
||||
return i.next(i) |
||||
} |
||||
|
||||
func nextASCIIBytes(i *Iter) []byte { |
||||
p := i.p + 1 |
||||
if p >= i.rb.nsrc { |
||||
i.setDone() |
||||
return i.rb.src.bytes[i.p:p] |
||||
} |
||||
if i.rb.src.bytes[p] < utf8.RuneSelf { |
||||
p0 := i.p |
||||
i.p = p |
||||
return i.rb.src.bytes[p0:p] |
||||
} |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.next = i.rb.f.nextMain |
||||
return i.next(i) |
||||
} |
||||
|
||||
func nextASCIIString(i *Iter) []byte { |
||||
p := i.p + 1 |
||||
if p >= i.rb.nsrc { |
||||
i.buf[0] = i.rb.src.str[i.p] |
||||
i.setDone() |
||||
return i.buf[:1] |
||||
} |
||||
if i.rb.src.str[p] < utf8.RuneSelf { |
||||
i.buf[0] = i.rb.src.str[i.p] |
||||
i.p = p |
||||
return i.buf[:1] |
||||
} |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.next = i.rb.f.nextMain |
||||
return i.next(i) |
||||
} |
||||
|
||||
func nextHangul(i *Iter) []byte { |
||||
p := i.p |
||||
next := p + hangulUTF8Size |
||||
if next >= i.rb.nsrc { |
||||
i.setDone() |
||||
} else if i.rb.src.hangul(next) == 0 { |
||||
i.rb.ss.next(i.info) |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.next = i.rb.f.nextMain |
||||
return i.next(i) |
||||
} |
||||
i.p = next |
||||
return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] |
||||
} |
||||
|
||||
func nextDone(i *Iter) []byte { |
||||
return nil |
||||
} |
||||
|
||||
// nextMulti is used for iterating over multi-segment decompositions
|
||||
// for decomposing normal forms.
|
||||
func nextMulti(i *Iter) []byte { |
||||
j := 0 |
||||
d := i.multiSeg |
||||
// skip first rune
|
||||
for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { |
||||
} |
||||
for j < len(d) { |
||||
info := i.rb.f.info(input{bytes: d}, j) |
||||
if info.BoundaryBefore() { |
||||
i.multiSeg = d[j:] |
||||
return d[:j] |
||||
} |
||||
j += int(info.size) |
||||
} |
||||
// treat last segment as normal decomposition
|
||||
i.next = i.rb.f.nextMain |
||||
return i.next(i) |
||||
} |
||||
|
||||
// nextMultiNorm is used for iterating over multi-segment decompositions
|
||||
// for composing normal forms.
|
||||
func nextMultiNorm(i *Iter) []byte { |
||||
j := 0 |
||||
d := i.multiSeg |
||||
for j < len(d) { |
||||
info := i.rb.f.info(input{bytes: d}, j) |
||||
if info.BoundaryBefore() { |
||||
i.rb.compose() |
||||
seg := i.buf[:i.rb.flushCopy(i.buf[:])] |
||||
i.rb.insertUnsafe(input{bytes: d}, j, info) |
||||
i.multiSeg = d[j+int(info.size):] |
||||
return seg |
||||
} |
||||
i.rb.insertUnsafe(input{bytes: d}, j, info) |
||||
j += int(info.size) |
||||
} |
||||
i.multiSeg = nil |
||||
i.next = nextComposed |
||||
return doNormComposed(i) |
||||
} |
||||
|
||||
// nextDecomposed is the implementation of Next for forms NFD and NFKD.
|
||||
func nextDecomposed(i *Iter) (next []byte) { |
||||
outp := 0 |
||||
inCopyStart, outCopyStart := i.p, 0 |
||||
for { |
||||
if sz := int(i.info.size); sz <= 1 { |
||||
i.rb.ss = 0 |
||||
p := i.p |
||||
i.p++ // ASCII or illegal byte. Either way, advance by 1.
|
||||
if i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
return i.returnSlice(p, i.p) |
||||
} else if i.rb.src._byte(i.p) < utf8.RuneSelf { |
||||
i.next = i.asciiF |
||||
return i.returnSlice(p, i.p) |
||||
} |
||||
outp++ |
||||
} else if d := i.info.Decomposition(); d != nil { |
||||
// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
|
||||
// Case 1: there is a leftover to copy. In this case the decomposition
|
||||
// must begin with a modifier and should always be appended.
|
||||
// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
|
||||
p := outp + len(d) |
||||
if outp > 0 { |
||||
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
||||
// TODO: this condition should not be possible, but we leave it
|
||||
// in for defensive purposes.
|
||||
if p > len(i.buf) { |
||||
return i.buf[:outp] |
||||
} |
||||
} else if i.info.multiSegment() { |
||||
// outp must be 0 as multi-segment decompositions always
|
||||
// start a new segment.
|
||||
if i.multiSeg == nil { |
||||
i.multiSeg = d |
||||
i.next = nextMulti |
||||
return nextMulti(i) |
||||
} |
||||
// We are in the last segment. Treat as normal decomposition.
|
||||
d = i.multiSeg |
||||
i.multiSeg = nil |
||||
p = len(d) |
||||
} |
||||
prevCC := i.info.tccc |
||||
if i.p += sz; i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
i.info = Properties{} // Force BoundaryBefore to succeed.
|
||||
} else { |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
} |
||||
switch i.rb.ss.next(i.info) { |
||||
case ssOverflow: |
||||
i.next = nextCGJDecompose |
||||
fallthrough |
||||
case ssStarter: |
||||
if outp > 0 { |
||||
copy(i.buf[outp:], d) |
||||
return i.buf[:p] |
||||
} |
||||
return d |
||||
} |
||||
copy(i.buf[outp:], d) |
||||
outp = p |
||||
inCopyStart, outCopyStart = i.p, outp |
||||
if i.info.ccc < prevCC { |
||||
goto doNorm |
||||
} |
||||
continue |
||||
} else if r := i.rb.src.hangul(i.p); r != 0 { |
||||
outp = decomposeHangul(i.buf[:], r) |
||||
i.p += hangulUTF8Size |
||||
inCopyStart, outCopyStart = i.p, outp |
||||
if i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
break |
||||
} else if i.rb.src.hangul(i.p) != 0 { |
||||
i.next = nextHangul |
||||
return i.buf[:outp] |
||||
} |
||||
} else { |
||||
p := outp + sz |
||||
if p > len(i.buf) { |
||||
break |
||||
} |
||||
outp = p |
||||
i.p += sz |
||||
} |
||||
if i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
break |
||||
} |
||||
prevCC := i.info.tccc |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
if v := i.rb.ss.next(i.info); v == ssStarter { |
||||
break |
||||
} else if v == ssOverflow { |
||||
i.next = nextCGJDecompose |
||||
break |
||||
} |
||||
if i.info.ccc < prevCC { |
||||
goto doNorm |
||||
} |
||||
} |
||||
if outCopyStart == 0 { |
||||
return i.returnSlice(inCopyStart, i.p) |
||||
} else if inCopyStart < i.p { |
||||
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
||||
} |
||||
return i.buf[:outp] |
||||
doNorm: |
||||
// Insert what we have decomposed so far in the reorderBuffer.
|
||||
// As we will only reorder, there will always be enough room.
|
||||
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) |
||||
i.rb.insertDecomposed(i.buf[0:outp]) |
||||
return doNormDecomposed(i) |
||||
} |
||||
|
||||
func doNormDecomposed(i *Iter) []byte { |
||||
for { |
||||
i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
||||
if i.p += int(i.info.size); i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
break |
||||
} |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
if i.info.ccc == 0 { |
||||
break |
||||
} |
||||
if s := i.rb.ss.next(i.info); s == ssOverflow { |
||||
i.next = nextCGJDecompose |
||||
break |
||||
} |
||||
} |
||||
// new segment or too many combining characters: exit normalization
|
||||
return i.buf[:i.rb.flushCopy(i.buf[:])] |
||||
} |
||||
|
||||
func nextCGJDecompose(i *Iter) []byte { |
||||
i.rb.ss = 0 |
||||
i.rb.insertCGJ() |
||||
i.next = nextDecomposed |
||||
i.rb.ss.first(i.info) |
||||
buf := doNormDecomposed(i) |
||||
return buf |
||||
} |
||||
|
||||
// nextComposed is the implementation of Next for forms NFC and NFKC.
|
||||
func nextComposed(i *Iter) []byte { |
||||
outp, startp := 0, i.p |
||||
var prevCC uint8 |
||||
for { |
||||
if !i.info.isYesC() { |
||||
goto doNorm |
||||
} |
||||
prevCC = i.info.tccc |
||||
sz := int(i.info.size) |
||||
if sz == 0 { |
||||
sz = 1 // illegal rune: copy byte-by-byte
|
||||
} |
||||
p := outp + sz |
||||
if p > len(i.buf) { |
||||
break |
||||
} |
||||
outp = p |
||||
i.p += sz |
||||
if i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
break |
||||
} else if i.rb.src._byte(i.p) < utf8.RuneSelf { |
||||
i.rb.ss = 0 |
||||
i.next = i.asciiF |
||||
break |
||||
} |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
if v := i.rb.ss.next(i.info); v == ssStarter { |
||||
break |
||||
} else if v == ssOverflow { |
||||
i.next = nextCGJCompose |
||||
break |
||||
} |
||||
if i.info.ccc < prevCC { |
||||
goto doNorm |
||||
} |
||||
} |
||||
return i.returnSlice(startp, i.p) |
||||
doNorm: |
||||
// reset to start position
|
||||
i.p = startp |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
i.rb.ss.first(i.info) |
||||
if i.info.multiSegment() { |
||||
d := i.info.Decomposition() |
||||
info := i.rb.f.info(input{bytes: d}, 0) |
||||
i.rb.insertUnsafe(input{bytes: d}, 0, info) |
||||
i.multiSeg = d[int(info.size):] |
||||
i.next = nextMultiNorm |
||||
return nextMultiNorm(i) |
||||
} |
||||
i.rb.ss.first(i.info) |
||||
i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
||||
return doNormComposed(i) |
||||
} |
||||
|
||||
func doNormComposed(i *Iter) []byte { |
||||
// First rune should already be inserted.
|
||||
for { |
||||
if i.p += int(i.info.size); i.p >= i.rb.nsrc { |
||||
i.setDone() |
||||
break |
||||
} |
||||
i.info = i.rb.f.info(i.rb.src, i.p) |
||||
if s := i.rb.ss.next(i.info); s == ssStarter { |
||||
break |
||||
} else if s == ssOverflow { |
||||
i.next = nextCGJCompose |
||||
break |
||||
} |
||||
i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
||||
} |
||||
i.rb.compose() |
||||
seg := i.buf[:i.rb.flushCopy(i.buf[:])] |
||||
return seg |
||||
} |
||||
|
||||
func nextCGJCompose(i *Iter) []byte { |
||||
i.rb.ss = 0 // instead of first
|
||||
i.rb.insertCGJ() |
||||
i.next = nextComposed |
||||
// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
|
||||
// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
|
||||
// If we ever change that, insert a check here.
|
||||
i.rb.ss.first(i.info) |
||||
i.rb.insertUnsafe(i.rb.src, i.p, i.info) |
||||
return doNormComposed(i) |
||||
} |
@ -0,0 +1,609 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Note: the file data_test.go that is generated should not be checked in.
|
||||
//go:generate go run maketables.go triegen.go
|
||||
//go:generate go test -tags test
|
||||
|
||||
// Package norm contains types and functions for normalizing Unicode strings.
|
||||
package norm // import "golang.org/x/text/unicode/norm"
|
||||
|
||||
import ( |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// A Form denotes a canonical representation of Unicode code points.
|
||||
// The Unicode-defined normalization and equivalence forms are:
|
||||
//
|
||||
// NFC Unicode Normalization Form C
|
||||
// NFD Unicode Normalization Form D
|
||||
// NFKC Unicode Normalization Form KC
|
||||
// NFKD Unicode Normalization Form KD
|
||||
//
|
||||
// For a Form f, this documentation uses the notation f(x) to mean
|
||||
// the bytes or string x converted to the given form.
|
||||
// A position n in x is called a boundary if conversion to the form can
|
||||
// proceed independently on both sides:
|
||||
// f(x) == append(f(x[0:n]), f(x[n:])...)
|
||||
//
|
||||
// References: http://unicode.org/reports/tr15/ and
|
||||
// http://unicode.org/notes/tn5/.
|
||||
type Form int |
||||
|
||||
const ( |
||||
NFC Form = iota |
||||
NFD |
||||
NFKC |
||||
NFKD |
||||
) |
||||
|
||||
// Bytes returns f(b). May return b if f(b) = b.
|
||||
func (f Form) Bytes(b []byte) []byte { |
||||
src := inputBytes(b) |
||||
ft := formTable[f] |
||||
n, ok := ft.quickSpan(src, 0, len(b), true) |
||||
if ok { |
||||
return b |
||||
} |
||||
out := make([]byte, n, len(b)) |
||||
copy(out, b[0:n]) |
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush} |
||||
return doAppendInner(&rb, n) |
||||
} |
||||
|
||||
// String returns f(s).
|
||||
func (f Form) String(s string) string { |
||||
src := inputString(s) |
||||
ft := formTable[f] |
||||
n, ok := ft.quickSpan(src, 0, len(s), true) |
||||
if ok { |
||||
return s |
||||
} |
||||
out := make([]byte, n, len(s)) |
||||
copy(out, s[0:n]) |
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush} |
||||
return string(doAppendInner(&rb, n)) |
||||
} |
||||
|
||||
// IsNormal returns true if b == f(b).
|
||||
func (f Form) IsNormal(b []byte) bool { |
||||
src := inputBytes(b) |
||||
ft := formTable[f] |
||||
bp, ok := ft.quickSpan(src, 0, len(b), true) |
||||
if ok { |
||||
return true |
||||
} |
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)} |
||||
rb.setFlusher(nil, cmpNormalBytes) |
||||
for bp < len(b) { |
||||
rb.out = b[bp:] |
||||
if bp = decomposeSegment(&rb, bp, true); bp < 0 { |
||||
return false |
||||
} |
||||
bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true) |
||||
} |
||||
return true |
||||
} |
||||
|
||||
func cmpNormalBytes(rb *reorderBuffer) bool { |
||||
b := rb.out |
||||
for i := 0; i < rb.nrune; i++ { |
||||
info := rb.rune[i] |
||||
if int(info.size) > len(b) { |
||||
return false |
||||
} |
||||
p := info.pos |
||||
pe := p + info.size |
||||
for ; p < pe; p++ { |
||||
if b[0] != rb.byte[p] { |
||||
return false |
||||
} |
||||
b = b[1:] |
||||
} |
||||
} |
||||
return true |
||||
} |
||||
|
||||
// IsNormalString returns true if s == f(s).
|
||||
func (f Form) IsNormalString(s string) bool { |
||||
src := inputString(s) |
||||
ft := formTable[f] |
||||
bp, ok := ft.quickSpan(src, 0, len(s), true) |
||||
if ok { |
||||
return true |
||||
} |
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)} |
||||
rb.setFlusher(nil, func(rb *reorderBuffer) bool { |
||||
for i := 0; i < rb.nrune; i++ { |
||||
info := rb.rune[i] |
||||
if bp+int(info.size) > len(s) { |
||||
return false |
||||
} |
||||
p := info.pos |
||||
pe := p + info.size |
||||
for ; p < pe; p++ { |
||||
if s[bp] != rb.byte[p] { |
||||
return false |
||||
} |
||||
bp++ |
||||
} |
||||
} |
||||
return true |
||||
}) |
||||
for bp < len(s) { |
||||
if bp = decomposeSegment(&rb, bp, true); bp < 0 { |
||||
return false |
||||
} |
||||
bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true) |
||||
} |
||||
return true |
||||
} |
||||
|
||||
// patchTail fixes a case where a rune may be incorrectly normalized
|
||||
// if it is followed by illegal continuation bytes. It returns the
|
||||
// patched buffer and whether the decomposition is still in progress.
|
||||
func patchTail(rb *reorderBuffer) bool { |
||||
info, p := lastRuneStart(&rb.f, rb.out) |
||||
if p == -1 || info.size == 0 { |
||||
return true |
||||
} |
||||
end := p + int(info.size) |
||||
extra := len(rb.out) - end |
||||
if extra > 0 { |
||||
// Potentially allocating memory. However, this only
|
||||
// happens with ill-formed UTF-8.
|
||||
x := make([]byte, 0) |
||||
x = append(x, rb.out[len(rb.out)-extra:]...) |
||||
rb.out = rb.out[:end] |
||||
decomposeToLastBoundary(rb) |
||||
rb.doFlush() |
||||
rb.out = append(rb.out, x...) |
||||
return false |
||||
} |
||||
buf := rb.out[p:] |
||||
rb.out = rb.out[:p] |
||||
decomposeToLastBoundary(rb) |
||||
if s := rb.ss.next(info); s == ssStarter { |
||||
rb.doFlush() |
||||
rb.ss.first(info) |
||||
} else if s == ssOverflow { |
||||
rb.doFlush() |
||||
rb.insertCGJ() |
||||
rb.ss = 0 |
||||
} |
||||
rb.insertUnsafe(inputBytes(buf), 0, info) |
||||
return true |
||||
} |
||||
|
||||
func appendQuick(rb *reorderBuffer, i int) int { |
||||
if rb.nsrc == i { |
||||
return i |
||||
} |
||||
end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true) |
||||
rb.out = rb.src.appendSlice(rb.out, i, end) |
||||
return end |
||||
} |
||||
|
||||
// Append returns f(append(out, b...)).
|
||||
// The buffer out must be nil, empty, or equal to f(out).
|
||||
func (f Form) Append(out []byte, src ...byte) []byte { |
||||
return f.doAppend(out, inputBytes(src), len(src)) |
||||
} |
||||
|
||||
func (f Form) doAppend(out []byte, src input, n int) []byte { |
||||
if n == 0 { |
||||
return out |
||||
} |
||||
ft := formTable[f] |
||||
// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
|
||||
if len(out) == 0 { |
||||
p, _ := ft.quickSpan(src, 0, n, true) |
||||
out = src.appendSlice(out, 0, p) |
||||
if p == n { |
||||
return out |
||||
} |
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush} |
||||
return doAppendInner(&rb, p) |
||||
} |
||||
rb := reorderBuffer{f: *ft, src: src, nsrc: n} |
||||
return doAppend(&rb, out, 0) |
||||
} |
||||
|
||||
func doAppend(rb *reorderBuffer, out []byte, p int) []byte { |
||||
rb.setFlusher(out, appendFlush) |
||||
src, n := rb.src, rb.nsrc |
||||
doMerge := len(out) > 0 |
||||
if q := src.skipContinuationBytes(p); q > p { |
||||
// Move leading non-starters to destination.
|
||||
rb.out = src.appendSlice(rb.out, p, q) |
||||
p = q |
||||
doMerge = patchTail(rb) |
||||
} |
||||
fd := &rb.f |
||||
if doMerge { |
||||
var info Properties |
||||
if p < n { |
||||
info = fd.info(src, p) |
||||
if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 { |
||||
if p == 0 { |
||||
decomposeToLastBoundary(rb) |
||||
} |
||||
p = decomposeSegment(rb, p, true) |
||||
} |
||||
} |
||||
if info.size == 0 { |
||||
rb.doFlush() |
||||
// Append incomplete UTF-8 encoding.
|
||||
return src.appendSlice(rb.out, p, n) |
||||
} |
||||
if rb.nrune > 0 { |
||||
return doAppendInner(rb, p) |
||||
} |
||||
} |
||||
p = appendQuick(rb, p) |
||||
return doAppendInner(rb, p) |
||||
} |
||||
|
||||
func doAppendInner(rb *reorderBuffer, p int) []byte { |
||||
for n := rb.nsrc; p < n; { |
||||
p = decomposeSegment(rb, p, true) |
||||
p = appendQuick(rb, p) |
||||
} |
||||
return rb.out |
||||
} |
||||
|
||||
// AppendString returns f(append(out, []byte(s))).
|
||||
// The buffer out must be nil, empty, or equal to f(out).
|
||||
func (f Form) AppendString(out []byte, src string) []byte { |
||||
return f.doAppend(out, inputString(src), len(src)) |
||||
} |
||||
|
||||
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
|
||||
// It is not guaranteed to return the largest such n.
|
||||
func (f Form) QuickSpan(b []byte) int { |
||||
n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true) |
||||
return n |
||||
} |
||||
|
||||
// Span implements transform.SpanningTransformer. It returns a boundary n such
|
||||
// that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
|
||||
func (f Form) Span(b []byte, atEOF bool) (n int, err error) { |
||||
n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF) |
||||
if n < len(b) { |
||||
if !ok { |
||||
err = transform.ErrEndOfSpan |
||||
} else { |
||||
err = transform.ErrShortSrc |
||||
} |
||||
} |
||||
return n, err |
||||
} |
||||
|
||||
// SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
|
||||
// It is not guaranteed to return the largest such n.
|
||||
func (f Form) SpanString(s string, atEOF bool) (n int, err error) { |
||||
n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF) |
||||
if n < len(s) { |
||||
if !ok { |
||||
err = transform.ErrEndOfSpan |
||||
} else { |
||||
err = transform.ErrShortSrc |
||||
} |
||||
} |
||||
return n, err |
||||
} |
||||
|
||||
// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
|
||||
// whether any non-normalized parts were found. If atEOF is false, n will
|
||||
// not point past the last segment if this segment might be become
|
||||
// non-normalized by appending other runes.
|
||||
func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) { |
||||
var lastCC uint8 |
||||
ss := streamSafe(0) |
||||
lastSegStart := i |
||||
for n = end; i < n; { |
||||
if j := src.skipASCII(i, n); i != j { |
||||
i = j |
||||
lastSegStart = i - 1 |
||||
lastCC = 0 |
||||
ss = 0 |
||||
continue |
||||
} |
||||
info := f.info(src, i) |
||||
if info.size == 0 { |
||||
if atEOF { |
||||
// include incomplete runes
|
||||
return n, true |
||||
} |
||||
return lastSegStart, true |
||||
} |
||||
// This block needs to be before the next, because it is possible to
|
||||
// have an overflow for runes that are starters (e.g. with U+FF9E).
|
||||
switch ss.next(info) { |
||||
case ssStarter: |
||||
lastSegStart = i |
||||
case ssOverflow: |
||||
return lastSegStart, false |
||||
case ssSuccess: |
||||
if lastCC > info.ccc { |
||||
return lastSegStart, false |
||||
} |
||||
} |
||||
if f.composing { |
||||
if !info.isYesC() { |
||||
break |
||||
} |
||||
} else { |
||||
if !info.isYesD() { |
||||
break |
||||
} |
||||
} |
||||
lastCC = info.ccc |
||||
i += int(info.size) |
||||
} |
||||
if i == n { |
||||
if !atEOF { |
||||
n = lastSegStart |
||||
} |
||||
return n, true |
||||
} |
||||
return lastSegStart, false |
||||
} |
||||
|
||||
// QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
|
||||
// It is not guaranteed to return the largest such n.
|
||||
func (f Form) QuickSpanString(s string) int { |
||||
n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true) |
||||
return n |
||||
} |
||||
|
||||
// FirstBoundary returns the position i of the first boundary in b
|
||||
// or -1 if b contains no boundary.
|
||||
func (f Form) FirstBoundary(b []byte) int { |
||||
return f.firstBoundary(inputBytes(b), len(b)) |
||||
} |
||||
|
||||
func (f Form) firstBoundary(src input, nsrc int) int { |
||||
i := src.skipContinuationBytes(0) |
||||
if i >= nsrc { |
||||
return -1 |
||||
} |
||||
fd := formTable[f] |
||||
ss := streamSafe(0) |
||||
// We should call ss.first here, but we can't as the first rune is
|
||||
// skipped already. This means FirstBoundary can't really determine
|
||||
// CGJ insertion points correctly. Luckily it doesn't have to.
|
||||
for { |
||||
info := fd.info(src, i) |
||||
if info.size == 0 { |
||||
return -1 |
||||
} |
||||
if s := ss.next(info); s != ssSuccess { |
||||
return i |
||||
} |
||||
i += int(info.size) |
||||
if i >= nsrc { |
||||
if !info.BoundaryAfter() && !ss.isMax() { |
||||
return -1 |
||||
} |
||||
return nsrc |
||||
} |
||||
} |
||||
} |
||||
|
||||
// FirstBoundaryInString returns the position i of the first boundary in s
|
||||
// or -1 if s contains no boundary.
|
||||
func (f Form) FirstBoundaryInString(s string) int { |
||||
return f.firstBoundary(inputString(s), len(s)) |
||||
} |
||||
|
||||
// NextBoundary reports the index of the boundary between the first and next
|
||||
// segment in b or -1 if atEOF is false and there are not enough bytes to
|
||||
// determine this boundary.
|
||||
func (f Form) NextBoundary(b []byte, atEOF bool) int { |
||||
return f.nextBoundary(inputBytes(b), len(b), atEOF) |
||||
} |
||||
|
||||
// NextBoundaryInString reports the index of the boundary between the first and
|
||||
// next segment in b or -1 if atEOF is false and there are not enough bytes to
|
||||
// determine this boundary.
|
||||
func (f Form) NextBoundaryInString(s string, atEOF bool) int { |
||||
return f.nextBoundary(inputString(s), len(s), atEOF) |
||||
} |
||||
|
||||
func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int { |
||||
if nsrc == 0 { |
||||
if atEOF { |
||||
return 0 |
||||
} |
||||
return -1 |
||||
} |
||||
fd := formTable[f] |
||||
info := fd.info(src, 0) |
||||
if info.size == 0 { |
||||
if atEOF { |
||||
return 1 |
||||
} |
||||
return -1 |
||||
} |
||||
ss := streamSafe(0) |
||||
ss.first(info) |
||||
|
||||
for i := int(info.size); i < nsrc; i += int(info.size) { |
||||
info = fd.info(src, i) |
||||
if info.size == 0 { |
||||
if atEOF { |
||||
return i |
||||
} |
||||
return -1 |
||||
} |
||||
// TODO: Using streamSafe to determine the boundary isn't the same as
|
||||
// using BoundaryBefore. Determine which should be used.
|
||||
if s := ss.next(info); s != ssSuccess { |
||||
return i |
||||
} |
||||
} |
||||
if !atEOF && !info.BoundaryAfter() && !ss.isMax() { |
||||
return -1 |
||||
} |
||||
return nsrc |
||||
} |
||||
|
||||
// LastBoundary returns the position i of the last boundary in b
|
||||
// or -1 if b contains no boundary.
|
||||
func (f Form) LastBoundary(b []byte) int { |
||||
return lastBoundary(formTable[f], b) |
||||
} |
||||
|
||||
func lastBoundary(fd *formInfo, b []byte) int { |
||||
i := len(b) |
||||
info, p := lastRuneStart(fd, b) |
||||
if p == -1 { |
||||
return -1 |
||||
} |
||||
if info.size == 0 { // ends with incomplete rune
|
||||
if p == 0 { // starts with incomplete rune
|
||||
return -1 |
||||
} |
||||
i = p |
||||
info, p = lastRuneStart(fd, b[:i]) |
||||
if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
|
||||
return i |
||||
} |
||||
} |
||||
if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
|
||||
return i |
||||
} |
||||
if info.BoundaryAfter() { |
||||
return i |
||||
} |
||||
ss := streamSafe(0) |
||||
v := ss.backwards(info) |
||||
for i = p; i >= 0 && v != ssStarter; i = p { |
||||
info, p = lastRuneStart(fd, b[:i]) |
||||
if v = ss.backwards(info); v == ssOverflow { |
||||
break |
||||
} |
||||
if p+int(info.size) != i { |
||||
if p == -1 { // no boundary found
|
||||
return -1 |
||||
} |
||||
return i // boundary after an illegal UTF-8 encoding
|
||||
} |
||||
} |
||||
return i |
||||
} |
||||
|
||||
// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
|
||||
// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
|
||||
// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
|
||||
func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int { |
||||
// Force one character to be consumed.
|
||||
info := rb.f.info(rb.src, sp) |
||||
if info.size == 0 { |
||||
return 0 |
||||
} |
||||
if s := rb.ss.next(info); s == ssStarter { |
||||
// TODO: this could be removed if we don't support merging.
|
||||
if rb.nrune > 0 { |
||||
goto end |
||||
} |
||||
} else if s == ssOverflow { |
||||
rb.insertCGJ() |
||||
goto end |
||||
} |
||||
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { |
||||
return int(err) |
||||
} |
||||
for { |
||||
sp += int(info.size) |
||||
if sp >= rb.nsrc { |
||||
if !atEOF && !info.BoundaryAfter() { |
||||
return int(iShortSrc) |
||||
} |
||||
break |
||||
} |
||||
info = rb.f.info(rb.src, sp) |
||||
if info.size == 0 { |
||||
if !atEOF { |
||||
return int(iShortSrc) |
||||
} |
||||
break |
||||
} |
||||
if s := rb.ss.next(info); s == ssStarter { |
||||
break |
||||
} else if s == ssOverflow { |
||||
rb.insertCGJ() |
||||
break |
||||
} |
||||
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess { |
||||
return int(err) |
||||
} |
||||
} |
||||
end: |
||||
if !rb.doFlush() { |
||||
return int(iShortDst) |
||||
} |
||||
return sp |
||||
} |
||||
|
||||
// lastRuneStart returns the runeInfo and position of the last
|
||||
// rune in buf or the zero runeInfo and -1 if no rune was found.
|
||||
func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) { |
||||
p := len(buf) - 1 |
||||
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- { |
||||
} |
||||
if p < 0 { |
||||
return Properties{}, -1 |
||||
} |
||||
return fd.info(inputBytes(buf), p), p |
||||
} |
||||
|
||||
// decomposeToLastBoundary finds an open segment at the end of the buffer
|
||||
// and scans it into rb. Returns the buffer minus the last segment.
|
||||
func decomposeToLastBoundary(rb *reorderBuffer) { |
||||
fd := &rb.f |
||||
info, i := lastRuneStart(fd, rb.out) |
||||
if int(info.size) != len(rb.out)-i { |
||||
// illegal trailing continuation bytes
|
||||
return |
||||
} |
||||
if info.BoundaryAfter() { |
||||
return |
||||
} |
||||
var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
|
||||
padd := 0 |
||||
ss := streamSafe(0) |
||||
p := len(rb.out) |
||||
for { |
||||
add[padd] = info |
||||
v := ss.backwards(info) |
||||
if v == ssOverflow { |
||||
// Note that if we have an overflow, it the string we are appending to
|
||||
// is not correctly normalized. In this case the behavior is undefined.
|
||||
break |
||||
} |
||||
padd++ |
||||
p -= int(info.size) |
||||
if v == ssStarter || p < 0 { |
||||
break |
||||
} |
||||
info, i = lastRuneStart(fd, rb.out[:p]) |
||||
if int(info.size) != p-i { |
||||
break |
||||
} |
||||
} |
||||
rb.ss = ss |
||||
// Copy bytes for insertion as we may need to overwrite rb.out.
|
||||
var buf [maxBufferSize * utf8.UTFMax]byte |
||||
cp := buf[:copy(buf[:], rb.out[p:])] |
||||
rb.out = rb.out[:p] |
||||
for padd--; padd >= 0; padd-- { |
||||
info = add[padd] |
||||
rb.insertUnsafe(inputBytes(cp), 0, info) |
||||
cp = cp[info.size:] |
||||
} |
||||
} |
@ -0,0 +1,125 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
import "io" |
||||
|
||||
type normWriter struct { |
||||
rb reorderBuffer |
||||
w io.Writer |
||||
buf []byte |
||||
} |
||||
|
||||
// Write implements the standard write interface. If the last characters are
|
||||
// not at a normalization boundary, the bytes will be buffered for the next
|
||||
// write. The remaining bytes will be written on close.
|
||||
func (w *normWriter) Write(data []byte) (n int, err error) { |
||||
// Process data in pieces to keep w.buf size bounded.
|
||||
const chunk = 4000 |
||||
|
||||
for len(data) > 0 { |
||||
// Normalize into w.buf.
|
||||
m := len(data) |
||||
if m > chunk { |
||||
m = chunk |
||||
} |
||||
w.rb.src = inputBytes(data[:m]) |
||||
w.rb.nsrc = m |
||||
w.buf = doAppend(&w.rb, w.buf, 0) |
||||
data = data[m:] |
||||
n += m |
||||
|
||||
// Write out complete prefix, save remainder.
|
||||
// Note that lastBoundary looks back at most 31 runes.
|
||||
i := lastBoundary(&w.rb.f, w.buf) |
||||
if i == -1 { |
||||
i = 0 |
||||
} |
||||
if i > 0 { |
||||
if _, err = w.w.Write(w.buf[:i]); err != nil { |
||||
break |
||||
} |
||||
bn := copy(w.buf, w.buf[i:]) |
||||
w.buf = w.buf[:bn] |
||||
} |
||||
} |
||||
return n, err |
||||
} |
||||
|
||||
// Close forces data that remains in the buffer to be written.
|
||||
func (w *normWriter) Close() error { |
||||
if len(w.buf) > 0 { |
||||
_, err := w.w.Write(w.buf) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
// Writer returns a new writer that implements Write(b)
|
||||
// by writing f(b) to w. The returned writer may use an
|
||||
// an internal buffer to maintain state across Write calls.
|
||||
// Calling its Close method writes any buffered data to w.
|
||||
func (f Form) Writer(w io.Writer) io.WriteCloser { |
||||
wr := &normWriter{rb: reorderBuffer{}, w: w} |
||||
wr.rb.init(f, nil) |
||||
return wr |
||||
} |
||||
|
||||
type normReader struct { |
||||
rb reorderBuffer |
||||
r io.Reader |
||||
inbuf []byte |
||||
outbuf []byte |
||||
bufStart int |
||||
lastBoundary int |
||||
err error |
||||
} |
||||
|
||||
// Read implements the standard read interface.
|
||||
func (r *normReader) Read(p []byte) (int, error) { |
||||
for { |
||||
if r.lastBoundary-r.bufStart > 0 { |
||||
n := copy(p, r.outbuf[r.bufStart:r.lastBoundary]) |
||||
r.bufStart += n |
||||
if r.lastBoundary-r.bufStart > 0 { |
||||
return n, nil |
||||
} |
||||
return n, r.err |
||||
} |
||||
if r.err != nil { |
||||
return 0, r.err |
||||
} |
||||
outn := copy(r.outbuf, r.outbuf[r.lastBoundary:]) |
||||
r.outbuf = r.outbuf[0:outn] |
||||
r.bufStart = 0 |
||||
|
||||
n, err := r.r.Read(r.inbuf) |
||||
r.rb.src = inputBytes(r.inbuf[0:n]) |
||||
r.rb.nsrc, r.err = n, err |
||||
if n > 0 { |
||||
r.outbuf = doAppend(&r.rb, r.outbuf, 0) |
||||
} |
||||
if err == io.EOF { |
||||
r.lastBoundary = len(r.outbuf) |
||||
} else { |
||||
r.lastBoundary = lastBoundary(&r.rb.f, r.outbuf) |
||||
if r.lastBoundary == -1 { |
||||
r.lastBoundary = 0 |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Reader returns a new reader that implements Read
|
||||
// by reading data from r and returning f(data).
|
||||
func (f Form) Reader(r io.Reader) io.Reader { |
||||
const chunk = 4000 |
||||
buf := make([]byte, chunk) |
||||
rr := &normReader{rb: reorderBuffer{}, r: r, inbuf: buf} |
||||
rr.rb.init(f, buf) |
||||
return rr |
||||
} |
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,88 @@ |
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
import ( |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// Reset implements the Reset method of the transform.Transformer interface.
|
||||
func (Form) Reset() {} |
||||
|
||||
// Transform implements the Transform method of the transform.Transformer
|
||||
// interface. It may need to write segments of up to MaxSegmentSize at once.
|
||||
// Users should either catch ErrShortDst and allow dst to grow or have dst be at
|
||||
// least of size MaxTransformChunkSize to be guaranteed of progress.
|
||||
func (f Form) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
n := 0 |
||||
// Cap the maximum number of src bytes to check.
|
||||
b := src |
||||
eof := atEOF |
||||
if ns := len(dst); ns < len(b) { |
||||
err = transform.ErrShortDst |
||||
eof = false |
||||
b = b[:ns] |
||||
} |
||||
i, ok := formTable[f].quickSpan(inputBytes(b), n, len(b), eof) |
||||
n += copy(dst[n:], b[n:i]) |
||||
if !ok { |
||||
nDst, nSrc, err = f.transform(dst[n:], src[n:], atEOF) |
||||
return nDst + n, nSrc + n, err |
||||
} |
||||
if n < len(src) && !atEOF { |
||||
err = transform.ErrShortSrc |
||||
} |
||||
return n, n, err |
||||
} |
||||
|
||||
func flushTransform(rb *reorderBuffer) bool { |
||||
// Write out (must fully fit in dst, or else it is an ErrShortDst).
|
||||
if len(rb.out) < rb.nrune*utf8.UTFMax { |
||||
return false |
||||
} |
||||
rb.out = rb.out[rb.flushCopy(rb.out):] |
||||
return true |
||||
} |
||||
|
||||
var errs = []error{nil, transform.ErrShortDst, transform.ErrShortSrc} |
||||
|
||||
// transform implements the transform.Transformer interface. It is only called
|
||||
// when quickSpan does not pass for a given string.
|
||||
func (f Form) transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
// TODO: get rid of reorderBuffer. See CL 23460044.
|
||||
rb := reorderBuffer{} |
||||
rb.init(f, src) |
||||
for { |
||||
// Load segment into reorder buffer.
|
||||
rb.setFlusher(dst[nDst:], flushTransform) |
||||
end := decomposeSegment(&rb, nSrc, atEOF) |
||||
if end < 0 { |
||||
return nDst, nSrc, errs[-end] |
||||
} |
||||
nDst = len(dst) - len(rb.out) |
||||
nSrc = end |
||||
|
||||
// Next quickSpan.
|
||||
end = rb.nsrc |
||||
eof := atEOF |
||||
if n := nSrc + len(dst) - nDst; n < end { |
||||
err = transform.ErrShortDst |
||||
end = n |
||||
eof = false |
||||
} |
||||
end, ok := rb.f.quickSpan(rb.src, nSrc, end, eof) |
||||
n := copy(dst[nDst:], rb.src.bytes[nSrc:end]) |
||||
nSrc += n |
||||
nDst += n |
||||
if ok { |
||||
if n < rb.nsrc && !atEOF { |
||||
err = transform.ErrShortSrc |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,54 @@ |
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package norm |
||||
|
||||
type valueRange struct { |
||||
value uint16 // header: value:stride
|
||||
lo, hi byte // header: lo:n
|
||||
} |
||||
|
||||
type sparseBlocks struct { |
||||
values []valueRange |
||||
offset []uint16 |
||||
} |
||||
|
||||
var nfcSparse = sparseBlocks{ |
||||
values: nfcSparseValues[:], |
||||
offset: nfcSparseOffset[:], |
||||
} |
||||
|
||||
var nfkcSparse = sparseBlocks{ |
||||
values: nfkcSparseValues[:], |
||||
offset: nfkcSparseOffset[:], |
||||
} |
||||
|
||||
var ( |
||||
nfcData = newNfcTrie(0) |
||||
nfkcData = newNfkcTrie(0) |
||||
) |
||||
|
||||
// lookupValue determines the type of block n and looks up the value for b.
|
||||
// For n < t.cutoff, the block is a simple lookup table. Otherwise, the block
|
||||
// is a list of ranges with an accompanying value. Given a matching range r,
|
||||
// the value for b is by r.value + (b - r.lo) * stride.
|
||||
func (t *sparseBlocks) lookup(n uint32, b byte) uint16 { |
||||
offset := t.offset[n] |
||||
header := t.values[offset] |
||||
lo := offset + 1 |
||||
hi := lo + uint16(header.lo) |
||||
for lo < hi { |
||||
m := lo + (hi-lo)/2 |
||||
r := t.values[m] |
||||
if r.lo <= b && b <= r.hi { |
||||
return r.value + uint16(b-r.lo)*header.value |
||||
} |
||||
if b < r.lo { |
||||
hi = m |
||||
} else { |
||||
lo = m + 1 |
||||
} |
||||
} |
||||
return 0 |
||||
} |
Loading…
Reference in new issue