Move modules/gzip to gitea.com/macaron/gzip (#9058)

* Move modules/gzip to gitea.com/macaron/gzip * Fix vendor
5 years ago · 9ff6312627
parent ba4e8f221b
commit 9ff6312627
54 changed files with 2963 additions and 5154 deletions
--- a/go.mod
+++ b/go.mod
@ -9,6 +9,7 @@ require (
 	gitea.com/macaron/captcha v0.0.0-20190822015246-daa973478bae
 	gitea.com/macaron/cors v0.0.0-20190821152825-7dcef4a17175
 	gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439
 	gitea.com/macaron/gzip v0.0.0-20191118033930-0c4c5566a0e5
 	gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223
 	gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a
 	gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb
@ -55,9 +56,7 @@ require (
 	github.com/joho/godotenv v1.3.0 // indirect
 	github.com/kballard/go-shellquote v0.0.0-20170619183022-cd60e84ee657
 	github.com/keybase/go-crypto v0.0.0-20170605145657-00ac4db533f6
-	github.com/klauspost/compress v0.0.0-20161025140425-8df558b6cb6f
+	github.com/klauspost/compress v1.9.2
 	github.com/klauspost/cpuid v0.0.0-20160302075316-09cded8978dc // indirect
 	github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6 // indirect
 	github.com/lafriks/xormstore v1.3.2
 	github.com/lib/pq v1.2.0
 	github.com/lunny/dingtalk_webhook v0.0.0-20171025031554-e3534c89ef96
--- a/go.sum
+++ b/go.sum
@ -20,6 +20,8 @@ gitea.com/macaron/cors v0.0.0-20190821152825-7dcef4a17175 h1:ikzdAGB6SsUGByW5wKl
 gitea.com/macaron/cors v0.0.0-20190821152825-7dcef4a17175/go.mod h1:rtOK4J20kpMD9XcNsnO5YA843YSTe/MUMbDj/TJ/Q7A=
 gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439 h1:88c34YM29a1GlWLrLBaG/GTT2htDdJz1u3n9+lmPolg=
 gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439/go.mod h1:IsQPHx73HnnqFBYiVHjg87q4XBZyGXXu77xANukvZuk=
 gitea.com/macaron/gzip v0.0.0-20191118033930-0c4c5566a0e5 h1:G/a7r0r2jEelSynBlv1+PAEZQKfsdRHQUMb1PlNvemM=
 gitea.com/macaron/gzip v0.0.0-20191118033930-0c4c5566a0e5/go.mod h1:jGHtoovArcQj+sw7NJxyPgjuRxOSG9a/oFu3VkLRTKQ=
 gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223 h1:iZWwQif/LHMjBgfY/ua8CFVa4XMDfbbs7EZ0Q1dYguU=
 gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223/go.mod h1:+qsc10s4hBsHKU/9luGGumFh4m5FFVc7uih+8/mM1NY=
 gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591/go.mod h1:h6E4kLao1Yko6DOU6QDnQPcuoNzvbZqzj2mtPcEn1aM=
@ -334,12 +336,8 @@ github.com/keybase/go-crypto v0.0.0-20170605145657-00ac4db533f6/go.mod h1:ghbZsc
 github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q=
 github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
-github.com/klauspost/compress v0.0.0-20161025140425-8df558b6cb6f h1:tCnZKEmDovgV4jmsclh6CuKk9AMzTzyVWfejgkgccVg=
+github.com/klauspost/compress v1.9.2 h1:LfVyl+ZlLlLDeQ/d2AqfGIIH4qEDu0Ed2S5GyhCWIWY=
-github.com/klauspost/compress v0.0.0-20161025140425-8df558b6cb6f/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
+github.com/klauspost/compress v1.9.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/cpuid v0.0.0-20160302075316-09cded8978dc h1:WW8B7p7QBnFlqRVv/k6ro/S8Z7tCnYjJHcQNScx9YVs=
 github.com/klauspost/cpuid v0.0.0-20160302075316-09cded8978dc/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
 github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6 h1:KAZ1BW2TCmT6PRihDPpocIy1QTtsAsrx6TneU/4+CMg=
 github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6/go.mod h1:+ZoRqAPRLkC4NPOvfYeR5KNOrY6TD+/sAC3HXPZgDYg=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc=
--- a/integrations/lfs_getobject_test.go
+++ b/integrations/lfs_getobject_test.go
@ -15,10 +15,10 @@ import (
 	"testing"
 	"code.gitea.io/gitea/models"
 	"code.gitea.io/gitea/modules/gzip"
 	"code.gitea.io/gitea/modules/lfs"
 	"code.gitea.io/gitea/modules/setting"
 	"gitea.com/macaron/gzip"
 	gzipp "github.com/klauspost/compress/gzip"
 	"github.com/stretchr/testify/assert"
 )
--- a/modules/gzip/gzip_test.go
+++ b/modules/gzip/gzip_test.go
@ -1,131 +0,0 @@
 // Copyright 2019 The Gitea Authors. All rights reserved.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
 package gzip
 import (
 	"archive/zip"
 	"bytes"
 	"io/ioutil"
 	"net/http"
 	"net/http/httptest"
 	"testing"
 	"gitea.com/macaron/macaron"
 	gzipp "github.com/klauspost/compress/gzip"
 	"github.com/stretchr/testify/assert"
 )
 func setup(sampleResponse []byte) (*macaron.Macaron, *[]byte) {
 	m := macaron.New()
 	m.Use(Middleware())
 	m.Get("/", func() *[]byte { return &sampleResponse })
 	return m, &sampleResponse
 }
 func reqNoAcceptGzip(t *testing.T, m *macaron.Macaron, sampleResponse *[]byte) {
 	// Request without accept gzip: Should not gzip
 	resp := httptest.NewRecorder()
 	req, err := http.NewRequest("GET", "/", nil)
 	assert.NoError(t, err)
 	m.ServeHTTP(resp, req)
 	_, ok := resp.HeaderMap[contentEncodingHeader]
 	assert.False(t, ok)
 	contentEncoding := resp.Header().Get(contentEncodingHeader)
 	assert.NotContains(t, contentEncoding, "gzip")
 	result := resp.Body.Bytes()
 	assert.Equal(t, *sampleResponse, result)
 }
 func reqAcceptGzip(t *testing.T, m *macaron.Macaron, sampleResponse *[]byte, expectGzip bool) {
 	// Request without accept gzip: Should not gzip
 	resp := httptest.NewRecorder()
 	req, err := http.NewRequest("GET", "/", nil)
 	assert.NoError(t, err)
 	req.Header.Set(acceptEncodingHeader, "gzip")
 	m.ServeHTTP(resp, req)
 	_, ok := resp.HeaderMap[contentEncodingHeader]
 	assert.Equal(t, ok, expectGzip)
 	contentEncoding := resp.Header().Get(contentEncodingHeader)
 	if expectGzip {
 		assert.Contains(t, contentEncoding, "gzip")
 		gzippReader, err := gzipp.NewReader(resp.Body)
 		assert.NoError(t, err)
 		result, err := ioutil.ReadAll(gzippReader)
 		assert.NoError(t, err)
 		assert.Equal(t, *sampleResponse, result)
 	} else {
 		assert.NotContains(t, contentEncoding, "gzip")
 		result := resp.Body.Bytes()
 		assert.Equal(t, *sampleResponse, result)
 	}
 }
 func TestMiddlewareSmall(t *testing.T) {
 	m, sampleResponse := setup([]byte("Small response"))
 	reqNoAcceptGzip(t, m, sampleResponse)
 	reqAcceptGzip(t, m, sampleResponse, false)
 }
 func TestMiddlewareLarge(t *testing.T) {
 	b := make([]byte, MinSize+1)
 	for i := range b {
 		b[i] = byte(i % 256)
 	}
 	m, sampleResponse := setup(b)
 	reqNoAcceptGzip(t, m, sampleResponse)
 	// This should be gzipped as we accept gzip
 	reqAcceptGzip(t, m, sampleResponse, true)
 }
 func TestMiddlewareGzip(t *testing.T) {
 	b := make([]byte, MinSize*10)
 	for i := range b {
 		b[i] = byte(i % 256)
 	}
 	outputBuffer := bytes.NewBuffer([]byte{})
 	gzippWriter := gzipp.NewWriter(outputBuffer)
 	gzippWriter.Write(b)
 	gzippWriter.Flush()
 	gzippWriter.Close()
 	output := outputBuffer.Bytes()
 	m, sampleResponse := setup(output)
 	reqNoAcceptGzip(t, m, sampleResponse)
 	// This should not be gzipped even though we accept gzip
 	reqAcceptGzip(t, m, sampleResponse, false)
 }
 func TestMiddlewareZip(t *testing.T) {
 	b := make([]byte, MinSize*10)
 	for i := range b {
 		b[i] = byte(i % 256)
 	}
 	outputBuffer := bytes.NewBuffer([]byte{})
 	zipWriter := zip.NewWriter(outputBuffer)
 	fileWriter, err := zipWriter.Create("default")
 	assert.NoError(t, err)
 	fileWriter.Write(b)
 	//fileWriter.Close()
 	zipWriter.Close()
 	output := outputBuffer.Bytes()
 	m, sampleResponse := setup(output)
 	reqNoAcceptGzip(t, m, sampleResponse)
 	// This should not be gzipped even though we accept gzip
 	reqAcceptGzip(t, m, sampleResponse, false)
 }
--- a/routers/routes/routes.go
+++ b/routers/routes/routes.go
@ -16,7 +16,6 @@ import (
 	"code.gitea.io/gitea/models"
 	"code.gitea.io/gitea/modules/auth"
 	"code.gitea.io/gitea/modules/context"
 	"code.gitea.io/gitea/modules/gzip"
 	"code.gitea.io/gitea/modules/lfs"
 	"code.gitea.io/gitea/modules/log"
 	"code.gitea.io/gitea/modules/metrics"
@ -44,6 +43,7 @@ import (
 	"gitea.com/macaron/captcha"
 	"gitea.com/macaron/cors"
 	"gitea.com/macaron/csrf"
 	"gitea.com/macaron/gzip"
 	"gitea.com/macaron/i18n"
 	"gitea.com/macaron/macaron"
 	"gitea.com/macaron/session"
--- a/vendor/gitea.com/macaron/gzip/go.mod
+++ b/vendor/gitea.com/macaron/gzip/go.mod
@ -0,0 +1,9 @@
 module gitea.com/macaron/gzip
 go 1.12
 require (
 	gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb
 	github.com/klauspost/compress v1.9.2
 	github.com/stretchr/testify v1.4.0
 )
--- a/vendor/gitea.com/macaron/gzip/go.sum
+++ b/vendor/gitea.com/macaron/gzip/go.sum
@ -0,0 +1,42 @@
 gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591 h1:UbCTjPcLrNxR9LzKDjQBMT2zoxZuEnca1pZCpgeMuhQ=
 gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591/go.mod h1:h6E4kLao1Yko6DOU6QDnQPcuoNzvbZqzj2mtPcEn1aM=
 gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb h1:amL0md6orTj1tXY16ANzVU9FmzQB+W7aJwp8pVDbrmA=
 gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb/go.mod h1:0coI+mSPSwbsyAbOuFllVS38awuk9mevhLD52l50Gjs=
 github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
 github.com/gopherjs/gopherjs v0.0.0-20181103185306-d547d1d9531e h1:JKmoR8x90Iww1ks85zJ1lfDGgIiMDuIptTOhJq+zKyg=
 github.com/gopherjs/gopherjs v0.0.0-20181103185306-d547d1d9531e/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
 github.com/jtolds/gls v4.2.1+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
 github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo=
 github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
 github.com/klauspost/compress v1.9.2 h1:LfVyl+ZlLlLDeQ/d2AqfGIIH4qEDu0Ed2S5GyhCWIWY=
 github.com/klauspost/compress v1.9.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
 github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304 h1:Jpy1PXuP99tXNrhbq2BaPz9B+jNAvH1JPQQpG/9GCXY=
 github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
 github.com/smartystreets/goconvey v0.0.0-20181108003508-044398e4856c/go.mod h1:XDJAKZRPZ1CvBcN2aX5YOUTYGHki24fSF0Iv48Ibg0s=
 github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337 h1:WN9BUFbdyOsSH/XohnWpXOlq9NBD5sGAB2FciQMUEe8=
 github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
 github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e h1:GSGeB9EAKY2spCABz6xOX5DbxZEXolK+nBSvmsQwRjM=
 github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e/go.mod h1:tOOxU81rwgoCLoOVVPHb6T/wt8HZygqH5id+GNnlCXM=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4 h1:HuIa8hRrWRSrqYzx1qI49NNxhdi2PrY7gxVSq1JjLDc=
 golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/ini.v1 v1.44.0 h1:YRJzTUp0kSYWUVFF5XAbDFfyiqwsl0Vb9R8TVP5eRi0=
 gopkg.in/ini.v1 v1.44.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
--- a/vendor/gitea.com/macaron/gzip/gzip.go
+++ b/vendor/gitea.com/macaron/gzip/gzip.go
--- a/vendor/github.com/klauspost/compress/LICENSE
+++ b/vendor/github.com/klauspost/compress/LICENSE
@ -1,4 +1,5 @@
 Copyright (c) 2012 The Go Authors. All rights reserved.
 Copyright (c) 2019 Klaus Post. All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
--- a/vendor/github.com/klauspost/compress/flate/copy.go
+++ b/vendor/github.com/klauspost/compress/flate/copy.go
@ -1,32 +0,0 @@
 // Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package flate
 // forwardCopy is like the built-in copy function except that it always goes
 // forward from the start, even if the dst and src overlap.
 // It is equivalent to:
 //   for i := 0; i < n; i++ {
 //     mem[dst+i] = mem[src+i]
 //   }
 func forwardCopy(mem []byte, dst, src, n int) {
 	if dst <= src {
 		copy(mem[dst:dst+n], mem[src:src+n])
 		return
 	}
 	for {
 		if dst >= src+n {
 			copy(mem[dst:dst+n], mem[src:src+n])
 			return
 		}
 		// There is some forward overlap.  The destination
 		// will be filled with a repeated pattern of mem[src:src+k].
 		// We copy one instance of the pattern here, then repeat.
 		// Each time around this loop k will double.
 		k := dst - src
 		copy(mem[dst:dst+k], mem[src:src+k])
 		n -= k
 		dst += k
 	}
 }
--- a/vendor/github.com/klauspost/compress/flate/crc32_amd64.go
+++ b/vendor/github.com/klauspost/compress/flate/crc32_amd64.go
@ -1,41 +0,0 @@
 //+build !noasm
 //+build !appengine
 // Copyright 2015, Klaus Post, see LICENSE for details.
 package flate
 import (
 	"github.com/klauspost/cpuid"
 )
 // crc32sse returns a hash for the first 4 bytes of the slice
 // len(a) must be >= 4.
 //go:noescape
 func crc32sse(a []byte) uint32
 // crc32sseAll calculates hashes for each 4-byte set in a.
 // dst must be east len(a) - 4 in size.
 // The size is not checked by the assembly.
 //go:noescape
 func crc32sseAll(a []byte, dst []uint32)
 // matchLenSSE4 returns the number of matching bytes in a and b
 // up to length 'max'. Both slices must be at least 'max'
 // bytes in size.
 //
 // TODO: drop the "SSE4" name, since it doesn't use any SSE instructions.
 //
 //go:noescape
 func matchLenSSE4(a, b []byte, max int) int
 // histogram accumulates a histogram of b in h.
 // h must be at least 256 entries in length,
 // and must be cleared before calling this function.
 //go:noescape
 func histogram(b []byte, h []int32)
 // Detect SSE 4.2 feature.
 func init() {
 	useSSE42 = cpuid.CPU.SSE42()
 }
--- a/vendor/github.com/klauspost/compress/flate/crc32_amd64.s
+++ b/vendor/github.com/klauspost/compress/flate/crc32_amd64.s
@ -1,213 +0,0 @@
 //+build !noasm
 //+build !appengine
 // Copyright 2015, Klaus Post, see LICENSE for details.
 // func crc32sse(a []byte) uint32
 TEXT ·crc32sse(SB), 4, $0
 	MOVQ a+0(FP), R10
 	XORQ BX, BX
 	// CRC32   dword (R10), EBX
 	BYTE $0xF2; BYTE $0x41; BYTE $0x0f
 	BYTE $0x38; BYTE $0xf1; BYTE $0x1a
 	MOVL BX, ret+24(FP)
 	RET
 // func crc32sseAll(a []byte, dst []uint32)
 TEXT ·crc32sseAll(SB), 4, $0
 	MOVQ  a+0(FP), R8      // R8: src
 	MOVQ  a_len+8(FP), R10 // input length
 	MOVQ  dst+24(FP), R9   // R9: dst
 	SUBQ  $4, R10
 	JS    end
 	JZ    one_crc
 	MOVQ  R10, R13
 	SHRQ  $2, R10          // len/4
 	ANDQ  $3, R13          // len&3
 	XORQ  BX, BX
 	ADDQ  $1, R13
 	TESTQ R10, R10
 	JZ    rem_loop
 crc_loop:
 	MOVQ (R8), R11
 	XORQ BX, BX
 	XORQ DX, DX
 	XORQ DI, DI
 	MOVQ R11, R12
 	SHRQ $8, R11
 	MOVQ R12, AX
 	MOVQ R11, CX
 	SHRQ $16, R12
 	SHRQ $16, R11
 	MOVQ R12, SI
 	// CRC32   EAX, EBX
 	BYTE $0xF2; BYTE $0x0f
 	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
 	// CRC32   ECX, EDX
 	BYTE $0xF2; BYTE $0x0f
 	BYTE $0x38; BYTE $0xf1; BYTE $0xd1
 	// CRC32   ESI, EDI
 	BYTE $0xF2; BYTE $0x0f
 	BYTE $0x38; BYTE $0xf1; BYTE $0xfe
 	MOVL BX, (R9)
 	MOVL DX, 4(R9)
 	MOVL DI, 8(R9)
 	XORQ BX, BX
 	MOVL R11, AX
 	// CRC32   EAX, EBX
 	BYTE $0xF2; BYTE $0x0f
 	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
 	MOVL BX, 12(R9)
 	ADDQ $16, R9
 	ADDQ $4, R8
 	XORQ BX, BX
 	SUBQ $1, R10
 	JNZ  crc_loop
 rem_loop:
 	MOVL (R8), AX
 	// CRC32   EAX, EBX
 	BYTE $0xF2; BYTE $0x0f
 	BYTE $0x38; BYTE $0xf1; BYTE $0xd8
 	MOVL BX, (R9)
 	ADDQ $4, R9
 	ADDQ $1, R8
 	XORQ BX, BX
 	SUBQ $1, R13
 	JNZ  rem_loop
 end:
 	RET
 one_crc:
 	MOVQ $1, R13
 	XORQ BX, BX
 	JMP  rem_loop
 // func matchLenSSE4(a, b []byte, max int) int
 TEXT ·matchLenSSE4(SB), 4, $0
 	MOVQ a_base+0(FP), SI
 	MOVQ b_base+24(FP), DI
 	MOVQ DI, DX
 	MOVQ max+48(FP), CX
 cmp8:
 	// As long as we are 8 or more bytes before the end of max, we can load and
 	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
 	CMPQ CX, $8
 	JLT  cmp1
 	MOVQ (SI), AX
 	MOVQ (DI), BX
 	CMPQ AX, BX
 	JNE  bsf
 	ADDQ $8, SI
 	ADDQ $8, DI
 	SUBQ $8, CX
 	JMP  cmp8
 bsf:
 	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
 	// the index of the first byte that differs. The BSF instruction finds the
 	// least significant 1 bit, the amd64 architecture is little-endian, and
 	// the shift by 3 converts a bit index to a byte index.
 	XORQ AX, BX
 	BSFQ BX, BX
 	SHRQ $3, BX
 	ADDQ BX, DI
 	// Subtract off &b[0] to convert from &b[ret] to ret, and return.
 	SUBQ DX, DI
 	MOVQ DI, ret+56(FP)
 	RET
 cmp1:
 	// In the slices' tail, compare 1 byte at a time.
 	CMPQ CX, $0
 	JEQ  matchLenEnd
 	MOVB (SI), AX
 	MOVB (DI), BX
 	CMPB AX, BX
 	JNE  matchLenEnd
 	ADDQ $1, SI
 	ADDQ $1, DI
 	SUBQ $1, CX
 	JMP  cmp1
 matchLenEnd:
 	// Subtract off &b[0] to convert from &b[ret] to ret, and return.
 	SUBQ DX, DI
 	MOVQ DI, ret+56(FP)
 	RET
 // func histogram(b []byte, h []int32)
 TEXT ·histogram(SB), 4, $0
 	MOVQ b+0(FP), SI     // SI: &b
 	MOVQ b_len+8(FP), R9 // R9: len(b)
 	MOVQ h+24(FP), DI    // DI: Histogram
 	MOVQ R9, R8
 	SHRQ $3, R8
 	JZ   hist1
 	XORQ R11, R11
 loop_hist8:
 	MOVQ (SI), R10
 	MOVB R10, R11
 	INCL (DI)(R11*4)
 	SHRQ $8, R10
 	MOVB R10, R11
 	INCL (DI)(R11*4)
 	SHRQ $8, R10
 	MOVB R10, R11
 	INCL (DI)(R11*4)
 	SHRQ $8, R10
 	MOVB R10, R11
 	INCL (DI)(R11*4)
 	SHRQ $8, R10
 	MOVB R10, R11
 	INCL (DI)(R11*4)
 	SHRQ $8, R10
 	MOVB R10, R11
 	INCL (DI)(R11*4)
 	SHRQ $8, R10
 	MOVB R10, R11
 	INCL (DI)(R11*4)
 	SHRQ $8, R10
 	INCL (DI)(R10*4)
 	ADDQ $8, SI
 	DECQ R8
 	JNZ  loop_hist8
 hist1:
 	ANDQ $7, R9
 	JZ   end_hist
 	XORQ R10, R10
 loop_hist1:
 	MOVB (SI), R10
 	INCL (DI)(R10*4)
 	INCQ SI
 	DECQ R9
 	JNZ  loop_hist1
 end_hist:
 	RET
--- a/vendor/github.com/klauspost/compress/flate/crc32_noasm.go
+++ b/vendor/github.com/klauspost/compress/flate/crc32_noasm.go
@ -1,35 +0,0 @@
 //+build !amd64 noasm appengine
 // Copyright 2015, Klaus Post, see LICENSE for details.
 package flate
 func init() {
 	useSSE42 = false
 }
 // crc32sse should never be called.
 func crc32sse(a []byte) uint32 {
 	panic("no assembler")
 }
 // crc32sseAll should never be called.
 func crc32sseAll(a []byte, dst []uint32) {
 	panic("no assembler")
 }
 // matchLenSSE4 should never be called.
 func matchLenSSE4(a, b []byte, max int) int {
 	panic("no assembler")
 	return 0
 }
 // histogram accumulates a histogram of b in h.
 //
 // len(h) must be >= 256, and h's elements must be all zeroes.
 func histogram(b []byte, h []int32) {
 	h = h[:256]
 	for _, t := range b {
 		h[t]++
 	}
 }
--- a/vendor/github.com/klauspost/compress/flate/deflate.go
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
--- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@ -0,0 +1,257 @@
 // Copyright 2011 The Snappy-Go Authors. All rights reserved.
 // Modified for deflate by Klaus Post (c) 2015.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package flate
 import (
 	"fmt"
 	"math/bits"
 )
 type fastEnc interface {
 	Encode(dst *tokens, src []byte)
 	Reset()
 }
 func newFastEnc(level int) fastEnc {
 	switch level {
 	case 1:
 		return &fastEncL1{fastGen: fastGen{cur: maxStoreBlockSize}}
 	case 2:
 		return &fastEncL2{fastGen: fastGen{cur: maxStoreBlockSize}}
 	case 3:
 		return &fastEncL3{fastGen: fastGen{cur: maxStoreBlockSize}}
 	case 4:
 		return &fastEncL4{fastGen: fastGen{cur: maxStoreBlockSize}}
 	case 5:
 		return &fastEncL5{fastGen: fastGen{cur: maxStoreBlockSize}}
 	case 6:
 		return &fastEncL6{fastGen: fastGen{cur: maxStoreBlockSize}}
 	default:
 		panic("invalid level specified")
 	}
 }
 const (
 	tableBits       = 16             // Bits used in the table
 	tableSize       = 1 << tableBits // Size of the table
 	tableShift      = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
 	baseMatchOffset = 1              // The smallest match offset
 	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
 	maxMatchOffset  = 1 << 15        // The largest match offset
 	bTableBits   = 18                                           // Bits used in the big tables
 	bTableSize   = 1 << bTableBits                              // Size of the table
 	allocHistory = maxMatchOffset * 10                          // Size to preallocate for history.
 	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize // Reset the buffer offset when reaching this.
 )
 const (
 	prime3bytes = 506832829
 	prime4bytes = 2654435761
 	prime5bytes = 889523592379
 	prime6bytes = 227718039650203
 	prime7bytes = 58295818150454627
 	prime8bytes = 0xcf1bbcdcb7a56463
 )
 func load32(b []byte, i int) uint32 {
 	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
 	b = b[i:]
 	b = b[:4]
 	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
 }
 func load64(b []byte, i int) uint64 {
 	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
 	b = b[i:]
 	b = b[:8]
 	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
 		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
 }
 func load3232(b []byte, i int32) uint32 {
 	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
 	b = b[i:]
 	b = b[:4]
 	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
 }
 func load6432(b []byte, i int32) uint64 {
 	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
 	b = b[i:]
 	b = b[:8]
 	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
 		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
 }
 func hash(u uint32) uint32 {
 	return (u * 0x1e35a7bd) >> tableShift
 }
 type tableEntry struct {
 	val    uint32
 	offset int32
 }
 // fastGen maintains the table for matches,
 // and the previous byte block for level 2.
 // This is the generic implementation.
 type fastGen struct {
 	hist []byte
 	cur  int32
 }
 func (e *fastGen) addBlock(src []byte) int32 {
 	// check if we have space already
 	if len(e.hist)+len(src) > cap(e.hist) {
 		if cap(e.hist) == 0 {
 			e.hist = make([]byte, 0, allocHistory)
 		} else {
 			if cap(e.hist) < maxMatchOffset*2 {
 				panic("unexpected buffer size")
 			}
 			// Move down
 			offset := int32(len(e.hist)) - maxMatchOffset
 			copy(e.hist[0:maxMatchOffset], e.hist[offset:])
 			e.cur += offset
 			e.hist = e.hist[:maxMatchOffset]
 		}
 	}
 	s := int32(len(e.hist))
 	e.hist = append(e.hist, src...)
 	return s
 }
 // hash4 returns the hash of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <32.
 func hash4u(u uint32, h uint8) uint32 {
 	return (u * prime4bytes) >> ((32 - h) & 31)
 }
 type tableEntryPrev struct {
 	Cur  tableEntry
 	Prev tableEntry
 }
 // hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <32.
 func hash4x64(u uint64, h uint8) uint32 {
 	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
 }
 // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash7(u uint64, h uint8) uint32 {
 	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
 }
 // hash8 returns the hash of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash8(u uint64, h uint8) uint32 {
 	return uint32((u * prime8bytes) >> ((64 - h) & 63))
 }
 // hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash6(u uint64, h uint8) uint32 {
 	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
 }
 // matchlen will return the match length between offsets and t in src.
 // The maximum length returned is maxMatchLength - 4.
 // It is assumed that s > t, that t >=0 and s < len(src).
 func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
 	if debugDecode {
 		if t >= s {
 			panic(fmt.Sprint("t >=s:", t, s))
 		}
 		if int(s) >= len(src) {
 			panic(fmt.Sprint("s >= len(src):", s, len(src)))
 		}
 		if t < 0 {
 			panic(fmt.Sprint("t < 0:", t))
 		}
 		if s-t > maxMatchOffset {
 			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
 		}
 	}
 	s1 := int(s) + maxMatchLength - 4
 	if s1 > len(src) {
 		s1 = len(src)
 	}
 	// Extend the match to be as long as possible.
 	return int32(matchLen(src[s:s1], src[t:]))
 }
 // matchlenLong will return the match length between offsets and t in src.
 // It is assumed that s > t, that t >=0 and s < len(src).
 func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
 	if debugDecode {
 		if t >= s {
 			panic(fmt.Sprint("t >=s:", t, s))
 		}
 		if int(s) >= len(src) {
 			panic(fmt.Sprint("s >= len(src):", s, len(src)))
 		}
 		if t < 0 {
 			panic(fmt.Sprint("t < 0:", t))
 		}
 		if s-t > maxMatchOffset {
 			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
 		}
 	}
 	// Extend the match to be as long as possible.
 	return int32(matchLen(src[s:], src[t:]))
 }
 // Reset the encoding table.
 func (e *fastGen) Reset() {
 	if cap(e.hist) < int(maxMatchOffset*8) {
 		l := maxMatchOffset * 8
 		// Make it at least 1MB.
 		if l < 1<<20 {
 			l = 1 << 20
 		}
 		e.hist = make([]byte, 0, l)
 	}
 	// We offset current position so everything will be out of reach
 	e.cur += maxMatchOffset + int32(len(e.hist))
 	e.hist = e.hist[:0]
 }
 // matchLen returns the maximum length.
 // 'a' must be the shortest of the two.
 func matchLen(a, b []byte) int {
 	b = b[:len(a)]
 	var checked int
 	if len(a) > 4 {
 		// Try 4 bytes first
 		if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
 			return bits.TrailingZeros32(diff) >> 3
 		}
 		// Switch to 8 byte matching.
 		checked = 4
 		a = a[4:]
 		b = b[4:]
 		for len(a) >= 8 {
 			b = b[:len(a)]
 			if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
 				return checked + (bits.TrailingZeros64(diff) >> 3)
 			}
 			checked += 8
 			a = a[8:]
 			b = b[8:]
 		}
 	}
 	b = b[:len(a)]
 	for i := range a {
 		if a[i] != b[i] {
 			return int(i) + checked
 		}
 	}
 	return len(a) + checked
 }
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@ -35,7 +35,7 @@ const (
 )
 // The number of extra bits needed by length code X - LENGTH_CODES_START.
-var lengthExtraBits = []int8{
+var lengthExtraBits = [32]int8{
 	/* 257 */ 0, 0, 0,
 	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
 	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
@ -43,14 +43,14 @@ var lengthExtraBits = []int8{
 }
 // The length indicated by length code X - LENGTH_CODES_START.
-var lengthBase = []uint32{
+var lengthBase = [32]uint8{
 	0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
 	12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
 	64, 80, 96, 112, 128, 160, 192, 224, 255,
 }
 // offset code word extra bits.
-var offsetExtraBits = []int8{
+var offsetExtraBits = [64]int8{
 	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
 	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
 	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
@ -58,7 +58,7 @@ var offsetExtraBits = []int8{
 	14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
 }
-var offsetBase = []uint32{
+var offsetBase = [64]uint32{
 	/* normal deflate */
 	0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
 	0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
@ -85,26 +85,48 @@ type huffmanBitWriter struct {
 	// Data waiting to be written is bytes[0:nbytes]
 	// and then the low nbits of bits.
 	bits            uint64
-	nbits           uint
+	nbits           uint16
-	bytes           [bufferSize]byte
+	nbytes          uint8
 	codegenFreq     [codegenCodeCount]int32
 	nbytes          int
 	literalFreq     []int32
 	offsetFreq      []int32
 	codegen         []uint8
 	literalEncoding *huffmanEncoder
 	offsetEncoding  *huffmanEncoder
 	codegenEncoding *huffmanEncoder
 	err             error
 	lastHeader      int
 	// Set between 0 (reused block can be up to 2x the size)
 	logReusePenalty uint
 	lastHuffMan     bool
 	bytes           [256]byte
 	literalFreq     [lengthCodesStart + 32]uint16
 	offsetFreq      [32]uint16
 	codegenFreq     [codegenCodeCount]uint16
 	// codegen must have an extra space for the final symbol.
 	codegen [literalCount + offsetCodeCount + 1]uint8
 }
 // Huffman reuse.
 //
 // The huffmanBitWriter supports reusing huffman tables and thereby combining block sections.
 //
 // This is controlled by several variables:
 //
 // If lastHeader is non-zero the Huffman table can be reused.
 // This also indicates that a Huffman table has been generated that can output all
 // possible symbols.
 // It also indicates that an EOB has not yet been emitted, so if a new tabel is generated
 // an EOB with the previous table must be written.
 //
 // If lastHuffMan is set, a table for outputting literals has been generated and offsets are invalid.
 //
 // An incoming block estimates the output size of a new table using a 'fresh' by calculating the
 // optimal size and adding a penalty in 'logReusePenalty'.
 // A Huffman table is not optimal, which is why we add a penalty, and generating a new table
 // is slower both for compression and decompression.
 func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 	return &huffmanBitWriter{
 		writer:          w,
-		literalFreq:     make([]int32, maxNumLit),
+		literalEncoding: newHuffmanEncoder(literalCount),
 		offsetFreq:      make([]int32, offsetCodeCount),
 		codegen:         make([]uint8, maxNumLit+offsetCodeCount+1),
 		literalEncoding: newHuffmanEncoder(maxNumLit),
 		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
 		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
 	}
@ -113,7 +135,42 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 func (w *huffmanBitWriter) reset(writer io.Writer) {
 	w.writer = writer
 	w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil
-	w.bytes = [bufferSize]byte{}
+	w.bytes = [256]byte{}
 	w.lastHeader = 0
 	w.lastHuffMan = false
 }
 func (w *huffmanBitWriter) canReuse(t *tokens) (offsets, lits bool) {
 	offsets, lits = true, true
 	a := t.offHist[:offsetCodeCount]
 	b := w.offsetFreq[:len(a)]
 	for i := range a {
 		if b[i] == 0 && a[i] != 0 {
 			offsets = false
 			break
 		}
 	}
 	a = t.extraHist[:literalCount-256]
 	b = w.literalFreq[256:literalCount]
 	b = b[:len(a)]
 	for i := range a {
 		if b[i] == 0 && a[i] != 0 {
 			lits = false
 			break
 		}
 	}
 	if lits {
 		a = t.litHist[:]
 		b = w.literalFreq[:len(a)]
 		for i := range a {
 			if b[i] == 0 && a[i] != 0 {
 				lits = false
 				break
 			}
 		}
 	}
 	return
 }
 func (w *huffmanBitWriter) flush() {
@ -144,30 +201,11 @@ func (w *huffmanBitWriter) write(b []byte) {
 	_, w.err = w.writer.Write(b)
 }
-func (w *huffmanBitWriter) writeBits(b int32, nb uint) {
+func (w *huffmanBitWriter) writeBits(b int32, nb uint16) {
-	if w.err != nil {
+	w.bits |= uint64(b) << (w.nbits & 63)
 		return
 	}
 	w.bits |= uint64(b) << w.nbits
 	w.nbits += nb
 	if w.nbits >= 48 {
-		bits := w.bits
+		w.writeOutBits()
 		w.bits >>= 48
 		w.nbits -= 48
 		n := w.nbytes
 		bytes := w.bytes[n : n+6]
 		bytes[0] = byte(bits)
 		bytes[1] = byte(bits >> 8)
 		bytes[2] = byte(bits >> 16)
 		bytes[3] = byte(bits >> 24)
 		bytes[4] = byte(bits >> 32)
 		bytes[5] = byte(bits >> 40)
 		n += 6
 		if n >= bufferFlushSize {
 			w.write(w.bytes[:n])
 			n = 0
 		}
 		w.nbytes = n
 	}
 }
@ -213,7 +251,7 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
 	// a copy of the frequencies, and as the place where we put the result.
 	// This is fine because the output is always shorter than the input used
 	// so far.
-	codegen := w.codegen // cache
+	codegen := w.codegen[:] // cache
 	// Copy the concatenated code sizes to codegen. Put a marker at the end.
 	cgnl := codegen[:numLiterals]
 	for i := range cgnl {
@ -292,30 +330,54 @@ func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litE
 	codegen[outIndex] = badCode
 }
-// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) codegens() int {
-func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+	numCodegens := len(w.codegenFreq)
 	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
 		numCodegens--
 	}
 	return numCodegens
 }
 func (w *huffmanBitWriter) headerSize() (size, numCodegens int) {
 	numCodegens = len(w.codegenFreq)
 	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
 		numCodegens--
 	}
-	header := 3 + 5 + 5 + 4 + (3 * numCodegens) +
+	return 3 + 5 + 5 + 4 + (3 * numCodegens) +
 		w.codegenEncoding.bitLength(w.codegenFreq[:]) +
 		int(w.codegenFreq[16])*2 +
 		int(w.codegenFreq[17])*3 +
-		int(w.codegenFreq[18])*7
+		int(w.codegenFreq[18])*7, numCodegens
 }
 // dynamicSize returns the size of dynamically encoded data in bits.
 func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
 	header, numCodegens := w.headerSize()
 	size = header +
-		litEnc.bitLength(w.literalFreq) +
+		litEnc.bitLength(w.literalFreq[:]) +
-		offEnc.bitLength(w.offsetFreq) +
+		offEnc.bitLength(w.offsetFreq[:]) +
 		extraBits
 	return size, numCodegens
 }
 // extraBitSize will return the number of bits that will be written
 // as "extra" bits on matches.
 func (w *huffmanBitWriter) extraBitSize() int {
 	total := 0
 	for i, n := range w.literalFreq[257:literalCount] {
 		total += int(n) * int(lengthExtraBits[i&31])
 	}
 	for i, n := range w.offsetFreq[:offsetCodeCount] {
 		total += int(n) * int(offsetExtraBits[i&31])
 	}
 	return total
 }
 // fixedSize returns the size of dynamically encoded data in bits.
 func (w *huffmanBitWriter) fixedSize(extraBits int) int {
 	return 3 +
-		fixedLiteralEncoding.bitLength(w.literalFreq) +
+		fixedLiteralEncoding.bitLength(w.literalFreq[:]) +
-		fixedOffsetEncoding.bitLength(w.offsetFreq) +
+		fixedOffsetEncoding.bitLength(w.offsetFreq[:]) +
 		extraBits
 }
@ -333,30 +395,36 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
 }
 func (w *huffmanBitWriter) writeCode(c hcode) {
-	if w.err != nil {
+	// The function does not get inlined if we "& 63" the shift.
 		return
 	}
 	w.bits |= uint64(c.code) << w.nbits
-	w.nbits += uint(c.len)
+	w.nbits += c.len
 	if w.nbits >= 48 {
-		bits := w.bits
+		w.writeOutBits()
-		w.bits >>= 48
+	}
-		w.nbits -= 48
+}
-		n := w.nbytes
+
-		bytes := w.bytes[n : n+6]
+// writeOutBits will write bits to the buffer.
-		bytes[0] = byte(bits)
+func (w *huffmanBitWriter) writeOutBits() {
-		bytes[1] = byte(bits >> 8)
+	bits := w.bits
-		bytes[2] = byte(bits >> 16)
+	w.bits >>= 48
-		bytes[3] = byte(bits >> 24)
+	w.nbits -= 48
-		bytes[4] = byte(bits >> 32)
+	n := w.nbytes
-		bytes[5] = byte(bits >> 40)
+	w.bytes[n] = byte(bits)
-		n += 6
+	w.bytes[n+1] = byte(bits >> 8)
-		if n >= bufferFlushSize {
+	w.bytes[n+2] = byte(bits >> 16)
-			w.write(w.bytes[:n])
+	w.bytes[n+3] = byte(bits >> 24)
 	w.bytes[n+4] = byte(bits >> 32)
 	w.bytes[n+5] = byte(bits >> 40)
 	n += 6
 	if n >= bufferFlushSize {
 		if w.err != nil {
 			n = 0
 			return
 		}
-		w.nbytes = n
+		w.write(w.bytes[:n])
 		n = 0
 	}
 	w.nbytes = n
 }
 // Write the header of a dynamic Huffman block to the output stream.
@ -412,6 +480,11 @@ func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
 	if w.err != nil {
 		return
 	}
 	if w.lastHeader > 0 {
 		// We owe an EOB
 		w.writeCode(w.literalEncoding.codes[endBlockMarker])
 		w.lastHeader = 0
 	}
 	var flag int32
 	if isEof {
 		flag = 1
@ -426,6 +499,12 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
 	if w.err != nil {
 		return
 	}
 	if w.lastHeader > 0 {
 		// We owe an EOB
 		w.writeCode(w.literalEncoding.codes[endBlockMarker])
 		w.lastHeader = 0
 	}
 	// Indicate that we are a fixed Huffman block
 	var value int32 = 2
 	if isEof {
@ -439,29 +518,23 @@ func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
 // is larger than the original bytes, the data will be written as a
 // stored block.
 // If the input is nil, the tokens will always be Huffman encoded.
-func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	if w.err != nil {
 		return
 	}
-	tokens = append(tokens, endBlockMarker)
+	tokens.AddEOB()
-	numLiterals, numOffsets := w.indexTokens(tokens)
+	if w.lastHeader > 0 {
-
+		// We owe an EOB
 		w.writeCode(w.literalEncoding.codes[endBlockMarker])
 		w.lastHeader = 0
 	}
 	numLiterals, numOffsets := w.indexTokens(tokens, false)
 	w.generate(tokens)
 	var extraBits int
 	storedSize, storable := w.storedSize(input)
 	if storable {
-		// We only bother calculating the costs of the extra bits required by
+		extraBits = w.extraBitSize()
 		// the length of offset fields (which will be the same for both fixed
 		// and dynamic encoding), if we need to compare those two encodings
 		// against stored encoding.
 		for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
 			// First eight length codes have extra size = 0.
 			extraBits += int(w.literalFreq[lengthCode]) * int(lengthExtraBits[lengthCode-lengthCodesStart])
 		}
 		for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
 			// First four offset codes have extra size = 0.
 			extraBits += int(w.offsetFreq[offsetCode]) * int(offsetExtraBits[offsetCode])
 		}
 	}
 	// Figure out smallest code.
@ -500,7 +573,7 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
 	}
 	// Write the tokens.
-	w.writeTokens(tokens, literalEncoding.codes, offsetEncoding.codes)
+	w.writeTokens(tokens.Slice(), literalEncoding.codes, offsetEncoding.codes)
 }
 // writeBlockDynamic encodes a block using a dynamic Huffman table.
@ -508,57 +581,103 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
 // histogram distribution.
 // If input is supplied and the compression savings are below 1/16th of the
 // input size the block is stored.
-func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []byte, sync bool) {
 	if w.err != nil {
 		return
 	}
-	tokens = append(tokens, endBlockMarker)
+	sync = sync || eof
-	numLiterals, numOffsets := w.indexTokens(tokens)
+	if sync {
 		tokens.AddEOB()
 	}
-	// Generate codegen and codegenFrequencies, which indicates how to encode
+	// We cannot reuse pure huffman table.
-	// the literalEncoding and the offsetEncoding.
+	if w.lastHuffMan && w.lastHeader > 0 {
-	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+		// We will not try to reuse.
-	w.codegenEncoding.generate(w.codegenFreq[:], 7)
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
-	size, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, 0)
+		w.lastHeader = 0
 		w.lastHuffMan = false
 	}
 	if !sync {
 		tokens.Fill()
 	}
 	numLiterals, numOffsets := w.indexTokens(tokens, !sync)
-	// Store bytes, if we don't get a reasonable improvement.
+	var size int
-	if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+	// Check if we should reuse.
-		w.writeStoredHeader(len(input), eof)
+	if w.lastHeader > 0 {
-		w.writeBytes(input)
+		// Estimate size for using a new table
-		return
+		newSize := w.lastHeader + tokens.EstimatedBits()
 		// The estimated size is calculated as an optimal table.
 		// We add a penalty to make it more realistic and re-use a bit more.
 		newSize += newSize >> (w.logReusePenalty & 31)
 		extra := w.extraBitSize()
 		reuseSize, _ := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extra)
 		// Check if a new table is better.
 		if newSize < reuseSize {
 			// Write the EOB we owe.
 			w.writeCode(w.literalEncoding.codes[endBlockMarker])
 			size = newSize
 			w.lastHeader = 0
 		} else {
 			size = reuseSize
 		}
 		// Check if we get a reasonable size decrease.
 		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
 			w.lastHeader = 0
 			return
 		}
 	}
-	// Write Huffman table.
+	// We want a new block/table
-	w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+	if w.lastHeader == 0 {
 		w.generate(tokens)
 		// Generate codegen and codegenFrequencies, which indicates how to encode
 		// the literalEncoding and the offsetEncoding.
 		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
 		w.codegenEncoding.generate(w.codegenFreq[:], 7)
 		var numCodegens int
 		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, w.extraBitSize())
 		// Store bytes, if we don't get a reasonable improvement.
 		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
 			w.lastHeader = 0
 			return
 		}
 		// Write Huffman table.
 		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
 		w.lastHeader, _ = w.headerSize()
 		w.lastHuffMan = false
 	}
 	if sync {
 		w.lastHeader = 0
 	}
 	// Write the tokens.
-	w.writeTokens(tokens, w.literalEncoding.codes, w.offsetEncoding.codes)
+	w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
 }
 // indexTokens indexes a slice of tokens, and updates
 // literalFreq and offsetFreq, and generates literalEncoding
 // and offsetEncoding.
 // The number of literal and offset tokens is returned.
-func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets int) {
+func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, numOffsets int) {
-	for i := range w.literalFreq {
+	copy(w.literalFreq[:], t.litHist[:])
-		w.literalFreq[i] = 0
+	copy(w.literalFreq[256:], t.extraHist[:])
-	}
+	copy(w.offsetFreq[:], t.offHist[:offsetCodeCount])
 	for i := range w.offsetFreq {
 		w.offsetFreq[i] = 0
 	}
-	for _, t := range tokens {
+	if t.n == 0 {
-		if t < matchType {
+		return
-			w.literalFreq[t.literal()]++
+	}
-			continue
+	if filled {
-		}
+		return maxNumLit, maxNumDist
 		length := t.length()
 		offset := t.offset()
 		w.literalFreq[lengthCodesStart+lengthCode(length)]++
 		w.offsetFreq[offsetCode(offset)]++
 	}
 	// get the number of literals
 	numLiterals = len(w.literalFreq)
 	for w.literalFreq[numLiterals-1] == 0 {
@ -575,41 +694,85 @@ func (w *huffmanBitWriter) indexTokens(tokens []token) (numLiterals, numOffsets
 		w.offsetFreq[0] = 1
 		numOffsets = 1
 	}
 	w.literalEncoding.generate(w.literalFreq, 15)
 	w.offsetEncoding.generate(w.offsetFreq, 15)
 	return
 }
 func (w *huffmanBitWriter) generate(t *tokens) {
 	w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
 	w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
 }
 // writeTokens writes a slice of tokens to the output.
 // codes for literal and offset encoding must be supplied.
 func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) {
 	if w.err != nil {
 		return
 	}
 	if len(tokens) == 0 {
 		return
 	}
 	// Only last token should be endBlockMarker.
 	var deferEOB bool
 	if tokens[len(tokens)-1] == endBlockMarker {
 		tokens = tokens[:len(tokens)-1]
 		deferEOB = true
 	}
 	// Create slices up to the next power of two to avoid bounds checks.
 	lits := leCodes[:256]
 	offs := oeCodes[:32]
 	lengths := leCodes[lengthCodesStart:]
 	lengths = lengths[:32]
 	for _, t := range tokens {
 		if t < matchType {
-			w.writeCode(leCodes[t.literal()])
+			w.writeCode(lits[t.literal()])
 			continue
 		}
 		// Write the length
 		length := t.length()
 		lengthCode := lengthCode(length)
-		w.writeCode(leCodes[lengthCode+lengthCodesStart])
+		if false {
-		extraLengthBits := uint(lengthExtraBits[lengthCode])
+			w.writeCode(lengths[lengthCode&31])
 		} else {
 			// inlined
 			c := lengths[lengthCode&31]
 			w.bits |= uint64(c.code) << (w.nbits & 63)
 			w.nbits += c.len
 			if w.nbits >= 48 {
 				w.writeOutBits()
 			}
 		}
 		extraLengthBits := uint16(lengthExtraBits[lengthCode&31])
 		if extraLengthBits > 0 {
-			extraLength := int32(length - lengthBase[lengthCode])
+			extraLength := int32(length - lengthBase[lengthCode&31])
 			w.writeBits(extraLength, extraLengthBits)
 		}
 		// Write the offset
 		offset := t.offset()
 		offsetCode := offsetCode(offset)
-		w.writeCode(oeCodes[offsetCode])
+		if false {
-		extraOffsetBits := uint(offsetExtraBits[offsetCode])
+			w.writeCode(offs[offsetCode&31])
 		} else {
 			// inlined
 			c := offs[offsetCode&31]
 			w.bits |= uint64(c.code) << (w.nbits & 63)
 			w.nbits += c.len
 			if w.nbits >= 48 {
 				w.writeOutBits()
 			}
 		}
 		extraOffsetBits := uint16(offsetExtraBits[offsetCode&63])
 		if extraOffsetBits > 0 {
-			extraOffset := int32(offset - offsetBase[offsetCode])
+			extraOffset := int32(offset - offsetBase[offsetCode&63])
 			w.writeBits(extraOffset, extraOffsetBits)
 		}
 	}
 	if deferEOB {
 		w.writeCode(leCodes[endBlockMarker])
 	}
 }
 // huffOffset is a static offset encoder used for huffman only encoding.
@ -620,82 +783,99 @@ func init() {
 	w := newHuffmanBitWriter(nil)
 	w.offsetFreq[0] = 1
 	huffOffset = newHuffmanEncoder(offsetCodeCount)
-	huffOffset.generate(w.offsetFreq, 15)
+	huffOffset.generate(w.offsetFreq[:offsetCodeCount], 15)
 }
 // writeBlockHuff encodes a block of bytes as either
 // Huffman encoded literals or uncompressed bytes if the
 // results only gains very little from compression.
-func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
+func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	if w.err != nil {
 		return
 	}
 	// Clear histogram
-	for i := range w.literalFreq {
+	for i := range w.literalFreq[:] {
 		w.literalFreq[i] = 0
 	}
 	if !w.lastHuffMan {
 		for i := range w.offsetFreq[:] {
 			w.offsetFreq[i] = 0
 		}
 	}
 	// Add everything as literals
-	histogram(input, w.literalFreq)
+	estBits := histogramSize(input, w.literalFreq[:], !eof && !sync) + 15
-	w.literalFreq[endBlockMarker] = 1
+	// Store bytes, if we don't get a reasonable improvement.
 	ssize, storable := w.storedSize(input)
 	if storable && ssize < (estBits+estBits>>4) {
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
 	}
-	const numLiterals = endBlockMarker + 1
+	if w.lastHeader > 0 {
-	const numOffsets = 1
+		size, _ := w.dynamicSize(w.literalEncoding, huffOffset, w.lastHeader)
 		estBits += estBits >> (w.logReusePenalty)
-	w.literalEncoding.generate(w.literalFreq, 15)
+		if estBits < size {
 			// We owe an EOB
 			w.writeCode(w.literalEncoding.codes[endBlockMarker])
 			w.lastHeader = 0
 		}
 	}
-	// Figure out smallest code.
+	const numLiterals = endBlockMarker + 1
-	// Always use dynamic Huffman or Store
+	const numOffsets = 1
-	var numCodegens int
+	if w.lastHeader == 0 {
 		w.literalFreq[endBlockMarker] = 1
 		w.literalEncoding.generate(w.literalFreq[:numLiterals], 15)
-	// Generate codegen and codegenFrequencies, which indicates how to encode
+		// Generate codegen and codegenFrequencies, which indicates how to encode
-	// the literalEncoding and the offsetEncoding.
+		// the literalEncoding and the offsetEncoding.
-	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
+		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
-	w.codegenEncoding.generate(w.codegenFreq[:], 7)
+		w.codegenEncoding.generate(w.codegenFreq[:], 7)
-	size, numCodegens := w.dynamicSize(w.literalEncoding, huffOffset, 0)
+		numCodegens := w.codegens()
-	// Store bytes, if we don't get a reasonable improvement.
+		// Huffman.
-	if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
-		w.writeStoredHeader(len(input), eof)
+		w.lastHuffMan = true
-		w.writeBytes(input)
+		w.lastHeader, _ = w.headerSize()
 		return
 	}
 	// Huffman.
 	w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
 	encoding := w.literalEncoding.codes[:257]
 	n := w.nbytes
 	for _, t := range input {
 		// Bitwriting inlined, ~30% speedup
 		c := encoding[t]
-		w.bits |= uint64(c.code) << w.nbits
+		w.bits |= uint64(c.code) << ((w.nbits) & 63)
-		w.nbits += uint(c.len)
+		w.nbits += c.len
-		if w.nbits < 48 {
+		if w.nbits >= 48 {
-			continue
+			bits := w.bits
-		}
+			w.bits >>= 48
-		// Store 6 bytes
+			w.nbits -= 48
-		bits := w.bits
+			n := w.nbytes
-		w.bits >>= 48
+			w.bytes[n] = byte(bits)
-		w.nbits -= 48
+			w.bytes[n+1] = byte(bits >> 8)
-		bytes := w.bytes[n : n+6]
+			w.bytes[n+2] = byte(bits >> 16)
-		bytes[0] = byte(bits)
+			w.bytes[n+3] = byte(bits >> 24)
-		bytes[1] = byte(bits >> 8)
+			w.bytes[n+4] = byte(bits >> 32)
-		bytes[2] = byte(bits >> 16)
+			w.bytes[n+5] = byte(bits >> 40)
-		bytes[3] = byte(bits >> 24)
+			n += 6
-		bytes[4] = byte(bits >> 32)
+			if n >= bufferFlushSize {
-		bytes[5] = byte(bits >> 40)
+				if w.err != nil {
-		n += 6
+					n = 0
-		if n < bufferFlushSize {
+					return
-			continue
+				}
-		}
+				w.write(w.bytes[:n])
-		w.write(w.bytes[:n])
+				n = 0
-		if w.err != nil {
+			}
-			return // Return early in the event of write failures
+			w.nbytes = n
 		}
 		n = 0
 	}
-	w.nbytes = n
+	if eof || sync {
-	w.writeCode(encoding[endBlockMarker])
+		w.writeCode(encoding[endBlockMarker])
 		w.lastHeader = 0
 		w.lastHuffMan = false
 	}
 }
--- a/vendor/github.com/klauspost/compress/flate/huffman_code.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@ -6,9 +6,16 @@ package flate
 import (
 	"math"
 	"math/bits"
 	"sort"
 )
 const (
 	maxBitsLimit = 16
 	// number of valid literals
 	literalCount = 286
 )
 // hcode is a huffman code with a bit code and bit length.
 type hcode struct {
 	code, len uint16
@ -24,7 +31,7 @@ type huffmanEncoder struct {
 type literalNode struct {
 	literal uint16
-	freq    int32
+	freq    uint16
 }
 // A levelInfo describes the state of the constructed tree for a given depth.
@ -53,18 +60,24 @@ func (h *hcode) set(code uint16, length uint16) {
 	h.code = code
 }
-func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxInt32} }
+func reverseBits(number uint16, bitLength byte) uint16 {
 	return bits.Reverse16(number << ((16 - bitLength) & 15))
 }
 func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxUint16} }
 func newHuffmanEncoder(size int) *huffmanEncoder {
-	return &huffmanEncoder{codes: make([]hcode, size)}
+	// Make capacity to next power of two.
 	c := uint(bits.Len32(uint32(size - 1)))
 	return &huffmanEncoder{codes: make([]hcode, size, 1<<c)}
 }
 // Generates a HuffmanCode corresponding to the fixed literal table
 func generateFixedLiteralEncoding() *huffmanEncoder {
-	h := newHuffmanEncoder(maxNumLit)
+	h := newHuffmanEncoder(literalCount)
 	codes := h.codes
 	var ch uint16
-	for ch = 0; ch < maxNumLit; ch++ {
+	for ch = 0; ch < literalCount; ch++ {
 		var bits uint16
 		var size uint16
 		switch {
@ -105,7 +118,7 @@ func generateFixedOffsetEncoding() *huffmanEncoder {
 var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding()
 var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding()
-func (h *huffmanEncoder) bitLength(freq []int32) int {
+func (h *huffmanEncoder) bitLength(freq []uint16) int {
 	var total int
 	for i, f := range freq {
 		if f != 0 {
@ -115,8 +128,6 @@ func (h *huffmanEncoder) bitLength(freq []int32) int {
 	return total
 }
 const maxBitsLimit = 16
 // Return the number of literals assigned to each bit size in the Huffman encoding
 //
 // This method is only called when list.length >= 3
@ -160,9 +171,9 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 		// We initialize the levels as if we had already figured this out.
 		levels[level] = levelInfo{
 			level:        level,
-			lastFreq:     list[1].freq,
+			lastFreq:     int32(list[1].freq),
-			nextCharFreq: list[2].freq,
+			nextCharFreq: int32(list[2].freq),
-			nextPairFreq: list[0].freq + list[1].freq,
+			nextPairFreq: int32(list[0].freq) + int32(list[1].freq),
 		}
 		leafCounts[level][level] = 2
 		if level == 1 {
@ -194,7 +205,12 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 			l.lastFreq = l.nextCharFreq
 			// Lower leafCounts are the same of the previous node.
 			leafCounts[level][level] = n
-			l.nextCharFreq = list[n].freq
+			e := list[n]
 			if e.literal < math.MaxUint16 {
 				l.nextCharFreq = int32(e.freq)
 			} else {
 				l.nextCharFreq = math.MaxInt32
 			}
 		} else {
 			// The next item on this row is a pair from the previous row.
 			// nextPairFreq isn't valid until we generate two
@ -270,12 +286,12 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 //
 // freq  An array of frequencies, in which frequency[i] gives the frequency of literal i.
 // maxBits  The maximum number of bits to use for any literal.
-func (h *huffmanEncoder) generate(freq []int32, maxBits int32) {
+func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 	if h.freqcache == nil {
 		// Allocate a reusable buffer with the longest possible frequency table.
-		// Possible lengths are codegenCodeCount, offsetCodeCount and maxNumLit.
+		// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
-		// The largest of these is maxNumLit, so we allocate for that case.
+		// The largest of these is literalCount, so we allocate for that case.
-		h.freqcache = make([]literalNode, maxNumLit+1)
+		h.freqcache = make([]literalNode, literalCount+1)
 	}
 	list := h.freqcache[:len(freq)+1]
 	// Number of non-zero literals
@ -342,3 +358,27 @@ func (s byFreq) Less(i, j int) bool {
 }
 func (s byFreq) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
 // histogramSize accumulates a histogram of b in h.
 // An estimated size in bits is returned.
 // Unassigned values are assigned '1' in the histogram.
 // len(h) must be >= 256, and h's elements must be all zeroes.
 func histogramSize(b []byte, h []uint16, fill bool) int {
 	h = h[:256]
 	for _, t := range b {
 		h[t]++
 	}
 	invTotal := 1.0 / float64(len(b))
 	shannon := 0.0
 	single := math.Ceil(-math.Log2(invTotal))
 	for i, v := range h[:] {
 		if v > 0 {
 			n := float64(v)
 			shannon += math.Ceil(-math.Log2(n*invTotal) * n)
 		} else if fill {
 			shannon += single
 			h[i] = 1
 		}
 	}
 	return int(shannon + 0.99)
 }
--- a/vendor/github.com/klauspost/compress/flate/inflate.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@ -9,19 +9,24 @@ package flate
 import (
 	"bufio"
 	"fmt"
 	"io"
 	"math/bits"
 	"strconv"
 	"sync"
 )
 const (
-	maxCodeLen = 16 // max length of Huffman code
+	maxCodeLen     = 16 // max length of Huffman code
 	maxCodeLenMask = 15 // mask for max length of Huffman code
 	// The next three numbers come from the RFC section 3.2.7, with the
 	// additional proviso in section 3.2.5 which implies that distance codes
 	// 30 and 31 should never occur in compressed data.
 	maxNumLit  = 286
 	maxNumDist = 30
 	numCodes   = 19 // number of codes in Huffman meta-code
 	debugDecode = false
 )
 // Initialize the fixedHuffmanDecoder only once upon first use.
@ -101,10 +106,10 @@ const (
 )
 type huffmanDecoder struct {
-	min      int                      // the minimum code length
+	min      int                       // the minimum code length
-	chunks   [huffmanNumChunks]uint32 // chunks as described above
+	chunks   *[huffmanNumChunks]uint16 // chunks as described above
-	links    [][]uint32               // overflow links
+	links    [][]uint16                // overflow links
-	linkMask uint32                   // mask the width of the link table
+	linkMask uint32                    // mask the width of the link table
 }
 // Initialize Huffman decoding tables from array of code lengths.
@ -112,21 +117,24 @@ type huffmanDecoder struct {
 // tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
 // degenerate case where the tree has only a single symbol with length 1. Empty
 // trees are permitted.
-func (h *huffmanDecoder) init(bits []int) bool {
+func (h *huffmanDecoder) init(lengths []int) bool {
 	// Sanity enables additional runtime tests during Huffman
 	// table construction. It's intended to be used during
 	// development to supplement the currently ad-hoc unit tests.
 	const sanity = false
 	if h.chunks == nil {
 		h.chunks = &[huffmanNumChunks]uint16{}
 	}
 	if h.min != 0 {
-		*h = huffmanDecoder{}
+		*h = huffmanDecoder{chunks: h.chunks, links: h.links}
 	}
 	// Count number of codes of each length,
 	// compute min and max length.
 	var count [maxCodeLen]int
 	var min, max int
-	for _, n := range bits {
+	for _, n := range lengths {
 		if n == 0 {
 			continue
 		}
@ -136,7 +144,7 @@ func (h *huffmanDecoder) init(bits []int) bool {
 		if n > max {
 			max = n
 		}
-		count[n]++
+		count[n&maxCodeLenMask]++
 	}
 	// Empty tree. The decompressor.huffSym function will fail later if the tree
@ -154,8 +162,8 @@ func (h *huffmanDecoder) init(bits []int) bool {
 	var nextcode [maxCodeLen]int
 	for i := min; i <= max; i++ {
 		code <<= 1
-		nextcode[i] = code
+		nextcode[i&maxCodeLenMask] = code
-		code += count[i]
+		code += count[i&maxCodeLenMask]
 	}
 	// Check that the coding is complete (i.e., that we've
@ -164,37 +172,56 @@ func (h *huffmanDecoder) init(bits []int) bool {
 	// accept degenerate single-code codings. See also
 	// TestDegenerateHuffmanCoding.
 	if code != 1<<uint(max) && !(code == 1 && max == 1) {
 		if debugDecode {
 			fmt.Println("coding failed, code, max:", code, max, code == 1<<uint(max), code == 1 && max == 1, "(one should be true)")
 		}
 		return false
 	}
 	h.min = min
 	chunks := h.chunks[:]
 	for i := range chunks {
 		chunks[i] = 0
 	}
 	if max > huffmanChunkBits {
 		numLinks := 1 << (uint(max) - huffmanChunkBits)
 		h.linkMask = uint32(numLinks - 1)
 		// create link tables
 		link := nextcode[huffmanChunkBits+1] >> 1
-		h.links = make([][]uint32, huffmanNumChunks-link)
+		if cap(h.links) < huffmanNumChunks-link {
 			h.links = make([][]uint16, huffmanNumChunks-link)
 		} else {
 			h.links = h.links[:huffmanNumChunks-link]
 		}
 		for j := uint(link); j < huffmanNumChunks; j++ {
-			reverse := int(reverseByte[j>>8]) | int(reverseByte[j&0xff])<<8
+			reverse := int(bits.Reverse16(uint16(j)))
 			reverse >>= uint(16 - huffmanChunkBits)
 			off := j - uint(link)
 			if sanity && h.chunks[reverse] != 0 {
 				panic("impossible: overwriting existing chunk")
 			}
-			h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1))
+			h.chunks[reverse] = uint16(off<<huffmanValueShift | (huffmanChunkBits + 1))
-			h.links[off] = make([]uint32, numLinks)
+			if cap(h.links[off]) < numLinks {
 				h.links[off] = make([]uint16, numLinks)
 			} else {
 				links := h.links[off][:0]
 				h.links[off] = links[:numLinks]
 			}
 		}
 	} else {
 		h.links = h.links[:0]
 	}
-	for i, n := range bits {
+	for i, n := range lengths {
 		if n == 0 {
 			continue
 		}
 		code := nextcode[n]
 		nextcode[n]++
-		chunk := uint32(i<<huffmanValueShift | n)
+		chunk := uint16(i<<huffmanValueShift | n)
-		reverse := int(reverseByte[code>>8]) | int(reverseByte[code&0xff])<<8
+		reverse := int(bits.Reverse16(uint16(code)))
 		reverse >>= uint(16 - n)
 		if n <= huffmanChunkBits {
 			for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
@ -326,6 +353,9 @@ func (f *decompressor) nextBlock() {
 		f.huffmanBlock()
 	default:
 		// 3 is reserved.
 		if debugDecode {
 			fmt.Println("reserved data block encountered")
 		}
 		f.err = CorruptInputError(f.roffset)
 	}
 }
@ -404,11 +434,17 @@ func (f *decompressor) readHuffman() error {
 	}
 	nlit := int(f.b&0x1F) + 257
 	if nlit > maxNumLit {
 		if debugDecode {
 			fmt.Println("nlit > maxNumLit", nlit)
 		}
 		return CorruptInputError(f.roffset)
 	}
 	f.b >>= 5
 	ndist := int(f.b&0x1F) + 1
 	if ndist > maxNumDist {
 		if debugDecode {
 			fmt.Println("ndist > maxNumDist", ndist)
 		}
 		return CorruptInputError(f.roffset)
 	}
 	f.b >>= 5
@ -432,6 +468,9 @@ func (f *decompressor) readHuffman() error {
 		f.codebits[codeOrder[i]] = 0
 	}
 	if !f.h1.init(f.codebits[0:]) {
 		if debugDecode {
 			fmt.Println("init codebits failed")
 		}
 		return CorruptInputError(f.roffset)
 	}
@ -459,6 +498,9 @@ func (f *decompressor) readHuffman() error {
 			rep = 3
 			nb = 2
 			if i == 0 {
 				if debugDecode {
 					fmt.Println("i==0")
 				}
 				return CorruptInputError(f.roffset)
 			}
 			b = f.bits[i-1]
@ -473,6 +515,9 @@ func (f *decompressor) readHuffman() error {
 		}
 		for f.nb < nb {
 			if err := f.moreBits(); err != nil {
 				if debugDecode {
 					fmt.Println("morebits:", err)
 				}
 				return err
 			}
 		}
@ -480,6 +525,9 @@ func (f *decompressor) readHuffman() error {
 		f.b >>= nb
 		f.nb -= nb
 		if i+rep > n {
 			if debugDecode {
 				fmt.Println("i+rep > n", i, rep, n)
 			}
 			return CorruptInputError(f.roffset)
 		}
 		for j := 0; j < rep; j++ {
@ -489,6 +537,9 @@ func (f *decompressor) readHuffman() error {
 	}
 	if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) {
 		if debugDecode {
 			fmt.Println("init2 failed")
 		}
 		return CorruptInputError(f.roffset)
 	}
@ -566,12 +617,18 @@ readLiteral:
 			length = 258
 			n = 0
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
 		if n > 0 {
 			for f.nb < n {
 				if err = f.moreBits(); err != nil {
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
 					f.err = err
 					return
 				}
@ -585,15 +642,21 @@ readLiteral:
 		if f.hd == nil {
 			for f.nb < 5 {
 				if err = f.moreBits(); err != nil {
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
 					f.err = err
 					return
 				}
 			}
-			dist = int(reverseByte[(f.b&0x1F)<<3])
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
 			f.b >>= 5
 			f.nb -= 5
 		} else {
 			if dist, err = f.huffSym(f.hd); err != nil {
 				if debugDecode {
 					fmt.Println("huffsym:", err)
 				}
 				f.err = err
 				return
 			}
@ -608,6 +671,9 @@ readLiteral:
 			extra := (dist & 1) << nb
 			for f.nb < nb {
 				if err = f.moreBits(); err != nil {
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
 					f.err = err
 					return
 				}
@ -617,12 +683,18 @@ readLiteral:
 			f.nb -= nb
 			dist = 1<<(nb+1) + 1 + extra
 		default:
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
 		// No check on length; encoding can be prescient.
 		if dist > f.dict.histSize() {
 			if debugDecode {
 				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
@ -661,15 +733,15 @@ func (f *decompressor) dataBlock() {
 	nr, err := io.ReadFull(f.r, f.buf[0:4])
 	f.roffset += int64(nr)
 	if err != nil {
-		if err == io.EOF {
+		f.err = noEOF(err)
 			err = io.ErrUnexpectedEOF
 		}
 		f.err = err
 		return
 	}
 	n := int(f.buf[0]) | int(f.buf[1])<<8
 	nn := int(f.buf[2]) | int(f.buf[3])<<8
 	if uint16(nn) != uint16(^n) {
 		if debugDecode {
 			fmt.Println("uint16(nn) != uint16(^n)", nn, ^n)
 		}
 		f.err = CorruptInputError(f.roffset)
 		return
 	}
@ -697,10 +769,7 @@ func (f *decompressor) copyData() {
 	f.copyLen -= cnt
 	f.dict.writeMark(cnt)
 	if err != nil {
-		if err == io.EOF {
+		f.err = noEOF(err)
 			err = io.ErrUnexpectedEOF
 		}
 		f.err = err
 		return
 	}
@ -722,13 +791,18 @@ func (f *decompressor) finishBlock() {
 	f.step = (*decompressor).nextBlock
 }
 // noEOF returns err, unless err == io.EOF, in which case it returns io.ErrUnexpectedEOF.
 func noEOF(e error) error {
 	if e == io.EOF {
 		return io.ErrUnexpectedEOF
 	}
 	return e
 }
 func (f *decompressor) moreBits() error {
 	c, err := f.r.ReadByte()
 	if err != nil {
-		if err == io.EOF {
+		return noEOF(err)
 			err = io.ErrUnexpectedEOF
 		}
 		return err
 	}
 	f.roffset++
 	f.b |= uint32(c) << f.nb
@ -743,25 +817,40 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
 	// cases, the chunks slice will be 0 for the invalid sequence, leading it
 	// satisfy the n == 0 check below.
 	n := uint(h.min)
 	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
 	// inline call to moreBits and reassign b,nb back to f on return.
 	nb, b := f.nb, f.b
 	for {
-		for f.nb < n {
+		for nb < n {
-			if err := f.moreBits(); err != nil {
+			c, err := f.r.ReadByte()
-				return 0, err
+			if err != nil {
 				f.b = b
 				f.nb = nb
 				return 0, noEOF(err)
 			}
 			f.roffset++
 			b |= uint32(c) << (nb & 31)
 			nb += 8
 		}
-		chunk := h.chunks[f.b&(huffmanNumChunks-1)]
+		chunk := h.chunks[b&(huffmanNumChunks-1)]
 		n = uint(chunk & huffmanCountMask)
 		if n > huffmanChunkBits {
-			chunk = h.links[chunk>>huffmanValueShift][(f.b>>huffmanChunkBits)&h.linkMask]
+			chunk = h.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&h.linkMask]
 			n = uint(chunk & huffmanCountMask)
 		}
-		if n <= f.nb {
+		if n <= nb {
 			if n == 0 {
 				f.b = b
 				f.nb = nb
 				if debugDecode {
 					fmt.Println("huffsym: n==0")
 				}
 				f.err = CorruptInputError(f.roffset)
 				return 0, f.err
 			}
-			f.b >>= n
+			f.b = b >> (n & 31)
-			f.nb -= n
+			f.nb = nb - n
 			return int(chunk >> huffmanValueShift), nil
 		}
 	}
@ -799,6 +888,8 @@ func (f *decompressor) Reset(r io.Reader, dict []byte) error {
 		r:        makeReader(r),
 		bits:     f.bits,
 		codebits: f.codebits,
 		h1:       f.h1,
 		h2:       f.h2,
 		dict:     f.dict,
 		step:     (*decompressor).nextBlock,
 	}
--- a/vendor/github.com/klauspost/compress/flate/level1.go
+++ b/vendor/github.com/klauspost/compress/flate/level1.go
@ -0,0 +1,174 @@
 package flate
 // fastGen maintains the table for matches,
 // and the previous byte block for level 2.
 // This is the generic implementation.
 type fastEncL1 struct {
 	fastGen
 	table [tableSize]tableEntry
 }
 // EncodeL1 uses a similar algorithm to level 1
 func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 	// Protect against e.cur wraparound.
 	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
 			}
 			e.cur = maxMatchOffset
 			break
 		}
 		// Shift down everything in the table that isn't already too far away.
 		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
 		for i := range e.table[:] {
 			v := e.table[i].offset
 			if v <= minOff {
 				v = 0
 			} else {
 				v = v - e.cur + maxMatchOffset
 			}
 			e.table[i].offset = v
 		}
 		e.cur = maxMatchOffset
 	}
 	s := e.addBlock(src)
 	// This check isn't in the Snappy implementation, but there, the caller
 	// instead of the callee handles this case.
 	if len(src) < minNonLiteralBlockSize {
 		// We do not fill the token table.
 		// This will be picked up by caller.
 		dst.n = uint16(len(src))
 		return
 	}
 	// Override src
 	src = e.hist
 	nextEmit := s
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := int32(len(src) - inputMargin)
 	// nextEmit is where in src the next emitLiteral should start from.
 	cv := load3232(src, s)
 	for {
 		const skipLog = 5
 		const doEvery = 2
 		nextS := s
 		var candidate tableEntry
 		for {
 			nextHash := hash(cv)
 			candidate = e.table[nextHash]
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			now := load6432(src, nextS)
 			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
 			nextHash = hash(uint32(now))
 			offset := s - (candidate.offset - e.cur)
 			if offset < maxMatchOffset && cv == candidate.val {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
 				break
 			}
 			// Do one right away...
 			cv = uint32(now)
 			s = nextS
 			nextS++
 			candidate = e.table[nextHash]
 			now >>= 8
 			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
 			offset = s - (candidate.offset - e.cur)
 			if offset < maxMatchOffset && cv == candidate.val {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
 				break
 			}
 			cv = uint32(now)
 			s = nextS
 		}
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 		for {
 			// Invariant: we have a 4-byte match at s, and no need to emit any
 			// literal bytes prior to s.
 			// Extend the 4-byte match as long as possible.
 			t := candidate.offset - e.cur
 			l := e.matchlenLong(s+4, t+4, src) + 4
 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
 				s--
 				t--
 				l++
 			}
 			if nextEmit < s {
 				emitLiteral(dst, src[nextEmit:s])
 			}
 			// Save the match found
 			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
 			s += l
 			nextEmit = s
 			if nextS >= s {
 				s = nextS + 1
 			}
 			if s >= sLimit {
 				// Index first pair after match end.
 				if int(s+l+4) < len(src) {
 					cv := load3232(src, s)
 					e.table[hash(cv)] = tableEntry{offset: s + e.cur, val: cv}
 				}
 				goto emitRemainder
 			}
 			// We could immediately start working at s now, but to improve
 			// compression we first update the hash table at s-2 and at s. If
 			// another emitCopy is not our next move, also calculate nextHash
 			// at s+1. At least on GOARCH=amd64, these three hash calculations
 			// are faster as one load64 call (with some shifts) instead of
 			// three load32 calls.
 			x := load6432(src, s-2)
 			o := e.cur + s - 2
 			prevHash := hash(uint32(x))
 			e.table[prevHash] = tableEntry{offset: o, val: uint32(x)}
 			x >>= 16
 			currHash := hash(uint32(x))
 			candidate = e.table[currHash]
 			e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x)}
 			offset := s - (candidate.offset - e.cur)
 			if offset > maxMatchOffset || uint32(x) != candidate.val {
 				cv = uint32(x >> 8)
 				s++
 				break
 			}
 		}
 	}
 emitRemainder:
 	if int(nextEmit) < len(src) {
 		// If nothing was added, don't encode literals.
 		if dst.n == 0 {
 			return
 		}
 		emitLiteral(dst, src[nextEmit:])
 	}
 }
--- a/vendor/github.com/klauspost/compress/flate/level2.go
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@ -0,0 +1,199 @@
 package flate
 // fastGen maintains the table for matches,
 // and the previous byte block for level 2.
 // This is the generic implementation.
 type fastEncL2 struct {
 	fastGen
 	table [bTableSize]tableEntry
 }
 // EncodeL2 uses a similar algorithm to level 1, but is capable
 // of matching across blocks giving better compression at a small slowdown.
 func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 	// Protect against e.cur wraparound.
 	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
 			}
 			e.cur = maxMatchOffset
 			break
 		}
 		// Shift down everything in the table that isn't already too far away.
 		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
 		for i := range e.table[:] {
 			v := e.table[i].offset
 			if v <= minOff {
 				v = 0
 			} else {
 				v = v - e.cur + maxMatchOffset
 			}
 			e.table[i].offset = v
 		}
 		e.cur = maxMatchOffset
 	}
 	s := e.addBlock(src)
 	// This check isn't in the Snappy implementation, but there, the caller
 	// instead of the callee handles this case.
 	if len(src) < minNonLiteralBlockSize {
 		// We do not fill the token table.
 		// This will be picked up by caller.
 		dst.n = uint16(len(src))
 		return
 	}
 	// Override src
 	src = e.hist
 	nextEmit := s
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := int32(len(src) - inputMargin)
 	// nextEmit is where in src the next emitLiteral should start from.
 	cv := load3232(src, s)
 	for {
 		// When should we start skipping if we haven't found matches in a long while.
 		const skipLog = 5
 		const doEvery = 2
 		nextS := s
 		var candidate tableEntry
 		for {
 			nextHash := hash4u(cv, bTableBits)
 			s = nextS
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			candidate = e.table[nextHash]
 			now := load6432(src, nextS)
 			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
 			nextHash = hash4u(uint32(now), bTableBits)
 			offset := s - (candidate.offset - e.cur)
 			if offset < maxMatchOffset && cv == candidate.val {
 				e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
 				break
 			}
 			// Do one right away...
 			cv = uint32(now)
 			s = nextS
 			nextS++
 			candidate = e.table[nextHash]
 			now >>= 8
 			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
 			offset = s - (candidate.offset - e.cur)
 			if offset < maxMatchOffset && cv == candidate.val {
 				break
 			}
 			cv = uint32(now)
 		}
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 		// Call emitCopy, and then see if another emitCopy could be our next
 		// move. Repeat until we find no match for the input immediately after
 		// what was consumed by the last emitCopy call.
 		//
 		// If we exit this loop normally then we need to call emitLiteral next,
 		// though we don't yet know how big the literal will be. We handle that
 		// by proceeding to the next iteration of the main loop. We also can
 		// exit this loop via goto if we get close to exhausting the input.
 		for {
 			// Invariant: we have a 4-byte match at s, and no need to emit any
 			// literal bytes prior to s.
 			// Extend the 4-byte match as long as possible.
 			t := candidate.offset - e.cur
 			l := e.matchlenLong(s+4, t+4, src) + 4
 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
 				s--
 				t--
 				l++
 			}
 			if nextEmit < s {
 				emitLiteral(dst, src[nextEmit:s])
 			}
 			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
 			s += l
 			nextEmit = s
 			if nextS >= s {
 				s = nextS + 1
 			}
 			if s >= sLimit {
 				// Index first pair after match end.
 				if int(s+l+4) < len(src) {
 					cv := load3232(src, s)
 					e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur, val: cv}
 				}
 				goto emitRemainder
 			}
 			// Store every second hash in-between, but offset by 1.
 			for i := s - l + 2; i < s-5; i += 7 {
 				x := load6432(src, int32(i))
 				nextHash := hash4u(uint32(x), bTableBits)
 				e.table[nextHash] = tableEntry{offset: e.cur + i, val: uint32(x)}
 				// Skip one
 				x >>= 16
 				nextHash = hash4u(uint32(x), bTableBits)
 				e.table[nextHash] = tableEntry{offset: e.cur + i + 2, val: uint32(x)}
 				// Skip one
 				x >>= 16
 				nextHash = hash4u(uint32(x), bTableBits)
 				e.table[nextHash] = tableEntry{offset: e.cur + i + 4, val: uint32(x)}
 			}
 			// We could immediately start working at s now, but to improve
 			// compression we first update the hash table at s-2 to s. If
 			// another emitCopy is not our next move, also calculate nextHash
 			// at s+1. At least on GOARCH=amd64, these three hash calculations
 			// are faster as one load64 call (with some shifts) instead of
 			// three load32 calls.
 			x := load6432(src, s-2)
 			o := e.cur + s - 2
 			prevHash := hash4u(uint32(x), bTableBits)
 			prevHash2 := hash4u(uint32(x>>8), bTableBits)
 			e.table[prevHash] = tableEntry{offset: o, val: uint32(x)}
 			e.table[prevHash2] = tableEntry{offset: o + 1, val: uint32(x >> 8)}
 			currHash := hash4u(uint32(x>>16), bTableBits)
 			candidate = e.table[currHash]
 			e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x >> 16)}
 			offset := s - (candidate.offset - e.cur)
 			if offset > maxMatchOffset || uint32(x>>16) != candidate.val {
 				cv = uint32(x >> 24)
 				s++
 				break
 			}
 		}
 	}
 emitRemainder:
 	if int(nextEmit) < len(src) {
 		// If nothing was added, don't encode literals.
 		if dst.n == 0 {
 			return
 		}
 		emitLiteral(dst, src[nextEmit:])
 	}
 }
--- a/vendor/github.com/klauspost/compress/flate/level3.go
+++ b/vendor/github.com/klauspost/compress/flate/level3.go
@ -0,0 +1,225 @@
 package flate
 // fastEncL3
 type fastEncL3 struct {
 	fastGen
 	table [tableSize]tableEntryPrev
 }
 // Encode uses a similar algorithm to level 2, will check up to two candidates.
 func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 8 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 	// Protect against e.cur wraparound.
 	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntryPrev{}
 			}
 			e.cur = maxMatchOffset
 			break
 		}
 		// Shift down everything in the table that isn't already too far away.
 		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
 		for i := range e.table[:] {
 			v := e.table[i]
 			if v.Cur.offset <= minOff {
 				v.Cur.offset = 0
 			} else {
 				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
 			}
 			if v.Prev.offset <= minOff {
 				v.Prev.offset = 0
 			} else {
 				v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
 			}
 			e.table[i] = v
 		}
 		e.cur = maxMatchOffset
 	}
 	s := e.addBlock(src)
 	// Skip if too small.
 	if len(src) < minNonLiteralBlockSize {
 		// We do not fill the token table.
 		// This will be picked up by caller.
 		dst.n = uint16(len(src))
 		return
 	}
 	// Override src
 	src = e.hist
 	nextEmit := s
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := int32(len(src) - inputMargin)
 	// nextEmit is where in src the next emitLiteral should start from.
 	cv := load3232(src, s)
 	for {
 		const skipLog = 6
 		nextS := s
 		var candidate tableEntry
 		for {
 			nextHash := hash(cv)
 			s = nextS
 			nextS = s + 1 + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			candidates := e.table[nextHash]
 			now := load3232(src, nextS)
 			e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
 			// Check both candidates
 			candidate = candidates.Cur
 			offset := s - (candidate.offset - e.cur)
 			if cv == candidate.val {
 				if offset > maxMatchOffset {
 					cv = now
 					// Previous will also be invalid, we have nothing.
 					continue
 				}
 				o2 := s - (candidates.Prev.offset - e.cur)
 				if cv != candidates.Prev.val || o2 > maxMatchOffset {
 					break
 				}
 				// Both match and are valid, pick longest.
 				l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:])
 				if l2 > l1 {
 					candidate = candidates.Prev
 				}
 				break
 			} else {
 				// We only check if value mismatches.
 				// Offset will always be invalid in other cases.
 				candidate = candidates.Prev
 				if cv == candidate.val {
 					offset := s - (candidate.offset - e.cur)
 					if offset <= maxMatchOffset {
 						break
 					}
 				}
 			}
 			cv = now
 		}
 		// Call emitCopy, and then see if another emitCopy could be our next
 		// move. Repeat until we find no match for the input immediately after
 		// what was consumed by the last emitCopy call.
 		//
 		// If we exit this loop normally then we need to call emitLiteral next,
 		// though we don't yet know how big the literal will be. We handle that
 		// by proceeding to the next iteration of the main loop. We also can
 		// exit this loop via goto if we get close to exhausting the input.
 		for {
 			// Invariant: we have a 4-byte match at s, and no need to emit any
 			// literal bytes prior to s.
 			// Extend the 4-byte match as long as possible.
 			//
 			t := candidate.offset - e.cur
 			l := e.matchlenLong(s+4, t+4, src) + 4
 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
 				s--
 				t--
 				l++
 			}
 			if nextEmit < s {
 				emitLiteral(dst, src[nextEmit:s])
 			}
 			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
 			s += l
 			nextEmit = s
 			if nextS >= s {
 				s = nextS + 1
 			}
 			if s >= sLimit {
 				t += l
 				// Index first pair after match end.
 				if int(t+4) < len(src) && t > 0 {
 					cv := load3232(src, t)
 					nextHash := hash(cv)
 					e.table[nextHash] = tableEntryPrev{
 						Prev: e.table[nextHash].Cur,
 						Cur:  tableEntry{offset: e.cur + t, val: cv},
 					}
 				}
 				goto emitRemainder
 			}
 			// We could immediately start working at s now, but to improve
 			// compression we first update the hash table at s-3 to s.
 			x := load6432(src, s-3)
 			prevHash := hash(uint32(x))
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 3, val: uint32(x)},
 			}
 			x >>= 8
 			prevHash = hash(uint32(x))
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
 			}
 			x >>= 8
 			prevHash = hash(uint32(x))
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
 			}
 			x >>= 8
 			currHash := hash(uint32(x))
 			candidates := e.table[currHash]
 			cv = uint32(x)
 			e.table[currHash] = tableEntryPrev{
 				Prev: candidates.Cur,
 				Cur:  tableEntry{offset: s + e.cur, val: cv},
 			}
 			// Check both candidates
 			candidate = candidates.Cur
 			if cv == candidate.val {
 				offset := s - (candidate.offset - e.cur)
 				if offset <= maxMatchOffset {
 					continue
 				}
 			} else {
 				// We only check if value mismatches.
 				// Offset will always be invalid in other cases.
 				candidate = candidates.Prev
 				if cv == candidate.val {
 					offset := s - (candidate.offset - e.cur)
 					if offset <= maxMatchOffset {
 						continue
 					}
 				}
 			}
 			cv = uint32(x >> 8)
 			s++
 			break
 		}
 	}
 emitRemainder:
 	if int(nextEmit) < len(src) {
 		// If nothing was added, don't encode literals.
 		if dst.n == 0 {
 			return
 		}
 		emitLiteral(dst, src[nextEmit:])
 	}
 }
--- a/vendor/github.com/klauspost/compress/flate/level4.go
+++ b/vendor/github.com/klauspost/compress/flate/level4.go
@ -0,0 +1,210 @@
 package flate
 import "fmt"
 type fastEncL4 struct {
 	fastGen
 	table  [tableSize]tableEntry
 	bTable [tableSize]tableEntry
 }
 func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 	// Protect against e.cur wraparound.
 	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
 			}
 			for i := range e.bTable[:] {
 				e.bTable[i] = tableEntry{}
 			}
 			e.cur = maxMatchOffset
 			break
 		}
 		// Shift down everything in the table that isn't already too far away.
 		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
 		for i := range e.table[:] {
 			v := e.table[i].offset
 			if v <= minOff {
 				v = 0
 			} else {
 				v = v - e.cur + maxMatchOffset
 			}
 			e.table[i].offset = v
 		}
 		for i := range e.bTable[:] {
 			v := e.bTable[i].offset
 			if v <= minOff {
 				v = 0
 			} else {
 				v = v - e.cur + maxMatchOffset
 			}
 			e.bTable[i].offset = v
 		}
 		e.cur = maxMatchOffset
 	}
 	s := e.addBlock(src)
 	// This check isn't in the Snappy implementation, but there, the caller
 	// instead of the callee handles this case.
 	if len(src) < minNonLiteralBlockSize {
 		// We do not fill the token table.
 		// This will be picked up by caller.
 		dst.n = uint16(len(src))
 		return
 	}
 	// Override src
 	src = e.hist
 	nextEmit := s
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := int32(len(src) - inputMargin)
 	// nextEmit is where in src the next emitLiteral should start from.
 	cv := load6432(src, s)
 	for {
 		const skipLog = 6
 		const doEvery = 1
 		nextS := s
 		var t int32
 		for {
 			nextHashS := hash4x64(cv, tableBits)
 			nextHashL := hash7(cv, tableBits)
 			s = nextS
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			// Fetch a short+long candidate
 			sCandidate := e.table[nextHashS]
 			lCandidate := e.bTable[nextHashL]
 			next := load6432(src, nextS)
 			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
 			e.table[nextHashS] = entry
 			e.bTable[nextHashL] = entry
 			t = lCandidate.offset - e.cur
 			if s-t < maxMatchOffset && uint32(cv) == lCandidate.val {
 				// We got a long match. Use that.
 				break
 			}
 			t = sCandidate.offset - e.cur
 			if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
 				// Found a 4 match...
 				lCandidate = e.bTable[hash7(next, tableBits)]
 				// If the next long is a candidate, check if we should use that instead...
 				lOff := nextS - (lCandidate.offset - e.cur)
 				if lOff < maxMatchOffset && lCandidate.val == uint32(next) {
 					l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:])
 					if l2 > l1 {
 						s = nextS
 						t = lCandidate.offset - e.cur
 					}
 				}
 				break
 			}
 			cv = next
 		}
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 		// Extend the 4-byte match as long as possible.
 		l := e.matchlenLong(s+4, t+4, src) + 4
 		// Extend backwards
 		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
 			s--
 			t--
 			l++
 		}
 		if nextEmit < s {
 			emitLiteral(dst, src[nextEmit:s])
 		}
 		if false {
 			if t >= s {
 				panic("s-t")
 			}
 			if (s - t) > maxMatchOffset {
 				panic(fmt.Sprintln("mmo", t))
 			}
 			if l < baseMatchLength {
 				panic("bml")
 			}
 		}
 		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
 		s += l
 		nextEmit = s
 		if nextS >= s {
 			s = nextS + 1
 		}
 		if s >= sLimit {
 			// Index first pair after match end.
 			if int(s+8) < len(src) {
 				cv := load6432(src, s)
 				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)}
 				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)}
 			}
 			goto emitRemainder
 		}
 		// Store every 3rd hash in-between
 		if true {
 			i := nextS
 			if i < s-1 {
 				cv := load6432(src, i)
 				t := tableEntry{offset: i + e.cur, val: uint32(cv)}
 				t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1}
 				e.bTable[hash7(cv, tableBits)] = t
 				e.bTable[hash7(cv>>8, tableBits)] = t2
 				e.table[hash4u(t2.val, tableBits)] = t2
 				i += 3
 				for ; i < s-1; i += 3 {
 					cv := load6432(src, i)
 					t := tableEntry{offset: i + e.cur, val: uint32(cv)}
 					t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1}
 					e.bTable[hash7(cv, tableBits)] = t
 					e.bTable[hash7(cv>>8, tableBits)] = t2
 					e.table[hash4u(t2.val, tableBits)] = t2
 				}
 			}
 		}
 		// We could immediately start working at s now, but to improve
 		// compression we first update the hash table at s-1 and at s.
 		x := load6432(src, s-1)
 		o := e.cur + s - 1
 		prevHashS := hash4x64(x, tableBits)
 		prevHashL := hash7(x, tableBits)
 		e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)}
 		e.bTable[prevHashL] = tableEntry{offset: o, val: uint32(x)}
 		cv = x >> 8
 	}
 emitRemainder:
 	if int(nextEmit) < len(src) {
 		// If nothing was added, don't encode literals.
 		if dst.n == 0 {
 			return
 		}
 		emitLiteral(dst, src[nextEmit:])
 	}
 }
--- a/vendor/github.com/klauspost/compress/flate/level5.go
+++ b/vendor/github.com/klauspost/compress/flate/level5.go
@ -0,0 +1,276 @@
 package flate
 import "fmt"
 type fastEncL5 struct {
 	fastGen
 	table  [tableSize]tableEntry
 	bTable [tableSize]tableEntryPrev
 }
 func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 	// Protect against e.cur wraparound.
 	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
 			}
 			for i := range e.bTable[:] {
 				e.bTable[i] = tableEntryPrev{}
 			}
 			e.cur = maxMatchOffset
 			break
 		}
 		// Shift down everything in the table that isn't already too far away.
 		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
 		for i := range e.table[:] {
 			v := e.table[i].offset
 			if v <= minOff {
 				v = 0
 			} else {
 				v = v - e.cur + maxMatchOffset
 			}
 			e.table[i].offset = v
 		}
 		for i := range e.bTable[:] {
 			v := e.bTable[i]
 			if v.Cur.offset <= minOff {
 				v.Cur.offset = 0
 				v.Prev.offset = 0
 			} else {
 				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
 				if v.Prev.offset <= minOff {
 					v.Prev.offset = 0
 				} else {
 					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
 				}
 			}
 			e.bTable[i] = v
 		}
 		e.cur = maxMatchOffset
 	}
 	s := e.addBlock(src)
 	// This check isn't in the Snappy implementation, but there, the caller
 	// instead of the callee handles this case.
 	if len(src) < minNonLiteralBlockSize {
 		// We do not fill the token table.
 		// This will be picked up by caller.
 		dst.n = uint16(len(src))
 		return
 	}
 	// Override src
 	src = e.hist
 	nextEmit := s
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := int32(len(src) - inputMargin)
 	// nextEmit is where in src the next emitLiteral should start from.
 	cv := load6432(src, s)
 	for {
 		const skipLog = 6
 		const doEvery = 1
 		nextS := s
 		var l int32
 		var t int32
 		for {
 			nextHashS := hash4x64(cv, tableBits)
 			nextHashL := hash7(cv, tableBits)
 			s = nextS
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			// Fetch a short+long candidate
 			sCandidate := e.table[nextHashS]
 			lCandidate := e.bTable[nextHashL]
 			next := load6432(src, nextS)
 			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
 			e.table[nextHashS] = entry
 			eLong := &e.bTable[nextHashL]
 			eLong.Cur, eLong.Prev = entry, eLong.Cur
 			nextHashS = hash4x64(next, tableBits)
 			nextHashL = hash7(next, tableBits)
 			t = lCandidate.Cur.offset - e.cur
 			if s-t < maxMatchOffset {
 				if uint32(cv) == lCandidate.Cur.val {
 					// Store the next match
 					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
 					eLong := &e.bTable[nextHashL]
 					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
 					t2 := lCandidate.Prev.offset - e.cur
 					if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
 						l = e.matchlen(s+4, t+4, src) + 4
 						ml1 := e.matchlen(s+4, t2+4, src) + 4
 						if ml1 > l {
 							t = t2
 							l = ml1
 							break
 						}
 					}
 					break
 				}
 				t = lCandidate.Prev.offset - e.cur
 				if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
 					// Store the next match
 					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
 					eLong := &e.bTable[nextHashL]
 					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
 					break
 				}
 			}
 			t = sCandidate.offset - e.cur
 			if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
 				// Found a 4 match...
 				l = e.matchlen(s+4, t+4, src) + 4
 				lCandidate = e.bTable[nextHashL]
 				// Store the next match
 				e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
 				eLong := &e.bTable[nextHashL]
 				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
 				// If the next long is a candidate, use that...
 				t2 := lCandidate.Cur.offset - e.cur
 				if nextS-t2 < maxMatchOffset {
 					if lCandidate.Cur.val == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
 							s = nextS
 							l = ml
 							break
 						}
 					}
 					// If the previous long is a candidate, use that...
 					t2 = lCandidate.Prev.offset - e.cur
 					if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
 							s = nextS
 							l = ml
 							break
 						}
 					}
 				}
 				break
 			}
 			cv = next
 		}
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 		// Extend the 4-byte match as long as possible.
 		if l == 0 {
 			l = e.matchlenLong(s+4, t+4, src) + 4
 		} else if l == maxMatchLength {
 			l += e.matchlenLong(s+l, t+l, src)
 		}
 		// Extend backwards
 		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
 			s--
 			t--
 			l++
 		}
 		if nextEmit < s {
 			emitLiteral(dst, src[nextEmit:s])
 		}
 		if false {
 			if t >= s {
 				panic(fmt.Sprintln("s-t", s, t))
 			}
 			if (s - t) > maxMatchOffset {
 				panic(fmt.Sprintln("mmo", s-t))
 			}
 			if l < baseMatchLength {
 				panic("bml")
 			}
 		}
 		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
 		s += l
 		nextEmit = s
 		if nextS >= s {
 			s = nextS + 1
 		}
 		if s >= sLimit {
 			goto emitRemainder
 		}
 		// Store every 3rd hash in-between.
 		if true {
 			const hashEvery = 3
 			i := s - l + 1
 			if i < s-1 {
 				cv := load6432(src, i)
 				t := tableEntry{offset: i + e.cur, val: uint32(cv)}
 				e.table[hash4x64(cv, tableBits)] = t
 				eLong := &e.bTable[hash7(cv, tableBits)]
 				eLong.Cur, eLong.Prev = t, eLong.Cur
 				// Do an long at i+1
 				cv >>= 8
 				t = tableEntry{offset: t.offset + 1, val: uint32(cv)}
 				eLong = &e.bTable[hash7(cv, tableBits)]
 				eLong.Cur, eLong.Prev = t, eLong.Cur
 				// We only have enough bits for a short entry at i+2
 				cv >>= 8
 				t = tableEntry{offset: t.offset + 1, val: uint32(cv)}
 				e.table[hash4x64(cv, tableBits)] = t
 				// Skip one - otherwise we risk hitting 's'
 				i += 4
 				for ; i < s-1; i += hashEvery {
 					cv := load6432(src, i)
 					t := tableEntry{offset: i + e.cur, val: uint32(cv)}
 					t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)}
 					eLong := &e.bTable[hash7(cv, tableBits)]
 					eLong.Cur, eLong.Prev = t, eLong.Cur
 					e.table[hash4u(t2.val, tableBits)] = t2
 				}
 			}
 		}
 		// We could immediately start working at s now, but to improve
 		// compression we first update the hash table at s-1 and at s.
 		x := load6432(src, s-1)
 		o := e.cur + s - 1
 		prevHashS := hash4x64(x, tableBits)
 		prevHashL := hash7(x, tableBits)
 		e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)}
 		eLong := &e.bTable[prevHashL]
 		eLong.Cur, eLong.Prev = tableEntry{offset: o, val: uint32(x)}, eLong.Cur
 		cv = x >> 8
 	}
 emitRemainder:
 	if int(nextEmit) < len(src) {
 		// If nothing was added, don't encode literals.
 		if dst.n == 0 {
 			return
 		}
 		emitLiteral(dst, src[nextEmit:])
 	}
 }
--- a/vendor/github.com/klauspost/compress/flate/level6.go
+++ b/vendor/github.com/klauspost/compress/flate/level6.go
@ -0,0 +1,279 @@
 package flate
 import "fmt"
 type fastEncL6 struct {
 	fastGen
 	table  [tableSize]tableEntry
 	bTable [tableSize]tableEntryPrev
 }
 func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 	// Protect against e.cur wraparound.
 	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
 			}
 			for i := range e.bTable[:] {
 				e.bTable[i] = tableEntryPrev{}
 			}
 			e.cur = maxMatchOffset
 			break
 		}
 		// Shift down everything in the table that isn't already too far away.
 		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
 		for i := range e.table[:] {
 			v := e.table[i].offset
 			if v <= minOff {
 				v = 0
 			} else {
 				v = v - e.cur + maxMatchOffset
 			}
 			e.table[i].offset = v
 		}
 		for i := range e.bTable[:] {
 			v := e.bTable[i]
 			if v.Cur.offset <= minOff {
 				v.Cur.offset = 0
 				v.Prev.offset = 0
 			} else {
 				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
 				if v.Prev.offset <= minOff {
 					v.Prev.offset = 0
 				} else {
 					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
 				}
 			}
 			e.bTable[i] = v
 		}
 		e.cur = maxMatchOffset
 	}
 	s := e.addBlock(src)
 	// This check isn't in the Snappy implementation, but there, the caller
 	// instead of the callee handles this case.
 	if len(src) < minNonLiteralBlockSize {
 		// We do not fill the token table.
 		// This will be picked up by caller.
 		dst.n = uint16(len(src))
 		return
 	}
 	// Override src
 	src = e.hist
 	nextEmit := s
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := int32(len(src) - inputMargin)
 	// nextEmit is where in src the next emitLiteral should start from.
 	cv := load6432(src, s)
 	// Repeat MUST be > 1 and within range
 	repeat := int32(1)
 	for {
 		const skipLog = 7
 		const doEvery = 1
 		nextS := s
 		var l int32
 		var t int32
 		for {
 			nextHashS := hash4x64(cv, tableBits)
 			nextHashL := hash7(cv, tableBits)
 			s = nextS
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			// Fetch a short+long candidate
 			sCandidate := e.table[nextHashS]
 			lCandidate := e.bTable[nextHashL]
 			next := load6432(src, nextS)
 			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
 			e.table[nextHashS] = entry
 			eLong := &e.bTable[nextHashL]
 			eLong.Cur, eLong.Prev = entry, eLong.Cur
 			// Calculate hashes of 'next'
 			nextHashS = hash4x64(next, tableBits)
 			nextHashL = hash7(next, tableBits)
 			t = lCandidate.Cur.offset - e.cur
 			if s-t < maxMatchOffset {
 				if uint32(cv) == lCandidate.Cur.val {
 					// Long candidate matches at least 4 bytes.
 					// Store the next match
 					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
 					eLong := &e.bTable[nextHashL]
 					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
 					// Check the previous long candidate as well.
 					t2 := lCandidate.Prev.offset - e.cur
 					if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
 						l = e.matchlen(s+4, t+4, src) + 4
 						ml1 := e.matchlen(s+4, t2+4, src) + 4
 						if ml1 > l {
 							t = t2
 							l = ml1
 							break
 						}
 					}
 					break
 				}
 				// Current value did not match, but check if previous long value does.
 				t = lCandidate.Prev.offset - e.cur
 				if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
 					// Store the next match
 					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
 					eLong := &e.bTable[nextHashL]
 					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
 					break
 				}
 			}
 			t = sCandidate.offset - e.cur
 			if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
 				// Found a 4 match...
 				l = e.matchlen(s+4, t+4, src) + 4
 				// Look up next long candidate (at nextS)
 				lCandidate = e.bTable[nextHashL]
 				// Store the next match
 				e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
 				eLong := &e.bTable[nextHashL]
 				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
 				// Check repeat at s + repOff
 				const repOff = 1
 				t2 := s - repeat + repOff
 				if load3232(src, t2) == uint32(cv>>(8*repOff)) {
 					ml := e.matchlen(s+4+repOff, t2+4, src) + 4
 					if ml > l {
 						t = t2
 						l = ml
 						s += repOff
 						// Not worth checking more.
 						break
 					}
 				}
 				// If the next long is a candidate, use that...
 				t2 = lCandidate.Cur.offset - e.cur
 				if nextS-t2 < maxMatchOffset {
 					if lCandidate.Cur.val == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
 							s = nextS
 							l = ml
 							// This is ok, but check previous as well.
 						}
 					}
 					// If the previous long is a candidate, use that...
 					t2 = lCandidate.Prev.offset - e.cur
 					if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
 							s = nextS
 							l = ml
 							break
 						}
 					}
 				}
 				break
 			}
 			cv = next
 		}
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 		// Extend the 4-byte match as long as possible.
 		if l == 0 {
 			l = e.matchlenLong(s+4, t+4, src) + 4
 		} else if l == maxMatchLength {
 			l += e.matchlenLong(s+l, t+l, src)
 		}
 		// Extend backwards
 		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
 			s--
 			t--
 			l++
 		}
 		if nextEmit < s {
 			emitLiteral(dst, src[nextEmit:s])
 		}
 		if false {
 			if t >= s {
 				panic(fmt.Sprintln("s-t", s, t))
 			}
 			if (s - t) > maxMatchOffset {
 				panic(fmt.Sprintln("mmo", s-t))
 			}
 			if l < baseMatchLength {
 				panic("bml")
 			}
 		}
 		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
 		repeat = s - t
 		s += l
 		nextEmit = s
 		if nextS >= s {
 			s = nextS + 1
 		}
 		if s >= sLimit {
 			// Index after match end.
 			for i := nextS + 1; i < int32(len(src))-8; i += 2 {
 				cv := load6432(src, i)
 				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur, val: uint32(cv)}
 				eLong := &e.bTable[hash7(cv, tableBits)]
 				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur, val: uint32(cv)}, eLong.Cur
 			}
 			goto emitRemainder
 		}
 		// Store every long hash in-between and every second short.
 		if true {
 			for i := nextS + 1; i < s-1; i += 2 {
 				cv := load6432(src, i)
 				t := tableEntry{offset: i + e.cur, val: uint32(cv)}
 				t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)}
 				eLong := &e.bTable[hash7(cv, tableBits)]
 				eLong2 := &e.bTable[hash7(cv>>8, tableBits)]
 				e.table[hash4x64(cv, tableBits)] = t
 				eLong.Cur, eLong.Prev = t, eLong.Cur
 				eLong2.Cur, eLong2.Prev = t2, eLong2.Cur
 			}
 		}
 		// We could immediately start working at s now, but to improve
 		// compression we first update the hash table at s-1 and at s.
 		cv = load6432(src, s)
 	}
 emitRemainder:
 	if int(nextEmit) < len(src) {
 		// If nothing was added, don't encode literals.
 		if dst.n == 0 {
 			return
 		}
 		emitLiteral(dst, src[nextEmit:])
 	}
 }
--- a/vendor/github.com/klauspost/compress/flate/reverse_bits.go
+++ b/vendor/github.com/klauspost/compress/flate/reverse_bits.go
@ -1,48 +0,0 @@
 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package flate
 var reverseByte = [256]byte{
 	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
 	0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
 	0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
 	0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
 	0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
 	0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
 	0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
 	0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
 	0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
 	0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
 	0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
 	0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
 	0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
 	0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
 	0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
 	0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
 	0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
 	0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
 	0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
 	0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
 	0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
 	0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
 	0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
 	0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
 	0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
 	0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
 	0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
 	0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
 	0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
 	0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
 	0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
 	0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
 }
 func reverseUint16(v uint16) uint16 {
 	return uint16(reverseByte[v>>8]) | uint16(reverseByte[v&0xFF])<<8
 }
 func reverseBits(number uint16, bitLength byte) uint16 {
 	return reverseUint16(number << uint8(16-bitLength))
 }
--- a/vendor/github.com/klauspost/compress/flate/snappy.go
+++ b/vendor/github.com/klauspost/compress/flate/snappy.go
@ -1,856 +0,0 @@
 // Copyright 2011 The Snappy-Go Authors. All rights reserved.
 // Modified for deflate by Klaus Post (c) 2015.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package flate
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 func emitLiteral(dst *tokens, lit []byte) {
 	ol := int(dst.n)
 	for i, v := range lit {
 		dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
 	}
 	dst.n += uint16(len(lit))
 }
 // emitCopy writes a copy chunk and returns the number of bytes written.
 func emitCopy(dst *tokens, offset, length int) {
 	dst.tokens[dst.n] = matchToken(uint32(length-3), uint32(offset-minOffsetSize))
 	dst.n++
 }
 type snappyEnc interface {
 	Encode(dst *tokens, src []byte)
 	Reset()
 }
 func newSnappy(level int) snappyEnc {
 	switch level {
 	case 1:
 		return &snappyL1{}
 	case 2:
 		return &snappyL2{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}
 	case 3:
 		return &snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}
 	case 4:
 		return &snappyL4{snappyL3{snappyGen: snappyGen{cur: maxStoreBlockSize, prev: make([]byte, 0, maxStoreBlockSize)}}}
 	default:
 		panic("invalid level specified")
 	}
 }
 const (
 	tableBits       = 14             // Bits used in the table
 	tableSize       = 1 << tableBits // Size of the table
 	tableMask       = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
 	tableShift      = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
 	baseMatchOffset = 1              // The smallest match offset
 	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
 	maxMatchOffset  = 1 << 15        // The largest match offset
 )
 func load32(b []byte, i int) uint32 {
 	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
 	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
 }
 func load64(b []byte, i int) uint64 {
 	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
 	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
 		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
 }
 func hash(u uint32) uint32 {
 	return (u * 0x1e35a7bd) >> tableShift
 }
 // snappyL1 encapsulates level 1 compression
 type snappyL1 struct{}
 func (e *snappyL1) Reset() {}
 func (e *snappyL1) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 16 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 	// This check isn't in the Snappy implementation, but there, the caller
 	// instead of the callee handles this case.
 	if len(src) < minNonLiteralBlockSize {
 		// We do not fill the token table.
 		// This will be picked up by caller.
 		dst.n = uint16(len(src))
 		return
 	}
 	// Initialize the hash table.
 	//
 	// The table element type is uint16, as s < sLimit and sLimit < len(src)
 	// and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
 	var table [tableSize]uint16
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := len(src) - inputMargin
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := 0
 	// The encoded form must start with a literal, as there are no previous
 	// bytes to copy, so we start looking for hash matches at s == 1.
 	s := 1
 	nextHash := hash(load32(src, s))
 	for {
 		// Copied from the C++ snappy implementation:
 		//
 		// Heuristic match skipping: If 32 bytes are scanned with no matches
 		// found, start looking only at every other byte. If 32 more bytes are
 		// scanned (or skipped), look at every third byte, etc.. When a match
 		// is found, immediately go back to looking at every byte. This is a
 		// small loss (~5% performance, ~0.1% density) for compressible data
 		// due to more bookkeeping, but for non-compressible data (such as
 		// JPEG) it's a huge win since the compressor quickly "realizes" the
 		// data is incompressible and doesn't bother looking for matches
 		// everywhere.
 		//
 		// The "skip" variable keeps track of how many bytes there are since
 		// the last match; dividing it by 32 (ie. right-shifting by five) gives
 		// the number of bytes to move ahead for each iteration.
 		skip := 32
 		nextS := s
 		candidate := 0
 		for {
 			s = nextS
 			bytesBetweenHashLookups := skip >> 5
 			nextS = s + bytesBetweenHashLookups
 			skip += bytesBetweenHashLookups
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			candidate = int(table[nextHash&tableMask])
 			table[nextHash&tableMask] = uint16(s)
 			nextHash = hash(load32(src, nextS))
 			// TODO: < should be <=, and add a test for that.
 			if s-candidate < maxMatchOffset && load32(src, s) == load32(src, candidate) {
 				break
 			}
 		}
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 		emitLiteral(dst, src[nextEmit:s])
 		// Call emitCopy, and then see if another emitCopy could be our next
 		// move. Repeat until we find no match for the input immediately after
 		// what was consumed by the last emitCopy call.
 		//
 		// If we exit this loop normally then we need to call emitLiteral next,
 		// though we don't yet know how big the literal will be. We handle that
 		// by proceeding to the next iteration of the main loop. We also can
 		// exit this loop via goto if we get close to exhausting the input.
 		for {
 			// Invariant: we have a 4-byte match at s, and no need to emit any
 			// literal bytes prior to s.
 			base := s
 			// Extend the 4-byte match as long as possible.
 			//
 			// This is an inlined version of Snappy's:
 			//	s = extendMatch(src, candidate+4, s+4)
 			s += 4
 			s1 := base + maxMatchLength
 			if s1 > len(src) {
 				s1 = len(src)
 			}
 			a := src[s:s1]
 			b := src[candidate+4:]
 			b = b[:len(a)]
 			l := len(a)
 			for i := range a {
 				if a[i] != b[i] {
 					l = i
 					break
 				}
 			}
 			s += l
 			// matchToken is flate's equivalent of Snappy's emitCopy.
 			dst.tokens[dst.n] = matchToken(uint32(s-base-baseMatchLength), uint32(base-candidate-baseMatchOffset))
 			dst.n++
 			nextEmit = s
 			if s >= sLimit {
 				goto emitRemainder
 			}
 			// We could immediately start working at s now, but to improve
 			// compression we first update the hash table at s-1 and at s. If
 			// another emitCopy is not our next move, also calculate nextHash
 			// at s+1. At least on GOARCH=amd64, these three hash calculations
 			// are faster as one load64 call (with some shifts) instead of
 			// three load32 calls.
 			x := load64(src, s-1)
 			prevHash := hash(uint32(x >> 0))
 			table[prevHash&tableMask] = uint16(s - 1)
 			currHash := hash(uint32(x >> 8))
 			candidate = int(table[currHash&tableMask])
 			table[currHash&tableMask] = uint16(s)
 			// TODO: >= should be >, and add a test for that.
 			if s-candidate >= maxMatchOffset || uint32(x>>8) != load32(src, candidate) {
 				nextHash = hash(uint32(x >> 16))
 				s++
 				break
 			}
 		}
 	}
 emitRemainder:
 	if nextEmit < len(src) {
 		emitLiteral(dst, src[nextEmit:])
 	}
 }
 type tableEntry struct {
 	val    uint32
 	offset int32
 }
 func load3232(b []byte, i int32) uint32 {
 	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
 	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
 }
 func load6432(b []byte, i int32) uint64 {
 	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
 	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
 		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
 }
 // snappyGen maintains the table for matches,
 // and the previous byte block for level 2.
 // This is the generic implementation.
 type snappyGen struct {
 	prev []byte
 	cur  int32
 }
 // snappyGen maintains the table for matches,
 // and the previous byte block for level 2.
 // This is the generic implementation.
 type snappyL2 struct {
 	snappyGen
 	table [tableSize]tableEntry
 }
 // EncodeL2 uses a similar algorithm to level 1, but is capable
 // of matching across blocks giving better compression at a small slowdown.
 func (e *snappyL2) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 16 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 	// Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
 	if e.cur > 1<<30 {
 		for i := range e.table {
 			e.table[i] = tableEntry{}
 		}
 		e.cur = maxStoreBlockSize
 	}
 	// This check isn't in the Snappy implementation, but there, the caller
 	// instead of the callee handles this case.
 	if len(src) < minNonLiteralBlockSize {
 		// We do not fill the token table.
 		// This will be picked up by caller.
 		dst.n = uint16(len(src))
 		e.cur += maxStoreBlockSize
 		e.prev = e.prev[:0]
 		return
 	}
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := int32(len(src) - inputMargin)
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := int32(0)
 	s := int32(0)
 	cv := load3232(src, s)
 	nextHash := hash(cv)
 	for {
 		// Copied from the C++ snappy implementation:
 		//
 		// Heuristic match skipping: If 32 bytes are scanned with no matches
 		// found, start looking only at every other byte. If 32 more bytes are
 		// scanned (or skipped), look at every third byte, etc.. When a match
 		// is found, immediately go back to looking at every byte. This is a
 		// small loss (~5% performance, ~0.1% density) for compressible data
 		// due to more bookkeeping, but for non-compressible data (such as
 		// JPEG) it's a huge win since the compressor quickly "realizes" the
 		// data is incompressible and doesn't bother looking for matches
 		// everywhere.
 		//
 		// The "skip" variable keeps track of how many bytes there are since
 		// the last match; dividing it by 32 (ie. right-shifting by five) gives
 		// the number of bytes to move ahead for each iteration.
 		skip := int32(32)
 		nextS := s
 		var candidate tableEntry
 		for {
 			s = nextS
 			bytesBetweenHashLookups := skip >> 5
 			nextS = s + bytesBetweenHashLookups
 			skip += bytesBetweenHashLookups
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			candidate = e.table[nextHash&tableMask]
 			now := load3232(src, nextS)
 			e.table[nextHash&tableMask] = tableEntry{offset: s + e.cur, val: cv}
 			nextHash = hash(now)
 			offset := s - (candidate.offset - e.cur)
 			if offset >= maxMatchOffset || cv != candidate.val {
 				// Out of range or not matched.
 				cv = now
 				continue
 			}
 			break
 		}
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 		emitLiteral(dst, src[nextEmit:s])
 		// Call emitCopy, and then see if another emitCopy could be our next
 		// move. Repeat until we find no match for the input immediately after
 		// what was consumed by the last emitCopy call.
 		//
 		// If we exit this loop normally then we need to call emitLiteral next,
 		// though we don't yet know how big the literal will be. We handle that
 		// by proceeding to the next iteration of the main loop. We also can
 		// exit this loop via goto if we get close to exhausting the input.
 		for {
 			// Invariant: we have a 4-byte match at s, and no need to emit any
 			// literal bytes prior to s.
 			// Extend the 4-byte match as long as possible.
 			//
 			s += 4
 			t := candidate.offset - e.cur + 4
 			l := e.matchlen(s, t, src)
 			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
 			dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
 			dst.n++
 			s += l
 			nextEmit = s
 			if s >= sLimit {
 				goto emitRemainder
 			}
 			// We could immediately start working at s now, but to improve
 			// compression we first update the hash table at s-1 and at s. If
 			// another emitCopy is not our next move, also calculate nextHash
 			// at s+1. At least on GOARCH=amd64, these three hash calculations
 			// are faster as one load64 call (with some shifts) instead of
 			// three load32 calls.
 			x := load6432(src, s-1)
 			prevHash := hash(uint32(x))
 			e.table[prevHash&tableMask] = tableEntry{offset: e.cur + s - 1, val: uint32(x)}
 			x >>= 8
 			currHash := hash(uint32(x))
 			candidate = e.table[currHash&tableMask]
 			e.table[currHash&tableMask] = tableEntry{offset: e.cur + s, val: uint32(x)}
 			offset := s - (candidate.offset - e.cur)
 			if offset >= maxMatchOffset || uint32(x) != candidate.val {
 				cv = uint32(x >> 8)
 				nextHash = hash(cv)
 				s++
 				break
 			}
 		}
 	}
 emitRemainder:
 	if int(nextEmit) < len(src) {
 		emitLiteral(dst, src[nextEmit:])
 	}
 	e.cur += int32(len(src))
 	e.prev = e.prev[:len(src)]
 	copy(e.prev, src)
 }
 type tableEntryPrev struct {
 	Cur  tableEntry
 	Prev tableEntry
 }
 // snappyL3
 type snappyL3 struct {
 	snappyGen
 	table [tableSize]tableEntryPrev
 }
 // Encode uses a similar algorithm to level 2, will check up to two candidates.
 func (e *snappyL3) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 16 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 	// Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
 	if e.cur > 1<<30 {
 		for i := range e.table {
 			e.table[i] = tableEntryPrev{}
 		}
 		e.cur = maxStoreBlockSize
 	}
 	// This check isn't in the Snappy implementation, but there, the caller
 	// instead of the callee handles this case.
 	if len(src) < minNonLiteralBlockSize {
 		// We do not fill the token table.
 		// This will be picked up by caller.
 		dst.n = uint16(len(src))
 		e.cur += maxStoreBlockSize
 		e.prev = e.prev[:0]
 		return
 	}
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := int32(len(src) - inputMargin)
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := int32(0)
 	s := int32(0)
 	cv := load3232(src, s)
 	nextHash := hash(cv)
 	for {
 		// Copied from the C++ snappy implementation:
 		//
 		// Heuristic match skipping: If 32 bytes are scanned with no matches
 		// found, start looking only at every other byte. If 32 more bytes are
 		// scanned (or skipped), look at every third byte, etc.. When a match
 		// is found, immediately go back to looking at every byte. This is a
 		// small loss (~5% performance, ~0.1% density) for compressible data
 		// due to more bookkeeping, but for non-compressible data (such as
 		// JPEG) it's a huge win since the compressor quickly "realizes" the
 		// data is incompressible and doesn't bother looking for matches
 		// everywhere.
 		//
 		// The "skip" variable keeps track of how many bytes there are since
 		// the last match; dividing it by 32 (ie. right-shifting by five) gives
 		// the number of bytes to move ahead for each iteration.
 		skip := int32(32)
 		nextS := s
 		var candidate tableEntry
 		for {
 			s = nextS
 			bytesBetweenHashLookups := skip >> 5
 			nextS = s + bytesBetweenHashLookups
 			skip += bytesBetweenHashLookups
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			candidates := e.table[nextHash&tableMask]
 			now := load3232(src, nextS)
 			e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
 			nextHash = hash(now)
 			// Check both candidates
 			candidate = candidates.Cur
 			if cv == candidate.val {
 				offset := s - (candidate.offset - e.cur)
 				if offset < maxMatchOffset {
 					break
 				}
 			} else {
 				// We only check if value mismatches.
 				// Offset will always be invalid in other cases.
 				candidate = candidates.Prev
 				if cv == candidate.val {
 					offset := s - (candidate.offset - e.cur)
 					if offset < maxMatchOffset {
 						break
 					}
 				}
 			}
 			cv = now
 		}
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 		emitLiteral(dst, src[nextEmit:s])
 		// Call emitCopy, and then see if another emitCopy could be our next
 		// move. Repeat until we find no match for the input immediately after
 		// what was consumed by the last emitCopy call.
 		//
 		// If we exit this loop normally then we need to call emitLiteral next,
 		// though we don't yet know how big the literal will be. We handle that
 		// by proceeding to the next iteration of the main loop. We also can
 		// exit this loop via goto if we get close to exhausting the input.
 		for {
 			// Invariant: we have a 4-byte match at s, and no need to emit any
 			// literal bytes prior to s.
 			// Extend the 4-byte match as long as possible.
 			//
 			s += 4
 			t := candidate.offset - e.cur + 4
 			l := e.matchlen(s, t, src)
 			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
 			dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
 			dst.n++
 			s += l
 			nextEmit = s
 			if s >= sLimit {
 				goto emitRemainder
 			}
 			// We could immediately start working at s now, but to improve
 			// compression we first update the hash table at s-2, s-1 and at s. If
 			// another emitCopy is not our next move, also calculate nextHash
 			// at s+1. At least on GOARCH=amd64, these three hash calculations
 			// are faster as one load64 call (with some shifts) instead of
 			// three load32 calls.
 			x := load6432(src, s-2)
 			prevHash := hash(uint32(x))
 			e.table[prevHash&tableMask] = tableEntryPrev{
 				Prev: e.table[prevHash&tableMask].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
 			}
 			x >>= 8
 			prevHash = hash(uint32(x))
 			e.table[prevHash&tableMask] = tableEntryPrev{
 				Prev: e.table[prevHash&tableMask].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
 			}
 			x >>= 8
 			currHash := hash(uint32(x))
 			candidates := e.table[currHash&tableMask]
 			cv = uint32(x)
 			e.table[currHash&tableMask] = tableEntryPrev{
 				Prev: candidates.Cur,
 				Cur:  tableEntry{offset: s + e.cur, val: cv},
 			}
 			// Check both candidates
 			candidate = candidates.Cur
 			if cv == candidate.val {
 				offset := s - (candidate.offset - e.cur)
 				if offset < maxMatchOffset {
 					continue
 				}
 			} else {
 				// We only check if value mismatches.
 				// Offset will always be invalid in other cases.
 				candidate = candidates.Prev
 				if cv == candidate.val {
 					offset := s - (candidate.offset - e.cur)
 					if offset < maxMatchOffset {
 						continue
 					}
 				}
 			}
 			cv = uint32(x >> 8)
 			nextHash = hash(cv)
 			s++
 			break
 		}
 	}
 emitRemainder:
 	if int(nextEmit) < len(src) {
 		emitLiteral(dst, src[nextEmit:])
 	}
 	e.cur += int32(len(src))
 	e.prev = e.prev[:len(src)]
 	copy(e.prev, src)
 }
 // snappyL4
 type snappyL4 struct {
 	snappyL3
 }
 // Encode uses a similar algorithm to level 3,
 // but will check up to two candidates if first isn't long enough.
 func (e *snappyL4) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 16 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 		matchLenGood           = 12
 	)
 	// Ensure that e.cur doesn't wrap, mainly an issue on 32 bits.
 	if e.cur > 1<<30 {
 		for i := range e.table {
 			e.table[i] = tableEntryPrev{}
 		}
 		e.cur = maxStoreBlockSize
 	}
 	// This check isn't in the Snappy implementation, but there, the caller
 	// instead of the callee handles this case.
 	if len(src) < minNonLiteralBlockSize {
 		// We do not fill the token table.
 		// This will be picked up by caller.
 		dst.n = uint16(len(src))
 		e.cur += maxStoreBlockSize
 		e.prev = e.prev[:0]
 		return
 	}
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := int32(len(src) - inputMargin)
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := int32(0)
 	s := int32(0)
 	cv := load3232(src, s)
 	nextHash := hash(cv)
 	for {
 		// Copied from the C++ snappy implementation:
 		//
 		// Heuristic match skipping: If 32 bytes are scanned with no matches
 		// found, start looking only at every other byte. If 32 more bytes are
 		// scanned (or skipped), look at every third byte, etc.. When a match
 		// is found, immediately go back to looking at every byte. This is a
 		// small loss (~5% performance, ~0.1% density) for compressible data
 		// due to more bookkeeping, but for non-compressible data (such as
 		// JPEG) it's a huge win since the compressor quickly "realizes" the
 		// data is incompressible and doesn't bother looking for matches
 		// everywhere.
 		//
 		// The "skip" variable keeps track of how many bytes there are since
 		// the last match; dividing it by 32 (ie. right-shifting by five) gives
 		// the number of bytes to move ahead for each iteration.
 		skip := int32(32)
 		nextS := s
 		var candidate tableEntry
 		var candidateAlt tableEntry
 		for {
 			s = nextS
 			bytesBetweenHashLookups := skip >> 5
 			nextS = s + bytesBetweenHashLookups
 			skip += bytesBetweenHashLookups
 			if nextS > sLimit {
 				goto emitRemainder
 			}
 			candidates := e.table[nextHash&tableMask]
 			now := load3232(src, nextS)
 			e.table[nextHash&tableMask] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
 			nextHash = hash(now)
 			// Check both candidates
 			candidate = candidates.Cur
 			if cv == candidate.val {
 				offset := s - (candidate.offset - e.cur)
 				if offset < maxMatchOffset {
 					offset = s - (candidates.Prev.offset - e.cur)
 					if cv == candidates.Prev.val && offset < maxMatchOffset {
 						candidateAlt = candidates.Prev
 					}
 					break
 				}
 			} else {
 				// We only check if value mismatches.
 				// Offset will always be invalid in other cases.
 				candidate = candidates.Prev
 				if cv == candidate.val {
 					offset := s - (candidate.offset - e.cur)
 					if offset < maxMatchOffset {
 						break
 					}
 				}
 			}
 			cv = now
 		}
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 		emitLiteral(dst, src[nextEmit:s])
 		// Call emitCopy, and then see if another emitCopy could be our next
 		// move. Repeat until we find no match for the input immediately after
 		// what was consumed by the last emitCopy call.
 		//
 		// If we exit this loop normally then we need to call emitLiteral next,
 		// though we don't yet know how big the literal will be. We handle that
 		// by proceeding to the next iteration of the main loop. We also can
 		// exit this loop via goto if we get close to exhausting the input.
 		for {
 			// Invariant: we have a 4-byte match at s, and no need to emit any
 			// literal bytes prior to s.
 			// Extend the 4-byte match as long as possible.
 			//
 			s += 4
 			t := candidate.offset - e.cur + 4
 			l := e.matchlen(s, t, src)
 			// Try alternative candidate if match length < matchLenGood.
 			if l < matchLenGood-4 && candidateAlt.offset != 0 {
 				t2 := candidateAlt.offset - e.cur + 4
 				l2 := e.matchlen(s, t2, src)
 				if l2 > l {
 					l = l2
 					t = t2
 				}
 			}
 			// matchToken is flate's equivalent of Snappy's emitCopy. (length,offset)
 			dst.tokens[dst.n] = matchToken(uint32(l+4-baseMatchLength), uint32(s-t-baseMatchOffset))
 			dst.n++
 			s += l
 			nextEmit = s
 			if s >= sLimit {
 				goto emitRemainder
 			}
 			// We could immediately start working at s now, but to improve
 			// compression we first update the hash table at s-2, s-1 and at s. If
 			// another emitCopy is not our next move, also calculate nextHash
 			// at s+1. At least on GOARCH=amd64, these three hash calculations
 			// are faster as one load64 call (with some shifts) instead of
 			// three load32 calls.
 			x := load6432(src, s-2)
 			prevHash := hash(uint32(x))
 			e.table[prevHash&tableMask] = tableEntryPrev{
 				Prev: e.table[prevHash&tableMask].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
 			}
 			x >>= 8
 			prevHash = hash(uint32(x))
 			e.table[prevHash&tableMask] = tableEntryPrev{
 				Prev: e.table[prevHash&tableMask].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
 			}
 			x >>= 8
 			currHash := hash(uint32(x))
 			candidates := e.table[currHash&tableMask]
 			cv = uint32(x)
 			e.table[currHash&tableMask] = tableEntryPrev{
 				Prev: candidates.Cur,
 				Cur:  tableEntry{offset: s + e.cur, val: cv},
 			}
 			// Check both candidates
 			candidate = candidates.Cur
 			candidateAlt = tableEntry{}
 			if cv == candidate.val {
 				offset := s - (candidate.offset - e.cur)
 				if offset < maxMatchOffset {
 					offset = s - (candidates.Prev.offset - e.cur)
 					if cv == candidates.Prev.val && offset < maxMatchOffset {
 						candidateAlt = candidates.Prev
 					}
 					continue
 				}
 			} else {
 				// We only check if value mismatches.
 				// Offset will always be invalid in other cases.
 				candidate = candidates.Prev
 				if cv == candidate.val {
 					offset := s - (candidate.offset - e.cur)
 					if offset < maxMatchOffset {
 						continue
 					}
 				}
 			}
 			cv = uint32(x >> 8)
 			nextHash = hash(cv)
 			s++
 			break
 		}
 	}
 emitRemainder:
 	if int(nextEmit) < len(src) {
 		emitLiteral(dst, src[nextEmit:])
 	}
 	e.cur += int32(len(src))
 	e.prev = e.prev[:len(src)]
 	copy(e.prev, src)
 }
 func (e *snappyGen) matchlen(s, t int32, src []byte) int32 {
 	s1 := int(s) + maxMatchLength - 4
 	if s1 > len(src) {
 		s1 = len(src)
 	}
 	// If we are inside the current block
 	if t >= 0 {
 		b := src[t:]
 		a := src[s:s1]
 		b = b[:len(a)]
 		// Extend the match to be as long as possible.
 		for i := range a {
 			if a[i] != b[i] {
 				return int32(i)
 			}
 		}
 		return int32(len(a))
 	}
 	// We found a match in the previous block.
 	tp := int32(len(e.prev)) + t
 	if tp < 0 {
 		return 0
 	}
 	// Extend the match to be as long as possible.
 	a := src[s:s1]
 	b := e.prev[tp:]
 	if len(b) > len(a) {
 		b = b[:len(a)]
 	}
 	a = a[:len(b)]
 	for i := range b {
 		if a[i] != b[i] {
 			return int32(i)
 		}
 	}
 	n := int32(len(b))
 	a = src[s+n : s1]
 	b = src[:len(a)]
 	for i := range a {
 		if a[i] != b[i] {
 			return int32(i) + n
 		}
 	}
 	return int32(len(a)) + n
 }
 // Reset the encoding table.
 func (e *snappyGen) Reset() {
 	e.prev = e.prev[:0]
 	e.cur += maxMatchOffset + 1
 }
--- a/vendor/github.com/klauspost/compress/flate/stateless.go
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go
@ -0,0 +1,252 @@
 package flate
 import (
 	"io"
 	"math"
 )
 const (
 	maxStatelessBlock = math.MaxInt16
 	slTableBits  = 13
 	slTableSize  = 1 << slTableBits
 	slTableShift = 32 - slTableBits
 )
 type statelessWriter struct {
 	dst    io.Writer
 	closed bool
 }
 func (s *statelessWriter) Close() error {
 	if s.closed {
 		return nil
 	}
 	s.closed = true
 	// Emit EOF block
 	return StatelessDeflate(s.dst, nil, true)
 }
 func (s *statelessWriter) Write(p []byte) (n int, err error) {
 	err = StatelessDeflate(s.dst, p, false)
 	if err != nil {
 		return 0, err
 	}
 	return len(p), nil
 }
 func (s *statelessWriter) Reset(w io.Writer) {
 	s.dst = w
 	s.closed = false
 }
 // NewStatelessWriter will do compression but without maintaining any state
 // between Write calls.
 // There will be no memory kept between Write calls,
 // but compression and speed will be suboptimal.
 // Because of this, the size of actual Write calls will affect output size.
 func NewStatelessWriter(dst io.Writer) io.WriteCloser {
 	return &statelessWriter{dst: dst}
 }
 // StatelessDeflate allows to compress directly to a Writer without retaining state.
 // When returning everything will be flushed.
 func StatelessDeflate(out io.Writer, in []byte, eof bool) error {
 	var dst tokens
 	bw := newHuffmanBitWriter(out)
 	if eof && len(in) == 0 {
 		// Just write an EOF block.
 		// Could be faster...
 		bw.writeStoredHeader(0, true)
 		bw.flush()
 		return bw.err
 	}
 	for len(in) > 0 {
 		todo := in
 		if len(todo) > maxStatelessBlock {
 			todo = todo[:maxStatelessBlock]
 		}
 		in = in[len(todo):]
 		// Compress
 		statelessEnc(&dst, todo)
 		isEof := eof && len(in) == 0
 		if dst.n == 0 {
 			bw.writeStoredHeader(len(todo), isEof)
 			if bw.err != nil {
 				return bw.err
 			}
 			bw.writeBytes(todo)
 		} else if int(dst.n) > len(todo)-len(todo)>>4 {
 			// If we removed less than 1/16th, huffman compress the block.
 			bw.writeBlockHuff(isEof, todo, false)
 		} else {
 			bw.writeBlockDynamic(&dst, isEof, todo, false)
 		}
 		if bw.err != nil {
 			return bw.err
 		}
 		dst.Reset()
 	}
 	if !eof {
 		// Align.
 		bw.writeStoredHeader(0, false)
 	}
 	bw.flush()
 	return bw.err
 }
 func hashSL(u uint32) uint32 {
 	return (u * 0x1e35a7bd) >> slTableShift
 }
 func load3216(b []byte, i int16) uint32 {
 	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
 	b = b[i:]
 	b = b[:4]
 	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
 }
 func load6416(b []byte, i int16) uint64 {
 	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
 	b = b[i:]
 	b = b[:8]
 	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
 		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
 }
 func statelessEnc(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 	type tableEntry struct {
 		offset int16
 	}
 	var table [slTableSize]tableEntry
 	// This check isn't in the Snappy implementation, but there, the caller
 	// instead of the callee handles this case.
 	if len(src) < minNonLiteralBlockSize {
 		// We do not fill the token table.
 		// This will be picked up by caller.
 		dst.n = uint16(len(src))
 		return
 	}
 	s := int16(1)
 	nextEmit := int16(0)
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
 	sLimit := int16(len(src) - inputMargin)
 	// nextEmit is where in src the next emitLiteral should start from.
 	cv := load3216(src, s)
 	for {
 		const skipLog = 5
 		const doEvery = 2
 		nextS := s
 		var candidate tableEntry
 		for {
 			nextHash := hashSL(cv)
 			candidate = table[nextHash]
 			nextS = s + doEvery + (s-nextEmit)>>skipLog
 			if nextS > sLimit || nextS <= 0 {
 				goto emitRemainder
 			}
 			now := load6416(src, nextS)
 			table[nextHash] = tableEntry{offset: s}
 			nextHash = hashSL(uint32(now))
 			if cv == load3216(src, candidate.offset) {
 				table[nextHash] = tableEntry{offset: nextS}
 				break
 			}
 			// Do one right away...
 			cv = uint32(now)
 			s = nextS
 			nextS++
 			candidate = table[nextHash]
 			now >>= 8
 			table[nextHash] = tableEntry{offset: s}
 			if cv == load3216(src, candidate.offset) {
 				table[nextHash] = tableEntry{offset: nextS}
 				break
 			}
 			cv = uint32(now)
 			s = nextS
 		}
 		// A 4-byte match has been found. We'll later see if more than 4 bytes
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 		for {
 			// Invariant: we have a 4-byte match at s, and no need to emit any
 			// literal bytes prior to s.
 			// Extend the 4-byte match as long as possible.
 			t := candidate.offset
 			l := int16(matchLen(src[s+4:], src[t+4:]) + 4)
 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
 				s--
 				t--
 				l++
 			}
 			if nextEmit < s {
 				emitLiteral(dst, src[nextEmit:s])
 			}
 			// Save the match found
 			dst.AddMatchLong(int32(l), uint32(s-t-baseMatchOffset))
 			s += l
 			nextEmit = s
 			if nextS >= s {
 				s = nextS + 1
 			}
 			if s >= sLimit {
 				goto emitRemainder
 			}
 			// We could immediately start working at s now, but to improve
 			// compression we first update the hash table at s-2 and at s. If
 			// another emitCopy is not our next move, also calculate nextHash
 			// at s+1. At least on GOARCH=amd64, these three hash calculations
 			// are faster as one load64 call (with some shifts) instead of
 			// three load32 calls.
 			x := load6416(src, s-2)
 			o := s - 2
 			prevHash := hashSL(uint32(x))
 			table[prevHash] = tableEntry{offset: o}
 			x >>= 16
 			currHash := hashSL(uint32(x))
 			candidate = table[currHash]
 			table[currHash] = tableEntry{offset: o + 2}
 			if uint32(x) != load3216(src, candidate.offset) {
 				cv = uint32(x >> 8)
 				s++
 				break
 			}
 		}
 	}
 emitRemainder:
 	if int(nextEmit) < len(src) {
 		// If nothing was added, don't encode literals.
 		if dst.n == 0 {
 			return
 		}
 		emitLiteral(dst, src[nextEmit:])
 	}
 }
--- a/vendor/github.com/klauspost/compress/flate/token.go
+++ b/vendor/github.com/klauspost/compress/flate/token.go
@ -4,7 +4,13 @@
 package flate
-import "fmt"
+import (
 	"bytes"
 	"encoding/binary"
 	"fmt"
 	"io"
 	"math"
 )
 const (
 	// 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
@ -19,7 +25,7 @@ const (
 // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
 // is lengthCodes[length - MIN_MATCH_LENGTH]
-var lengthCodes = [...]uint32{
+var lengthCodes = [256]uint8{
 	0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
 	9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
 	13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
@ -48,7 +54,37 @@ var lengthCodes = [...]uint32{
 	27, 27, 27, 27, 27, 28,
 }
-var offsetCodes = [...]uint32{
+// lengthCodes1 is length codes, but starting at 1.
 var lengthCodes1 = [256]uint8{
 	1, 2, 3, 4, 5, 6, 7, 8, 9, 9,
 	10, 10, 11, 11, 12, 12, 13, 13, 13, 13,
 	14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
 	16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
 	18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
 	19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
 	20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
 	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
 	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
 	22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
 	23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
 	23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
 	24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
 	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
 	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
 	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
 	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
 	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
 	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
 	26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
 	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
 	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
 	27, 27, 27, 27, 28, 28, 28, 28, 28, 28,
 	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
 	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
 	28, 28, 28, 28, 28, 29,
 }
 var offsetCodes = [256]uint32{
 	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
 	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
 	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
@ -67,49 +103,265 @@ var offsetCodes = [...]uint32{
 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 }
 // offsetCodes14 are offsetCodes, but with 14 added.
 var offsetCodes14 = [256]uint32{
 	14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
 	22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
 	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
 	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
 	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
 	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
 	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
 	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
 	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
 	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
 	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
 	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
 	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
 	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
 	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
 	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
 }
 type token uint32
 type tokens struct {
-	tokens [maxStoreBlockSize + 1]token
+	nLits     int
-	n      uint16 // Must be able to contain maxStoreBlockSize
+	extraHist [32]uint16  // codes 256->maxnumlit
 	offHist   [32]uint16  // offset codes
 	litHist   [256]uint16 // codes 0->255
 	n         uint16      // Must be able to contain maxStoreBlockSize
 	tokens    [maxStoreBlockSize + 1]token
 }
 func (t *tokens) Reset() {
 	if t.n == 0 {
 		return
 	}
 	t.n = 0
 	t.nLits = 0
 	for i := range t.litHist[:] {
 		t.litHist[i] = 0
 	}
 	for i := range t.extraHist[:] {
 		t.extraHist[i] = 0
 	}
 	for i := range t.offHist[:] {
 		t.offHist[i] = 0
 	}
 }
 func (t *tokens) Fill() {
 	if t.n == 0 {
 		return
 	}
 	for i, v := range t.litHist[:] {
 		if v == 0 {
 			t.litHist[i] = 1
 			t.nLits++
 		}
 	}
 	for i, v := range t.extraHist[:literalCount-256] {
 		if v == 0 {
 			t.nLits++
 			t.extraHist[i] = 1
 		}
 	}
 	for i, v := range t.offHist[:offsetCodeCount] {
 		if v == 0 {
 			t.offHist[i] = 1
 		}
 	}
 }
-// Convert a literal into a literal token.
+func indexTokens(in []token) tokens {
-func literalToken(literal uint32) token { return token(literalType + literal) }
+	var t tokens
 	t.indexTokens(in)
 	return t
 }
 func (t *tokens) indexTokens(in []token) {
 	t.Reset()
 	for _, tok := range in {
 		if tok < matchType {
 			t.tokens[t.n] = tok
 			t.litHist[tok]++
 			t.n++
 			continue
 		}
 		t.AddMatch(uint32(tok.length()), tok.offset())
 	}
 }
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 func emitLiteral(dst *tokens, lit []byte) {
 	ol := int(dst.n)
 	for i, v := range lit {
 		dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
 		dst.litHist[v]++
 	}
 	dst.n += uint16(len(lit))
 	dst.nLits += len(lit)
 }
-// Convert a < xlength, xoffset > pair into a match token.
+func (t *tokens) AddLiteral(lit byte) {
-func matchToken(xlength uint32, xoffset uint32) token {
+	t.tokens[t.n] = token(lit)
-	return token(matchType + xlength<<lengthShift + xoffset)
+	t.litHist[lit]++
 	t.n++
 	t.nLits++
 }
-func matchTokend(xlength uint32, xoffset uint32) token {
+// EstimatedBits will return an minimum size estimated by an *optimal*
-	if xlength > maxMatchLength || xoffset > maxMatchOffset {
+// compression of the block.
-		panic(fmt.Sprintf("Invalid match: len: %d, offset: %d\n", xlength, xoffset))
+// The size of the block
-		return token(matchType)
+func (t *tokens) EstimatedBits() int {
 	shannon := float64(0)
 	bits := int(0)
 	nMatches := 0
 	if t.nLits > 0 {
 		invTotal := 1.0 / float64(t.nLits)
 		for _, v := range t.litHist[:] {
 			if v > 0 {
 				n := float64(v)
 				shannon += math.Ceil(-math.Log2(n*invTotal) * n)
 			}
 		}
 		// Just add 15 for EOB
 		shannon += 15
 		for _, v := range t.extraHist[1 : literalCount-256] {
 			if v > 0 {
 				n := float64(v)
 				shannon += math.Ceil(-math.Log2(n*invTotal) * n)
 				bits += int(lengthExtraBits[v&31]) * int(v)
 				nMatches += int(v)
 			}
 		}
 	}
-	return token(matchType + xlength<<lengthShift + xoffset)
+	if nMatches > 0 {
 		invTotal := 1.0 / float64(nMatches)
 		for _, v := range t.offHist[:offsetCodeCount] {
 			if v > 0 {
 				n := float64(v)
 				shannon += math.Ceil(-math.Log2(n*invTotal) * n)
 				bits += int(offsetExtraBits[v&31]) * int(n)
 			}
 		}
 	}
 	return int(shannon) + bits
 }
 // AddMatch adds a match to the tokens.
 // This function is very sensitive to inlining and right on the border.
 func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
 	if debugDecode {
 		if xlength >= maxMatchLength+baseMatchLength {
 			panic(fmt.Errorf("invalid length: %v", xlength))
 		}
 		if xoffset >= maxMatchOffset+baseMatchOffset {
 			panic(fmt.Errorf("invalid offset: %v", xoffset))
 		}
 	}
 	t.nLits++
 	lengthCode := lengthCodes1[uint8(xlength)] & 31
 	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
 	t.extraHist[lengthCode]++
 	t.offHist[offsetCode(xoffset)&31]++
 	t.n++
 }
 // AddMatchLong adds a match to the tokens, potentially longer than max match length.
 // Length should NOT have the base subtracted, only offset should.
 func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
 	if debugDecode {
 		if xoffset >= maxMatchOffset+baseMatchOffset {
 			panic(fmt.Errorf("invalid offset: %v", xoffset))
 		}
 	}
 	oc := offsetCode(xoffset) & 31
 	for xlength > 0 {
 		xl := xlength
 		if xl > 258 {
 			// We need to have at least baseMatchLength left over for next loop.
 			xl = 258 - baseMatchLength
 		}
 		xlength -= xl
 		xl -= 3
 		t.nLits++
 		lengthCode := lengthCodes1[uint8(xl)] & 31
 		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
 		t.extraHist[lengthCode]++
 		t.offHist[oc]++
 		t.n++
 	}
 }
 func (t *tokens) AddEOB() {
 	t.tokens[t.n] = token(endBlockMarker)
 	t.extraHist[0]++
 	t.n++
 }
 func (t *tokens) Slice() []token {
 	return t.tokens[:t.n]
 }
 // VarInt returns the tokens as varint encoded bytes.
 func (t *tokens) VarInt() []byte {
 	var b = make([]byte, binary.MaxVarintLen32*int(t.n))
 	var off int
 	for _, v := range t.tokens[:t.n] {
 		off += binary.PutUvarint(b[off:], uint64(v))
 	}
 	return b[:off]
 }
 // FromVarInt restores t to the varint encoded tokens provided.
 // Any data in t is removed.
 func (t *tokens) FromVarInt(b []byte) error {
 	var buf = bytes.NewReader(b)
 	var toks []token
 	for {
 		r, err := binary.ReadUvarint(buf)
 		if err == io.EOF {
 			break
 		}
 		if err != nil {
 			return err
 		}
 		toks = append(toks, token(r))
 	}
 	t.indexTokens(toks)
 	return nil
 }
 // Returns the type of a token
 func (t token) typ() uint32 { return uint32(t) & typeMask }
 // Returns the literal of a literal token
-func (t token) literal() uint32 { return uint32(t - literalType) }
+func (t token) literal() uint8 { return uint8(t) }
 // Returns the extra offset of a match token
 func (t token) offset() uint32 { return uint32(t) & offsetMask }
-func (t token) length() uint32 { return uint32((t - matchType) >> lengthShift) }
+func (t token) length() uint8 { return uint8(t >> lengthShift) }
-func lengthCode(len uint32) uint32 { return lengthCodes[len] }
+// The code is never more than 8 bits, but is returned as uint32 for convenience.
 func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) }
 // Returns the offset code corresponding to a specific offset
 func offsetCode(off uint32) uint32 {
 	if false {
 		if off < uint32(len(offsetCodes)) {
 			return offsetCodes[off&255]
 		} else if off>>7 < uint32(len(offsetCodes)) {
 			return offsetCodes[(off>>7)&255] + 14
 		} else {
 			return offsetCodes[(off>>14)&255] + 28
 		}
 	}
 	if off < uint32(len(offsetCodes)) {
-		return offsetCodes[off]
+		return offsetCodes[uint8(off)]
 	} else if off>>7 < uint32(len(offsetCodes)) {
 		return offsetCodes[off>>7] + 14
 	} else {
 		return offsetCodes[off>>14] + 28
 	}
 	return offsetCodes14[uint8(off>>7)]
 }
--- a/vendor/github.com/klauspost/compress/gzip/gunzip.go
+++ b/vendor/github.com/klauspost/compress/gzip/gunzip.go
@ -10,11 +10,11 @@ import (
 	"bufio"
 	"encoding/binary"
 	"errors"
 	"hash/crc32"
 	"io"
 	"time"
 	"github.com/klauspost/compress/flate"
 	"github.com/klauspost/crc32"
 )
 const (
--- a/vendor/github.com/klauspost/compress/gzip/gzip.go
+++ b/vendor/github.com/klauspost/compress/gzip/gzip.go
@ -7,10 +7,10 @@ package gzip
 import (
 	"errors"
 	"fmt"
 	"hash/crc32"
 	"io"
 	"github.com/klauspost/compress/flate"
 	"github.com/klauspost/crc32"
 )
 // These constants are copied from the flate package, so that code that imports
@ -22,6 +22,13 @@ const (
 	DefaultCompression  = flate.DefaultCompression
 	ConstantCompression = flate.ConstantCompression
 	HuffmanOnly         = flate.HuffmanOnly
 	// StatelessCompression will do compression but without maintaining any state
 	// between Write calls.
 	// There will be no memory kept between Write calls,
 	// but compression and speed will be suboptimal.
 	// Because of this, the size of actual Write calls will affect output size.
 	StatelessCompression = -3
 )
 // A Writer is an io.WriteCloser.
@ -59,7 +66,7 @@ func NewWriter(w io.Writer) *Writer {
 // integer value between BestSpeed and BestCompression inclusive. The error
 // returned will be nil if the level is valid.
 func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
-	if level < HuffmanOnly || level > BestCompression {
+	if level < StatelessCompression || level > BestCompression {
 		return nil, fmt.Errorf("gzip: invalid compression level: %d", level)
 	}
 	z := new(Writer)
@ -69,9 +76,12 @@ func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
 func (z *Writer) init(w io.Writer, level int) {
 	compressor := z.compressor
-	if compressor != nil {
+	if level != StatelessCompression {
-		compressor.Reset(w)
+		if compressor != nil {
 			compressor.Reset(w)
 		}
 	}
 	*z = Writer{
 		Header: Header{
 			OS: 255, // unknown
@ -189,12 +199,16 @@ func (z *Writer) Write(p []byte) (int, error) {
 				return n, z.err
 			}
 		}
-		if z.compressor == nil {
+
 		if z.compressor == nil && z.level != StatelessCompression {
 			z.compressor, _ = flate.NewWriter(z.w, z.level)
 		}
 	}
 	z.size += uint32(len(p))
 	z.digest = crc32.Update(z.digest, crc32.IEEETable, p)
 	if z.level == StatelessCompression {
 		return len(p), flate.StatelessDeflate(z.w, p, false)
 	}
 	n, z.err = z.compressor.Write(p)
 	return n, z.err
 }
@ -211,7 +225,7 @@ func (z *Writer) Flush() error {
 	if z.err != nil {
 		return z.err
 	}
-	if z.closed {
+	if z.closed || z.level == StatelessCompression {
 		return nil
 	}
 	if !z.wroteHeader {
@ -240,7 +254,11 @@ func (z *Writer) Close() error {
 			return z.err
 		}
 	}
-	z.err = z.compressor.Close()
+	if z.level == StatelessCompression {
 		z.err = flate.StatelessDeflate(z.w, nil, true)
 	} else {
 		z.err = z.compressor.Close()
 	}
 	if z.err != nil {
 		return z.err
 	}
--- a/vendor/github.com/klauspost/cpuid/.gitignore
+++ b/vendor/github.com/klauspost/cpuid/.gitignore
@ -1,24 +0,0 @@
 # Compiled Object files, Static and Dynamic libs (Shared Objects)
 *.o
 *.a
 *.so
 # Folders
 _obj
 _test
 # Architecture specific extensions/prefixes
 *.[568vq]
 [568vq].out
 *.cgo1.go
 *.cgo2.c
 _cgo_defun.c
 _cgo_gotypes.go
 _cgo_export.*
 _testmain.go
 *.exe
 *.test
 *.prof
--- a/vendor/github.com/klauspost/cpuid/.travis.yml
+++ b/vendor/github.com/klauspost/cpuid/.travis.yml
@ -1,8 +0,0 @@
 language: go
 go:
  - 1.3
  - 1.4
  - 1.5
  - 1.6
  - tip
--- a/vendor/github.com/klauspost/cpuid/LICENSE
+++ b/vendor/github.com/klauspost/cpuid/LICENSE
@ -1,22 +0,0 @@
 The MIT License (MIT)
 Copyright (c) 2015 Klaus Post
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/vendor/github.com/klauspost/cpuid/README.md
+++ b/vendor/github.com/klauspost/cpuid/README.md
@ -1,145 +0,0 @@
 # cpuid
 Package cpuid provides information about the CPU running the current program.
 CPU features are detected on startup, and kept for fast access through the life of the application.
 Currently x86 / x64 (AMD64) is supported, and no external C (cgo) code is used, which should make the library very easy to use.
 You can access the CPU information by accessing the shared CPU variable of the cpuid library.
 Package home: https://github.com/klauspost/cpuid
 [![GoDoc][1]][2] [![Build Status][3]][4]
 [1]: https://godoc.org/github.com/klauspost/cpuid?status.svg
 [2]: https://godoc.org/github.com/klauspost/cpuid
 [3]: https://travis-ci.org/klauspost/cpuid.svg
 [4]: https://travis-ci.org/klauspost/cpuid
 # features
 ## CPU Instructions
 *  **CMOV** (i686 CMOV)
 *  **NX** (NX (No-Execute) bit)
 *  **AMD3DNOW** (AMD 3DNOW)
 *  **AMD3DNOWEXT** (AMD 3DNowExt)
 *  **MMX** (standard MMX)
 *  **MMXEXT** (SSE integer functions or AMD MMX ext)
 *  **SSE** (SSE functions)
 *  **SSE2** (P4 SSE functions)
 *  **SSE3** (Prescott SSE3 functions)
 *  **SSSE3** (Conroe SSSE3 functions)
 *  **SSE4** (Penryn SSE4.1 functions)
 *  **SSE4A** (AMD Barcelona microarchitecture SSE4a instructions)
 *  **SSE42** (Nehalem SSE4.2 functions)
 *  **AVX** (AVX functions)
 *  **AVX2** (AVX2 functions)
 *  **FMA3** (Intel FMA 3)
 *  **FMA4** (Bulldozer FMA4 functions)
 *  **XOP** (Bulldozer XOP functions)
 *  **F16C** (Half-precision floating-point conversion)
 *  **BMI1** (Bit Manipulation Instruction Set 1)
 *  **BMI2** (Bit Manipulation Instruction Set 2)
 *  **TBM** (AMD Trailing Bit Manipulation)
 *  **LZCNT** (LZCNT instruction)
 *  **POPCNT** (POPCNT instruction)
 *  **AESNI** (Advanced Encryption Standard New Instructions)
 *  **CLMUL** (Carry-less Multiplication)
 *  **HTT** (Hyperthreading (enabled))
 *  **HLE** (Hardware Lock Elision)
 *  **RTM** (Restricted Transactional Memory)
 *  **RDRAND** (RDRAND instruction is available)
 *  **RDSEED** (RDSEED instruction is available)
 *  **ADX** (Intel ADX (Multi-Precision Add-Carry Instruction Extensions))
 *  **SHA** (Intel SHA Extensions)
 *  **AVX512F** (AVX-512 Foundation)
 *  **AVX512DQ** (AVX-512 Doubleword and Quadword Instructions)
 *  **AVX512IFMA** (AVX-512 Integer Fused Multiply-Add Instructions)
 *  **AVX512PF** (AVX-512 Prefetch Instructions)
 *  **AVX512ER** (AVX-512 Exponential and Reciprocal Instructions)
 *  **AVX512CD** (AVX-512 Conflict Detection Instructions)
 *  **AVX512BW** (AVX-512 Byte and Word Instructions)
 *  **AVX512VL** (AVX-512 Vector Length Extensions)
 *  **AVX512VBMI** (AVX-512 Vector Bit Manipulation Instructions)
 *  **MPX** (Intel MPX (Memory Protection Extensions))
 *  **ERMS** (Enhanced REP MOVSB/STOSB)
 *  **RDTSCP** (RDTSCP Instruction)
 *  **CX16** (CMPXCHG16B Instruction)
 *  **SGX** (Software Guard Extensions, with activation details)
 ## Performance
 *  **RDTSCP()** Returns current cycle count. Can be used for benchmarking.
 *  **SSE2SLOW** (SSE2 is supported, but usually not faster)
 *  **SSE3SLOW** (SSE3 is supported, but usually not faster)
 *  **ATOM** (Atom processor, some SSSE3 instructions are slower)
 *  **Cache line** (Probable size of a cache line).
 *  **L1, L2, L3 Cache size** on newer Intel/AMD CPUs.
 ## Cpu Vendor/VM
 * **Intel**
 * **AMD**
 * **VIA**
 * **Transmeta**
 * **NSC**
 * **KVM**  (Kernel-based Virtual Machine)
 * **MSVM** (Microsoft Hyper-V or Windows Virtual PC)
 * **VMware**
 * **XenHVM**
 # installing
 ```go get github.com/klauspost/cpuid```
 # example
 ```Go
 package main
 import (
 	"fmt"
 	"github.com/klauspost/cpuid"
 )
 func main() {
 	// Print basic CPU information:
 	fmt.Println("Name:", cpuid.CPU.BrandName)
 	fmt.Println("PhysicalCores:", cpuid.CPU.PhysicalCores)
 	fmt.Println("ThreadsPerCore:", cpuid.CPU.ThreadsPerCore)
 	fmt.Println("LogicalCores:", cpuid.CPU.LogicalCores)
 	fmt.Println("Family", cpuid.CPU.Family, "Model:", cpuid.CPU.Model)
 	fmt.Println("Features:", cpuid.CPU.Features)
 	fmt.Println("Cacheline bytes:", cpuid.CPU.CacheLine)
 	fmt.Println("L1 Data Cache:", cpuid.CPU.Cache.L1D, "bytes")
 	fmt.Println("L1 Instruction Cache:", cpuid.CPU.Cache.L1D, "bytes")
 	fmt.Println("L2 Cache:", cpuid.CPU.Cache.L2, "bytes")
 	fmt.Println("L3 Cache:", cpuid.CPU.Cache.L3, "bytes")
 	// Test if we have a specific feature:
 	if cpuid.CPU.SSE() {
 		fmt.Println("We have Streaming SIMD Extensions")
 	}
 }
 ```
 Sample output:
 ```
 >go run main.go
 Name: Intel(R) Core(TM) i5-2540M CPU @ 2.60GHz
 PhysicalCores: 2
 ThreadsPerCore: 2
 LogicalCores: 4
 Family 6 Model: 42
 Features: CMOV,MMX,MMXEXT,SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,AVX,AESNI,CLMUL
 Cacheline bytes: 64
 We have Streaming SIMD Extensions
 ```
 # private package
 In the "private" folder you can find an autogenerated version of the library you can include in your own packages.
 For this purpose all exports are removed, and functions and constants are lowercased.
 This is not a recommended way of using the library, but provided for convenience, if it is difficult for you to use external packages.
 # license
 This code is published under an MIT license. See LICENSE file for more information.
--- a/vendor/github.com/klauspost/cpuid/cpuid.go
+++ b/vendor/github.com/klauspost/cpuid/cpuid.go
--- a/vendor/github.com/klauspost/cpuid/cpuid_386.s
+++ b/vendor/github.com/klauspost/cpuid/cpuid_386.s
@ -1,42 +0,0 @@
 // Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
 // +build 386,!gccgo
 // func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
 TEXT ·asmCpuid(SB), 7, $0
 	XORL CX, CX
 	MOVL op+0(FP), AX
 	CPUID
 	MOVL AX, eax+4(FP)
 	MOVL BX, ebx+8(FP)
 	MOVL CX, ecx+12(FP)
 	MOVL DX, edx+16(FP)
 	RET
 // func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
 TEXT ·asmCpuidex(SB), 7, $0
 	MOVL op+0(FP), AX
 	MOVL op2+4(FP), CX
 	CPUID
 	MOVL AX, eax+8(FP)
 	MOVL BX, ebx+12(FP)
 	MOVL CX, ecx+16(FP)
 	MOVL DX, edx+20(FP)
 	RET
 // func xgetbv(index uint32) (eax, edx uint32)
 TEXT ·asmXgetbv(SB), 7, $0
 	MOVL index+0(FP), CX
 	BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
 	MOVL AX, eax+4(FP)
 	MOVL DX, edx+8(FP)
 	RET
 // func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
 TEXT ·asmRdtscpAsm(SB), 7, $0
 	BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
 	MOVL AX, eax+0(FP)
 	MOVL BX, ebx+4(FP)
 	MOVL CX, ecx+8(FP)
 	MOVL DX, edx+12(FP)
 	RET
--- a/vendor/github.com/klauspost/cpuid/cpuid_amd64.s
+++ b/vendor/github.com/klauspost/cpuid/cpuid_amd64.s
@ -1,42 +0,0 @@
 // Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
 //+build amd64,!gccgo
 // func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
 TEXT ·asmCpuid(SB), 7, $0
 	XORQ CX, CX
 	MOVL op+0(FP), AX
 	CPUID
 	MOVL AX, eax+8(FP)
 	MOVL BX, ebx+12(FP)
 	MOVL CX, ecx+16(FP)
 	MOVL DX, edx+20(FP)
 	RET
 // func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
 TEXT ·asmCpuidex(SB), 7, $0
 	MOVL op+0(FP), AX
 	MOVL op2+4(FP), CX
 	CPUID
 	MOVL AX, eax+8(FP)
 	MOVL BX, ebx+12(FP)
 	MOVL CX, ecx+16(FP)
 	MOVL DX, edx+20(FP)
 	RET
 // func asmXgetbv(index uint32) (eax, edx uint32)
 TEXT ·asmXgetbv(SB), 7, $0
 	MOVL index+0(FP), CX
 	BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
 	MOVL AX, eax+8(FP)
 	MOVL DX, edx+12(FP)
 	RET
 // func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
 TEXT ·asmRdtscpAsm(SB), 7, $0
 	BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
 	MOVL AX, eax+0(FP)
 	MOVL BX, ebx+4(FP)
 	MOVL CX, ecx+8(FP)
 	MOVL DX, edx+12(FP)
 	RET
--- a/vendor/github.com/klauspost/cpuid/detect_intel.go
+++ b/vendor/github.com/klauspost/cpuid/detect_intel.go
@ -1,17 +0,0 @@
 // Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
 // +build 386,!gccgo amd64,!gccgo
 package cpuid
 func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
 func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
 func asmXgetbv(index uint32) (eax, edx uint32)
 func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
 func initCPU() {
 	cpuid = asmCpuid
 	cpuidex = asmCpuidex
 	xgetbv = asmXgetbv
 	rdtscpAsm = asmRdtscpAsm
 }
--- a/vendor/github.com/klauspost/cpuid/detect_ref.go
+++ b/vendor/github.com/klauspost/cpuid/detect_ref.go
@ -1,23 +0,0 @@
 // Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
 // +build !amd64,!386 gccgo
 package cpuid
 func initCPU() {
 	cpuid = func(op uint32) (eax, ebx, ecx, edx uint32) {
 		return 0, 0, 0, 0
 	}
 	cpuidex = func(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
 		return 0, 0, 0, 0
 	}
 	xgetbv = func(index uint32) (eax, edx uint32) {
 		return 0, 0
 	}
 	rdtscpAsm = func() (eax, ebx, ecx, edx uint32) {
 		return 0, 0, 0, 0
 	}
 }
--- a/vendor/github.com/klauspost/cpuid/generate.go
+++ b/vendor/github.com/klauspost/cpuid/generate.go
@ -1,3 +0,0 @@
 package cpuid
 //go:generate go run private-gen.go
--- a/vendor/github.com/klauspost/crc32/.gitignore
+++ b/vendor/github.com/klauspost/crc32/.gitignore
@ -1,24 +0,0 @@
 # Compiled Object files, Static and Dynamic libs (Shared Objects)
 *.o
 *.a
 *.so
 # Folders
 _obj
 _test
 # Architecture specific extensions/prefixes
 *.[568vq]
 [568vq].out
 *.cgo1.go
 *.cgo2.c
 _cgo_defun.c
 _cgo_gotypes.go
 _cgo_export.*
 _testmain.go
 *.exe
 *.test
 *.prof
--- a/vendor/github.com/klauspost/crc32/.travis.yml
+++ b/vendor/github.com/klauspost/crc32/.travis.yml
@ -1,13 +0,0 @@
 language: go
 go:
  - 1.3
  - 1.4
  - 1.5
  - 1.6
  - 1.7
  - tip
 script: 
 - go test -v .
 - go test -v -race .
--- a/vendor/github.com/klauspost/crc32/LICENSE
+++ b/vendor/github.com/klauspost/crc32/LICENSE
@ -1,28 +0,0 @@
 Copyright (c) 2012 The Go Authors. All rights reserved.
 Copyright (c) 2015 Klaus Post
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
   * Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
   * Redistributions in binary form must reproduce the above
 copyright notice, this list of conditions and the following disclaimer
 in the documentation and/or other materials provided with the
 distribution.
   * Neither the name of Google Inc. nor the names of its
 contributors may be used to endorse or promote products derived from
 this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/vendor/github.com/klauspost/crc32/README.md
+++ b/vendor/github.com/klauspost/crc32/README.md
@ -1,87 +0,0 @@
 # crc32
 CRC32 hash with x64 optimizations
 This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup.
 [![Build Status](https://travis-ci.org/klauspost/crc32.svg?branch=master)](https://travis-ci.org/klauspost/crc32)
 # usage
 Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer.
 Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go.
 # changes
 * Oct 20, 2016: Changes have been merged to upstream Go. Package updated to match.
 * Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable.
 # performance
 For *Go 1.7* performance is equivalent to the standard library. So if you use this package for Go 1.7 you can switch back.
 For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction:
 ```
 benchmark            old ns/op     new ns/op     delta
 BenchmarkCrc32KB     99955         10258         -89.74%
 benchmark            old MB/s     new MB/s     speedup
 BenchmarkCrc32KB     327.83       3194.20      9.74x
 ```
 For other tables and "CLMUL"  capable machines the performance is the same as the standard library.
 Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled.
 ```
 Std:   Standard Go 1.5 library
 Crc:   Indicates IEEE type CRC.
 40B:   Size of each slice encoded.
 NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine).
 Castagnoli: Castagnoli CRC type.
 BenchmarkStdCrc40B-4            10000000               158 ns/op         252.88 MB/s
 BenchmarkCrc40BNoAsm-4          20000000               105 ns/op         377.38 MB/s (slice8)
 BenchmarkCrc40B-4               20000000               105 ns/op         378.77 MB/s (slice8)
 BenchmarkStdCrc1KB-4              500000              3604 ns/op         284.10 MB/s
 BenchmarkCrc1KBNoAsm-4           1000000              1463 ns/op         699.79 MB/s (slice8)
 BenchmarkCrc1KB-4                3000000               396 ns/op        2583.69 MB/s (asm)
 BenchmarkStdCrc8KB-4              200000             11417 ns/op         717.48 MB/s (slice8)
 BenchmarkCrc8KBNoAsm-4            200000             11317 ns/op         723.85 MB/s (slice8)
 BenchmarkCrc8KB-4                 500000              2919 ns/op        2805.73 MB/s (asm)
 BenchmarkStdCrc32KB-4              30000             45749 ns/op         716.24 MB/s (slice8)
 BenchmarkCrc32KBNoAsm-4            30000             45109 ns/op         726.42 MB/s (slice8)
 BenchmarkCrc32KB-4                100000             11497 ns/op        2850.09 MB/s (asm)
 BenchmarkStdNoAsmCastagnol40B-4 10000000               161 ns/op         246.94 MB/s
 BenchmarkStdCastagnoli40B-4     50000000              28.4 ns/op        1410.69 MB/s (asm)
 BenchmarkCastagnoli40BNoAsm-4   20000000               100 ns/op         398.01 MB/s (slice8)
 BenchmarkCastagnoli40B-4        50000000              28.2 ns/op        1419.54 MB/s (asm)
 BenchmarkStdNoAsmCastagnoli1KB-4  500000              3622 ns/op        282.67 MB/s
 BenchmarkStdCastagnoli1KB-4     10000000               144 ns/op        7099.78 MB/s (asm)
 BenchmarkCastagnoli1KBNoAsm-4    1000000              1475 ns/op         694.14 MB/s (slice8)
 BenchmarkCastagnoli1KB-4        10000000               146 ns/op        6993.35 MB/s (asm)
 BenchmarkStdNoAsmCastagnoli8KB-4  50000              28781 ns/op         284.63 MB/s
 BenchmarkStdCastagnoli8KB-4      1000000              1029 ns/op        7957.89 MB/s (asm)
 BenchmarkCastagnoli8KBNoAsm-4     200000             11410 ns/op         717.94 MB/s (slice8)
 BenchmarkCastagnoli8KB-4         1000000              1000 ns/op        8188.71 MB/s (asm)
 BenchmarkStdNoAsmCastagnoli32KB-4  10000            115426 ns/op         283.89 MB/s
 BenchmarkStdCastagnoli32KB-4      300000              4065 ns/op        8059.13 MB/s (asm)
 BenchmarkCastagnoli32KBNoAsm-4     30000             45171 ns/op         725.41 MB/s (slice8)
 BenchmarkCastagnoli32KB-4         500000              4077 ns/op        8035.89 MB/s (asm)
 ```
 The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library.
 However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7.
 # license
 Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions.
--- a/vendor/github.com/klauspost/crc32/crc32.go
+++ b/vendor/github.com/klauspost/crc32/crc32.go
@ -1,207 +0,0 @@
 // Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32,
 // checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for
 // information.
 //
 // Polynomials are represented in LSB-first form also known as reversed representation.
 //
 // See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials
 // for information.
 package crc32
 import (
 	"hash"
 	"sync"
 )
 // The size of a CRC-32 checksum in bytes.
 const Size = 4
 // Predefined polynomials.
 const (
 	// IEEE is by far and away the most common CRC-32 polynomial.
 	// Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ...
 	IEEE = 0xedb88320
 	// Castagnoli's polynomial, used in iSCSI.
 	// Has better error detection characteristics than IEEE.
 	// http://dx.doi.org/10.1109/26.231911
 	Castagnoli = 0x82f63b78
 	// Koopman's polynomial.
 	// Also has better error detection characteristics than IEEE.
 	// http://dx.doi.org/10.1109/DSN.2002.1028931
 	Koopman = 0xeb31d82e
 )
 // Table is a 256-word table representing the polynomial for efficient processing.
 type Table [256]uint32
 // This file makes use of functions implemented in architecture-specific files.
 // The interface that they implement is as follows:
 //
 //    // archAvailableIEEE reports whether an architecture-specific CRC32-IEEE
 //    // algorithm is available.
 //    archAvailableIEEE() bool
 //
 //    // archInitIEEE initializes the architecture-specific CRC3-IEEE algorithm.
 //    // It can only be called if archAvailableIEEE() returns true.
 //    archInitIEEE()
 //
 //    // archUpdateIEEE updates the given CRC32-IEEE. It can only be called if
 //    // archInitIEEE() was previously called.
 //    archUpdateIEEE(crc uint32, p []byte) uint32
 //
 //    // archAvailableCastagnoli reports whether an architecture-specific
 //    // CRC32-C algorithm is available.
 //    archAvailableCastagnoli() bool
 //
 //    // archInitCastagnoli initializes the architecture-specific CRC32-C
 //    // algorithm. It can only be called if archAvailableCastagnoli() returns
 //    // true.
 //    archInitCastagnoli()
 //
 //    // archUpdateCastagnoli updates the given CRC32-C. It can only be called
 //    // if archInitCastagnoli() was previously called.
 //    archUpdateCastagnoli(crc uint32, p []byte) uint32
 // castagnoliTable points to a lazily initialized Table for the Castagnoli
 // polynomial. MakeTable will always return this value when asked to make a
 // Castagnoli table so we can compare against it to find when the caller is
 // using this polynomial.
 var castagnoliTable *Table
 var castagnoliTable8 *slicing8Table
 var castagnoliArchImpl bool
 var updateCastagnoli func(crc uint32, p []byte) uint32
 var castagnoliOnce sync.Once
 func castagnoliInit() {
 	castagnoliTable = simpleMakeTable(Castagnoli)
 	castagnoliArchImpl = archAvailableCastagnoli()
 	if castagnoliArchImpl {
 		archInitCastagnoli()
 		updateCastagnoli = archUpdateCastagnoli
 	} else {
 		// Initialize the slicing-by-8 table.
 		castagnoliTable8 = slicingMakeTable(Castagnoli)
 		updateCastagnoli = func(crc uint32, p []byte) uint32 {
 			return slicingUpdate(crc, castagnoliTable8, p)
 		}
 	}
 }
 // IEEETable is the table for the IEEE polynomial.
 var IEEETable = simpleMakeTable(IEEE)
 // ieeeTable8 is the slicing8Table for IEEE
 var ieeeTable8 *slicing8Table
 var ieeeArchImpl bool
 var updateIEEE func(crc uint32, p []byte) uint32
 var ieeeOnce sync.Once
 func ieeeInit() {
 	ieeeArchImpl = archAvailableIEEE()
 	if ieeeArchImpl {
 		archInitIEEE()
 		updateIEEE = archUpdateIEEE
 	} else {
 		// Initialize the slicing-by-8 table.
 		ieeeTable8 = slicingMakeTable(IEEE)
 		updateIEEE = func(crc uint32, p []byte) uint32 {
 			return slicingUpdate(crc, ieeeTable8, p)
 		}
 	}
 }
 // MakeTable returns a Table constructed from the specified polynomial.
 // The contents of this Table must not be modified.
 func MakeTable(poly uint32) *Table {
 	switch poly {
 	case IEEE:
 		ieeeOnce.Do(ieeeInit)
 		return IEEETable
 	case Castagnoli:
 		castagnoliOnce.Do(castagnoliInit)
 		return castagnoliTable
 	}
 	return simpleMakeTable(poly)
 }
 // digest represents the partial evaluation of a checksum.
 type digest struct {
 	crc uint32
 	tab *Table
 }
 // New creates a new hash.Hash32 computing the CRC-32 checksum
 // using the polynomial represented by the Table.
 // Its Sum method will lay the value out in big-endian byte order.
 func New(tab *Table) hash.Hash32 {
 	if tab == IEEETable {
 		ieeeOnce.Do(ieeeInit)
 	}
 	return &digest{0, tab}
 }
 // NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum
 // using the IEEE polynomial.
 // Its Sum method will lay the value out in big-endian byte order.
 func NewIEEE() hash.Hash32 { return New(IEEETable) }
 func (d *digest) Size() int { return Size }
 func (d *digest) BlockSize() int { return 1 }
 func (d *digest) Reset() { d.crc = 0 }
 // Update returns the result of adding the bytes in p to the crc.
 func Update(crc uint32, tab *Table, p []byte) uint32 {
 	switch tab {
 	case castagnoliTable:
 		return updateCastagnoli(crc, p)
 	case IEEETable:
 		// Unfortunately, because IEEETable is exported, IEEE may be used without a
 		// call to MakeTable. We have to make sure it gets initialized in that case.
 		ieeeOnce.Do(ieeeInit)
 		return updateIEEE(crc, p)
 	default:
 		return simpleUpdate(crc, tab, p)
 	}
 }
 func (d *digest) Write(p []byte) (n int, err error) {
 	switch d.tab {
 	case castagnoliTable:
 		d.crc = updateCastagnoli(d.crc, p)
 	case IEEETable:
 		// We only create digest objects through New() which takes care of
 		// initialization in this case.
 		d.crc = updateIEEE(d.crc, p)
 	default:
 		d.crc = simpleUpdate(d.crc, d.tab, p)
 	}
 	return len(p), nil
 }
 func (d *digest) Sum32() uint32 { return d.crc }
 func (d *digest) Sum(in []byte) []byte {
 	s := d.Sum32()
 	return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s))
 }
 // Checksum returns the CRC-32 checksum of data
 // using the polynomial represented by the Table.
 func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
 // ChecksumIEEE returns the CRC-32 checksum of data
 // using the IEEE polynomial.
 func ChecksumIEEE(data []byte) uint32 {
 	ieeeOnce.Do(ieeeInit)
 	return updateIEEE(0, data)
 }
--- a/vendor/github.com/klauspost/crc32/crc32_amd64.go
+++ b/vendor/github.com/klauspost/crc32/crc32_amd64.go
@ -1,230 +0,0 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !appengine,!gccgo
 // AMD64-specific hardware-assisted CRC32 algorithms. See crc32.go for a
 // description of the interface that each architecture-specific file
 // implements.
 package crc32
 import "unsafe"
 // This file contains the code to call the SSE 4.2 version of the Castagnoli
 // and IEEE CRC.
 // haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
 // CPUID to test for SSE 4.1, 4.2 and CLMUL support.
 func haveSSE41() bool
 func haveSSE42() bool
 func haveCLMUL() bool
 // castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32
 // instruction.
 //go:noescape
 func castagnoliSSE42(crc uint32, p []byte) uint32
 // castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32
 // instruction.
 //go:noescape
 func castagnoliSSE42Triple(
 	crcA, crcB, crcC uint32,
 	a, b, c []byte,
 	rounds uint32,
 ) (retA uint32, retB uint32, retC uint32)
 // ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
 // instruction as well as SSE 4.1.
 //go:noescape
 func ieeeCLMUL(crc uint32, p []byte) uint32
 var sse42 = haveSSE42()
 var useFastIEEE = haveCLMUL() && haveSSE41()
 const castagnoliK1 = 168
 const castagnoliK2 = 1344
 type sse42Table [4]Table
 var castagnoliSSE42TableK1 *sse42Table
 var castagnoliSSE42TableK2 *sse42Table
 func archAvailableCastagnoli() bool {
 	return sse42
 }
 func archInitCastagnoli() {
 	if !sse42 {
 		panic("arch-specific Castagnoli not available")
 	}
 	castagnoliSSE42TableK1 = new(sse42Table)
 	castagnoliSSE42TableK2 = new(sse42Table)
 	// See description in updateCastagnoli.
 	//    t[0][i] = CRC(i000, O)
 	//    t[1][i] = CRC(0i00, O)
 	//    t[2][i] = CRC(00i0, O)
 	//    t[3][i] = CRC(000i, O)
 	// where O is a sequence of K zeros.
 	var tmp [castagnoliK2]byte
 	for b := 0; b < 4; b++ {
 		for i := 0; i < 256; i++ {
 			val := uint32(i) << uint32(b*8)
 			castagnoliSSE42TableK1[b][i] = castagnoliSSE42(val, tmp[:castagnoliK1])
 			castagnoliSSE42TableK2[b][i] = castagnoliSSE42(val, tmp[:])
 		}
 	}
 }
 // castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the
 // table given) with the given initial crc value. This corresponds to
 // CRC(crc, O) in the description in updateCastagnoli.
 func castagnoliShift(table *sse42Table, crc uint32) uint32 {
 	return table[3][crc>>24] ^
 		table[2][(crc>>16)&0xFF] ^
 		table[1][(crc>>8)&0xFF] ^
 		table[0][crc&0xFF]
 }
 func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
 	if !sse42 {
 		panic("not available")
 	}
 	// This method is inspired from the algorithm in Intel's white paper:
 	//    "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
 	// The same strategy of splitting the buffer in three is used but the
 	// combining calculation is different; the complete derivation is explained
 	// below.
 	//
 	// -- The basic idea --
 	//
 	// The CRC32 instruction (available in SSE4.2) can process 8 bytes at a
 	// time. In recent Intel architectures the instruction takes 3 cycles;
 	// however the processor can pipeline up to three instructions if they
 	// don't depend on each other.
 	//
 	// Roughly this means that we can process three buffers in about the same
 	// time we can process one buffer.
 	//
 	// The idea is then to split the buffer in three, CRC the three pieces
 	// separately and then combine the results.
 	//
 	// Combining the results requires precomputed tables, so we must choose a
 	// fixed buffer length to optimize. The longer the length, the faster; but
 	// only buffers longer than this length will use the optimization. We choose
 	// two cutoffs and compute tables for both:
 	//  - one around 512: 168*3=504
 	//  - one around 4KB: 1344*3=4032
 	//
 	// -- The nitty gritty --
 	//
 	// Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with
 	// initial non-inverted CRC I). This function has the following properties:
 	//   (a) CRC(I, AB) = CRC(CRC(I, A), B)
 	//   (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B)
 	//
 	// Say we want to compute CRC(I, ABC) where A, B, C are three sequences of
 	// K bytes each, where K is a fixed constant. Let O be the sequence of K zero
 	// bytes.
 	//
 	// CRC(I, ABC) = CRC(I, ABO xor C)
 	//             = CRC(I, ABO) xor CRC(0, C)
 	//             = CRC(CRC(I, AB), O) xor CRC(0, C)
 	//             = CRC(CRC(I, AO xor B), O) xor CRC(0, C)
 	//             = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C)
 	//             = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C)
 	//
 	// The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B),
 	// and CRC(0, C) efficiently.  We just need to find a way to quickly compute
 	// CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these
 	// values; since we can't have a 32-bit table, we break it up into four
 	// 8-bit tables:
 	//
 	//    CRC(uvwx, O) = CRC(u000, O) xor
 	//                   CRC(0v00, O) xor
 	//                   CRC(00w0, O) xor
 	//                   CRC(000x, O)
 	//
 	// We can compute tables corresponding to the four terms for all 8-bit
 	// values.
 	crc = ^crc
 	// If a buffer is long enough to use the optimization, process the first few
 	// bytes to align the buffer to an 8 byte boundary (if necessary).
 	if len(p) >= castagnoliK1*3 {
 		delta := int(uintptr(unsafe.Pointer(&p[0])) & 7)
 		if delta != 0 {
 			delta = 8 - delta
 			crc = castagnoliSSE42(crc, p[:delta])
 			p = p[delta:]
 		}
 	}
 	// Process 3*K2 at a time.
 	for len(p) >= castagnoliK2*3 {
 		// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
 		crcA, crcB, crcC := castagnoliSSE42Triple(
 			crc, 0, 0,
 			p, p[castagnoliK2:], p[castagnoliK2*2:],
 			castagnoliK2/24)
 		// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
 		crcAB := castagnoliShift(castagnoliSSE42TableK2, crcA) ^ crcB
 		// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
 		crc = castagnoliShift(castagnoliSSE42TableK2, crcAB) ^ crcC
 		p = p[castagnoliK2*3:]
 	}
 	// Process 3*K1 at a time.
 	for len(p) >= castagnoliK1*3 {
 		// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
 		crcA, crcB, crcC := castagnoliSSE42Triple(
 			crc, 0, 0,
 			p, p[castagnoliK1:], p[castagnoliK1*2:],
 			castagnoliK1/24)
 		// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
 		crcAB := castagnoliShift(castagnoliSSE42TableK1, crcA) ^ crcB
 		// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
 		crc = castagnoliShift(castagnoliSSE42TableK1, crcAB) ^ crcC
 		p = p[castagnoliK1*3:]
 	}
 	// Use the simple implementation for what's left.
 	crc = castagnoliSSE42(crc, p)
 	return ^crc
 }
 func archAvailableIEEE() bool {
 	return useFastIEEE
 }
 var archIeeeTable8 *slicing8Table
 func archInitIEEE() {
 	if !useFastIEEE {
 		panic("not available")
 	}
 	// We still use slicing-by-8 for small buffers.
 	archIeeeTable8 = slicingMakeTable(IEEE)
 }
 func archUpdateIEEE(crc uint32, p []byte) uint32 {
 	if !useFastIEEE {
 		panic("not available")
 	}
 	if len(p) >= 64 {
 		left := len(p) & 15
 		do := len(p) - left
 		crc = ^ieeeCLMUL(^crc, p[:do])
 		p = p[do:]
 	}
 	if len(p) == 0 {
 		return crc
 	}
 	return slicingUpdate(crc, archIeeeTable8, p)
 }
--- a/vendor/github.com/klauspost/crc32/crc32_amd64.s
+++ b/vendor/github.com/klauspost/crc32/crc32_amd64.s
@ -1,319 +0,0 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build gc
 #define NOSPLIT 4
 #define RODATA 8
 // castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
 //
 // func castagnoliSSE42(crc uint32, p []byte) uint32
 TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
 	MOVL crc+0(FP), AX    // CRC value
 	MOVQ p+8(FP), SI      // data pointer
 	MOVQ p_len+16(FP), CX // len(p)
 	// If there are fewer than 8 bytes to process, skip alignment.
 	CMPQ CX, $8
 	JL   less_than_8
 	MOVQ SI, BX
 	ANDQ $7, BX
 	JZ   aligned
 	// Process the first few bytes to 8-byte align the input.
 	// BX = 8 - BX. We need to process this many bytes to align.
 	SUBQ $1, BX
 	XORQ $7, BX
 	BTQ $0, BX
 	JNC align_2
 	CRC32B (SI), AX
 	DECQ   CX
 	INCQ   SI
 align_2:
 	BTQ $1, BX
 	JNC align_4
 	// CRC32W (SI), AX
 	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
 	SUBQ $2, CX
 	ADDQ $2, SI
 align_4:
 	BTQ $2, BX
 	JNC aligned
 	// CRC32L (SI), AX
 	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
 	SUBQ $4, CX
 	ADDQ $4, SI
 aligned:
 	// The input is now 8-byte aligned and we can process 8-byte chunks.
 	CMPQ CX, $8
 	JL   less_than_8
 	CRC32Q (SI), AX
 	ADDQ   $8, SI
 	SUBQ   $8, CX
 	JMP    aligned
 less_than_8:
 	// We may have some bytes left over; process 4 bytes, then 2, then 1.
 	BTQ $2, CX
 	JNC less_than_4
 	// CRC32L (SI), AX
 	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
 	ADDQ $4, SI
 less_than_4:
 	BTQ $1, CX
 	JNC less_than_2
 	// CRC32W (SI), AX
 	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
 	ADDQ $2, SI
 less_than_2:
 	BTQ $0, CX
 	JNC done
 	CRC32B (SI), AX
 done:
 	MOVL AX, ret+32(FP)
 	RET
 // castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
 // bytes from each buffer.
 //
 // func castagnoliSSE42Triple(
 //     crc1, crc2, crc3 uint32,
 //     a, b, c []byte,
 //     rounds uint32,
 // ) (retA uint32, retB uint32, retC uint32)
 TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0
 	MOVL crcA+0(FP), AX
 	MOVL crcB+4(FP), CX
 	MOVL crcC+8(FP), DX
 	MOVQ a+16(FP), R8  // data pointer
 	MOVQ b+40(FP), R9  // data pointer
 	MOVQ c+64(FP), R10 // data pointer
 	MOVL rounds+88(FP), R11
 loop:
 	CRC32Q (R8), AX
 	CRC32Q (R9), CX
 	CRC32Q (R10), DX
 	CRC32Q 8(R8), AX
 	CRC32Q 8(R9), CX
 	CRC32Q 8(R10), DX
 	CRC32Q 16(R8), AX
 	CRC32Q 16(R9), CX
 	CRC32Q 16(R10), DX
 	ADDQ $24, R8
 	ADDQ $24, R9
 	ADDQ $24, R10
 	DECQ R11
 	JNZ  loop
 	MOVL AX, retA+96(FP)
 	MOVL CX, retB+100(FP)
 	MOVL DX, retC+104(FP)
 	RET
 // func haveSSE42() bool
 TEXT ·haveSSE42(SB), NOSPLIT, $0
 	XORQ AX, AX
 	INCL AX
 	CPUID
 	SHRQ $20, CX
 	ANDQ $1, CX
 	MOVB CX, ret+0(FP)
 	RET
 // func haveCLMUL() bool
 TEXT ·haveCLMUL(SB), NOSPLIT, $0
 	XORQ AX, AX
 	INCL AX
 	CPUID
 	SHRQ $1, CX
 	ANDQ $1, CX
 	MOVB CX, ret+0(FP)
 	RET
 // func haveSSE41() bool
 TEXT ·haveSSE41(SB), NOSPLIT, $0
 	XORQ AX, AX
 	INCL AX
 	CPUID
 	SHRQ $19, CX
 	ANDQ $1, CX
 	MOVB CX, ret+0(FP)
 	RET
 // CRC32 polynomial data
 //
 // These constants are lifted from the
 // Linux kernel, since they avoid the costly
 // PSHUFB 16 byte reversal proposed in the
 // original Intel paper.
 DATA r2r1kp<>+0(SB)/8, $0x154442bd4
 DATA r2r1kp<>+8(SB)/8, $0x1c6e41596
 DATA r4r3kp<>+0(SB)/8, $0x1751997d0
 DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e
 DATA rupolykp<>+0(SB)/8, $0x1db710641
 DATA rupolykp<>+8(SB)/8, $0x1f7011641
 DATA r5kp<>+0(SB)/8, $0x163cd6124
 GLOBL r2r1kp<>(SB), RODATA, $16
 GLOBL r4r3kp<>(SB), RODATA, $16
 GLOBL rupolykp<>(SB), RODATA, $16
 GLOBL r5kp<>(SB), RODATA, $8
 // Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
 // len(p) must be at least 64, and must be a multiple of 16.
 // func ieeeCLMUL(crc uint32, p []byte) uint32
 TEXT ·ieeeCLMUL(SB), NOSPLIT, $0
 	MOVL crc+0(FP), X0    // Initial CRC value
 	MOVQ p+8(FP), SI      // data pointer
 	MOVQ p_len+16(FP), CX // len(p)
 	MOVOU (SI), X1
 	MOVOU 16(SI), X2
 	MOVOU 32(SI), X3
 	MOVOU 48(SI), X4
 	PXOR  X0, X1
 	ADDQ  $64, SI    // buf+=64
 	SUBQ  $64, CX    // len-=64
 	CMPQ  CX, $64    // Less than 64 bytes left
 	JB    remain64
 	MOVOA r2r1kp<>+0(SB), X0
 loopback64:
 	MOVOA X1, X5
 	MOVOA X2, X6
 	MOVOA X3, X7
 	MOVOA X4, X8
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0, X0, X2
 	PCLMULQDQ $0, X0, X3
 	PCLMULQDQ $0, X0, X4
 	// Load next early
 	MOVOU (SI), X11
 	MOVOU 16(SI), X12
 	MOVOU 32(SI), X13
 	MOVOU 48(SI), X14
 	PCLMULQDQ $0x11, X0, X5
 	PCLMULQDQ $0x11, X0, X6
 	PCLMULQDQ $0x11, X0, X7
 	PCLMULQDQ $0x11, X0, X8
 	PXOR X5, X1
 	PXOR X6, X2
 	PXOR X7, X3
 	PXOR X8, X4
 	PXOR X11, X1
 	PXOR X12, X2
 	PXOR X13, X3
 	PXOR X14, X4
 	ADDQ $0x40, DI
 	ADDQ $64, SI    // buf+=64
 	SUBQ $64, CX    // len-=64
 	CMPQ CX, $64    // Less than 64 bytes left?
 	JGE  loopback64
 	// Fold result into a single register (X1)
 remain64:
 	MOVOA r4r3kp<>+0(SB), X0
 	MOVOA     X1, X5
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0x11, X0, X5
 	PXOR      X5, X1
 	PXOR      X2, X1
 	MOVOA     X1, X5
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0x11, X0, X5
 	PXOR      X5, X1
 	PXOR      X3, X1
 	MOVOA     X1, X5
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0x11, X0, X5
 	PXOR      X5, X1
 	PXOR      X4, X1
 	// If there is less than 16 bytes left we are done
 	CMPQ CX, $16
 	JB   finish
 	// Encode 16 bytes
 remain16:
 	MOVOU     (SI), X10
 	MOVOA     X1, X5
 	PCLMULQDQ $0, X0, X1
 	PCLMULQDQ $0x11, X0, X5
 	PXOR      X5, X1
 	PXOR      X10, X1
 	SUBQ      $16, CX
 	ADDQ      $16, SI
 	CMPQ      CX, $16
 	JGE       remain16
 finish:
 	// Fold final result into 32 bits and return it
 	PCMPEQB   X3, X3
 	PCLMULQDQ $1, X1, X0
 	PSRLDQ    $8, X1
 	PXOR      X0, X1
 	MOVOA X1, X2
 	MOVQ  r5kp<>+0(SB), X0
 	// Creates 32 bit mask. Note that we don't care about upper half.
 	PSRLQ $32, X3
 	PSRLDQ    $4, X2
 	PAND      X3, X1
 	PCLMULQDQ $0, X0, X1
 	PXOR      X2, X1
 	MOVOA rupolykp<>+0(SB), X0
 	MOVOA     X1, X2
 	PAND      X3, X1
 	PCLMULQDQ $0x10, X0, X1
 	PAND      X3, X1
 	PCLMULQDQ $0, X0, X1
 	PXOR      X2, X1
 	// PEXTRD   $1, X1, AX  (SSE 4.1)
 	BYTE $0x66; BYTE $0x0f; BYTE $0x3a
 	BYTE $0x16; BYTE $0xc8; BYTE $0x01
 	MOVL AX, ret+32(FP)
 	RET
--- a/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
+++ b/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
@ -1,43 +0,0 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !appengine,!gccgo
 package crc32
 // This file contains the code to call the SSE 4.2 version of the Castagnoli
 // CRC.
 // haveSSE42 is defined in crc32_amd64p32.s and uses CPUID to test for SSE 4.2
 // support.
 func haveSSE42() bool
 // castagnoliSSE42 is defined in crc32_amd64p32.s and uses the SSE4.2 CRC32
 // instruction.
 //go:noescape
 func castagnoliSSE42(crc uint32, p []byte) uint32
 var sse42 = haveSSE42()
 func archAvailableCastagnoli() bool {
 	return sse42
 }
 func archInitCastagnoli() {
 	if !sse42 {
 		panic("not available")
 	}
 	// No initialization necessary.
 }
 func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
 	if !sse42 {
 		panic("not available")
 	}
 	return castagnoliSSE42(crc, p)
 }
 func archAvailableIEEE() bool                    { return false }
 func archInitIEEE()                              { panic("not available") }
 func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
--- a/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
+++ b/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
@ -1,67 +0,0 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build gc
 #define NOSPLIT 4
 #define RODATA 8
 // func castagnoliSSE42(crc uint32, p []byte) uint32
 TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
 	MOVL crc+0(FP), AX   // CRC value
 	MOVL p+4(FP), SI     // data pointer
 	MOVL p_len+8(FP), CX // len(p)
 	NOTL AX
 	// If there's less than 8 bytes to process, we do it byte-by-byte.
 	CMPQ CX, $8
 	JL   cleanup
 	// Process individual bytes until the input is 8-byte aligned.
 startup:
 	MOVQ SI, BX
 	ANDQ $7, BX
 	JZ   aligned
 	CRC32B (SI), AX
 	DECQ   CX
 	INCQ   SI
 	JMP    startup
 aligned:
 	// The input is now 8-byte aligned and we can process 8-byte chunks.
 	CMPQ CX, $8
 	JL   cleanup
 	CRC32Q (SI), AX
 	ADDQ   $8, SI
 	SUBQ   $8, CX
 	JMP    aligned
 cleanup:
 	// We may have some bytes left over that we process one at a time.
 	CMPQ CX, $0
 	JE   done
 	CRC32B (SI), AX
 	INCQ   SI
 	DECQ   CX
 	JMP    cleanup
 done:
 	NOTL AX
 	MOVL AX, ret+16(FP)
 	RET
 // func haveSSE42() bool
 TEXT ·haveSSE42(SB), NOSPLIT, $0
 	XORQ AX, AX
 	INCL AX
 	CPUID
 	SHRQ $20, CX
 	ANDQ $1, CX
 	MOVB CX, ret+0(FP)
 	RET
--- a/vendor/github.com/klauspost/crc32/crc32_generic.go
+++ b/vendor/github.com/klauspost/crc32/crc32_generic.go
@ -1,89 +0,0 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // This file contains CRC32 algorithms that are not specific to any architecture
 // and don't use hardware acceleration.
 //
 // The simple (and slow) CRC32 implementation only uses a 256*4 bytes table.
 //
 // The slicing-by-8 algorithm is a faster implementation that uses a bigger
 // table (8*256*4 bytes).
 package crc32
 // simpleMakeTable allocates and constructs a Table for the specified
 // polynomial. The table is suitable for use with the simple algorithm
 // (simpleUpdate).
 func simpleMakeTable(poly uint32) *Table {
 	t := new(Table)
 	simplePopulateTable(poly, t)
 	return t
 }
 // simplePopulateTable constructs a Table for the specified polynomial, suitable
 // for use with simpleUpdate.
 func simplePopulateTable(poly uint32, t *Table) {
 	for i := 0; i < 256; i++ {
 		crc := uint32(i)
 		for j := 0; j < 8; j++ {
 			if crc&1 == 1 {
 				crc = (crc >> 1) ^ poly
 			} else {
 				crc >>= 1
 			}
 		}
 		t[i] = crc
 	}
 }
 // simpleUpdate uses the simple algorithm to update the CRC, given a table that
 // was previously computed using simpleMakeTable.
 func simpleUpdate(crc uint32, tab *Table, p []byte) uint32 {
 	crc = ^crc
 	for _, v := range p {
 		crc = tab[byte(crc)^v] ^ (crc >> 8)
 	}
 	return ^crc
 }
 // Use slicing-by-8 when payload >= this value.
 const slicing8Cutoff = 16
 // slicing8Table is array of 8 Tables, used by the slicing-by-8 algorithm.
 type slicing8Table [8]Table
 // slicingMakeTable constructs a slicing8Table for the specified polynomial. The
 // table is suitable for use with the slicing-by-8 algorithm (slicingUpdate).
 func slicingMakeTable(poly uint32) *slicing8Table {
 	t := new(slicing8Table)
 	simplePopulateTable(poly, &t[0])
 	for i := 0; i < 256; i++ {
 		crc := t[0][i]
 		for j := 1; j < 8; j++ {
 			crc = t[0][crc&0xFF] ^ (crc >> 8)
 			t[j][i] = crc
 		}
 	}
 	return t
 }
 // slicingUpdate uses the slicing-by-8 algorithm to update the CRC, given a
 // table that was previously computed using slicingMakeTable.
 func slicingUpdate(crc uint32, tab *slicing8Table, p []byte) uint32 {
 	if len(p) >= slicing8Cutoff {
 		crc = ^crc
 		for len(p) > 8 {
 			crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
 			crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^
 				tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^
 				tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF]
 			p = p[8:]
 		}
 		crc = ^crc
 	}
 	if len(p) == 0 {
 		return crc
 	}
 	return simpleUpdate(crc, &tab[0], p)
 }
--- a/vendor/github.com/klauspost/crc32/crc32_otherarch.go
+++ b/vendor/github.com/klauspost/crc32/crc32_otherarch.go
@ -1,15 +0,0 @@
 // Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build !amd64,!amd64p32,!s390x
 package crc32
 func archAvailableIEEE() bool                    { return false }
 func archInitIEEE()                              { panic("not available") }
 func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
 func archAvailableCastagnoli() bool                    { return false }
 func archInitCastagnoli()                              { panic("not available") }
 func archUpdateCastagnoli(crc uint32, p []byte) uint32 { panic("not available") }
--- a/vendor/github.com/klauspost/crc32/crc32_s390x.go
+++ b/vendor/github.com/klauspost/crc32/crc32_s390x.go
@ -1,91 +0,0 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build s390x
 package crc32
 const (
 	vxMinLen    = 64
 	vxAlignMask = 15 // align to 16 bytes
 )
 // hasVectorFacility reports whether the machine has the z/Architecture
 // vector facility installed and enabled.
 func hasVectorFacility() bool
 var hasVX = hasVectorFacility()
 // vectorizedCastagnoli implements CRC32 using vector instructions.
 // It is defined in crc32_s390x.s.
 //go:noescape
 func vectorizedCastagnoli(crc uint32, p []byte) uint32
 // vectorizedIEEE implements CRC32 using vector instructions.
 // It is defined in crc32_s390x.s.
 //go:noescape
 func vectorizedIEEE(crc uint32, p []byte) uint32
 func archAvailableCastagnoli() bool {
 	return hasVX
 }
 var archCastagnoliTable8 *slicing8Table
 func archInitCastagnoli() {
 	if !hasVX {
 		panic("not available")
 	}
 	// We still use slicing-by-8 for small buffers.
 	archCastagnoliTable8 = slicingMakeTable(Castagnoli)
 }
 // archUpdateCastagnoli calculates the checksum of p using
 // vectorizedCastagnoli.
 func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
 	if !hasVX {
 		panic("not available")
 	}
 	// Use vectorized function if data length is above threshold.
 	if len(p) >= vxMinLen {
 		aligned := len(p) & ^vxAlignMask
 		crc = vectorizedCastagnoli(crc, p[:aligned])
 		p = p[aligned:]
 	}
 	if len(p) == 0 {
 		return crc
 	}
 	return slicingUpdate(crc, archCastagnoliTable8, p)
 }
 func archAvailableIEEE() bool {
 	return hasVX
 }
 var archIeeeTable8 *slicing8Table
 func archInitIEEE() {
 	if !hasVX {
 		panic("not available")
 	}
 	// We still use slicing-by-8 for small buffers.
 	archIeeeTable8 = slicingMakeTable(IEEE)
 }
 // archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
 func archUpdateIEEE(crc uint32, p []byte) uint32 {
 	if !hasVX {
 		panic("not available")
 	}
 	// Use vectorized function if data length is above threshold.
 	if len(p) >= vxMinLen {
 		aligned := len(p) & ^vxAlignMask
 		crc = vectorizedIEEE(crc, p[:aligned])
 		p = p[aligned:]
 	}
 	if len(p) == 0 {
 		return crc
 	}
 	return slicingUpdate(crc, archIeeeTable8, p)
 }
--- a/vendor/github.com/klauspost/crc32/crc32_s390x.s
+++ b/vendor/github.com/klauspost/crc32/crc32_s390x.s
@ -1,249 +0,0 @@
 // Copyright 2016 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build s390x
 #include "textflag.h"
 // Vector register range containing CRC-32 constants
 #define CONST_PERM_LE2BE        V9
 #define CONST_R2R1              V10
 #define CONST_R4R3              V11
 #define CONST_R5                V12
 #define CONST_RU_POLY           V13
 #define CONST_CRC_POLY          V14
 // The CRC-32 constant block contains reduction constants to fold and
 // process particular chunks of the input data stream in parallel.
 //
 // Note that the constant definitions below are extended in order to compute
 // intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
 // The rightmost doubleword can be 0 to prevent contribution to the result or
 // can be multiplied by 1 to perform an XOR without the need for a separate
 // VECTOR EXCLUSIVE OR instruction.
 //
 // The polynomials used are bit-reflected:
 //
 //            IEEE: P'(x) = 0x0edb88320
 //      Castagnoli: P'(x) = 0x082f63b78
 // IEEE polynomial constants
 DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
 DATA ·crcleconskp+8(SB)/8, $0x0706050403020100
 DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2
 DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1
 DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4
 DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3
 DATA ·crcleconskp+48(SB)/8, $0x0000000000000000
 DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5
 DATA ·crcleconskp+64(SB)/8, $0x0000000000000000
 DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u'
 DATA ·crcleconskp+80(SB)/8, $0x0000000000000000
 DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1
 GLOBL ·crcleconskp(SB), RODATA, $144
 // Castagonli Polynomial constants
 DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
 DATA ·crccleconskp+8(SB)/8, $0x0706050403020100
 DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2
 DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1
 DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4
 DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3
 DATA ·crccleconskp+48(SB)/8, $0x0000000000000000
 DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5
 DATA ·crccleconskp+64(SB)/8, $0x0000000000000000
 DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u'
 DATA ·crccleconskp+80(SB)/8, $0x0000000000000000
 DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1
 GLOBL ·crccleconskp(SB), RODATA, $144
 // func hasVectorFacility() bool
 TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
 	MOVD  $x-24(SP), R1
 	XC    $24, 0(R1), 0(R1) // clear the storage
 	MOVD  $2, R0            // R0 is the number of double words stored -1
 	WORD  $0xB2B01000       // STFLE 0(R1)
 	XOR   R0, R0            // reset the value of R0
 	MOVBZ z-8(SP), R1
 	AND   $0x40, R1
 	BEQ   novector
 vectorinstalled:
 	// check if the vector instruction has been enabled
 	VLEIB  $0, $0xF, V16
 	VLGVB  $0, V16, R1
 	CMPBNE R1, $0xF, novector
 	MOVB   $1, ret+0(FP)      // have vx
 	RET
 novector:
 	MOVB $0, ret+0(FP) // no vx
 	RET
 // The CRC-32 function(s) use these calling conventions:
 //
 // Parameters:
 //
 //      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
 //      R3:    Input buffer pointer, performance might be improved if the
 //             buffer is on a doubleword boundary.
 //      R4:    Length of the buffer, must be 64 bytes or greater.
 //
 // Register usage:
 //
 //      R5:     CRC-32 constant pool base pointer.
 //      V0:     Initial CRC value and intermediate constants and results.
 //      V1..V4: Data for CRC computation.
 //      V5..V8: Next data chunks that are fetched from the input buffer.
 //
 //      V9..V14: CRC-32 constants.
 // func vectorizedIEEE(crc uint32, p []byte) uint32
 TEXT ·vectorizedIEEE(SB), NOSPLIT, $0
 	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
 	MOVD  p+8(FP), R3      // data pointer
 	MOVD  p_len+16(FP), R4 // len(p)
 	MOVD $·crcleconskp(SB), R5
 	BR   vectorizedBody<>(SB)
 // func vectorizedCastagnoli(crc uint32, p []byte) uint32
 TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0
 	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
 	MOVD  p+8(FP), R3      // data pointer
 	MOVD  p_len+16(FP), R4 // len(p)
 	// R5: crc-32 constant pool base pointer, constant is used to reduce crc
 	MOVD $·crccleconskp(SB), R5
 	BR   vectorizedBody<>(SB)
 TEXT vectorizedBody<>(SB), NOSPLIT, $0
 	XOR $0xffffffff, R2                         // NOTW R2
 	VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
 	// Load the initial CRC value into the rightmost word of V0
 	VZERO V0
 	VLVGF $3, R2, V0
 	// Crash if the input size is less than 64-bytes.
 	CMP R4, $64
 	BLT crash
 	// Load a 64-byte data chunk and XOR with CRC
 	VLM 0(R3), V1, V4 // 64-bytes into V1..V4
 	// Reflect the data if the CRC operation is in the bit-reflected domain
 	VPERM V1, V1, CONST_PERM_LE2BE, V1
 	VPERM V2, V2, CONST_PERM_LE2BE, V2
 	VPERM V3, V3, CONST_PERM_LE2BE, V3
 	VPERM V4, V4, CONST_PERM_LE2BE, V4
 	VX  V0, V1, V1 // V1 ^= CRC
 	ADD $64, R3    // BUF = BUF + 64
 	ADD $(-64), R4
 	// Check remaining buffer size and jump to proper folding method
 	CMP R4, $64
 	BLT less_than_64bytes
 fold_64bytes_loop:
 	// Load the next 64-byte data chunk into V5 to V8
 	VLM   0(R3), V5, V8
 	VPERM V5, V5, CONST_PERM_LE2BE, V5
 	VPERM V6, V6, CONST_PERM_LE2BE, V6
 	VPERM V7, V7, CONST_PERM_LE2BE, V7
 	VPERM V8, V8, CONST_PERM_LE2BE, V8
 	// Perform a GF(2) multiplication of the doublewords in V1 with
 	// the reduction constants in V0.  The intermediate result is
 	// then folded (accumulated) with the next data chunk in V5 and
 	// stored in V1.  Repeat this step for the register contents
 	// in V2, V3, and V4 respectively.
 	VGFMAG CONST_R2R1, V1, V5, V1
 	VGFMAG CONST_R2R1, V2, V6, V2
 	VGFMAG CONST_R2R1, V3, V7, V3
 	VGFMAG CONST_R2R1, V4, V8, V4
 	// Adjust buffer pointer and length for next loop
 	ADD $64, R3    // BUF = BUF + 64
 	ADD $(-64), R4 // LEN = LEN - 64
 	CMP R4, $64
 	BGE fold_64bytes_loop
 less_than_64bytes:
 	// Fold V1 to V4 into a single 128-bit value in V1
 	VGFMAG CONST_R4R3, V1, V2, V1
 	VGFMAG CONST_R4R3, V1, V3, V1
 	VGFMAG CONST_R4R3, V1, V4, V1
 	// Check whether to continue with 64-bit folding
 	CMP R4, $16
 	BLT final_fold
 fold_16bytes_loop:
 	VL    0(R3), V2                    // Load next data chunk
 	VPERM V2, V2, CONST_PERM_LE2BE, V2
 	VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk
 	// Adjust buffer pointer and size for folding next data chunk
 	ADD $16, R3
 	ADD $-16, R4
 	// Process remaining data chunks
 	CMP R4, $16
 	BGE fold_16bytes_loop
 final_fold:
 	VLEIB $7, $0x40, V9
 	VSRLB V9, CONST_R4R3, V0
 	VLEIG $0, $1, V0
 	VGFMG V0, V1, V1
 	VLEIB  $7, $0x20, V9        // Shift by words
 	VSRLB  V9, V1, V2           // Store remaining bits in V2
 	VUPLLF V1, V1               // Split rightmost doubleword
 	VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2
 	// The input values to the Barret reduction are the degree-63 polynomial
 	// in V1 (R(x)), degree-32 generator polynomial, and the reduction
 	// constant u.  The Barret reduction result is the CRC value of R(x) mod
 	// P(x).
 	//
 	// The Barret reduction algorithm is defined as:
 	//
 	//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
 	//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
 	//    3. C(x)  = R(x) XOR T2(x) mod x^32
 	//
 	// Note: To compensate the division by x^32, use the vector unpack
 	// instruction to move the leftmost word into the leftmost doubleword
 	// of the vector register.  The rightmost doubleword is multiplied
 	// with zero to not contribute to the intermedate results.
 	// T1(x) = floor( R(x) / x^32 ) GF2MUL u
 	VUPLLF V1, V2
 	VGFMG  CONST_RU_POLY, V2, V2
 	// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
 	// V2 and XOR the intermediate result, T2(x),  with the value in V1.
 	// The final result is in the rightmost word of V2.
 	VUPLLF V2, V2
 	VGFMAG CONST_CRC_POLY, V2, V1, V2
 done:
 	VLGVF $2, V2, R2
 	XOR   $0xffffffff, R2  // NOTW R2
 	MOVWZ R2, ret + 32(FP)
 	RET
 crash:
 	MOVD $0, (R0) // input size is less than 64-bytes
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -12,6 +12,8 @@ gitea.com/macaron/captcha
 gitea.com/macaron/cors
 # gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439
 gitea.com/macaron/csrf
 # gitea.com/macaron/gzip v0.0.0-20191118033930-0c4c5566a0e5
 gitea.com/macaron/gzip
 # gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223
 gitea.com/macaron/i18n
 # gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a
@ -259,13 +261,9 @@ github.com/keybase/go-crypto/openpgp/errors
 github.com/keybase/go-crypto/openpgp/packet
 github.com/keybase/go-crypto/openpgp/s2k
 github.com/keybase/go-crypto/rsa
-# github.com/klauspost/compress v0.0.0-20161025140425-8df558b6cb6f
+# github.com/klauspost/compress v1.9.2
 github.com/klauspost/compress/flate
 github.com/klauspost/compress/gzip
 # github.com/klauspost/cpuid v0.0.0-20160302075316-09cded8978dc
 github.com/klauspost/cpuid
 # github.com/klauspost/crc32 v0.0.0-20161016154125-cb6bfca970f6
 github.com/klauspost/crc32
 # github.com/kr/pretty v0.1.0
 github.com/kr/pretty
 # github.com/kr/text v0.1.0
		`@ -1,3 +0,0 @@`
			`package cpuid`

			`//go:generate go run private-gen.go`