Language statistics bar for repositories (#8037)

* Implementation for calculating language statistics

Impement saving code language statistics to database

Implement rendering langauge stats

Add primary laguage to show in repository list

Implement repository stats indexer queue

Add indexer test

Refactor to use queue module

* Do not timeout for queues
pull/10216/head^2
Lauris BH 5 years ago committed by GitHub
parent 37892be635
commit ad2642a8aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      go.mod
  2. 9
      go.sum
  3. 3
      integrations/testlogger.go
  4. 2
      models/migrations/migrations.go
  5. 45
      models/migrations/v127.go
  6. 1
      models/models.go
  7. 5
      models/repo.go
  8. 77
      models/repo_indexer.go
  9. 137
      models/repo_language_stats.go
  10. 27
      models/repo_list.go
  11. 116
      modules/git/repo_language_stats.go
  12. 2
      modules/indexer/code/bleve.go
  13. 7
      modules/indexer/code/git.go
  14. 2
      modules/indexer/code/queue.go
  15. 54
      modules/indexer/stats/db.go
  16. 85
      modules/indexer/stats/indexer.go
  17. 42
      modules/indexer/stats/indexer_test.go
  18. 43
      modules/indexer/stats/queue.go
  19. 7
      modules/notification/indexer/indexer.go
  20. 1
      options/locale/locale_en-US.ini
  21. 4
      routers/init.go
  22. 1
      routers/org/home.go
  23. 15
      routers/repo/view.go
  24. 1
      routers/user/profile.go
  25. 3
      templates/explore/repo_list.tmpl
  26. 27
      templates/repo/sub_menu.tmpl
  27. 11
      vendor/github.com/src-d/enry/v2/.gitignore
  28. 132
      vendor/github.com/src-d/enry/v2/.travis.yml
  29. 61
      vendor/github.com/src-d/enry/v2/CONTRIBUTING.md
  30. 25
      vendor/github.com/src-d/enry/v2/DCO
  31. 201
      vendor/github.com/src-d/enry/v2/LICENSE
  32. 1
      vendor/github.com/src-d/enry/v2/MAINTAINERS
  33. 82
      vendor/github.com/src-d/enry/v2/Makefile
  34. 328
      vendor/github.com/src-d/enry/v2/README.md
  35. 107
      vendor/github.com/src-d/enry/v2/classifier.go
  36. 472
      vendor/github.com/src-d/enry/v2/common.go
  37. 783
      vendor/github.com/src-d/enry/v2/data/alias.go
  38. 254
      vendor/github.com/src-d/enry/v2/data/colors.go
  39. 7
      vendor/github.com/src-d/enry/v2/data/commit.go
  40. 1358
      vendor/github.com/src-d/enry/v2/data/content.go
  41. 3
      vendor/github.com/src-d/enry/v2/data/doc.go
  42. 26
      vendor/github.com/src-d/enry/v2/data/documentation.go
  43. 1629
      vendor/github.com/src-d/enry/v2/data/extension.go
  44. 241
      vendor/github.com/src-d/enry/v2/data/filename.go
  45. 170527
      vendor/github.com/src-d/enry/v2/data/frequencies.go
  46. 35
      vendor/github.com/src-d/enry/v2/data/heuristics.go
  47. 124
      vendor/github.com/src-d/enry/v2/data/interpreter.go
  48. 226
      vendor/github.com/src-d/enry/v2/data/mimeType.go
  49. 109
      vendor/github.com/src-d/enry/v2/data/rule/rule.go
  50. 526
      vendor/github.com/src-d/enry/v2/data/type.go
  51. 166
      vendor/github.com/src-d/enry/v2/data/vendor.go
  52. 16
      vendor/github.com/src-d/enry/v2/enry.go
  53. 11
      vendor/github.com/src-d/enry/v2/go.mod
  54. 17
      vendor/github.com/src-d/enry/v2/go.sum
  55. 7
      vendor/github.com/src-d/enry/v2/internal/tokenizer/common.go
  56. 2226
      vendor/github.com/src-d/enry/v2/internal/tokenizer/flex/lex.linguist_yy.c
  57. 336
      vendor/github.com/src-d/enry/v2/internal/tokenizer/flex/lex.linguist_yy.h
  58. 15
      vendor/github.com/src-d/enry/v2/internal/tokenizer/flex/linguist.h
  59. 71
      vendor/github.com/src-d/enry/v2/internal/tokenizer/flex/tokenize_c.go
  60. 214
      vendor/github.com/src-d/enry/v2/internal/tokenizer/tokenize.go
  61. 15
      vendor/github.com/src-d/enry/v2/internal/tokenizer/tokenize_c.go
  62. 17
      vendor/github.com/src-d/enry/v2/regex/oniguruma.go
  63. 17
      vendor/github.com/src-d/enry/v2/regex/standard.go
  64. 84
      vendor/github.com/src-d/enry/v2/utils.go
  65. 20
      vendor/github.com/src-d/go-oniguruma/.travis.yml
  66. 19
      vendor/github.com/src-d/go-oniguruma/LICENSE
  67. 20
      vendor/github.com/src-d/go-oniguruma/README.md
  68. 184
      vendor/github.com/src-d/go-oniguruma/chelper.c
  69. 14
      vendor/github.com/src-d/go-oniguruma/chelper.h
  70. 27
      vendor/github.com/src-d/go-oniguruma/constants.go
  71. 1
      vendor/github.com/src-d/go-oniguruma/go.mod
  72. 36
      vendor/github.com/src-d/go-oniguruma/quotemeta.go
  73. 668
      vendor/github.com/src-d/go-oniguruma/regex.go
  74. 22
      vendor/github.com/toqueteos/trie/LICENSE.txt
  75. 7
      vendor/github.com/toqueteos/trie/README.md
  76. 1
      vendor/github.com/toqueteos/trie/go.mod
  77. 102
      vendor/github.com/toqueteos/trie/trie.go
  78. 24
      vendor/gopkg.in/toqueteos/substring.v1/.gitignore
  79. 11
      vendor/gopkg.in/toqueteos/substring.v1/.travis.yml
  80. 22
      vendor/gopkg.in/toqueteos/substring.v1/LICENSE
  81. 80
      vendor/gopkg.in/toqueteos/substring.v1/README.md
  82. 229
      vendor/gopkg.in/toqueteos/substring.v1/bytes.go
  83. 10
      vendor/gopkg.in/toqueteos/substring.v1/lib.go
  84. 216
      vendor/gopkg.in/toqueteos/substring.v1/string.go
  85. 13
      vendor/modules.txt
  86. 8
      web_src/js/index.js
  87. 10
      web_src/less/_base.less
  88. 20
      web_src/less/_repository.less
  89. 6
      web_src/less/themes/theme-arc-green.less

@ -85,6 +85,7 @@ require (
github.com/sergi/go-diff v1.0.0
github.com/shurcooL/httpfs v0.0.0-20190527155220-6a4d4a70508b // indirect
github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd
github.com/src-d/enry/v2 v2.1.0
github.com/steveyen/gtreap v0.0.0-20150807155958-0abe01ef9be2 // indirect
github.com/stretchr/testify v1.4.0
github.com/tecbot/gorocksdb v0.0.0-20181010114359-8752a9433481 // indirect

@ -508,8 +508,13 @@ github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnIn
github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
github.com/spf13/viper v1.4.0 h1:yXHLWeravcrgGyFSyCgdYpXQ9dR9c/WED3pg1RhxqEU=
github.com/spf13/viper v1.4.0/go.mod h1:PTJ7Z/lr49W6bUbkmS1V3by4uWynFiR9p7+dSq/yZzE=
github.com/src-d/enry v1.7.3 h1:jG2fmEaQaURh0qqU/sn82BRzVa6d4EVHJIw6gc98bak=
github.com/src-d/enry/v2 v2.1.0 h1:z1L8t+B8bh3mmjPkJrgOTnVRpFGmTPJsplHX9wAn6BI=
github.com/src-d/enry/v2 v2.1.0/go.mod h1:qQeCMRwzMF3ckeGr+h0tJLdxXnq+NVZsIDMELj0t028=
github.com/src-d/gcfg v1.4.0 h1:xXbNR5AlLSA315x2UO+fTSSAXCDf+Ar38/6oyGbDKQ4=
github.com/src-d/gcfg v1.4.0/go.mod h1:p/UMsR43ujA89BJY9duynAwIpvqEujIH/jFlfL7jWoI=
github.com/src-d/go-oniguruma v1.1.0 h1:EG+Nm5n2JqWUaCjtM0NtutPxU7ZN5Tp50GWrrV8bTww=
github.com/src-d/go-oniguruma v1.1.0/go.mod h1:chVbff8kcVtmrhxtZ3yBVLLquXbzCS6DrxQaAK/CeqM=
github.com/steveyen/gtreap v0.0.0-20150807155958-0abe01ef9be2 h1:JNEGSiWg6D3lcBCMCBqN3ELniXujt+0QNHLhNnO0w3s=
github.com/steveyen/gtreap v0.0.0-20150807155958-0abe01ef9be2/go.mod h1:mjqs7N0Q6m5HpR7QfXVBZXZWSqTjQLeTujjA/xUp2uw=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
@ -530,6 +535,8 @@ github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhV
github.com/tinylib/msgp v1.1.0 h1:9fQd+ICuRIu/ue4vxJZu6/LzxN0HwMds2nq/0cFvxHU=
github.com/tinylib/msgp v1.1.0/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE=
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
github.com/toqueteos/trie v1.0.0 h1:8i6pXxNUXNRAqP246iibb7w/pSFquNTQ+uNfriG7vlk=
github.com/toqueteos/trie v1.0.0/go.mod h1:Ywk48QhEqhU1+DwhMkJ2x7eeGxDHiGkAdc9+0DYcbsM=
github.com/toqueteos/webbrowser v1.2.0 h1:tVP/gpK69Fx+qMJKsLE7TD8LuGWPnEV71wBN9rrstGQ=
github.com/toqueteos/webbrowser v1.2.0/go.mod h1:XWoZq4cyp9WeUeak7w7LXRUQf1F1ATJMir8RTqb4ayM=
github.com/tstranex/u2f v1.0.0 h1:HhJkSzDDlVSVIVt7pDJwCHQj67k7A5EeBgPmeD+pVsQ=
@ -747,6 +754,8 @@ gopkg.in/testfixtures.v2 v2.5.0 h1:N08B7l2GzFQenyYbzqthDnKAA+cmb17iAZhhFxr7JHw=
gopkg.in/testfixtures.v2 v2.5.0/go.mod h1:vyAq+MYCgNpR29qitQdLZhdbLFf4mR/2MFJRFoQZZ2M=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/toqueteos/substring.v1 v1.0.2 h1:urLqCeMm6x/eTuQa1oZerNw8N1KNOIp5hD5kGL7lFsE=
gopkg.in/toqueteos/substring.v1 v1.0.2/go.mod h1:Eb2Z1UYehlVK8LYW2WBVR2rwbujsz3aX8XDrM1vbNew=
gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME=
gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI=
gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74=

@ -13,7 +13,6 @@ import (
"strings"
"sync"
"testing"
"time"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/queue"
@ -101,7 +100,7 @@ func PrintCurrentTest(t testing.TB, skip ...int) func() {
}
writerCloser.setT(&t)
return func() {
if err := queue.GetManager().FlushAll(context.Background(), 20*time.Second); err != nil {
if err := queue.GetManager().FlushAll(context.Background(), -1); err != nil {
t.Errorf("Flushing queues failed with error %v", err)
}
_ = writerCloser.Close()

@ -186,6 +186,8 @@ var migrations = []Migration{
NewMigration("Add some columns on review for migration", addReviewMigrateInfo),
// v126 -> v127
NewMigration("Fix topic repository count", fixTopicRepositoryCount),
// v127 -> v128
NewMigration("add repository code language statistics", addLanguageStats),
}
// Migrate database to current version

@ -0,0 +1,45 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package migrations
import (
"fmt"
"code.gitea.io/gitea/modules/timeutil"
"xorm.io/xorm"
)
func addLanguageStats(x *xorm.Engine) error {
// LanguageStat see models/repo_language_stats.go
type LanguageStat struct {
ID int64 `xorm:"pk autoincr"`
RepoID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"`
CommitID string
IsPrimary bool
Language string `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"`
Percentage float32 `xorm:"NUMERIC(5,2) NOT NULL DEFAULT 0"`
Color string `xorm:"-"`
CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"`
}
type RepoIndexerType int
// RepoIndexerStatus see models/repo_stats_indexer.go
type RepoIndexerStatus struct {
ID int64 `xorm:"pk autoincr"`
RepoID int64 `xorm:"INDEX(s)"`
CommitSha string `xorm:"VARCHAR(40)"`
IndexerType RepoIndexerType `xorm:"INDEX(s) NOT NULL DEFAULT 0"`
}
if err := x.Sync2(new(LanguageStat)); err != nil {
return fmt.Errorf("Sync2: %v", err)
}
if err := x.Sync2(new(RepoIndexerStatus)); err != nil {
return fmt.Errorf("Sync2: %v", err)
}
return nil
}

@ -116,6 +116,7 @@ func init() {
new(OAuth2AuthorizationCode),
new(OAuth2Grant),
new(Task),
new(LanguageStat),
)
gonicNames := []string{"SSL", "UID"}

@ -177,6 +177,7 @@ type Repository struct {
RenderingMetas map[string]string `xorm:"-"`
Units []*RepoUnit `xorm:"-"`
PrimaryLanguage *LanguageStat `xorm:"-"`
IsFork bool `xorm:"INDEX NOT NULL DEFAULT false"`
ForkID int64 `xorm:"INDEX"`
@ -185,7 +186,8 @@ type Repository struct {
TemplateID int64 `xorm:"INDEX"`
TemplateRepo *Repository `xorm:"-"`
Size int64 `xorm:"NOT NULL DEFAULT 0"`
IndexerStatus *RepoIndexerStatus `xorm:"-"`
CodeIndexerStatus *RepoIndexerStatus `xorm:"-"`
StatsIndexerStatus *RepoIndexerStatus `xorm:"-"`
IsFsckEnabled bool `xorm:"NOT NULL DEFAULT true"`
CloseIssuesViaCommitInAnyBranch bool `xorm:"NOT NULL DEFAULT false"`
Topics []string `xorm:"TEXT JSON"`
@ -1504,6 +1506,7 @@ func DeleteRepository(doer *User, uid, repoID int64) error {
&Notification{RepoID: repoID},
&CommitStatus{RepoID: repoID},
&RepoIndexerStatus{RepoID: repoID},
&LanguageStat{RepoID: repoID},
&Comment{RefRepoID: repoID},
&Task{RepoID: repoID},
); err != nil {

@ -10,21 +10,32 @@ import (
"xorm.io/builder"
)
// RepoIndexerType specifies the repository indexer type
type RepoIndexerType int
const (
// RepoIndexerTypeCode code indexer
RepoIndexerTypeCode RepoIndexerType = iota // 0
// RepoIndexerTypeStats repository stats indexer
RepoIndexerTypeStats // 1
)
// RepoIndexerStatus status of a repo's entry in the repo indexer
// For now, implicitly refers to default branch
type RepoIndexerStatus struct {
ID int64 `xorm:"pk autoincr"`
RepoID int64 `xorm:"INDEX"`
RepoID int64 `xorm:"INDEX(s)"`
CommitSha string `xorm:"VARCHAR(40)"`
IndexerType RepoIndexerType `xorm:"INDEX(s) NOT NULL DEFAULT 0"`
}
// GetUnindexedRepos returns repos which do not have an indexer status
func GetUnindexedRepos(maxRepoID int64, page, pageSize int) ([]int64, error) {
func GetUnindexedRepos(indexerType RepoIndexerType, maxRepoID int64, page, pageSize int) ([]int64, error) {
ids := make([]int64, 0, 50)
cond := builder.Cond(builder.IsNull{
"repo_indexer_status.id",
})
sess := x.Table("repository").Join("LEFT OUTER", "repo_indexer_status", "repository.id = repo_indexer_status.repo_id")
sess := x.Table("repository").Join("LEFT OUTER", "repo_indexer_status", "repository.id = repo_indexer_status.repo_id AND repo_indexer_status.indexer_type = ?", indexerType)
if maxRepoID > 0 {
cond = builder.And(cond, builder.Lte{
"repository.id": maxRepoID,
@ -43,40 +54,64 @@ func GetUnindexedRepos(maxRepoID int64, page, pageSize int) ([]int64, error) {
return ids, err
}
// GetIndexerStatus loads repo codes indxer status
func (repo *Repository) GetIndexerStatus() error {
if repo.IndexerStatus != nil {
return nil
// getIndexerStatus loads repo codes indxer status
func (repo *Repository) getIndexerStatus(e Engine, indexerType RepoIndexerType) (*RepoIndexerStatus, error) {
switch indexerType {
case RepoIndexerTypeCode:
if repo.CodeIndexerStatus != nil {
return repo.CodeIndexerStatus, nil
}
status := &RepoIndexerStatus{RepoID: repo.ID}
has, err := x.Get(status)
case RepoIndexerTypeStats:
if repo.StatsIndexerStatus != nil {
return repo.StatsIndexerStatus, nil
}
}
status := &RepoIndexerStatus{RepoID: repo.ID, IndexerType: indexerType}
has, err := e.Get(status)
if err != nil {
return err
return nil, err
} else if !has {
status.CommitSha = ""
}
repo.IndexerStatus = status
return nil
switch indexerType {
case RepoIndexerTypeCode:
repo.CodeIndexerStatus = status
case RepoIndexerTypeStats:
repo.StatsIndexerStatus = status
}
return status, nil
}
// UpdateIndexerStatus updates indexer status
func (repo *Repository) UpdateIndexerStatus(sha string) error {
if err := repo.GetIndexerStatus(); err != nil {
// GetIndexerStatus loads repo codes indxer status
func (repo *Repository) GetIndexerStatus(indexerType RepoIndexerType) (*RepoIndexerStatus, error) {
return repo.getIndexerStatus(x, indexerType)
}
// updateIndexerStatus updates indexer status
func (repo *Repository) updateIndexerStatus(e Engine, indexerType RepoIndexerType, sha string) error {
status, err := repo.getIndexerStatus(e, indexerType)
if err != nil {
return fmt.Errorf("UpdateIndexerStatus: Unable to getIndexerStatus for repo: %s Error: %v", repo.FullName(), err)
}
if len(repo.IndexerStatus.CommitSha) == 0 {
repo.IndexerStatus.CommitSha = sha
_, err := x.Insert(repo.IndexerStatus)
if len(status.CommitSha) == 0 {
status.CommitSha = sha
_, err := e.Insert(status)
if err != nil {
return fmt.Errorf("UpdateIndexerStatus: Unable to insert repoIndexerStatus for repo: %s Sha: %s Error: %v", repo.FullName(), sha, err)
}
return nil
}
repo.IndexerStatus.CommitSha = sha
_, err := x.ID(repo.IndexerStatus.ID).Cols("commit_sha").
Update(repo.IndexerStatus)
status.CommitSha = sha
_, err = e.ID(status.ID).Cols("commit_sha").
Update(status)
if err != nil {
return fmt.Errorf("UpdateIndexerStatus: Unable to update repoIndexerStatus for repo: %s Sha: %s Error: %v", repo.FullName(), sha, err)
}
return nil
}
// UpdateIndexerStatus updates indexer status
func (repo *Repository) UpdateIndexerStatus(indexerType RepoIndexerType, sha string) error {
return repo.updateIndexerStatus(x, indexerType, sha)
}

@ -0,0 +1,137 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package models
import (
"math"
"strings"
"code.gitea.io/gitea/modules/timeutil"
"github.com/src-d/enry/v2"
)
// LanguageStat describes language statistics of a repository
type LanguageStat struct {
ID int64 `xorm:"pk autoincr"`
RepoID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"`
CommitID string
IsPrimary bool
Language string `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"`
Percentage float32 `xorm:"NUMERIC(5,2) NOT NULL DEFAULT 0"`
Color string `xorm:"-"`
CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"`
}
// LanguageStatList defines a list of language statistics
type LanguageStatList []*LanguageStat
func (stats LanguageStatList) loadAttributes() {
for i := range stats {
stats[i].Color = enry.GetColor(stats[i].Language)
}
}
func (repo *Repository) getLanguageStats(e Engine) (LanguageStatList, error) {
stats := make(LanguageStatList, 0, 6)
if err := e.Where("`repo_id` = ?", repo.ID).Desc("`percentage`").Find(&stats); err != nil {
return nil, err
}
stats.loadAttributes()
return stats, nil
}
// GetLanguageStats returns the language statistics for a repository
func (repo *Repository) GetLanguageStats() (LanguageStatList, error) {
return repo.getLanguageStats(x)
}
// GetTopLanguageStats returns the top language statistics for a repository
func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error) {
stats, err := repo.getLanguageStats(x)
if err != nil {
return nil, err
}
topstats := make(LanguageStatList, 0, limit)
var other float32
for i := range stats {
if stats[i].Language == "other" || len(topstats) >= limit {
other += stats[i].Percentage
continue
}
topstats = append(topstats, stats[i])
}
if other > 0 {
topstats = append(topstats, &LanguageStat{
RepoID: repo.ID,
Language: "other",
Color: "#cccccc",
Percentage: float32(math.Round(float64(other)*10) / 10),
})
}
return topstats, nil
}
// UpdateLanguageStats updates the language statistics for repository
func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]float32) error {
sess := x.NewSession()
if err := sess.Begin(); err != nil {
return err
}
defer sess.Close()
oldstats, err := repo.getLanguageStats(sess)
if err != nil {
return err
}
var topLang string
var p float32
for lang, perc := range stats {
if perc > p {
p = perc
topLang = strings.ToLower(lang)
}
}
for lang, perc := range stats {
upd := false
llang := strings.ToLower(lang)
for _, s := range oldstats {
// Update already existing language
if strings.ToLower(s.Language) == llang {
s.CommitID = commitID
s.IsPrimary = llang == topLang
s.Percentage = perc
if _, err := sess.ID(s.ID).Cols("`commit_id`", "`percentage`", "`is_primary`").Update(s); err != nil {
return err
}
upd = true
break
}
}
// Insert new language
if !upd {
if _, err := sess.Insert(&LanguageStat{
RepoID: repo.ID,
CommitID: commitID,
IsPrimary: llang == topLang,
Language: lang,
Percentage: perc,
}); err != nil {
return err
}
}
}
// Delete old languages
if _, err := sess.Where("`id` IN (SELECT `id` FROM `language_stat` WHERE `repo_id` = ? AND `commit_id` != ?)", repo.ID, commitID).Delete(&LanguageStat{}); err != nil {
return err
}
if err = repo.updateIndexerStatus(sess, RepoIndexerTypeStats, commitID); err != nil {
return err
}
return sess.Commit()
}

@ -46,11 +46,14 @@ func (repos RepositoryList) loadAttributes(e Engine) error {
return nil
}
// Load owners.
set := make(map[int64]struct{})
repoIDs := make([]int64, len(repos))
for i := range repos {
set[repos[i].OwnerID] = struct{}{}
repoIDs[i] = repos[i].ID
}
// Load owners.
users := make(map[int64]*User, len(set))
if err := e.
Where("id > 0").
@ -61,6 +64,25 @@ func (repos RepositoryList) loadAttributes(e Engine) error {
for i := range repos {
repos[i].Owner = users[repos[i].OwnerID]
}
// Load primary language.
stats := make(LanguageStatList, 0, len(repos))
if err := e.
Where("`is_primary` = ? AND `language` != ?", true, "other").
In("`repo_id`", repoIDs).
Find(&stats); err != nil {
return fmt.Errorf("find primary languages: %v", err)
}
stats.loadAttributes()
for i := range repos {
for _, st := range stats {
if st.RepoID == repos[i].ID {
repos[i].PrimaryLanguage = st
break
}
}
}
return nil
}
@ -119,7 +141,6 @@ type SearchRepoOptions struct {
OrderBy SearchOrderBy
Private bool // Include private repositories in results
StarredByID int64
IsProfile bool
AllPublic bool // Include also all public repositories of users and public organisations
AllLimited bool // Include also all public repositories of limited organisations
// None -> include collaborative AND non-collaborative
@ -306,11 +327,9 @@ func SearchRepository(opts *SearchRepoOptions) (RepositoryList, int64, error) {
return nil, 0, fmt.Errorf("Repo: %v", err)
}
if !opts.IsProfile {
if err = repos.loadAttributes(sess); err != nil {
return nil, 0, fmt.Errorf("LoadAttributes: %v", err)
}
}
return repos, count, nil
}

@ -0,0 +1,116 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package git
import (
"bytes"
"io"
"io/ioutil"
"math"
"path/filepath"
"github.com/src-d/enry/v2"
"gopkg.in/src-d/go-git.v4"
"gopkg.in/src-d/go-git.v4/plumbing"
"gopkg.in/src-d/go-git.v4/plumbing/object"
)
const fileSizeLimit int64 = 16 * 1024 * 1024
// GetLanguageStats calculates language stats for git repository at specified commit
func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, error) {
r, err := git.PlainOpen(repo.Path)
if err != nil {
return nil, err
}
rev, err := r.ResolveRevision(plumbing.Revision(commitID))
if err != nil {
return nil, err
}
commit, err := r.CommitObject(*rev)
if err != nil {
return nil, err
}
tree, err := commit.Tree()
if err != nil {
return nil, err
}
sizes := make(map[string]int64)
var total int64
err = tree.Files().ForEach(func(f *object.File) error {
if enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) ||
enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) {
return nil
}
// TODO: Use .gitattributes file for linguist overrides
language, ok := enry.GetLanguageByExtension(f.Name)
if !ok {
if language, ok = enry.GetLanguageByFilename(f.Name); !ok {
content, err := readFile(f, fileSizeLimit)
if err != nil {
return nil
}
language = enry.GetLanguage(filepath.Base(f.Name), content)
if language == enry.OtherLanguage {
return nil
}
}
}
if language != "" {
sizes[language] += f.Size
total += f.Size
}
return nil
})
if err != nil {
return nil, err
}
stats := make(map[string]float32)
var otherPerc float32 = 100
for language, size := range sizes {
perc := float32(math.Round(float64(size)/float64(total)*1000) / 10)
if perc <= 0.1 {
continue
}
otherPerc -= perc
stats[language] = perc
}
otherPerc = float32(math.Round(float64(otherPerc)*10) / 10)
if otherPerc > 0 {
stats["other"] = otherPerc
}
return stats, nil
}
func readFile(f *object.File, limit int64) ([]byte, error) {
r, err := f.Reader()
if err != nil {
return nil, err
}
defer r.Close()
if limit <= 0 {
return ioutil.ReadAll(r)
}
size := f.Size
if limit > 0 && size > limit {
size = limit
}
buf := bytes.NewBuffer(nil)
buf.Grow(int(size))
_, err = io.Copy(buf, io.LimitReader(r, limit))
return buf.Bytes(), err
}

@ -267,7 +267,7 @@ func (b *BleveIndexer) Index(repoID int64) error {
if err = batch.Flush(); err != nil {
return err
}
return repo.UpdateIndexerStatus(sha)
return repo.UpdateIndexerStatus(models.RepoIndexerTypeCode, sha)
}
// Delete deletes indexes by ids

@ -35,11 +35,12 @@ func getDefaultBranchSha(repo *models.Repository) (string, error) {
// getRepoChanges returns changes to repo since last indexer update
func getRepoChanges(repo *models.Repository, revision string) (*repoChanges, error) {
if err := repo.GetIndexerStatus(); err != nil {
status, err := repo.GetIndexerStatus(models.RepoIndexerTypeCode)
if err != nil {
return nil, err
}
if len(repo.IndexerStatus.CommitSha) == 0 {
if len(status.CommitSha) == 0 {
return genesisChanges(repo, revision)
}
return nonGenesisChanges(repo, revision)
@ -98,7 +99,7 @@ func genesisChanges(repo *models.Repository, revision string) (*repoChanges, err
// nonGenesisChanges get changes since the previous indexer update
func nonGenesisChanges(repo *models.Repository, revision string) (*repoChanges, error) {
diffCmd := git.NewCommand("diff", "--name-status",
repo.IndexerStatus.CommitSha, revision)
repo.CodeIndexerStatus.CommitSha, revision)
stdout, err := diffCmd.RunInDir(repo.RepoPath())
if err != nil {
// previous commit sha may have been removed by a force push, so

@ -109,7 +109,7 @@ func populateRepoIndexer() {
return
default:
}
ids, err := models.GetUnindexedRepos(maxRepoID, 0, 50)
ids, err := models.GetUnindexedRepos(models.RepoIndexerTypeCode, maxRepoID, 0, 50)
if err != nil {
log.Error("populateRepoIndexer: %v", err)
return

@ -0,0 +1,54 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package stats
import (
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/git"
)
// DBIndexer implements Indexer interface to use database's like search
type DBIndexer struct {
}
// Index repository status function
func (db *DBIndexer) Index(id int64) error {
repo, err := models.GetRepositoryByID(id)
if err != nil {
return err
}
status, err := repo.GetIndexerStatus(models.RepoIndexerTypeStats)
if err != nil {
return err
}
gitRepo, err := git.OpenRepository(repo.RepoPath())
if err != nil {
return err
}
defer gitRepo.Close()
// Get latest commit for default branch
commitID, err := gitRepo.GetBranchCommitID(repo.DefaultBranch)
if err != nil {
return err
}
// Do not recalculate stats if already calculated for this commit
if status.CommitSha == commitID {
return nil
}
// Calculate and save language statistics to database
stats, err := gitRepo.GetLanguageStats(commitID)
if err != nil {
return err
}
return repo.UpdateLanguageStats(commitID, stats)
}
// Close dummy function
func (db *DBIndexer) Close() {
}

@ -0,0 +1,85 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package stats
import (
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/graceful"
"code.gitea.io/gitea/modules/log"
)
// Indexer defines an interface to index repository stats
type Indexer interface {
Index(id int64) error
Close()
}
// indexer represents a indexer instance
var indexer Indexer
// Init initialize the repo indexer
func Init() error {
indexer = &DBIndexer{}
if err := initStatsQueue(); err != nil {
return err
}
go populateRepoIndexer()
return nil
}
// populateRepoIndexer populate the repo indexer with pre-existing data. This
// should only be run when the indexer is created for the first time.
func populateRepoIndexer() {
log.Info("Populating the repo stats indexer with existing repositories")
isShutdown := graceful.GetManager().IsShutdown()
exist, err := models.IsTableNotEmpty("repository")
if err != nil {
log.Fatal("System error: %v", err)
} else if !exist {
return
}
var maxRepoID int64
if maxRepoID, err = models.GetMaxID("repository"); err != nil {
log.Fatal("System error: %v", err)
}
// start with the maximum existing repo ID and work backwards, so that we
// don't include repos that are created after gitea starts; such repos will
// already be added to the indexer, and we don't need to add them again.
for maxRepoID > 0 {
select {
case <-isShutdown:
log.Info("Repository Stats Indexer population shutdown before completion")
return
default:
}
ids, err := models.GetUnindexedRepos(models.RepoIndexerTypeStats, maxRepoID, 0, 50)
if err != nil {
log.Error("populateRepoIndexer: %v", err)
return
} else if len(ids) == 0 {
break
}
for _, id := range ids {
select {
case <-isShutdown:
log.Info("Repository Stats Indexer population shutdown before completion")
return
default:
}
if err := statsQueue.Push(id); err != nil {
log.Error("statsQueue.Push: %v", err)
}
maxRepoID = id - 1
}
}
log.Info("Done (re)populating the repo stats indexer with existing repositories")
}

@ -0,0 +1,42 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package stats
import (
"path/filepath"
"testing"
"time"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/setting"
"gopkg.in/ini.v1"
"github.com/stretchr/testify/assert"
)
func TestMain(m *testing.M) {
models.MainTest(m, filepath.Join("..", "..", ".."))
}
func TestRepoStatsIndex(t *testing.T) {
assert.NoError(t, models.PrepareTestDatabase())
setting.Cfg = ini.Empty()
setting.NewQueueService()
err := Init()
assert.NoError(t, err)
time.Sleep(5 * time.Second)
repo, err := models.GetRepositoryByID(1)
assert.NoError(t, err)
langs, err := repo.GetTopLanguageStats(5)
assert.NoError(t, err)
assert.Len(t, langs, 1)
assert.Equal(t, "other", langs[0].Language)
assert.Equal(t, float32(100), langs[0].Percentage)
}

@ -0,0 +1,43 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package stats
import (
"fmt"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/graceful"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/queue"
)
// statsQueue represents a queue to handle repository stats updates
var statsQueue queue.Queue
// handle passed PR IDs and test the PRs
func handle(data ...queue.Data) {
for _, datum := range data {
opts := datum.(int64)
if err := indexer.Index(opts); err != nil {
log.Error("stats queue idexer.Index(%d) failed: %v", opts, err)
}
}
}
func initStatsQueue() error {
statsQueue = queue.CreateQueue("repo_stats_update", handle, int64(0)).(queue.Queue)
if statsQueue == nil {
return fmt.Errorf("Unable to create repo_stats_update Queue")
}
go graceful.GetManager().RunWithShutdownFns(statsQueue.Run)
return nil
}
// UpdateRepoIndexer update a repository's entries in the indexer
func UpdateRepoIndexer(repo *models.Repository) error {
return statsQueue.Push(repo.ID)
}

@ -9,6 +9,7 @@ import (
"code.gitea.io/gitea/modules/git"
code_indexer "code.gitea.io/gitea/modules/indexer/code"
issue_indexer "code.gitea.io/gitea/modules/indexer/issues"
stats_indexer "code.gitea.io/gitea/modules/indexer/stats"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/notification/base"
"code.gitea.io/gitea/modules/repository"
@ -117,12 +118,18 @@ func (r *indexerNotifier) NotifyMigrateRepository(doer *models.User, u *models.U
if setting.Indexer.RepoIndexerEnabled && !repo.IsEmpty {
code_indexer.UpdateRepoIndexer(repo)
}
if err := stats_indexer.UpdateRepoIndexer(repo); err != nil {
log.Error("stats_indexer.UpdateRepoIndexer(%d) failed: %v", repo.ID, err)
}
}
func (r *indexerNotifier) NotifyPushCommits(pusher *models.User, repo *models.Repository, refName, oldCommitID, newCommitID string, commits *repository.PushCommits) {
if setting.Indexer.RepoIndexerEnabled && refName == git.BranchPrefix+repo.DefaultBranch {
code_indexer.UpdateRepoIndexer(repo)
}
if err := stats_indexer.UpdateRepoIndexer(repo); err != nil {
log.Error("stats_indexer.UpdateRepoIndexer(%d) failed: %v", repo.ID, err)
}
}
func (r *indexerNotifier) NotifyIssueChangeContent(doer *models.User, issue *models.Issue, oldContent string) {

@ -641,6 +641,7 @@ forks = Forks
pick_reaction = Pick your reaction
reactions_more = and %d more
unit_disabled = The site administrator has disabled this repository section.
language_other = Other
template.items = Template Items
template.git_content = Git Content (Default Branch)

@ -19,6 +19,7 @@ import (
"code.gitea.io/gitea/modules/highlight"
code_indexer "code.gitea.io/gitea/modules/indexer/code"
issue_indexer "code.gitea.io/gitea/modules/indexer/issues"
stats_indexer "code.gitea.io/gitea/modules/indexer/stats"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/markup/external"
@ -111,6 +112,9 @@ func GlobalInit(ctx context.Context) {
cron.NewContext()
issue_indexer.InitIssueIndexer(false)
code_indexer.Init()
if err := stats_indexer.Init(); err != nil {
log.Fatal("Failed to initialize repository stats indexer queue: %v", err)
}
mirror_service.InitSyncMirrors()
webhook.InitDeliverHooks()
if err := pull_service.Init(); err != nil {

@ -85,7 +85,6 @@ func Home(ctx *context.Context) {
OrderBy: orderBy,
Private: ctx.IsSigned,
Actor: ctx.User,
IsProfile: true,
IncludeDescription: setting.UI.SearchRepoDescription,
})
if err != nil {

@ -457,6 +457,16 @@ func Home(ctx *context.Context) {
ctx.NotFound("Home", fmt.Errorf(ctx.Tr("units.error.no_unit_allowed_repo")))
}
func renderLanguageStats(ctx *context.Context) {
langs, err := ctx.Repo.Repository.GetTopLanguageStats(5)
if err != nil {
ctx.ServerError("Repo.GetTopLanguageStats", err)
return
}
ctx.Data["LanguageStats"] = langs
}
func renderCode(ctx *context.Context) {
ctx.Data["PageIsViewCode"] = true
@ -497,6 +507,11 @@ func renderCode(ctx *context.Context) {
return
}
renderLanguageStats(ctx)
if ctx.Written() {
return
}
if entry.IsDir() {
renderDirectory(ctx, treeLink)
} else {

@ -220,7 +220,6 @@ func Profile(ctx *context.Context) {
OwnerID: ctxUser.ID,
OrderBy: orderBy,
Private: ctx.IsSigned,
IsProfile: true,
Collaborate: util.OptionalBoolFalse,
TopicOnly: topicOnly,
IncludeDescription: setting.UI.SearchRepoDescription,

@ -21,6 +21,9 @@
{{end}}
{{end}}
<div class="ui right metas">
{{if .PrimaryLanguage }}
<span class="text grey"><i class="color-icon" style="background-color: {{.PrimaryLanguage.Color}}"></i>{{ .PrimaryLanguage.Language }}</span>
{{end}}
<span class="text grey"><i class="octicon octicon-star"></i> {{.NumStars}}</span>
<span class="text grey"><i class="octicon octicon-git-branch"></i> {{.NumForks}}</span>
</div>

@ -1,4 +1,5 @@
<div class="ui segment sub-menu">
<div class="ui segments">
<div class="ui segment sub-menu repository-menu">
<div class="ui two horizontal center link list">
{{if and (.Permission.CanRead $.UnitTypeCode) (not .IsEmptyRepo)}}
<div class="item{{if .PageIsCommits}} active{{end}}">
@ -14,4 +15,28 @@
</div>
{{end}}
</div>
</div>
{{if and (.Permission.CanRead $.UnitTypeCode) (not .IsEmptyRepo) .LanguageStats }}
<div class="ui segment sub-menu language-stats-details" style="display: none">
<div class="ui horizontal center link list">
{{range .LanguageStats}}
<div class="item">
<i class="color-icon" style="background-color: {{ .Color }}"></i>
<span class="ui"><b>
{{if eq .Language "other" }}
{{ $.i18n.Tr "repo.language_other" }}
{{else}}
{{ .Language }}
{{end}}
</b> {{ .Percentage }}%</span>
</div>
{{end}}
</div>
</div>
<a class="ui segment language-stats">
{{range .LanguageStats}}
<div class="bar" style="width: {{ .Percentage }}%; background-color: {{ .Color }}">&nbsp;</div>
{{end}}
</a>
{{end}}
</div>

@ -0,0 +1,11 @@
.linguist
benchmarks/output
.ci
Makefile.main
.shared
.idea
.docsrv-resources
build/
vendor/
java/lib/
.vscode/

@ -0,0 +1,132 @@
dist: trusty
language: go
go:
- '1.12.x'
- '1.11.x'
env:
global:
- GO_VERSION_FOR_JVM='1.11.x'
- CGO_ENABLED=0
- GO111MODULE=on
- ONIGURUMA_VERSION='6.9.1'
matrix:
- ONIGURUMA=0
- ONIGURUMA=1
matrix:
fast_finish: true
stages:
- name: test
- name: release
if: tag IS present
- name: publish
if: tag IS present
stage: test
install:
- >
if [[ "${ONIGURUMA}" -gt 0 ]]; then
export CGO_ENABLED=1
export GO_TAGS='oniguruma'
# install oniguruma manually as trusty has only ancient 5.x
sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8 fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627
wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_${ONIGURUMA_VERSION}-1_amd64.deb"
sudo dpkg -i "libonig5_${ONIGURUMA_VERSION}-1_amd64.deb"
wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb"
sudo dpkg -i "libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb"
fi;
script:
- make test-coverage
after_success:
- bash <(curl -s https://codecov.io/bash)
jobs:
include:
- name: 'java unit-tests'
stage: test
language: scala
jdk: oraclejdk8
install:
- export CGO_ENABLED=1
- eval "$(curl -sL https://raw.githubusercontent.com/travis-ci/gimme/master/gimme | GIMME_GO_VERSION=$GO_VERSION_FOR_JVM bash)"
- go version
before_script:
- cd java
- make
script:
- make test
- name: 'linux packages'
stage: release
install:
- go version
script: make packages
deploy:
provider: releases
api_key:
secure: $GITHUB_TOKEN
file_glob: true
file: build/*.tar.gz
skip_cleanup: true
on:
tags: true
- name: 'linux shared lib'
stage: release
install:
- go version
script: make linux-shared
deploy:
provider: releases
api_key:
secure: $GITHUB_TOKEN
file:
- ./.shared/linux-x86-64/libenry.so
skip_cleanup: true
on:
tags: true
- name: 'macOS shared lib'
stage: release
env:
- OSXCROSS_PACKAGE="osxcross_3034f7149716d815bc473d0a7b35d17e4cf175aa.tar.gz"
- OSXCROSS_URL="https://github.com/bblfsh/client-scala/releases/download/v1.5.2/${OSXCROSS_PACKAGE}"
- PATH="/$HOME/osxcross/bin:$PATH"
install:
- go version
- sudo apt-get update
- sudo apt-get install -y --no-install-recommends clang g++ gcc gcc-multilib libc6-dev libc6-dev-i386 mingw-w64 patch xz-utils
- cd ${HOME}
- curl -sfSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf -
- cd $GOPATH/src/github.com/src-d/enry
script: make darwin-shared
deploy:
provider: releases
api_key:
secure: $GITHUB_TOKEN
file: ./.shared/darwin/libenry.dylib
skip_cleanup: true
on:
tags: true
- name: 'java: publish to maven'
stage: publish
language: scala
jdk: oraclejdk8
install:
- export CGO_ENABLED=1
- eval "$(curl -sL https://raw.githubusercontent.com/travis-ci/gimme/master/gimme | GIMME_GO_VERSION=$GO_VERSION_FOR_JVM bash)"
- go version
before_script:
- cd java
- make
- curl -o ./shared/linux-x86-64/libenry.so -sfL "https://github.com/$TRAVIS_REPO_SLUG/releases/download/$TRAVIS_TAG/libenry.so" || travis_terminate 1
- mkdir -p ./shared/darwin
- curl -o ./shared/darwin/libenry.dylib -sfL "https://github.com/$TRAVIS_REPO_SLUG/releases/download/$TRAVIS_TAG/libenry.dylib" || travis_terminate 1
- openssl aes-256-cbc -K $encrypted_a0e1c69dbbc7_key -iv $encrypted_a0e1c69dbbc7_iv -in key.asc.enc -out key.asc -d
- gpg --no-default-keyring --primary-keyring ./project/.gnupg/pubring.gpg --secret-keyring ./project/.gnupg/secring.gpg --keyring ./project/.gnupg/pubring.gpg --fingerprint --import key.asc
script:
- make test # ensure the shared objects are functional
- ./sbt publishLocal
- ./sbt publishSigned
- ./sbt sonatypeRelease

@ -0,0 +1,61 @@
# source{d} Contributing Guidelines
source{d} projects accept contributions via GitHub pull requests.
This document outlines some of the
conventions on development workflow, commit message formatting, contact points,
and other resources to make it easier to get your contribution accepted.
## Certificate of Origin
By contributing to this project, you agree to the [Developer Certificate of
Origin (DCO)](DCO). This document was created by the Linux Kernel community and is a
simple statement that you, as a contributor, have the legal right to make the
contribution.
In order to show your agreement with the DCO you should include at the end of the commit message,
the following line: `Signed-off-by: John Doe <john.doe@example.com>`, using your real name.
This can be done easily using the [`-s`](https://github.com/git/git/blob/b2c150d3aa82f6583b9aadfecc5f8fa1c74aca09/Documentation/git-commit.txt#L154-L161) flag on the `git commit`.
If you find yourself pushed a few commits without `Signed-off-by`, you can still add it afterwards. We wrote a manual which can help: [fix-DCO.md](https://github.com/src-d/guide/blob/master/developer-community/fix-DCO.md).
## Support Channels
The official support channels, for both users and contributors, are:
- GitHub issues: each repository has its own list of issues.
- Slack: join the [source{d} Slack](https://join.slack.com/t/sourced-community/shared_invite/enQtMjc4Njk5MzEyNzM2LTFjNzY4NjEwZGEwMzRiNTM4MzRlMzQ4MmIzZjkwZmZlM2NjODUxZmJjNDI1OTcxNDAyMmZlNmFjODZlNTg0YWM) community.
*Before opening a new issue or submitting a new pull request, it's helpful to
search the project - it's likely that another user has already reported the
issue you're facing, or it's a known issue that we're already aware of.
## How to Contribute
Pull Requests (PRs) are the main and exclusive way to contribute code to source{d} projects.
In order for a PR to be accepted it needs to pass this list of requirements:
- The contribution must be correctly explained with natural language and providing a minimum working example that reproduces it.
- All PRs must be written idiomaticly:
- for Go: formatted according to [gofmt](https://golang.org/cmd/gofmt/), and without any warnings from [go lint](https://github.com/golang/lint) nor [go vet](https://golang.org/cmd/vet/)
- for other languages, similar constraints apply.
- They should in general include tests, and those shall pass.
- If the PR is a bug fix, it has to include a new unit test that fails before the patch is merged.
- If the PR is a new feature, it has to come with a suite of unit tests, that tests the new functionality.
- In any case, all the PRs have to pass the personal evaluation of at least one of the [maintainers](MAINTAINERS) of the project.
### Format of the commit message
Every commit message should describe what was changed, under which context and, if applicable, the GitHub issue it relates to:
```
plumbing: packp, Skip argument validations for unknown capabilities. Fixes #623
```
The format can be described more formally as follows:
```
<package>: <subpackage>, <what changed>. [Fixes #<issue-number>]
```

@ -0,0 +1,25 @@
Developer's Certificate of Origin 1.1
By making a contribution to this project, I certify that:
(a) The contribution was created in whole or in part by me and I
have the right to submit it under the open source license
indicated in the file; or
(b) The contribution is based upon previous work that, to the best
of my knowledge, is covered under an appropriate open source
license and I have the right under that license to submit that
work with modifications, whether created in whole or in part
by me, under the same open source license (unless I am
permitted to submit under a different license), as indicated
in the file; or
(c) The contribution was provided directly to me by some other
person who certified (a), (b) or (c) and I have not modified
it.
(d) I understand and agree that this project and the contribution
are public and that a record of the contribution (including all
personal information I submit with it, including my sign-off) is
maintained indefinitely and may be redistributed consistent with
this project or the open source license(s) involved.

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2017 Sourced Technologies, S.L.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

@ -0,0 +1 @@
Alexander Bezzubov <alex@sourced.tech> (@bzz)

@ -0,0 +1,82 @@
# Package configuration
PROJECT = enry
COMMANDS = cmd/enry
# Including ci Makefile
CI_REPOSITORY ?= https://github.com/src-d/ci.git
CI_BRANCH ?= v1
CI_PATH ?= .ci
MAKEFILE := $(CI_PATH)/Makefile.main
$(MAKEFILE):
git clone --quiet --depth 1 -b $(CI_BRANCH) $(CI_REPOSITORY) $(CI_PATH);
-include $(MAKEFILE)
# Docsrv: configure the languages whose api-doc can be auto generated
LANGUAGES = go
# Docs: do not edit this
DOCS_REPOSITORY := https://github.com/src-d/docs
SHARED_PATH ?= $(shell pwd)/.docsrv-resources
DOCS_PATH ?= $(SHARED_PATH)/.docs
$(DOCS_PATH)/Makefile.inc:
git clone --quiet --depth 1 $(DOCS_REPOSITORY) $(DOCS_PATH);
-include $(DOCS_PATH)/Makefile.inc
LINGUIST_PATH = .linguist
# shared objects
RESOURCES_DIR=./.shared
LINUX_DIR=$(RESOURCES_DIR)/linux-x86-64
LINUX_SHARED_LIB=$(LINUX_DIR)/libenry.so
DARWIN_DIR=$(RESOURCES_DIR)/darwin
DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib
HEADER_FILE=libenry.h
NATIVE_LIB=./shared/enry.go
$(LINGUIST_PATH):
git clone https://github.com/github/linguist.git $@
clean-linguist:
rm -rf $(LINGUIST_PATH)
clean-shared:
rm -rf $(RESOURCES_DIR)
clean: clean-linguist clean-shared
code-generate: $(LINGUIST_PATH)
mkdir -p data && \
go run internal/code-generator/main.go
ENRY_TEST_REPO="$${PWD}/.linguist" go test -v \
-run Test_GeneratorTestSuite \
./internal/code-generator/generator \
-testify.m TestUpdateGeneratorTestSuiteGold \
-update_gold
benchmarks: $(LINGUIST_PATH)
go test -run=NONE -bench=. && \
benchmarks/linguist-total.rb
benchmarks-samples: $(LINGUIST_PATH)
go test -run=NONE -bench=. -benchtime=5us && \
benchmarks/linguist-samples.rb
benchmarks-slow: $(LINGUIST_PATH)
mkdir -p benchmarks/output && \
go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h > benchmarks/output/enry_samples.bench && \
benchmarks/linguist-samples.rb 5 > benchmarks/output/linguist_samples.bench
linux-shared: $(LINUX_SHARED_LIB)
darwin-shared: $(DARWIN_SHARED_LIB)
$(DARWIN_SHARED_LIB):
mkdir -p $(DARWIN_DIR) && \
CC="o64-clang" CXX="o64-clang++" CGO_ENABLED=1 GOOS=darwin go build -buildmode=c-shared -o $(DARWIN_SHARED_LIB) $(NATIVE_LIB) && \
mv $(DARWIN_DIR)/$(HEADER_FILE) $(RESOURCES_DIR)/$(HEADER_FILE)
$(LINUX_SHARED_LIB):
mkdir -p $(LINUX_DIR) && \
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -buildmode=c-shared -o $(LINUX_SHARED_LIB) $(NATIVE_LIB) && \
mv $(LINUX_DIR)/$(HEADER_FILE) $(RESOURCES_DIR)/$(HEADER_FILE)
.PHONY: benchmarks benchmarks-samples benchmarks-slow

@ -0,0 +1,328 @@
# enry [![GoDoc](https://godoc.org/github.com/src-d/enry?status.svg)](https://godoc.org/github.com/src-d/enry) [![Build Status](https://travis-ci.com/src-d/enry.svg?branch=master)](https://travis-ci.com/src-d/enry) [![codecov](https://codecov.io/gh/src-d/enry/branch/master/graph/badge.svg)](https://codecov.io/gh/src-d/enry)
File programming language detector and toolbox to ignore binary or vendored files. *enry*, started as a port to _Go_ of the original [linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved *2x performance*.
* [Installation](#installation)
* [Examples](#examples)
* [CLI](#cli)
* [Java bindings](#java-bindings)
* [Python bindings](#python-bindings)
* [Divergences from linguist](#divergences-from-linguist)
* [Benchmarks](#benchmarks)
* [Why Enry?](#why-enry)
* [Development](#development)
* [Sync with github/linguist upstream](#sync-with-githublinguist-upstream)
* [Misc](#misc)
* [Benchmark](#benchmark)
* [Faster regexp engine (optional)](#faster-regexp-engine-optional)
* [License](#license)
Installation
------------
The recommended way to install enry is to either [download a release](https://github.com/src-d/enry/releases) or
```
go get github.com/src-d/enry/cmd/enry
```
This project is now part of [source{d} Engine](https://sourced.tech/engine),
which provides the simplest way to get started with a single command.
Visit [sourced.tech/engine](https://sourced.tech/engine) for more information.
Examples
------------
```go
lang, safe := enry.GetLanguageByExtension("foo.go")
fmt.Println(lang, safe)
// result: Go true
lang, safe := enry.GetLanguageByContent("foo.m", []byte("<matlab-code>"))
fmt.Println(lang, safe)
// result: Matlab true
lang, safe := enry.GetLanguageByContent("bar.m", []byte("<objective-c-code>"))
fmt.Println(lang, safe)
// result: Objective-C true
// all strategies together
lang := enry.GetLanguage("foo.cpp", []byte("<cpp-code>"))
// result: C++ true
```
Note that the returned boolean value `safe` is set either to `true`, if there is only one possible language detected, or to `false` otherwise.
To get a list of possible languages for a given file, you can use the plural version of the detecting functions.
```go
langs := enry.GetLanguages("foo.h", []byte("<cpp-code>"))
// result: []string{"C", "C++", "Objective-C}
langs := enry.GetLanguagesByExtension("foo.asc", []byte("<content>"), nil)
// result: []string{"AGS Script", "AsciiDoc", "Public Key"}
langs := enry.GetLanguagesByFilename("Gemfile", []byte("<content>"), []string{})
// result: []string{"Ruby"}
```
CLI
------------
You can use enry as a command,
```bash
$ enry --help
enry v2.0.0 build: 05-08-2019_20_40_35 commit: 6ccf0b6, based on linguist commit: e456098
enry, A simple (and faster) implementation of github/linguist
usage: enry [-mode=(file|line|byte)] [-prog] <path>
enry [-mode=(file|line|byte)] [-prog] [-json] [-breakdown] <path>
enry [-mode=(file|line|byte)] [-prog] [-json] [-breakdown]
enry [-version]
```
and on repository root, it'll return an output similar to *linguist*'s output,
```bash
$ enry
97.71% Go
1.60% C
0.31% Shell
0.22% Java
0.07% Ruby
0.05% Makefile
0.04% Scala
0.01% Gnuplot
```
but not only the output; its flags are also the same as *linguist*'s ones,
```bash
$ enry --breakdown
97.71% Go
1.60% C
0.31% Shell
0.22% Java
0.07% Ruby
0.05% Makefile
0.04% Scala
0.01% Gnuplot
Scala
java/build.sbt
java/project/plugins.sbt
Java
java/src/main/java/tech/sourced/enry/Enry.java
java/src/main/java/tech/sourced/enry/GoUtils.java
java/src/main/java/tech/sourced/enry/Guess.java
java/src/test/java/tech/sourced/enry/EnryTest.java
Makefile
Makefile
java/Makefile
Go
benchmark_test.go
```
even the JSON flag,
```bash
$ enry --json | jq .
{
"C": [
"internal/tokenizer/flex/lex.linguist_yy.c",
"internal/tokenizer/flex/lex.linguist_yy.h",
"internal/tokenizer/flex/linguist.h",
"python/_c_enry.c",
"python/enry.c"
],
"Gnuplot": [
"benchmarks/plot-histogram.gp"
],
"Go": [
"benchmark_test.go",
```
Note that enry's CLI **_doesn't need a git repository to work_**, which is intentionally different from the linguist.
## Java bindings
Generated Java bindings using a C shared library and JNI are available under [`java`](https://github.com/src-d/enry/blob/master/java) and published on Maven at [tech.sourced:enry-java](https://mvnrepository.com/artifact/tech.sourced/enry-java) for macOS and linux.
## Python bindings
Generated Python bindings using a C shared library and cffi are not available yet and are WIP under [src-d/enry#154](https://github.com/src-d/enry/issues/154).
Divergences from linguist
------------
The `enry` library is based on the data from `github/linguist` version **v7.5.1**.
As opposed to linguist, `enry` [CLI tool](#cli) does *not* require a full Git repository in the filesystem in order to report languages.
Parsing [linguist/samples](https://github.com/github/linguist/tree/master/samples) the following `enry` results are different from linguist:
* [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine.
* [Heuristics for ".rno" extension](https://github.com/github/linguist/blob/3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d/lib/linguist/heuristics.yml#L365) in RUNOFF could not be parsed, due to unsupported lookahead in RE2 regexp engine.
* As of [Linguist v5.3.2](https://github.com/github/linguist/releases/tag/v5.3.2) it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193).
* Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194).
* Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet.
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213).
* XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192).
* Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18).
* `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as linguist does
In all the cases above that have an issue number - we plan to update enry to match Linguist behavior.
Benchmarks
------------
Enry's language detection has been compared with Linguist's one. In order to do that, Linguist's project directory [*linguist/samples*](https://github.com/github/linguist/tree/master/samples) was used as a set of files to run benchmarks against.
We got these results:
![histogram](benchmarks/histogram/distribution.png)
The histogram shows the number of files detected (y-axis) per time interval bucket (x-axis). As one can see, most of the files were detected faster by enry.
We found few cases where enry turns slower than linguist due to
Go regexp engine being slower than Ruby's, based on [oniguruma](https://github.com/kkos/oniguruma) library, written in C.
See [instructions](#misc) for running enry with oniguruma.
Why Enry?
------------
In the movie [My Fair Lady](https://en.wikipedia.org/wiki/My_Fair_Lady), [Professor Henry Higgins](http://www.imdb.com/character/ch0011719/?ref_=tt_cl_t2) is one of the main characters. Henry is a linguist and at the very beginning of the movie enjoys guessing the origin of people based on their accent.
"Enry Iggins" is how [Eliza Doolittle](http://www.imdb.com/character/ch0011720/?ref_=tt_cl_t1), [pronounces](https://www.youtube.com/watch?v=pwNKyTktDIE) the name of the Professor during the first half of the movie.
## Development
To build enry's CLI run:
make build
this will generate a binary in the project's root directory called `enry`.
To run the tests:
make test
### Sync with github/linguist upstream
*enry* re-uses parts of the original [github/linguist](https://github.com/github/linguist) to generate internal data structures.
In order to update to the latest release of linguist do:
```bash
$ git clone https://github.com/github/linguist.git .linguist
$ cd .linguist; git checkout <release-tag>; cd ..
# put the new release's commit sha in the generator_test.go (to re-generate .gold test fixtures)
# https://github.com/src-d/enry/blob/13d3d66d37a87f23a013246a1b0678c9ee3d524b/internal/code-generator/generator/generator_test.go#L18
$ make code-generate
```
To stay in sync, enry needs to be updated when a new release of the linguist includes changes to any of the following files:
* [languages.yml](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml)
* [heuristics.yml](https://github.com/github/linguist/blob/master/lib/linguist/heuristics.yml)
* [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
* [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)
There is no automation for detecting the changes in the linguist project, so this process above has to be done manually from time to time.
When submitting a pull request syncing up to a new release, please make sure it only contains the changes in
the generated files (in [data](https://github.com/src-d/enry/blob/master/data) subdirectory).
Separating all the necessary "manual" code changes to a different PR that includes some background description and an update to the documentation on ["divergences from linguist"](##divergences-from-linguist) is very much appreciated as it simplifies the maintenance (review/release notes/etc).
## Misc
<details>
### Benchmark
All benchmark scripts are in [*benchmarks*](https://github.com/src-d/enry/blob/master/benchmarks) directory.
#### Dependencies
As benchmarks depend on Ruby and Github-Linguist gem make sure you have:
- Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed
- Docker
- [native dependencies](https://github.com/github/linguist/#dependencies) installed
- Build the gem `cd .linguist && bundle install && rake build_gem && cd -`
- Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem`
#### Quick benchmark
To run quicker benchmarks you can either:
make benchmarks
to get average times for the main detection function and strategies for the whole samples set or:
make benchmarks-samples
if you want to see measures per sample file.
#### Full benchmark
If you want to reproduce the same benchmarks as reported above:
- Make sure all [dependencies](#benchmark-dependencies) are installed
- Install [gnuplot](http://gnuplot.info) (in order to plot the histogram)
- Run `ENRY_TEST_REPO="$PWD/.linguist" benchmarks/run.sh` (takes ~15h)
It will run the benchmarks for enry and linguist, parse the output, create csv files and plot the histogram.
### Faster regexp engine (optional)
[Oniguruma](https://github.com/kkos/oniguruma) is CRuby's regular expression engine.
It is very fast and performs better than the one built into Go runtime. *enry* supports swapping
between those two engines thanks to [rubex](https://github.com/moovweb/rubex) project.
The typical overall speedup from using Oniguruma is 1.5-2x. However, it requires CGo and the external shared library.
On macOS with [Homebrew](https://brew.sh/), it is:
```
brew install oniguruma
```
On Ubuntu, it is
```
sudo apt install libonig-dev
```
To build enry with Oniguruma regexps use the `oniguruma` build tag
```
go get -v -t --tags oniguruma ./...
```
and then rebuild the project.
</details>
License
------------
Apache License, Version 2.0. See [LICENSE](LICENSE)

@ -0,0 +1,107 @@
package enry
import (
"math"
"sort"
"github.com/src-d/enry/v2/internal/tokenizer"
)
// Classifier is the interface in charge to detect the possible languages of the given content based on a set of
// candidates. Candidates is a map which can be used to assign weights to languages dynamically.
type Classifier interface {
Classify(content []byte, candidates map[string]float64) (languages []string)
}
type classifier struct {
languagesLogProbabilities map[string]float64
tokensLogProbabilities map[string]map[string]float64
tokensTotal float64
}
type scoredLanguage struct {
language string
score float64
}
// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
var languages map[string]float64
if len(candidates) == 0 {
languages = c.knownLangs()
} else {
languages = make(map[string]float64, len(candidates))
for candidate, weight := range candidates {
if lang, ok := GetLanguageByAlias(candidate); ok {
candidate = lang
}
languages[candidate] = weight
}
}
empty := len(content) == 0
scoredLangs := make([]*scoredLanguage, 0, len(languages))
var tokens []string
if !empty {
tokens = tokenizer.Tokenize(content)
}
for language := range languages {
score := c.languagesLogProbabilities[language]
if !empty {
score += c.tokensLogProbability(tokens, language)
}
scoredLangs = append(scoredLangs, &scoredLanguage{
language: language,
score: score,
})
}
return sortLanguagesByScore(scoredLangs)
}
func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
sort.Stable(byScore(scoredLangs))
sortedLanguages := make([]string, 0, len(scoredLangs))
for _, scoredLang := range scoredLangs {
sortedLanguages = append(sortedLanguages, scoredLang.language)
}
return sortedLanguages
}
func (c *classifier) knownLangs() map[string]float64 {
langs := make(map[string]float64, len(c.languagesLogProbabilities))
for lang := range c.languagesLogProbabilities {
langs[lang]++
}
return langs
}
func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
var sum float64
for _, token := range tokens {
sum += c.tokenProbability(token, language)
}
return sum
}
func (c *classifier) tokenProbability(token, language string) float64 {
tokenProb, ok := c.tokensLogProbabilities[language][token]
if !ok {
tokenProb = math.Log(1.000000 / c.tokensTotal)
}
return tokenProb
}
type byScore []*scoredLanguage
func (b byScore) Len() int { return len(b) }
func (b byScore) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }

@ -0,0 +1,472 @@
package enry
import (
"bufio"
"bytes"
"path/filepath"
"strings"
"github.com/src-d/enry/v2/data"
"github.com/src-d/enry/v2/regex"
)
// OtherLanguage is used as a zero value when a function can not return a specific language.
const OtherLanguage = ""
// Strategy type fix the signature for the functions that can be used as a strategy.
type Strategy func(filename string, content []byte, candidates []string) (languages []string)
// DefaultStrategies is a sequence of strategies used by GetLanguage to detect languages.
var DefaultStrategies = []Strategy{
GetLanguagesByModeline,
GetLanguagesByFilename,
GetLanguagesByShebang,
GetLanguagesByExtension,
GetLanguagesByContent,
GetLanguagesByClassifier,
}
// DefaultClassifier is a Naive Bayes classifier trained on Linguist samples.
var DefaultClassifier Classifier = &classifier{
languagesLogProbabilities: data.LanguagesLogProbabilities,
tokensLogProbabilities: data.TokensLogProbabilities,
tokensTotal: data.TokensTotal,
}
// GetLanguage applies a sequence of strategies based on the given filename and content
// to find out the most probably language to return.
func GetLanguage(filename string, content []byte) (language string) {
languages := GetLanguages(filename, content)
return firstLanguage(languages)
}
func firstLanguage(languages []string) string {
for _, l := range languages {
if l != "" {
return l
}
}
return OtherLanguage
}
// GetLanguageByModeline returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByModeline(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil)
}
// GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil)
}
// GetLanguageByVimModeline returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByVimModeline(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil)
}
// GetLanguageByFilename returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByFilename(filename string) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil)
}
// GetLanguageByShebang returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByShebang(content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil)
}
// GetLanguageByExtension returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByExtension(filename string) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil)
}
// GetLanguageByContent returns detected language. If there are more than one possibles languages
// it returns the first language by alphabetically order and safe to false.
func GetLanguageByContent(filename string, content []byte) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByContent, filename, content, nil)
}
// GetLanguageByClassifier returns the most probably language detected for the given content. It uses
// DefaultClassifier, if no candidates are provided it returns OtherLanguage.
func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) {
return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates)
}
func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) {
languages := strategy(filename, content, candidates)
return getFirstLanguageAndSafe(languages)
}
func getFirstLanguageAndSafe(languages []string) (language string, safe bool) {
language = firstLanguage(languages)
safe = len(languages) == 1
return
}
// GetLanguageBySpecificClassifier returns the most probably language for the given content using
// classifier to detect language.
func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) {
languages := GetLanguagesBySpecificClassifier(content, candidates, classifier)
return getFirstLanguageAndSafe(languages)
}
// GetLanguages applies a sequence of strategies based on the given filename and content
// to find out the most probably languages to return.
// At least one of arguments should be set. If content is missing, language detection will be based on the filename.
// The function won't read the file, given an empty content.
func GetLanguages(filename string, content []byte) []string {
if IsBinary(content) {
return nil
}
var languages []string
candidates := []string{}
for _, strategy := range DefaultStrategies {
languages = strategy(filename, content, candidates)
if len(languages) == 1 {
return languages
}
if len(languages) > 0 {
candidates = append(candidates, languages...)
}
}
return languages
}
// GetLanguagesByModeline returns a slice of possible languages for the given content.
// It complies with the signature to be a Strategy type.
func GetLanguagesByModeline(_ string, content []byte, candidates []string) []string {
headFoot := getHeaderAndFooter(content)
var languages []string
for _, getLang := range modelinesFunc {
languages = getLang("", headFoot, candidates)
if len(languages) > 0 {
break
}
}
return languages
}
var modelinesFunc = []Strategy{
GetLanguagesByEmacsModeline,
GetLanguagesByVimModeline,
}
func getHeaderAndFooter(content []byte) []byte {
const searchScope = 5
if len(content) == 0 {
return content
}
if bytes.Count(content, []byte("\n")) < 2*searchScope {
return content
}
header := headScope(content, searchScope)
footer := footScope(content, searchScope)
headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:]))
headerAndFooter = append(headerAndFooter, content[:header]...)
headerAndFooter = append(headerAndFooter, content[footer:]...)
return headerAndFooter
}
func headScope(content []byte, scope int) (index int) {
for i := 0; i < scope; i++ {
eol := bytes.IndexAny(content, "\n")
content = content[eol+1:]
index += eol
}
return index + scope - 1
}
func footScope(content []byte, scope int) (index int) {
for i := 0; i < scope; i++ {
index = bytes.LastIndexAny(content, "\n")
content = content[:index]
}
return index + 1
}
var (
reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`)
reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`)
reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`)
reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`)
)
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
// It complies with the signature to be a Strategy type.
func GetLanguagesByEmacsModeline(_ string, content []byte, _ []string) []string {
matched := reEmacsModeline.FindAllSubmatch(content, -1)
if matched == nil {
return nil
}
// only take the last matched line, discard previous lines
lastLineMatched := matched[len(matched)-1][1]
matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched)
var alias string
if matchedAlias != nil {
alias = string(matchedAlias[1])
} else {
alias = string(lastLineMatched)
}
language, ok := GetLanguageByAlias(alias)
if !ok {
return nil
}
return []string{language}
}
// GetLanguagesByVimModeline returns a slice of possible languages for the given content.
// It complies with the signature to be a Strategy type.
func GetLanguagesByVimModeline(_ string, content []byte, _ []string) []string {
matched := reVimModeline.FindAllSubmatch(content, -1)
if matched == nil {
return nil
}
// only take the last matched line, discard previous lines
lastLineMatched := matched[len(matched)-1][1]
matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1)
if matchedAlias == nil {
return nil
}
alias := string(matchedAlias[0][1])
if len(matchedAlias) > 1 {
// cases:
// matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage;
// matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python";
for _, match := range matchedAlias {
otherAlias := string(match[1])
if otherAlias != alias {
return nil
}
}
}
language, ok := GetLanguageByAlias(alias)
if !ok {
return nil
}
return []string{language}
}
// GetLanguagesByFilename returns a slice of possible languages for the given filename.
// It complies with the signature to be a Strategy type.
func GetLanguagesByFilename(filename string, _ []byte, _ []string) []string {
if filename == "" {
return nil
}
return data.LanguagesByFilename[filepath.Base(filename)]
}
// GetLanguagesByShebang returns a slice of possible languages for the given content.
// It complies with the signature to be a Strategy type.
func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []string) {
interpreter := getInterpreter(content)
return data.LanguagesByInterpreter[interpreter]
}
var (
shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`)
pythonVersion = regex.MustCompile(`python\d\.\d+`)
)
func getInterpreter(data []byte) (interpreter string) {
line := getFirstLine(data)
if !hasShebang(line) {
return ""
}
// skip shebang
line = bytes.TrimSpace(line[2:])
splitted := bytes.Fields(line)
if len(splitted) == 0 {
return ""
}
if bytes.Contains(splitted[0], []byte("env")) {
if len(splitted) > 1 {
interpreter = string(splitted[1])
}
} else {
splittedPath := bytes.Split(splitted[0], []byte{'/'})
interpreter = string(splittedPath[len(splittedPath)-1])
}
if interpreter == "sh" {
interpreter = lookForMultilineExec(data)
}
if pythonVersion.MatchString(interpreter) {
interpreter = interpreter[:strings.Index(interpreter, `.`)]
}
// If osascript is called with argument -l it could be different language so do not relay on it
// To match linguist behaviour, see ref https://github.com/github/linguist/blob/d95bae794576ab0ef2fcb41a39eb61ea5302c5b5/lib/linguist/shebang.rb#L63
if interpreter == "osascript" && bytes.Contains(line, []byte("-l")) {
interpreter = ""
}
return
}
func getFirstLine(data []byte) []byte {
buf := bufio.NewScanner(bytes.NewReader(data))
buf.Scan()
line := buf.Bytes()
if err := buf.Err(); err != nil {
return nil
}
return line
}
func hasShebang(line []byte) bool {
const shebang = `#!`
prefix := []byte(shebang)
return bytes.HasPrefix(line, prefix)
}
func lookForMultilineExec(data []byte) string {
const magicNumOfLines = 5
interpreter := "sh"
buf := bufio.NewScanner(bytes.NewReader(data))
for i := 0; i < magicNumOfLines && buf.Scan(); i++ {
line := buf.Bytes()
if shebangExecHack.Match(line) {
interpreter = shebangExecHack.FindStringSubmatch(string(line))[1]
break
}
}
if err := buf.Err(); err != nil {
return interpreter
}
return interpreter
}
// GetLanguagesByExtension returns a slice of possible languages for the given filename.
// It complies with the signature to be a Strategy type.
func GetLanguagesByExtension(filename string, _ []byte, _ []string) []string {
if !strings.Contains(filename, ".") {
return nil
}
filename = strings.ToLower(filename)
dots := getDotIndexes(filename)
for _, dot := range dots {
ext := filename[dot:]
languages, ok := data.LanguagesByExtension[ext]
if ok {
return languages
}
}
return nil
}
func getDotIndexes(filename string) []int {
dots := make([]int, 0, 2)
for i, letter := range filename {
if letter == rune('.') {
dots = append(dots, i)
}
}
return dots
}
// GetLanguagesByContent returns a slice of languages for the given content.
// It is a Strategy that uses content-based regexp heuristics and a filename extension.
func GetLanguagesByContent(filename string, content []byte, _ []string) []string {
if filename == "" {
return nil
}
ext := strings.ToLower(filepath.Ext(filename))
heuristic, ok := data.ContentHeuristics[ext]
if !ok {
return nil
}
return heuristic.Match(content)
}
// GetLanguagesByClassifier uses DefaultClassifier as a Classifier and returns a sorted slice of possible languages ordered by
// decreasing language's probability. If there are not candidates it returns nil. It complies with the signature to be a Strategy type.
func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) {
if len(candidates) == 0 {
return nil
}
return GetLanguagesBySpecificClassifier(content, candidates, DefaultClassifier)
}
// GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) {
mapCandidates := make(map[string]float64)
for _, candidate := range candidates {
mapCandidates[candidate]++
}
return classifier.Classify(content, mapCandidates)
}
// GetLanguageExtensions returns the different extensions being used by the language.
func GetLanguageExtensions(language string) []string {
return data.ExtensionsByLanguage[language]
}
// Type represent language's type. Either data, programming, markup, prose, or unknown.
type Type int
// Type's values.
const (
Unknown Type = iota
Data
Programming
Markup
Prose
)
// GetLanguageType returns the type of the given language.
func GetLanguageType(language string) (langType Type) {
intType, ok := data.LanguagesType[language]
langType = Type(intType)
if !ok {
langType = Unknown
}
return langType
}
// GetLanguageByAlias returns either the language related to the given alias and ok set to true
// or Otherlanguage and ok set to false if the alias is not recognized.
func GetLanguageByAlias(alias string) (lang string, ok bool) {
lang, ok = data.LanguageByAlias(alias)
if !ok {
lang = OtherLanguage
}
return
}

@ -0,0 +1,783 @@
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
package data
import "strings"
// LanguageByAliasMap keeps alias for different languages and use the name of the languages as an alias too.
// All the keys (alias or not) are written in lower case and the whitespaces has been replaced by underscores.
var LanguageByAliasMap = map[string]string{
"1c_enterprise": "1C Enterprise",
"abap": "ABAP",
"abl": "OpenEdge ABL",
"abnf": "ABNF",
"abuild": "Alpine Abuild",
"acfm": "Adobe Font Metrics",
"aconf": "ApacheConf",
"actionscript": "ActionScript",
"actionscript3": "ActionScript",
"actionscript_3": "ActionScript",
"ada": "Ada",
"ada2005": "Ada",
"ada95": "Ada",
"adobe_composite_font_metrics": "Adobe Font Metrics",
"adobe_font_metrics": "Adobe Font Metrics",
"adobe_multiple_font_metrics": "Adobe Font Metrics",
"advpl": "xBase",
"afdko": "OpenType Feature File",
"agda": "Agda",
"ags": "AGS Script",
"ags_script": "AGS Script",
"ahk": "AutoHotkey",
"alloy": "Alloy",
"alpine_abuild": "Alpine Abuild",
"altium": "Altium Designer",
"altium_designer": "Altium Designer",
"amfm": "Adobe Font Metrics",
"ampl": "AMPL",
"angelscript": "AngelScript",
"ant_build_system": "Ant Build System",
"antlr": "ANTLR",
"apache": "ApacheConf",
"apacheconf": "ApacheConf",
"apex": "Apex",
"api_blueprint": "API Blueprint",
"apkbuild": "Alpine Abuild",
"apl": "APL",
"apollo_guidance_computer": "Apollo Guidance Computer",
"applescript": "AppleScript",
"arc": "Arc",
"arexx": "REXX",
"as3": "ActionScript",
"asciidoc": "AsciiDoc",
"asm": "Assembly",
"asn.1": "ASN.1",
"asp": "ASP",
"aspectj": "AspectJ",
"aspx": "ASP",
"aspx-vb": "ASP",
"assembly": "Assembly",
"asymptote": "Asymptote",
"ats": "ATS",
"ats2": "ATS",
"au3": "AutoIt",
"augeas": "Augeas",
"autoconf": "M4Sugar",
"autohotkey": "AutoHotkey",
"autoit": "AutoIt",
"autoit3": "AutoIt",
"autoitscript": "AutoIt",
"awk": "Awk",
"b3d": "BlitzBasic",
"ballerina": "Ballerina",
"bash": "Shell",
"bash_session": "ShellSession",
"bat": "Batchfile",
"batch": "Batchfile",
"batchfile": "Batchfile",
"befunge": "Befunge",
"bison": "Bison",
"bitbake": "BitBake",
"blade": "Blade",
"blitz3d": "BlitzBasic",
"blitzbasic": "BlitzBasic",
"blitzmax": "BlitzMax",
"blitzplus": "BlitzBasic",
"bluespec": "Bluespec",
"bmax": "BlitzMax",
"boo": "Boo",
"bplus": "BlitzBasic",
"brainfuck": "Brainfuck",
"brightscript": "Brightscript",
"bro": "Zeek",
"bsdmake": "Makefile",
"byond": "DM",
"c": "C",
"c#": "C#",
"c++": "C++",
"c++-objdump": "Cpp-ObjDump",
"c-objdump": "C-ObjDump",
"c2hs": "C2hs Haskell",
"c2hs_haskell": "C2hs Haskell",
"cabal": "Cabal Config",
"cabal_config": "Cabal Config",
"cap'n_proto": "Cap'n Proto",
"carto": "CartoCSS",
"cartocss": "CartoCSS",
"ceylon": "Ceylon",
"cfc": "ColdFusion CFC",
"cfm": "ColdFusion",
"cfml": "ColdFusion",
"chapel": "Chapel",
"charity": "Charity",
"chpl": "Chapel",
"chuck": "ChucK",
"cirru": "Cirru",
"clarion": "Clarion",
"clean": "Clean",
"click": "Click",
"clipper": "xBase",
"clips": "CLIPS",
"clojure": "Clojure",
"closure_templates": "Closure Templates",
"cloud_firestore_security_rules": "Cloud Firestore Security Rules",
"cmake": "CMake",
"cobol": "COBOL",
"coffee": "CoffeeScript",
"coffee-script": "CoffeeScript",
"coffeescript": "CoffeeScript",
"coldfusion": "ColdFusion",
"coldfusion_cfc": "ColdFusion CFC",
"coldfusion_html": "ColdFusion",
"collada": "COLLADA",
"common_lisp": "Common Lisp",
"common_workflow_language": "Common Workflow Language",
"component_pascal": "Component Pascal",
"conll": "CoNLL-U",
"conll-u": "CoNLL-U",
"conll-x": "CoNLL-U",
"console": "ShellSession",
"cool": "Cool",
"coq": "Coq",
"cperl": "Perl",
"cpp": "C++",
"cpp-objdump": "Cpp-ObjDump",
"creole": "Creole",
"crystal": "Crystal",
"csharp": "C#",
"cson": "CSON",
"csound": "Csound",
"csound-csd": "Csound Document",
"csound-orc": "Csound",
"csound-sco": "Csound Score",
"csound_document": "Csound Document",
"csound_score": "Csound Score",
"css": "CSS",
"csv": "CSV",
"cucumber": "Gherkin",
"cuda": "Cuda",
"cweb": "CWeb",
"cwl": "Common Workflow Language",
"cycript": "Cycript",
"cython": "Cython",
"d": "D",
"d-objdump": "D-ObjDump",
"darcs_patch": "Darcs Patch",
"dart": "Dart",
"dataweave": "DataWeave",
"dcl": "DIGITAL Command Language",
"delphi": "Component Pascal",
"desktop": "desktop",
"dhall": "Dhall",
"diff": "Diff",
"digital_command_language": "DIGITAL Command Language",
"django": "HTML+Django",
"dm": "DM",
"dns_zone": "DNS Zone",
"dockerfile": "Dockerfile",
"dogescript": "Dogescript",
"dosbatch": "Batchfile",
"dosini": "INI",
"dpatch": "Darcs Patch",
"dtrace": "DTrace",
"dtrace-script": "DTrace",
"dylan": "Dylan",
"e": "E",
"eagle": "Eagle",
"easybuild": "Easybuild",
"ebnf": "EBNF",
"ec": "eC",
"ecere_projects": "Ecere Projects",
"ecl": "ECL",
"eclipse": "ECLiPSe",
"ecr": "HTML+ECR",
"editor-config": "EditorConfig",
"editorconfig": "EditorConfig",
"edje_data_collection": "Edje Data Collection",
"edn": "edn",
"eeschema_schematic": "KiCad Schematic",
"eex": "HTML+EEX",
"eiffel": "Eiffel",
"ejs": "EJS",
"elisp": "Emacs Lisp",
"elixir": "Elixir",
"elm": "Elm",
"emacs": "Emacs Lisp",
"emacs_lisp": "Emacs Lisp",
"emberscript": "EmberScript",
"eml": "EML",
"eq": "EQ",
"erb": "HTML+ERB",
"erlang": "Erlang",
"f#": "F#",
"f*": "F*",
"factor": "Factor",
"fancy": "Fancy",
"fantom": "Fantom",
"figfont": "FIGlet Font",
"figlet_font": "FIGlet Font",
"filebench_wml": "Filebench WML",
"filterscript": "Filterscript",
"fish": "fish",
"flex": "Lex",
"flux": "FLUX",
"formatted": "Formatted",
"forth": "Forth",
"fortran": "Fortran",
"foxpro": "xBase",
"freemarker": "FreeMarker",
"frege": "Frege",
"fsharp": "F#",
"fstar": "F*",
"ftl": "FreeMarker",
"fundamental": "Text",
"g-code": "G-code",
"game_maker_language": "Game Maker Language",
"gams": "GAMS",
"gap": "GAP",
"gcc_machine_description": "GCC Machine Description",
"gdb": "GDB",
"gdscript": "GDScript",
"genie": "Genie",
"genshi": "Genshi",
"gentoo_ebuild": "Gentoo Ebuild",
"gentoo_eclass": "Gentoo Eclass",
"gerber_image": "Gerber Image",
"gettext_catalog": "Gettext Catalog",
"gf": "Grammatical Framework",
"gherkin": "Gherkin",
"git-ignore": "Ignore List",
"git_attributes": "Git Attributes",
"git_config": "Git Config",
"gitattributes": "Git Attributes",
"gitconfig": "Git Config",
"gitignore": "Ignore List",
"gitmodules": "Git Config",
"glsl": "GLSL",
"glyph": "Glyph",
"glyph_bitmap_distribution_format": "Glyph Bitmap Distribution Format",
"gn": "GN",
"gnuplot": "Gnuplot",
"go": "Go",
"golang": "Go",
"golo": "Golo",
"gosu": "Gosu",
"grace": "Grace",
"gradle": "Gradle",
"grammatical_framework": "Grammatical Framework",
"graph_modeling_language": "Graph Modeling Language",
"graphql": "GraphQL",
"graphviz_(dot)": "Graphviz (DOT)",
"groff": "Roff",
"groovy": "Groovy",
"groovy_server_pages": "Groovy Server Pages",
"gsp": "Groovy Server Pages",
"hack": "Hack",
"haml": "Haml",
"handlebars": "Handlebars",
"haproxy": "HAProxy",
"harbour": "Harbour",
"haskell": "Haskell",
"haxe": "Haxe",
"hbs": "Handlebars",
"hcl": "HCL",
"hiveql": "HiveQL",
"hlsl": "HLSL",
"holyc": "HolyC",
"html": "HTML",
"html+django": "HTML+Django",
"html+django/jinja": "HTML+Django",
"html+ecr": "HTML+ECR",
"html+eex": "HTML+EEX",
"html+erb": "HTML+ERB",
"html+jinja": "HTML+Django",
"html+php": "HTML+PHP",
"html+razor": "HTML+Razor",
"html+ruby": "RHTML",
"htmlbars": "Handlebars",
"htmldjango": "HTML+Django",
"http": "HTTP",
"hxml": "HXML",
"hy": "Hy",
"hylang": "Hy",
"hyphy": "HyPhy",
"i7": "Inform 7",
"idl": "IDL",
"idris": "Idris",
"ignore": "Ignore List",
"ignore_list": "Ignore List",
"igor": "IGOR Pro",
"igor_pro": "IGOR Pro",
"igorpro": "IGOR Pro",
"inc": "PHP",
"inform7": "Inform 7",
"inform_7": "Inform 7",
"ini": "INI",
"inno_setup": "Inno Setup",
"io": "Io",
"ioke": "Ioke",
"ipython_notebook": "Jupyter Notebook",
"irc": "IRC log",
"irc_log": "IRC log",
"irc_logs": "IRC log",
"isabelle": "Isabelle",
"isabelle_root": "Isabelle ROOT",
"j": "J",
"jasmin": "Jasmin",
"java": "Java",
"java_properties": "Java Properties",
"java_server_page": "Groovy Server Pages",
"java_server_pages": "Java Server Pages",
"javascript": "JavaScript",
"javascript+erb": "JavaScript+ERB",
"jflex": "JFlex",
"jison": "Jison",
"jison_lex": "Jison Lex",
"jolie": "Jolie",
"jruby": "Ruby",
"js": "JavaScript",
"json": "JSON",
"json5": "JSON5",
"json_with_comments": "JSON with Comments",
"jsonc": "JSON with Comments",
"jsoniq": "JSONiq",
"jsonld": "JSONLD",
"jsonnet": "Jsonnet",
"jsp": "Java Server Pages",
"jsx": "JSX",
"julia": "Julia",
"jupyter_notebook": "Jupyter Notebook",
"kicad_layout": "KiCad Layout",
"kicad_legacy_layout": "KiCad Legacy Layout",
"kicad_schematic": "KiCad Schematic",
"kit": "Kit",
"kotlin": "Kotlin",
"krl": "KRL",
"labview": "LabVIEW",
"lasso": "Lasso",
"lassoscript": "Lasso",
"latex": "TeX",
"latte": "Latte",
"lean": "Lean",
"less": "Less",
"lex": "Lex",
"lfe": "LFE",
"lhaskell": "Literate Haskell",
"lhs": "Literate Haskell",
"lilypond": "LilyPond",
"limbo": "Limbo",
"linker_script": "Linker Script",
"linux_kernel_module": "Linux Kernel Module",
"liquid": "Liquid",
"lisp": "Common Lisp",
"litcoffee": "Literate CoffeeScript",
"literate_agda": "Literate Agda",
"literate_coffeescript": "Literate CoffeeScript",
"literate_haskell": "Literate Haskell",
"live-script": "LiveScript",
"livescript": "LiveScript",
"llvm": "LLVM",
"logos": "Logos",
"logtalk": "Logtalk",
"lolcode": "LOLCODE",
"lookml": "LookML",
"loomscript": "LoomScript",
"ls": "LiveScript",
"lsl": "LSL",
"ltspice_symbol": "LTspice Symbol",
"lua": "Lua",
"m": "M",
"m4": "M4",
"m4sugar": "M4Sugar",
"macruby": "Ruby",
"make": "Makefile",
"makefile": "Makefile",
"mako": "Mako",
"man": "Roff",
"man-page": "Roff",
"man_page": "Roff",
"manpage": "Roff",
"markdown": "Markdown",
"marko": "Marko",
"markojs": "Marko",
"mask": "Mask",
"mathematica": "Mathematica",
"matlab": "MATLAB",
"maven_pom": "Maven POM",
"max": "Max",
"max/msp": "Max",
"maxmsp": "Max",
"maxscript": "MAXScript",
"mcfunction": "mcfunction",
"mdoc": "Roff",
"mediawiki": "MediaWiki",
"mercury": "Mercury",
"meson": "Meson",
"metal": "Metal",
"mf": "Makefile",
"minid": "MiniD",
"mirah": "Mirah",
"mma": "Mathematica",
"modelica": "Modelica",
"modula-2": "Modula-2",
"modula-3": "Modula-3",
"module_management_system": "Module Management System",
"monkey": "Monkey",
"moocode": "Moocode",
"moonscript": "MoonScript",
"motorola_68k_assembly": "Motorola 68K Assembly",
"mql4": "MQL4",
"mql5": "MQL5",
"mtml": "MTML",
"muf": "MUF",
"mumps": "M",
"mupad": "mupad",
"myghty": "Myghty",
"nanorc": "nanorc",
"nasm": "Assembly",
"ncl": "NCL",
"nearley": "Nearley",
"nemerle": "Nemerle",
"nesc": "nesC",
"netlinx": "NetLinx",
"netlinx+erb": "NetLinx+ERB",
"netlogo": "NetLogo",
"newlisp": "NewLisp",
"nextflow": "Nextflow",
"nginx": "Nginx",
"nginx_configuration_file": "Nginx",
"nim": "Nim",
"ninja": "Ninja",
"nit": "Nit",
"nix": "Nix",
"nixos": "Nix",
"njk": "HTML+Django",
"nl": "NL",
"node": "JavaScript",
"nroff": "Roff",
"nsis": "NSIS",
"nu": "Nu",
"numpy": "NumPy",
"nunjucks": "HTML+Django",
"nush": "Nu",
"nvim": "Vim script",
"obj-c": "Objective-C",
"obj-c++": "Objective-C++",
"obj-j": "Objective-J",
"objc": "Objective-C",
"objc++": "Objective-C++",
"objdump": "ObjDump",
"objective-c": "Objective-C",
"objective-c++": "Objective-C++",
"objective-j": "Objective-J",
"objectivec": "Objective-C",
"objectivec++": "Objective-C++",
"objectivej": "Objective-J",
"objectpascal": "Component Pascal",
"objectscript": "ObjectScript",
"objj": "Objective-J",
"ocaml": "OCaml",
"octave": "MATLAB",
"omgrofl": "Omgrofl",
"oncrpc": "RPC",
"ooc": "ooc",
"opa": "Opa",
"opal": "Opal",
"opencl": "OpenCL",
"openedge": "OpenEdge ABL",
"openedge_abl": "OpenEdge ABL",
"openrc": "OpenRC runscript",
"openrc_runscript": "OpenRC runscript",
"openscad": "OpenSCAD",
"opentype_feature_file": "OpenType Feature File",
"org": "Org",
"osascript": "AppleScript",
"ox": "Ox",
"oxygene": "Oxygene",
"oz": "Oz",
"p4": "P4",
"pan": "Pan",
"pandoc": "Markdown",
"papyrus": "Papyrus",
"parrot": "Parrot",
"parrot_assembly": "Parrot Assembly",
"parrot_internal_representation": "Parrot Internal Representation",
"pascal": "Pascal",
"pasm": "Parrot Assembly",
"pawn": "Pawn",
"pcbnew": "KiCad Layout",
"pep8": "Pep8",
"perl": "Perl",
"perl6": "Perl 6",
"perl_6": "Perl 6",
"php": "PHP",
"pic": "Pic",
"pickle": "Pickle",
"picolisp": "PicoLisp",
"piglatin": "PigLatin",
"pike": "Pike",
"pir": "Parrot Internal Representation",
"plpgsql": "PLpgSQL",
"plsql": "PLSQL",
"pod": "Pod",
"pod_6": "Pod 6",
"pogoscript": "PogoScript",
"pony": "Pony",
"posh": "PowerShell",
"postcss": "PostCSS",
"postscr": "PostScript",
"postscript": "PostScript",
"pot": "Gettext Catalog",
"pov-ray": "POV-Ray SDL",
"pov-ray_sdl": "POV-Ray SDL",
"povray": "POV-Ray SDL",
"powerbuilder": "PowerBuilder",
"powershell": "PowerShell",
"processing": "Processing",
"progress": "OpenEdge ABL",
"prolog": "Prolog",
"propeller_spin": "Propeller Spin",
"protobuf": "Protocol Buffer",
"protocol_buffer": "Protocol Buffer",
"protocol_buffers": "Protocol Buffer",
"public_key": "Public Key",
"pug": "Pug",
"puppet": "Puppet",
"pure_data": "Pure Data",
"purebasic": "PureBasic",
"purescript": "PureScript",
"pwsh": "PowerShell",
"pycon": "Python console",
"pyrex": "Cython",
"python": "Python",
"python3": "Python",
"python_console": "Python console",
"python_traceback": "Python traceback",
"q": "q",
"qmake": "QMake",
"qml": "QML",
"quake": "Quake",
"r": "R",
"racket": "Racket",
"ragel": "Ragel",
"ragel-rb": "Ragel",
"ragel-ruby": "Ragel",
"rake": "Ruby",
"raml": "RAML",
"rascal": "Rascal",
"raw": "Raw token data",
"raw_token_data": "Raw token data",
"razor": "HTML+Razor",
"rb": "Ruby",
"rbx": "Ruby",
"rdoc": "RDoc",
"realbasic": "REALbasic",
"reason": "Reason",
"rebol": "Rebol",
"red": "Red",
"red/system": "Red",
"redcode": "Redcode",
"regex": "Regular Expression",
"regexp": "Regular Expression",
"regular_expression": "Regular Expression",
"ren'py": "Ren'Py",
"renderscript": "RenderScript",
"renpy": "Ren'Py",
"restructuredtext": "reStructuredText",
"rexx": "REXX",
"rhtml": "RHTML",
"rich_text_format": "Rich Text Format",
"ring": "Ring",
"rmarkdown": "RMarkdown",
"robotframework": "RobotFramework",
"roff": "Roff",
"roff_manpage": "Roff Manpage",
"rouge": "Rouge",
"rpc": "RPC",
"rpcgen": "RPC",
"rpm_spec": "RPM Spec",
"rs-274x": "Gerber Image",
"rscript": "R",
"rss": "XML",
"rst": "reStructuredText",
"ruby": "Ruby",
"runoff": "RUNOFF",
"rust": "Rust",
"rusthon": "Python",
"sage": "Sage",
"salt": "SaltStack",
"saltstack": "SaltStack",
"saltstate": "SaltStack",
"sas": "SAS",
"sass": "Sass",
"scala": "Scala",
"scaml": "Scaml",
"scheme": "Scheme",
"scilab": "Scilab",
"scss": "SCSS",
"sed": "sed",
"self": "Self",
"sh": "Shell",
"shaderlab": "ShaderLab",
"shell": "Shell",
"shell-script": "Shell",
"shellsession": "ShellSession",
"shen": "Shen",
"slash": "Slash",
"slice": "Slice",
"slim": "Slim",
"smali": "Smali",
"smalltalk": "Smalltalk",
"smarty": "Smarty",
"sml": "Standard ML",
"smt": "SMT",
"snippet": "YASnippet",
"solidity": "Solidity",
"sourcemod": "SourcePawn",
"sourcepawn": "SourcePawn",
"soy": "Closure Templates",
"sparql": "SPARQL",
"specfile": "RPM Spec",
"spline_font_database": "Spline Font Database",
"splus": "R",
"sqf": "SQF",
"sql": "SQL",
"sqlpl": "SQLPL",
"squeak": "Smalltalk",
"squirrel": "Squirrel",
"srecode_template": "SRecode Template",
"ssh_config": "SSH Config",
"stan": "Stan",
"standard_ml": "Standard ML",
"stata": "Stata",
"ston": "STON",
"stylus": "Stylus",
"subrip_text": "SubRip Text",
"sugarss": "SugarSS",
"supercollider": "SuperCollider",
"svelte": "Svelte",
"svg": "SVG",
"swift": "Swift",
"systemverilog": "SystemVerilog",
"tcl": "Tcl",
"tcsh": "Tcsh",
"tea": "Tea",
"terra": "Terra",
"terraform": "HCL",
"tex": "TeX",
"text": "Text",
"textile": "Textile",
"thrift": "Thrift",
"ti_program": "TI Program",
"tl": "Type Language",
"tla": "TLA",
"toml": "TOML",
"troff": "Roff",
"ts": "TypeScript",
"tsql": "TSQL",
"tsx": "TSX",
"turing": "Turing",
"turtle": "Turtle",
"twig": "Twig",
"txl": "TXL",
"type_language": "Type Language",
"typescript": "TypeScript",
"udiff": "Diff",
"unified_parallel_c": "Unified Parallel C",
"unity3d_asset": "Unity3D Asset",
"unix_assembly": "Unix Assembly",
"uno": "Uno",
"unrealscript": "UnrealScript",
"ur": "UrWeb",
"ur/web": "UrWeb",
"urweb": "UrWeb",
"vala": "Vala",
"vb.net": "Visual Basic",
"vbnet": "Visual Basic",
"vcl": "VCL",
"verilog": "Verilog",
"vhdl": "VHDL",
"vim": "Vim script",
"vim_script": "Vim script",
"viml": "Vim script",
"visual_basic": "Visual Basic",
"volt": "Volt",
"vue": "Vue",
"wasm": "WebAssembly",
"wast": "WebAssembly",
"wavefront_material": "Wavefront Material",
"wavefront_object": "Wavefront Object",
"wdl": "wdl",
"web_ontology_language": "Web Ontology Language",
"webassembly": "WebAssembly",
"webidl": "WebIDL",
"webvtt": "WebVTT",
"winbatch": "Batchfile",
"windows_registry_entries": "Windows Registry Entries",
"wisp": "wisp",
"wollok": "Wollok",
"world_of_warcraft_addon_data": "World of Warcraft Addon Data",
"wsdl": "XML",
"x10": "X10",
"x_bitmap": "X BitMap",
"x_font_directory_index": "X Font Directory Index",
"x_pixmap": "X PixMap",
"xbase": "xBase",
"xbm": "X BitMap",
"xc": "XC",
"xcompose": "XCompose",
"xdr": "RPC",
"xhtml": "HTML",
"xml": "XML",
"xml+genshi": "Genshi",
"xml+kid": "Genshi",
"xojo": "Xojo",
"xpages": "XPages",
"xpm": "X PixMap",
"xproc": "XProc",
"xquery": "XQuery",
"xs": "XS",
"xsd": "XML",
"xsl": "XSLT",
"xslt": "XSLT",
"xten": "X10",
"xtend": "Xtend",
"yacc": "Yacc",
"yaml": "YAML",
"yang": "YANG",
"yara": "YARA",
"yas": "YASnippet",
"yasnippet": "YASnippet",
"yml": "YAML",
"zap": "ZAP",
"zeek": "Zeek",
"zenscript": "ZenScript",
"zephir": "Zephir",
"zig": "Zig",
"zil": "ZIL",
"zimpl": "Zimpl",
"zsh": "Shell",
}
// LanguageByAlias looks up the language name by it's alias or name.
// It mirrors the logic of github linguist and is needed e.g for heuristcs.yml
// that mixes names and aliases in a language field (see XPM example).
func LanguageByAlias(langOrAlias string) (lang string, ok bool) {
k := convertToAliasKey(langOrAlias)
lang, ok = LanguageByAliasMap[k]
return
}
// convertToAliasKey converts language name to a key in LanguageByAliasMap.
// Following
// - internal.code-generator.generator.convertToAliasKey()
// - GetLanguageByAlias()
// conventions.
// It is here to avoid dependency on "generate" and "enry" packages.
func convertToAliasKey(langName string) string {
ak := strings.SplitN(langName, `,`, 2)[0]
ak = strings.Replace(ak, ` `, `_`, -1)
ak = strings.ToLower(ak)
return ak
}

@ -0,0 +1,254 @@
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
package data
var LanguagesColor = map[string]string{
"1C Enterprise": "#814CCC",
"ABAP": "#E8274B",
"AGS Script": "#B9D9FF",
"AMPL": "#E6EFBB",
"ANTLR": "#9DC3FF",
"API Blueprint": "#2ACCA8",
"APL": "#5A8164",
"ASP": "#6a40fd",
"ATS": "#1ac620",
"ActionScript": "#882B0F",
"Ada": "#02f88c",
"Agda": "#315665",
"Alloy": "#64C800",
"AngelScript": "#C7D7DC",
"AppleScript": "#101F1F",
"Arc": "#aa2afe",
"AspectJ": "#a957b0",
"Assembly": "#6E4C13",
"Asymptote": "#4a0c0c",
"AutoHotkey": "#6594b9",
"AutoIt": "#1C3552",
"Ballerina": "#FF5000",
"Batchfile": "#C1F12E",
"BlitzMax": "#cd6400",
"Boo": "#d4bec1",
"Brainfuck": "#2F2530",
"C": "#555555",
"C#": "#178600",
"C++": "#f34b7d",
"CSS": "#563d7c",
"Ceylon": "#dfa535",
"Chapel": "#8dc63f",
"Cirru": "#ccccff",
"Clarion": "#db901e",
"Clean": "#3F85AF",
"Click": "#E4E6F3",
"Clojure": "#db5855",
"CoffeeScript": "#244776",
"ColdFusion": "#ed2cd6",
"Common Lisp": "#3fb68b",
"Common Workflow Language": "#B5314C",
"Component Pascal": "#B0CE4E",
"Crystal": "#000100",
"Cuda": "#3A4E3A",
"D": "#ba595e",
"DM": "#447265",
"Dart": "#00B4AB",
"DataWeave": "#003a52",
"Dhall": "#dfafff",
"Dockerfile": "#384d54",
"Dogescript": "#cca760",
"Dylan": "#6c616e",
"E": "#ccce35",
"ECL": "#8a1267",
"EQ": "#a78649",
"Eiffel": "#946d57",
"Elixir": "#6e4a7e",
"Elm": "#60B5CC",
"Emacs Lisp": "#c065db",
"EmberScript": "#FFF4F3",
"Erlang": "#B83998",
"F#": "#b845fc",
"F*": "#572e30",
"FLUX": "#88ccff",
"Factor": "#636746",
"Fancy": "#7b9db4",
"Fantom": "#14253c",
"Forth": "#341708",
"Fortran": "#4d41b1",
"FreeMarker": "#0050b2",
"Frege": "#00cafe",
"G-code": "#D08CF2",
"GDScript": "#355570",
"Game Maker Language": "#71b417",
"Genie": "#fb855d",
"Gherkin": "#5B2063",
"Glyph": "#c1ac7f",
"Gnuplot": "#f0a9f0",
"Go": "#00ADD8",
"Golo": "#88562A",
"Gosu": "#82937f",
"Grammatical Framework": "#79aa7a",
"Groovy": "#e69f56",
"HTML": "#e34c26",
"Hack": "#878787",
"Harbour": "#0e60e3",
"Haskell": "#5e5086",
"Haxe": "#df7900",
"HiveQL": "#dce200",
"HolyC": "#ffefaf",
"Hy": "#7790B2",
"IDL": "#a3522f",
"Idris": "#b30000",
"Io": "#a9188d",
"Ioke": "#078193",
"Isabelle": "#FEFE00",
"J": "#9EEDFF",
"JSONiq": "#40d47e",
"Java": "#b07219",
"JavaScript": "#f1e05a",
"Jolie": "#843179",
"Jsonnet": "#0064bd",
"Julia": "#a270ba",
"Jupyter Notebook": "#DA5B0B",
"KRL": "#28430A",
"Kotlin": "#F18E33",
"LFE": "#4C3023",
"LLVM": "#185619",
"LOLCODE": "#cc9900",
"LSL": "#3d9970",
"Lasso": "#999999",
"Lex": "#DBCA00",
"LiveScript": "#499886",
"LookML": "#652B81",
"Lua": "#000080",
"MATLAB": "#e16737",
"MAXScript": "#00a6a6",
"MQL4": "#62A8D6",
"MQL5": "#4A76B8",
"MTML": "#b7e1f4",
"Makefile": "#427819",
"Mask": "#f97732",
"Max": "#c4a79c",
"Mercury": "#ff2b2b",
"Meson": "#007800",
"Metal": "#8f14e9",
"Mirah": "#c7a938",
"Modula-3": "#223388",
"NCL": "#28431f",
"Nearley": "#990000",
"Nemerle": "#3d3c6e",
"NetLinx": "#0aa0ff",
"NetLinx+ERB": "#747faa",
"NetLogo": "#ff6375",
"NewLisp": "#87AED7",
"Nextflow": "#3ac486",
"Nim": "#37775b",
"Nit": "#009917",
"Nix": "#7e7eff",
"Nu": "#c9df40",
"OCaml": "#3be133",
"ObjectScript": "#424893",
"Objective-C": "#438eff",
"Objective-C++": "#6866fb",
"Objective-J": "#ff0c5a",
"Omgrofl": "#cabbff",
"Opal": "#f7ede0",
"Oxygene": "#cdd0e3",
"Oz": "#fab738",
"P4": "#7055b5",
"PHP": "#4F5D95",
"PLSQL": "#dad8d8",
"Pan": "#cc0000",
"Papyrus": "#6600cc",
"Parrot": "#f3ca0a",
"Pascal": "#E3F171",
"Pawn": "#dbb284",
"Pep8": "#C76F5B",
"Perl": "#0298c3",
"Perl 6": "#0000fb",
"PigLatin": "#fcd7de",
"Pike": "#005390",
"PogoScript": "#d80074",
"PostScript": "#da291c",
"PowerBuilder": "#8f0f8d",
"PowerShell": "#012456",
"Processing": "#0096D8",
"Prolog": "#74283c",
"Propeller Spin": "#7fa2a7",
"Puppet": "#302B6D",
"PureBasic": "#5a6986",
"PureScript": "#1D222D",
"Python": "#3572A5",
"QML": "#44a51c",
"Quake": "#882233",
"R": "#198CE7",
"RAML": "#77d9fb",
"RUNOFF": "#665a4e",
"Racket": "#3c5caa",
"Ragel": "#9d5200",
"Rascal": "#fffaa0",
"Rebol": "#358a5b",
"Red": "#f50000",
"Ren'Py": "#ff7f7f",
"Ring": "#2D54CB",
"Roff": "#ecdebe",
"Rouge": "#cc0088",
"Ruby": "#701516",
"Rust": "#dea584",
"SAS": "#B34936",
"SQF": "#3F3F3F",
"SRecode Template": "#348a34",
"SaltStack": "#646464",
"Scala": "#c22d40",
"Scheme": "#1e4aec",
"Self": "#0579aa",
"Shell": "#89e051",
"Shen": "#120F14",
"Slash": "#007eff",
"Slice": "#003fa2",
"Smalltalk": "#596706",
"Solidity": "#AA6746",
"SourcePawn": "#5c7611",
"Squirrel": "#800000",
"Stan": "#b2011d",
"Standard ML": "#dc566d",
"SuperCollider": "#46390b",
"Swift": "#ffac45",
"SystemVerilog": "#DAE1C2",
"TI Program": "#A0AA87",
"Tcl": "#e4cc98",
"TeX": "#3D6117",
"Terra": "#00004c",
"Turing": "#cf142b",
"TypeScript": "#2b7489",
"UnrealScript": "#a54c4d",
"VCL": "#148AA8",
"VHDL": "#adb2cb",
"Vala": "#fbe5cd",
"Verilog": "#b2b7f8",
"Vim script": "#199f4b",
"Visual Basic": "#945db7",
"Volt": "#1F1F1F",
"Vue": "#2c3e50",
"WebAssembly": "#04133b",
"Wollok": "#a23738",
"X10": "#4B6BEF",
"XC": "#99DA07",
"XQuery": "#5232e7",
"XSLT": "#EB8CEB",
"YARA": "#220000",
"YASnippet": "#32AB90",
"Yacc": "#4B6C4B",
"ZAP": "#0d665e",
"ZIL": "#dc75e5",
"ZenScript": "#00BCD1",
"Zephir": "#118f9e",
"Zig": "#ec915c",
"eC": "#913960",
"mcfunction": "#E22837",
"nesC": "#94B0C7",
"ooc": "#b0b77e",
"q": "#0040cd",
"sed": "#64b970",
"wdl": "#42f1f4",
"wisp": "#7582D1",
"xBase": "#403a40",
}

@ -0,0 +1,7 @@
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
package data
// linguist's commit from which files were generated.
var LinguistCommit = "3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d"

File diff suppressed because it is too large Load Diff

@ -0,0 +1,3 @@
// Package data contains only auto-generated data-structures for all the language
// identification strategies from the Linguist project sources.
package data

@ -0,0 +1,26 @@
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
package data
import "gopkg.in/toqueteos/substring.v1"
var DocumentationMatchers = substring.Or(
substring.Regexp(`^[Dd]ocs?/`),
substring.Regexp(`(^|/)[Dd]ocumentation/`),
substring.Regexp(`(^|/)[Gg]roovydoc/`),
substring.Regexp(`(^|/)[Jj]avadoc/`),
substring.Regexp(`^[Mm]an/`),
substring.Regexp(`^[Ee]xamples/`),
substring.Regexp(`^[Dd]emos?/`),
substring.Regexp(`(^|/)inst/doc/`),
substring.Regexp(`(^|/)CHANGE(S|LOG)?(\.|$)`),
substring.Regexp(`(^|/)CONTRIBUTING(\.|$)`),
substring.Regexp(`(^|/)COPYING(\.|$)`),
substring.Regexp(`(^|/)INSTALL(\.|$)`),
substring.Regexp(`(^|/)LICEN[CS]E(\.|$)`),
substring.Regexp(`(^|/)[Ll]icen[cs]e(\.|$)`),
substring.Regexp(`(^|/)README(\.|$)`),
substring.Regexp(`(^|/)[Rr]eadme(\.|$)`),
substring.Regexp(`^[Ss]amples?/`),
)

File diff suppressed because it is too large Load Diff

@ -0,0 +1,241 @@
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
package data
var LanguagesByFilename = map[string][]string{
".Rprofile": {"R"},
".XCompose": {"XCompose"},
".abbrev_defs": {"Emacs Lisp"},
".arcconfig": {"JSON"},
".atomignore": {"Ignore List"},
".babelignore": {"Ignore List"},
".babelrc": {"JSON with Comments"},
".bash_aliases": {"Shell"},
".bash_history": {"Shell"},
".bash_logout": {"Shell"},
".bash_profile": {"Shell"},
".bashrc": {"Shell"},
".bzrignore": {"Ignore List"},
".clang-format": {"YAML"},
".clang-tidy": {"YAML"},
".classpath": {"XML"},
".coffeelintignore": {"Ignore List"},
".cproject": {"XML"},
".cshrc": {"Shell"},
".cvsignore": {"Ignore List"},
".dockerignore": {"Ignore List"},
".editorconfig": {"EditorConfig"},
".emacs": {"Emacs Lisp"},
".emacs.desktop": {"Emacs Lisp"},
".eslintignore": {"Ignore List"},
".eslintrc.json": {"JSON with Comments"},
".factor-boot-rc": {"Factor"},
".factor-rc": {"Factor"},
".gclient": {"Python"},
".gemrc": {"YAML"},
".gitattributes": {"Git Attributes"},
".gitconfig": {"Git Config"},
".gitignore": {"Ignore List"},
".gitmodules": {"Git Config"},
".gn": {"GN"},
".gnus": {"Emacs Lisp"},
".gvimrc": {"Vim script"},
".htaccess": {"ApacheConf"},
".htmlhintrc": {"JSON"},
".irbrc": {"Ruby"},
".jscsrc": {"JSON with Comments"},
".jshintrc": {"JSON with Comments"},
".jslintrc": {"JSON with Comments"},
".login": {"Shell"},
".nanorc": {"nanorc"},
".nodemonignore": {"Ignore List"},
".npmignore": {"Ignore List"},
".nvimrc": {"Vim script"},
".php": {"PHP"},
".php_cs": {"PHP"},
".php_cs.dist": {"PHP"},
".prettierignore": {"Ignore List"},
".profile": {"Shell"},
".project": {"XML"},
".pryrc": {"Ruby"},
".spacemacs": {"Emacs Lisp"},
".stylelintignore": {"Ignore List"},
".tern-config": {"JSON"},
".tern-project": {"JSON"},
".vimrc": {"Vim script"},
".viper": {"Emacs Lisp"},
".vscodeignore": {"Ignore List"},
".watchmanconfig": {"JSON"},
".zlogin": {"Shell"},
".zlogout": {"Shell"},
".zprofile": {"Shell"},
".zshenv": {"Shell"},
".zshrc": {"Shell"},
"9fs": {"Shell"},
"APKBUILD": {"Alpine Abuild"},
"App.config": {"XML"},
"Appraisals": {"Ruby"},
"BSDmakefile": {"Makefile"},
"BUCK": {"Python"},
"BUILD": {"Python"},
"BUILD.bazel": {"Python"},
"Berksfile": {"Ruby"},
"Brewfile": {"Ruby"},
"Buildfile": {"Ruby"},
"CMakeLists.txt": {"CMake"},
"COPYING": {"Text"},
"COPYING.regex": {"Text"},
"COPYRIGHT.regex": {"Text"},
"Cakefile": {"CoffeeScript"},
"Capfile": {"Ruby"},
"Cargo.lock": {"TOML"},
"Cask": {"Emacs Lisp"},
"Dangerfile": {"Ruby"},
"Deliverfile": {"Ruby"},
"Dockerfile": {"Dockerfile"},
"Emakefile": {"Erlang"},
"FONTLOG": {"Text"},
"Fakefile": {"Fancy"},
"Fastfile": {"Ruby"},
"GNUmakefile": {"Makefile"},
"Gemfile": {"Ruby"},
"Gemfile.lock": {"Ruby"},
"Gopkg.lock": {"TOML"},
"Guardfile": {"Ruby"},
"INSTALL": {"Text"},
"INSTALL.mysql": {"Text"},
"Jakefile": {"JavaScript"},
"Jarfile": {"Ruby"},
"Jenkinsfile": {"Groovy"},
"Kbuild": {"Makefile"},
"LICENSE": {"Text"},
"LICENSE.mysql": {"Text"},
"Makefile": {"Makefile"},
"Makefile.PL": {"Perl"},
"Makefile.am": {"Makefile"},
"Makefile.boot": {"Makefile"},
"Makefile.frag": {"Makefile"},
"Makefile.in": {"Makefile"},
"Makefile.inc": {"Makefile"},
"Makefile.wat": {"Makefile"},
"Mavenfile": {"Ruby"},
"Modulefile": {"Puppet"},
"NEWS": {"Text"},
"Notebook": {"Jupyter Notebook"},
"NuGet.config": {"XML"},
"Nukefile": {"Nu"},
"PKGBUILD": {"Shell"},
"Phakefile": {"PHP"},
"Podfile": {"Ruby"},
"Project.ede": {"Emacs Lisp"},
"Puppetfile": {"Ruby"},
"README.1ST": {"Text"},
"README.me": {"Text"},
"README.mysql": {"Text"},
"ROOT": {"Isabelle ROOT"},
"Rakefile": {"Ruby"},
"Rexfile": {"Perl"},
"SConscript": {"Python"},
"SConstruct": {"Python"},
"Settings.StyleCop": {"XML"},
"Slakefile": {"LiveScript"},
"Snakefile": {"Python"},
"Snapfile": {"Ruby"},
"Thorfile": {"Ruby"},
"Vagrantfile": {"Ruby"},
"WORKSPACE": {"Python"},
"Web.Debug.config": {"XML"},
"Web.Release.config": {"XML"},
"Web.config": {"XML"},
"XCompose": {"XCompose"},
"_emacs": {"Emacs Lisp"},
"_vimrc": {"Vim script"},
"abbrev_defs": {"Emacs Lisp"},
"ack": {"Perl"},
"ant.xml": {"Ant Build System"},
"apache2.conf": {"ApacheConf"},
"bash_aliases": {"Shell"},
"bash_logout": {"Shell"},
"bash_profile": {"Shell"},
"bashrc": {"Shell"},
"build.xml": {"Ant Build System"},
"buildfile": {"Ruby"},
"buildozer.spec": {"INI"},
"cabal.config": {"Cabal Config"},
"cabal.project": {"Cabal Config"},
"click.me": {"Text"},
"composer.lock": {"JSON"},
"configure.ac": {"M4Sugar"},
"contents.lr": {"Markdown"},
"cpanfile": {"Perl"},
"cshrc": {"Shell"},
"delete.me": {"Text"},
"descrip.mmk": {"Module Management System"},
"descrip.mms": {"Module Management System"},
"encodings.dir": {"X Font Directory Index"},
"expr-dist": {"R"},
"firestore.rules": {"Cloud Firestore Security Rules"},
"fonts.alias": {"X Font Directory Index"},
"fonts.dir": {"X Font Directory Index"},
"fonts.scale": {"X Font Directory Index"},
"fp-lib-table": {"KiCad Layout"},
"gitignore-global": {"Ignore List"},
"gitignore_global": {"Ignore List"},
"glide.lock": {"YAML"},
"go.mod": {"Text"},
"go.sum": {"Text"},
"gradlew": {"Shell"},
"gvimrc": {"Vim script"},
"haproxy.cfg": {"HAProxy"},
"httpd.conf": {"ApacheConf"},
"jsconfig.json": {"JSON with Comments"},
"keep.me": {"Text"},
"language-configuration.json": {"JSON with Comments"},
"ld.script": {"Linker Script"},
"login": {"Shell"},
"m3makefile": {"Quake"},
"m3overrides": {"Quake"},
"makefile": {"Makefile"},
"makefile.sco": {"Makefile"},
"man": {"Shell"},
"mcmod.info": {"JSON"},
"meson.build": {"Meson"},
"meson_options.txt": {"Meson"},
"mix.lock": {"Elixir"},
"mkfile": {"Makefile"},
"mmn": {"Roff"},
"mmt": {"Roff"},
"nanorc": {"nanorc"},
"nextflow.config": {"Nextflow"},
"nginx.conf": {"Nginx"},
"nim.cfg": {"Nim"},
"nvimrc": {"Vim script"},
"owh": {"Tcl"},
"packages.config": {"XML"},
"pom.xml": {"Maven POM"},
"profile": {"Shell"},
"read.me": {"Text"},
"readme.1st": {"Text"},
"rebar.config": {"Erlang"},
"rebar.config.lock": {"Erlang"},
"rebar.lock": {"Erlang"},
"riemann.config": {"Clojure"},
"ssh-config": {"SSH Config"},
"ssh_config": {"SSH Config"},
"sshconfig": {"SSH Config"},
"sshconfig.snip": {"SSH Config"},
"sshd-config": {"SSH Config"},
"sshd_config": {"SSH Config"},
"starfield": {"Tcl"},
"test.me": {"Text"},
"tsconfig.json": {"JSON with Comments"},
"vimrc": {"Vim script"},
"wscript": {"Python"},
"xcompose": {"XCompose"},
"zlogin": {"Shell"},
"zlogout": {"Shell"},
"zprofile": {"Shell"},
"zshenv": {"Shell"},
"zshrc": {"Shell"},
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,35 @@
package data
import "github.com/src-d/enry/v2/data/rule"
// Heuristics implements a rule-based content matching engine.
// Heuristics is a number of sequntially applied rule.Heuristic where a
// matching one disambiguages language(s) for a single file extension.
type Heuristics []rule.Heuristic
// Match returns languages identified by the matching rule of the heuristic.
func (hs Heuristics) Match(data []byte) []string {
var matchedLangs []string
for _, heuristic := range hs {
if heuristic.Match(data) {
for _, langOrAlias := range heuristic.Languages() {
lang, ok := LanguageByAlias(langOrAlias)
if !ok { // should never happen
// reaching here means language name/alias in heuristics.yml
// is not consistent with languages.yml
// but we do not surface any such error at the API
continue
}
matchedLangs = append(matchedLangs, lang)
}
break
}
}
return matchedLangs
}
// matchString is a convenience used only in tests.
func (hs *Heuristics) matchString(data string) []string {
return hs.Match([]byte(data))
}

@ -0,0 +1,124 @@
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
package data
var LanguagesByInterpreter = map[string][]string{
"Rscript": {"R"},
"apl": {"APL"},
"aplx": {"APL"},
"ash": {"Shell"},
"asy": {"Asymptote"},
"awk": {"Awk"},
"bash": {"Shell"},
"bigloo": {"Scheme"},
"boolector": {"SMT"},
"ccl": {"Common Lisp"},
"chakra": {"JavaScript"},
"chicken": {"Scheme"},
"clisp": {"Common Lisp"},
"coffee": {"CoffeeScript"},
"cperl": {"Perl"},
"crystal": {"Crystal"},
"csi": {"Scheme"},
"cvc4": {"SMT"},
"cwl-runner": {"Common Workflow Language"},
"d8": {"JavaScript"},
"dart": {"Dart"},
"dash": {"Shell"},
"deno": {"TypeScript"},
"dtrace": {"DTrace"},
"dyalog": {"APL"},
"ecl": {"Common Lisp"},
"elixir": {"Elixir"},
"escript": {"Erlang"},
"fish": {"fish"},
"gawk": {"Awk"},
"gerbv": {"Gerber Image"},
"gerbview": {"Gerber Image"},
"gn": {"GN"},
"gnuplot": {"Gnuplot"},
"gosh": {"Scheme"},
"groovy": {"Groovy"},
"gsed": {"sed"},
"guile": {"Scheme"},
"hy": {"Hy"},
"instantfpc": {"Pascal"},
"io": {"Io"},
"ioke": {"Ioke"},
"jconsole": {"J"},
"jolie": {"Jolie"},
"jruby": {"Ruby"},
"js": {"JavaScript"},
"julia": {"Julia"},
"ksh": {"Shell"},
"lisp": {"Common Lisp"},
"lsl": {"LSL"},
"lua": {"Lua", "Terra"},
"macruby": {"Ruby"},
"make": {"Makefile"},
"mathsat5": {"SMT"},
"mawk": {"Awk"},
"minised": {"sed"},
"mksh": {"Shell"},
"mmi": {"Mercury"},
"moon": {"MoonScript"},
"nawk": {"Awk"},
"newlisp": {"NewLisp"},
"nextflow": {"Nextflow"},
"node": {"JavaScript"},
"nush": {"Nu"},
"ocaml": {"OCaml", "Reason"},
"ocamlrun": {"OCaml"},
"ocamlscript": {"OCaml"},
"openrc-run": {"OpenRC runscript"},
"opensmt": {"SMT"},
"osascript": {"AppleScript"},
"parrot": {"Parrot Assembly", "Parrot Internal Representation"},
"pdksh": {"Shell"},
"perl": {"Perl", "Pod"},
"perl6": {"Perl 6", "Pod 6"},
"php": {"PHP"},
"picolisp": {"PicoLisp"},
"pike": {"Pike"},
"pil": {"PicoLisp"},
"pwsh": {"PowerShell"},
"python": {"Python"},
"python2": {"Python"},
"python3": {"Python"},
"qmake": {"QMake"},
"r6rs": {"Scheme"},
"racket": {"Racket"},
"rake": {"Ruby"},
"rbx": {"Ruby"},
"rc": {"Shell"},
"regina": {"REXX"},
"rexx": {"REXX"},
"rhino": {"JavaScript"},
"ruby": {"Ruby"},
"rune": {"E"},
"runhaskell": {"Haskell"},
"sbcl": {"Common Lisp"},
"scala": {"Scala"},
"scheme": {"Scheme"},
"sclang": {"SuperCollider"},
"scsynth": {"SuperCollider"},
"sed": {"sed"},
"sh": {"Shell"},
"smt-rat": {"SMT"},
"smtinterpol": {"SMT"},
"ssed": {"sed"},
"stp": {"SMT"},
"swipl": {"Prolog"},
"tcc": {"C"},
"tclsh": {"Tcl"},
"ts-node": {"TypeScript"},
"v8": {"JavaScript"},
"v8-shell": {"JavaScript"},
"verit": {"SMT"},
"wish": {"Tcl"},
"yap": {"Prolog"},
"yices2": {"SMT"},
"z3": {"SMT"},
"zsh": {"Shell"},
}

@ -0,0 +1,226 @@
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
package data
var LanguagesMime = map[string]string{
"AGS Script": "text/x-c++src",
"APL": "text/apl",
"ASN.1": "text/x-ttcn-asn",
"ASP": "application/x-aspx",
"Alpine Abuild": "text/x-sh",
"AngelScript": "text/x-c++src",
"Ant Build System": "application/xml",
"Apex": "text/x-java",
"Asymptote": "text/x-kotlin",
"Brainfuck": "text/x-brainfuck",
"C": "text/x-csrc",
"C#": "text/x-csharp",
"C++": "text/x-c++src",
"C2hs Haskell": "text/x-haskell",
"CMake": "text/x-cmake",
"COBOL": "text/x-cobol",
"COLLADA": "text/xml",
"CSON": "text/x-coffeescript",
"CSS": "text/css",
"Cabal Config": "text/x-haskell",
"ChucK": "text/x-java",
"Clojure": "text/x-clojure",
"Closure Templates": "text/x-soy",
"Cloud Firestore Security Rules": "text/css",
"CoffeeScript": "text/x-coffeescript",
"Common Lisp": "text/x-common-lisp",
"Common Workflow Language": "text/x-yaml",
"Component Pascal": "text/x-pascal",
"Crystal": "text/x-crystal",
"Cuda": "text/x-c++src",
"Cycript": "text/javascript",
"Cython": "text/x-cython",
"D": "text/x-d",
"DTrace": "text/x-csrc",
"Dart": "application/dart",
"Dhall": "text/x-haskell",
"Diff": "text/x-diff",
"Dockerfile": "text/x-dockerfile",
"Dylan": "text/x-dylan",
"EBNF": "text/x-ebnf",
"ECL": "text/x-ecl",
"EQ": "text/x-csharp",
"Eagle": "text/xml",
"Easybuild": "text/x-python",
"Ecere Projects": "application/json",
"EditorConfig": "text/x-properties",
"Edje Data Collection": "application/json",
"Eiffel": "text/x-eiffel",
"Elm": "text/x-elm",
"Emacs Lisp": "text/x-common-lisp",
"EmberScript": "text/x-coffeescript",
"Erlang": "text/x-erlang",
"F#": "text/x-fsharp",
"Factor": "text/x-factor",
"Forth": "text/x-forth",
"Fortran": "text/x-fortran",
"GCC Machine Description": "text/x-common-lisp",
"GN": "text/x-python",
"Game Maker Language": "text/x-c++src",
"Genshi": "text/xml",
"Gentoo Ebuild": "text/x-sh",
"Gentoo Eclass": "text/x-sh",
"Git Attributes": "text/x-sh",
"Git Config": "text/x-properties",
"Glyph": "text/x-tcl",
"Go": "text/x-go",
"Grammatical Framework": "text/x-haskell",
"Groovy": "text/x-groovy",
"Groovy Server Pages": "application/x-jsp",
"HCL": "text/x-ruby",
"HTML": "text/html",
"HTML+Django": "text/x-django",
"HTML+ECR": "text/html",
"HTML+EEX": "text/html",
"HTML+ERB": "application/x-erb",
"HTML+PHP": "application/x-httpd-php",
"HTML+Razor": "text/html",
"HTTP": "message/http",
"Hack": "application/x-httpd-php",
"Haml": "text/x-haml",
"Haskell": "text/x-haskell",
"Haxe": "text/x-haxe",
"HolyC": "text/x-csrc",
"IDL": "text/x-idl",
"INI": "text/x-properties",
"IRC log": "text/mirc",
"Ignore List": "text/x-sh",
"JSON": "application/json",
"JSON with Comments": "text/javascript",
"JSON5": "application/json",
"JSONLD": "application/json",
"JSONiq": "application/json",
"JSX": "text/jsx",
"Java": "text/x-java",
"Java Properties": "text/x-properties",
"Java Server Pages": "application/x-jsp",
"JavaScript": "text/javascript",
"JavaScript+ERB": "application/javascript",
"Julia": "text/x-julia",
"Jupyter Notebook": "application/json",
"KiCad Layout": "text/x-common-lisp",
"Kit": "text/html",
"Kotlin": "text/x-kotlin",
"LFE": "text/x-common-lisp",
"LTspice Symbol": "text/x-spreadsheet",
"LabVIEW": "text/xml",
"Latte": "text/x-smarty",
"Less": "text/css",
"Literate Haskell": "text/x-literate-haskell",
"LiveScript": "text/x-livescript",
"LookML": "text/x-yaml",
"Lua": "text/x-lua",
"M": "text/x-mumps",
"MATLAB": "text/x-octave",
"MTML": "text/html",
"MUF": "text/x-forth",
"Makefile": "text/x-cmake",
"Markdown": "text/x-gfm",
"Marko": "text/html",
"Mathematica": "text/x-mathematica",
"Maven POM": "text/xml",
"Max": "application/json",
"Metal": "text/x-c++src",
"Mirah": "text/x-ruby",
"Modelica": "text/x-modelica",
"NSIS": "text/x-nsis",
"NetLogo": "text/x-common-lisp",
"NewLisp": "text/x-common-lisp",
"Nginx": "text/x-nginx-conf",
"Nu": "text/x-scheme",
"NumPy": "text/x-python",
"OCaml": "text/x-ocaml",
"Objective-C": "text/x-objectivec",
"Objective-C++": "text/x-objectivec",
"OpenCL": "text/x-csrc",
"OpenRC runscript": "text/x-sh",
"Oz": "text/x-oz",
"PHP": "application/x-httpd-php",
"PLSQL": "text/x-plsql",
"PLpgSQL": "text/x-sql",
"Pascal": "text/x-pascal",
"Perl": "text/x-perl",
"Perl 6": "text/x-perl",
"Pic": "text/troff",
"Pod": "text/x-perl",
"PowerShell": "application/x-powershell",
"Protocol Buffer": "text/x-protobuf",
"Public Key": "application/pgp",
"Pug": "text/x-pug",
"Puppet": "text/x-puppet",
"PureScript": "text/x-haskell",
"Python": "text/x-python",
"R": "text/x-rsrc",
"RAML": "text/x-yaml",
"RHTML": "application/x-erb",
"RMarkdown": "text/x-gfm",
"RPM Spec": "text/x-rpm-spec",
"Reason": "text/x-rustsrc",
"Roff": "text/troff",
"Roff Manpage": "text/troff",
"Rouge": "text/x-clojure",
"Ruby": "text/x-ruby",
"Rust": "text/x-rustsrc",
"SAS": "text/x-sas",
"SCSS": "text/x-scss",
"SPARQL": "application/sparql-query",
"SQL": "text/x-sql",
"SQLPL": "text/x-sql",
"SRecode Template": "text/x-common-lisp",
"SVG": "text/xml",
"Sage": "text/x-python",
"SaltStack": "text/x-yaml",
"Sass": "text/x-sass",
"Scala": "text/x-scala",
"Scheme": "text/x-scheme",
"Shell": "text/x-sh",
"ShellSession": "text/x-sh",
"Slim": "text/x-slim",
"Smalltalk": "text/x-stsrc",
"Smarty": "text/x-smarty",
"Squirrel": "text/x-c++src",
"Standard ML": "text/x-ocaml",
"Svelte": "text/html",
"Swift": "text/x-swift",
"SystemVerilog": "text/x-systemverilog",
"TOML": "text/x-toml",
"TSX": "text/jsx",
"Tcl": "text/x-tcl",
"Tcsh": "text/x-sh",
"TeX": "text/x-stex",
"Terra": "text/x-lua",
"Textile": "text/x-textile",
"Turtle": "text/turtle",
"Twig": "text/x-twig",
"TypeScript": "application/typescript",
"Unified Parallel C": "text/x-csrc",
"Unity3D Asset": "text/x-yaml",
"Uno": "text/x-csharp",
"UnrealScript": "text/x-java",
"VHDL": "text/x-vhdl",
"Verilog": "text/x-verilog",
"Visual Basic": "text/x-vb",
"Volt": "text/x-d",
"WebAssembly": "text/x-common-lisp",
"WebIDL": "text/x-webidl",
"Windows Registry Entries": "text/x-properties",
"X BitMap": "text/x-csrc",
"X PixMap": "text/x-csrc",
"XC": "text/x-csrc",
"XML": "text/xml",
"XPages": "text/xml",
"XProc": "text/xml",
"XQuery": "application/xquery",
"XS": "text/x-csrc",
"XSLT": "text/xml",
"YAML": "text/x-yaml",
"edn": "text/x-clojure",
"reStructuredText": "text/x-rst",
"wisp": "text/x-clojure",
}

@ -0,0 +1,109 @@
// Package rule contains rule-based heuristic implementations.
// It is used in the generated code in content.go for disambiguation of languages
// with colliding extensions, based on regexps from Linguist data.
package rule
// Heuristic consist of (a number of) rules where each, if matches,
// identifes content as belonging to a programming language(s).
type Heuristic interface {
Matcher
Languages() []string
}
// Matcher checks if the data matches (number of) pattern.
// Every heuristic rule below implements this interface.
// A regexp.Regexp satisfies this interface and can be used instead.
type Matcher interface {
Match(data []byte) bool
}
// languages struct incapsulate data common to every Matcher: all languages
// that it identifies.
type languages struct {
langs []string
}
// Languages returns all languages, identified by this Matcher.
func (l languages) Languages() []string {
return l.langs
}
// MatchingLanguages is a helper to create new languages.
func MatchingLanguages(langs ...string) languages {
return languages{langs}
}
// Implements a Heuristic.
type or struct {
languages
pattern Matcher
}
// Or rule matches, if a single matching pattern exists.
// It recives only one pattern as it relies on compile-time optimization that
// represtes union with | inside a single regexp.
func Or(l languages, r Matcher) Heuristic {
return or{l, r}
}
// Match implements rule.Matcher.
func (r or) Match(data []byte) bool {
return r.pattern.Match(data)
}
// Implements a Heuristic.
type and struct {
languages
patterns []Matcher
}
// And rule matches, if each of the patterns does match.
func And(l languages, m ...Matcher) Heuristic {
return and{l, m}
}
// Match implements data.Matcher.
func (r and) Match(data []byte) bool {
for _, p := range r.patterns {
if !p.Match(data) {
return false
}
}
return true
}
// Implements a Heuristic.
type not struct {
languages
Patterns []Matcher
}
// Not rule matches if none of the patterns match.
func Not(l languages, r ...Matcher) Heuristic {
return not{l, r}
}
// Match implements data.Matcher.
func (r not) Match(data []byte) bool {
for _, p := range r.Patterns {
if p.Match(data) {
return false
}
}
return true
}
// Implements a Heuristic.
type always struct {
languages
}
// Always rule always matches. Often is used as a default fallback.
func Always(l languages) Heuristic {
return always{l}
}
// Match implements Matcher.
func (r always) Match(data []byte) bool {
return true
}

@ -0,0 +1,526 @@
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
package data
var LanguagesType = map[string]int{
"1C Enterprise": 2,
"ABAP": 2,
"ABNF": 1,
"AGS Script": 2,
"AMPL": 2,
"ANTLR": 2,
"API Blueprint": 3,
"APL": 2,
"ASN.1": 1,
"ASP": 2,
"ATS": 2,
"ActionScript": 2,
"Ada": 2,
"Adobe Font Metrics": 1,
"Agda": 2,
"Alloy": 2,
"Alpine Abuild": 2,
"Altium Designer": 1,
"AngelScript": 2,
"Ant Build System": 1,
"ApacheConf": 1,
"Apex": 2,
"Apollo Guidance Computer": 2,
"AppleScript": 2,
"Arc": 2,
"AsciiDoc": 4,
"AspectJ": 2,
"Assembly": 2,
"Asymptote": 2,
"Augeas": 2,
"AutoHotkey": 2,
"AutoIt": 2,
"Awk": 2,
"Ballerina": 2,
"Batchfile": 2,
"Befunge": 2,
"Bison": 2,
"BitBake": 2,
"Blade": 3,
"BlitzBasic": 2,
"BlitzMax": 2,
"Bluespec": 2,
"Boo": 2,
"Brainfuck": 2,
"Brightscript": 2,
"C": 2,
"C#": 2,
"C++": 2,
"C-ObjDump": 1,
"C2hs Haskell": 2,
"CLIPS": 2,
"CMake": 2,
"COBOL": 2,
"COLLADA": 1,
"CSON": 1,
"CSS": 3,
"CSV": 1,
"CWeb": 2,
"Cabal Config": 1,
"Cap'n Proto": 2,
"CartoCSS": 2,
"Ceylon": 2,
"Chapel": 2,
"Charity": 2,
"ChucK": 2,
"Cirru": 2,
"Clarion": 2,
"Clean": 2,
"Click": 2,
"Clojure": 2,
"Closure Templates": 3,
"Cloud Firestore Security Rules": 1,
"CoNLL-U": 1,
"CoffeeScript": 2,
"ColdFusion": 2,
"ColdFusion CFC": 2,
"Common Lisp": 2,
"Common Workflow Language": 2,
"Component Pascal": 2,
"Cool": 2,
"Coq": 2,
"Cpp-ObjDump": 1,
"Creole": 4,
"Crystal": 2,
"Csound": 2,
"Csound Document": 2,
"Csound Score": 2,
"Cuda": 2,
"Cycript": 2,
"Cython": 2,
"D": 2,
"D-ObjDump": 1,
"DIGITAL Command Language": 2,
"DM": 2,
"DNS Zone": 1,
"DTrace": 2,
"Darcs Patch": 1,
"Dart": 2,
"DataWeave": 2,
"Dhall": 2,
"Diff": 1,
"Dockerfile": 2,
"Dogescript": 2,
"Dylan": 2,
"E": 2,
"EBNF": 1,
"ECL": 2,
"ECLiPSe": 2,
"EJS": 3,
"EML": 1,
"EQ": 2,
"Eagle": 1,
"Easybuild": 1,
"Ecere Projects": 1,
"EditorConfig": 1,
"Edje Data Collection": 1,
"Eiffel": 2,
"Elixir": 2,
"Elm": 2,
"Emacs Lisp": 2,
"EmberScript": 2,
"Erlang": 2,
"F#": 2,
"F*": 2,
"FIGlet Font": 1,
"FLUX": 2,
"Factor": 2,
"Fancy": 2,
"Fantom": 2,
"Filebench WML": 2,
"Filterscript": 2,
"Formatted": 1,
"Forth": 2,
"Fortran": 2,
"FreeMarker": 2,
"Frege": 2,
"G-code": 2,
"GAMS": 2,
"GAP": 2,
"GCC Machine Description": 2,
"GDB": 2,
"GDScript": 2,
"GLSL": 2,
"GN": 1,
"Game Maker Language": 2,
"Genie": 2,
"Genshi": 2,
"Gentoo Ebuild": 2,
"Gentoo Eclass": 2,
"Gerber Image": 1,
"Gettext Catalog": 4,
"Gherkin": 2,
"Git Attributes": 1,
"Git Config": 1,
"Glyph": 2,
"Glyph Bitmap Distribution Format": 1,
"Gnuplot": 2,
"Go": 2,
"Golo": 2,
"Gosu": 2,
"Grace": 2,
"Gradle": 1,
"Grammatical Framework": 2,
"Graph Modeling Language": 1,
"GraphQL": 1,
"Graphviz (DOT)": 1,
"Groovy": 2,
"Groovy Server Pages": 2,
"HAProxy": 1,
"HCL": 2,
"HLSL": 2,
"HTML": 3,
"HTML+Django": 3,
"HTML+ECR": 3,
"HTML+EEX": 3,
"HTML+ERB": 3,
"HTML+PHP": 3,
"HTML+Razor": 3,
"HTTP": 1,
"HXML": 1,
"Hack": 2,
"Haml": 3,
"Handlebars": 3,
"Harbour": 2,
"Haskell": 2,
"Haxe": 2,
"HiveQL": 2,
"HolyC": 2,
"Hy": 2,
"HyPhy": 2,
"IDL": 2,
"IGOR Pro": 2,
"INI": 1,
"IRC log": 1,
"Idris": 2,
"Ignore List": 1,
"Inform 7": 2,
"Inno Setup": 2,
"Io": 2,
"Ioke": 2,
"Isabelle": 2,
"Isabelle ROOT": 2,
"J": 2,
"JFlex": 2,
"JSON": 1,
"JSON with Comments": 1,
"JSON5": 1,
"JSONLD": 1,
"JSONiq": 2,
"JSX": 2,
"Jasmin": 2,
"Java": 2,
"Java Properties": 1,
"Java Server Pages": 2,
"JavaScript": 2,
"JavaScript+ERB": 2,
"Jison": 2,
"Jison Lex": 2,
"Jolie": 2,
"Jsonnet": 2,
"Julia": 2,
"Jupyter Notebook": 3,
"KRL": 2,
"KiCad Layout": 1,
"KiCad Legacy Layout": 1,
"KiCad Schematic": 1,
"Kit": 3,
"Kotlin": 2,
"LFE": 2,
"LLVM": 2,
"LOLCODE": 2,
"LSL": 2,
"LTspice Symbol": 1,
"LabVIEW": 2,
"Lasso": 2,
"Latte": 3,
"Lean": 2,
"Less": 3,
"Lex": 2,
"LilyPond": 2,
"Limbo": 2,
"Linker Script": 1,
"Linux Kernel Module": 1,
"Liquid": 3,
"Literate Agda": 2,
"Literate CoffeeScript": 2,
"Literate Haskell": 2,
"LiveScript": 2,
"Logos": 2,
"Logtalk": 2,
"LookML": 2,
"LoomScript": 2,
"Lua": 2,
"M": 2,
"M4": 2,
"M4Sugar": 2,
"MATLAB": 2,
"MAXScript": 2,
"MQL4": 2,
"MQL5": 2,
"MTML": 3,
"MUF": 2,
"Makefile": 2,
"Mako": 2,
"Markdown": 4,
"Marko": 3,
"Mask": 3,
"Mathematica": 2,
"Maven POM": 1,
"Max": 2,
"MediaWiki": 4,
"Mercury": 2,
"Meson": 2,
"Metal": 2,
"MiniD": 2,
"Mirah": 2,
"Modelica": 2,
"Modula-2": 2,
"Modula-3": 2,
"Module Management System": 2,
"Monkey": 2,
"Moocode": 2,
"MoonScript": 2,
"Motorola 68K Assembly": 2,
"Myghty": 2,
"NCL": 2,
"NL": 1,
"NSIS": 2,
"Nearley": 2,
"Nemerle": 2,
"NetLinx": 2,
"NetLinx+ERB": 2,
"NetLogo": 2,
"NewLisp": 2,
"Nextflow": 2,
"Nginx": 1,
"Nim": 2,
"Ninja": 1,
"Nit": 2,
"Nix": 2,
"Nu": 2,
"NumPy": 2,
"OCaml": 2,
"ObjDump": 1,
"ObjectScript": 2,
"Objective-C": 2,
"Objective-C++": 2,
"Objective-J": 2,
"Omgrofl": 2,
"Opa": 2,
"Opal": 2,
"OpenCL": 2,
"OpenEdge ABL": 2,
"OpenRC runscript": 2,
"OpenSCAD": 2,
"OpenType Feature File": 1,
"Org": 4,
"Ox": 2,
"Oxygene": 2,
"Oz": 2,
"P4": 2,
"PHP": 2,
"PLSQL": 2,
"PLpgSQL": 2,
"POV-Ray SDL": 2,
"Pan": 2,
"Papyrus": 2,
"Parrot": 2,
"Parrot Assembly": 2,
"Parrot Internal Representation": 2,
"Pascal": 2,
"Pawn": 2,
"Pep8": 2,
"Perl": 2,
"Perl 6": 2,
"Pic": 3,
"Pickle": 1,
"PicoLisp": 2,
"PigLatin": 2,
"Pike": 2,
"Pod": 4,
"Pod 6": 4,
"PogoScript": 2,
"Pony": 2,
"PostCSS": 3,
"PostScript": 3,
"PowerBuilder": 2,
"PowerShell": 2,
"Processing": 2,
"Prolog": 2,
"Propeller Spin": 2,
"Protocol Buffer": 1,
"Public Key": 1,
"Pug": 3,
"Puppet": 2,
"Pure Data": 1,
"PureBasic": 2,
"PureScript": 2,
"Python": 2,
"Python console": 2,
"Python traceback": 1,
"QML": 2,
"QMake": 2,
"Quake": 2,
"R": 2,
"RAML": 3,
"RDoc": 4,
"REALbasic": 2,
"REXX": 2,
"RHTML": 3,
"RMarkdown": 4,
"RPC": 2,
"RPM Spec": 1,
"RUNOFF": 3,
"Racket": 2,
"Ragel": 2,
"Rascal": 2,
"Raw token data": 1,
"Reason": 2,
"Rebol": 2,
"Red": 2,
"Redcode": 2,
"Regular Expression": 1,
"Ren'Py": 2,
"RenderScript": 2,
"Rich Text Format": 3,
"Ring": 2,
"RobotFramework": 2,
"Roff": 3,
"Roff Manpage": 3,
"Rouge": 2,
"Ruby": 2,
"Rust": 2,
"SAS": 2,
"SCSS": 3,
"SMT": 2,
"SPARQL": 1,
"SQF": 2,
"SQL": 1,
"SQLPL": 2,
"SRecode Template": 3,
"SSH Config": 1,
"STON": 1,
"SVG": 1,
"Sage": 2,
"SaltStack": 2,
"Sass": 3,
"Scala": 2,
"Scaml": 3,
"Scheme": 2,
"Scilab": 2,
"Self": 2,
"ShaderLab": 2,
"Shell": 2,
"ShellSession": 2,
"Shen": 2,
"Slash": 2,
"Slice": 2,
"Slim": 3,
"Smali": 2,
"Smalltalk": 2,
"Smarty": 2,
"Solidity": 2,
"SourcePawn": 2,
"Spline Font Database": 1,
"Squirrel": 2,
"Stan": 2,
"Standard ML": 2,
"Stata": 2,
"Stylus": 3,
"SubRip Text": 1,
"SugarSS": 3,
"SuperCollider": 2,
"Svelte": 3,
"Swift": 2,
"SystemVerilog": 2,
"TI Program": 2,
"TLA": 2,
"TOML": 1,
"TSQL": 2,
"TSX": 2,
"TXL": 2,
"Tcl": 2,
"Tcsh": 2,
"TeX": 3,
"Tea": 3,
"Terra": 2,
"Text": 4,
"Textile": 4,
"Thrift": 2,
"Turing": 2,
"Turtle": 1,
"Twig": 3,
"Type Language": 1,
"TypeScript": 2,
"Unified Parallel C": 2,
"Unity3D Asset": 1,
"Unix Assembly": 2,
"Uno": 2,
"UnrealScript": 2,
"UrWeb": 2,
"VCL": 2,
"VHDL": 2,
"Vala": 2,
"Verilog": 2,
"Vim script": 2,
"Visual Basic": 2,
"Volt": 2,
"Vue": 3,
"Wavefront Material": 1,
"Wavefront Object": 1,
"Web Ontology Language": 1,
"WebAssembly": 2,
"WebIDL": 2,
"WebVTT": 1,
"Windows Registry Entries": 1,
"Wollok": 2,
"World of Warcraft Addon Data": 1,
"X BitMap": 1,
"X Font Directory Index": 1,
"X PixMap": 1,
"X10": 2,
"XC": 2,
"XCompose": 1,
"XML": 1,
"XPages": 1,
"XProc": 2,
"XQuery": 2,
"XS": 2,
"XSLT": 2,
"Xojo": 2,
"Xtend": 2,
"YAML": 1,
"YANG": 1,
"YARA": 2,
"YASnippet": 3,
"Yacc": 2,
"ZAP": 2,
"ZIL": 2,
"Zeek": 2,
"ZenScript": 2,
"Zephir": 2,
"Zig": 2,
"Zimpl": 2,
"desktop": 1,
"eC": 2,
"edn": 1,
"fish": 2,
"mcfunction": 2,
"mupad": 2,
"nanorc": 1,
"nesC": 2,
"ooc": 2,
"q": 2,
"reStructuredText": 4,
"sed": 2,
"wdl": 2,
"wisp": 2,
"xBase": 2,
}

@ -0,0 +1,166 @@
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
package data
import "gopkg.in/toqueteos/substring.v1"
var VendorMatchers = substring.Or(
substring.Regexp(`(^|/)cache/`),
substring.Regexp(`^[Dd]ependencies/`),
substring.Regexp(`(^|/)dist/`),
substring.Regexp(`^deps/`),
substring.Regexp(`(^|/)configure$`),
substring.Regexp(`(^|/)config.guess$`),
substring.Regexp(`(^|/)config.sub$`),
substring.Regexp(`(^|/)aclocal.m4`),
substring.Regexp(`(^|/)libtool.m4`),
substring.Regexp(`(^|/)ltoptions.m4`),
substring.Regexp(`(^|/)ltsugar.m4`),
substring.Regexp(`(^|/)ltversion.m4`),
substring.Regexp(`(^|/)lt~obsolete.m4`),
substring.Regexp(`cpplint.py`),
substring.Regexp(`node_modules/`),
substring.Regexp(`bower_components/`),
substring.Regexp(`^rebar$`),
substring.Regexp(`erlang.mk`),
substring.Regexp(`Godeps/_workspace/`),
substring.Regexp(`(^|/)testdata/`),
substring.Regexp(`.indent.pro`),
substring.Regexp(`(\.|-)min\.(js|css)$`),
substring.Regexp(`([^\s]*)import\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)bootstrap([^.]*)\.(js|css|less|scss|styl)$`),
substring.Regexp(`(^|/)custom\.bootstrap([^\s]*)(js|css|less|scss|styl)$`),
substring.Regexp(`(^|/)font-?awesome\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)font-?awesome/.*\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)foundation\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)normalize\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)skeleton\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)[Bb]ourbon/.*\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)animate\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)materialize\.(css|less|scss|styl|js)$`),
substring.Regexp(`(^|/)select2/.*\.(css|scss|js)$`),
substring.Regexp(`(^|/)bulma\.(css|sass|scss)$`),
substring.Regexp(`(3rd|[Tt]hird)[-_]?[Pp]arty/`),
substring.Regexp(`vendors?/`),
substring.Regexp(`extern(al)?/`),
substring.Regexp(`(^|/)[Vv]+endor/`),
substring.Regexp(`^debian/`),
substring.Regexp(`run.n$`),
substring.Regexp(`bootstrap-datepicker/`),
substring.Regexp(`(^|/)jquery([^.]*)\.js$`),
substring.Regexp(`(^|/)jquery\-\d\.\d+(\.\d+)?\.js$`),
substring.Regexp(`(^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?\.(js|css)$`),
substring.Regexp(`(^|/)jquery\.(ui|effects)\.([^.]*)\.(js|css)$`),
substring.Regexp(`jquery.fn.gantt.js`),
substring.Regexp(`jquery.fancybox.(js|css)`),
substring.Regexp(`fuelux.js`),
substring.Regexp(`(^|/)jquery\.fileupload(-\w+)?\.js$`),
substring.Regexp(`jquery.dataTables.js`),
substring.Regexp(`bootbox.js`),
substring.Regexp(`pdf.worker.js`),
substring.Regexp(`(^|/)slick\.\w+.js$`),
substring.Regexp(`(^|/)Leaflet\.Coordinates-\d+\.\d+\.\d+\.src\.js$`),
substring.Regexp(`leaflet.draw-src.js`),
substring.Regexp(`leaflet.draw.css`),
substring.Regexp(`Control.FullScreen.css`),
substring.Regexp(`Control.FullScreen.js`),
substring.Regexp(`leaflet.spin.js`),
substring.Regexp(`wicket-leaflet.js`),
substring.Regexp(`.sublime-project`),
substring.Regexp(`.sublime-workspace`),
substring.Regexp(`.vscode`),
substring.Regexp(`(^|/)prototype(.*)\.js$`),
substring.Regexp(`(^|/)effects\.js$`),
substring.Regexp(`(^|/)controls\.js$`),
substring.Regexp(`(^|/)dragdrop\.js$`),
substring.Regexp(`(.*?)\.d\.ts$`),
substring.Regexp(`(^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$`),
substring.Regexp(`(^|/)dojo\.js$`),
substring.Regexp(`(^|/)MochiKit\.js$`),
substring.Regexp(`(^|/)yahoo-([^.]*)\.js$`),
substring.Regexp(`(^|/)yui([^.]*)\.js$`),
substring.Regexp(`(^|/)ckeditor\.js$`),
substring.Regexp(`(^|/)tiny_mce([^.]*)\.js$`),
substring.Regexp(`(^|/)tiny_mce/(langs|plugins|themes|utils)`),
substring.Regexp(`(^|/)ace-builds/`),
substring.Regexp(`(^|/)fontello(.*?)\.css$`),
substring.Regexp(`(^|/)MathJax/`),
substring.Regexp(`(^|/)Chart\.js$`),
substring.Regexp(`(^|/)[Cc]ode[Mm]irror/(\d+\.\d+/)?(lib|mode|theme|addon|keymap|demo)`),
substring.Regexp(`(^|/)shBrush([^.]*)\.js$`),
substring.Regexp(`(^|/)shCore\.js$`),
substring.Regexp(`(^|/)shLegacy\.js$`),
substring.Regexp(`(^|/)angular([^.]*)\.js$`),
substring.Regexp(`(^|\/)d3(\.v\d+)?([^.]*)\.js$`),
substring.Regexp(`(^|/)react(-[^.]*)?\.js$`),
substring.Regexp(`(^|/)flow-typed/.*\.js$`),
substring.Regexp(`(^|/)modernizr\-\d\.\d+(\.\d+)?\.js$`),
substring.Regexp(`(^|/)modernizr\.custom\.\d+\.js$`),
substring.Regexp(`(^|/)knockout-(\d+\.){3}(debug\.)?js$`),
substring.Regexp(`(^|/)docs?/_?(build|themes?|templates?|static)/`),
substring.Regexp(`(^|/)admin_media/`),
substring.Regexp(`(^|/)env/`),
substring.Regexp(`^fabfile\.py$`),
substring.Regexp(`^waf$`),
substring.Regexp(`^.osx$`),
substring.Regexp(`\.xctemplate/`),
substring.Regexp(`\.imageset/`),
substring.Regexp(`(^|/)Carthage/`),
substring.Regexp(`(^|/)Sparkle/`),
substring.Regexp(`Crashlytics.framework/`),
substring.Regexp(`Fabric.framework/`),
substring.Regexp(`BuddyBuildSDK.framework/`),
substring.Regexp(`Realm.framework`),
substring.Regexp(`RealmSwift.framework`),
substring.Regexp(`gitattributes$`),
substring.Regexp(`gitignore$`),
substring.Regexp(`gitmodules$`),
substring.Regexp(`(^|/)gradlew$`),
substring.Regexp(`(^|/)gradlew\.bat$`),
substring.Regexp(`(^|/)gradle/wrapper/`),
substring.Regexp(`(^|/)mvnw$`),
substring.Regexp(`(^|/)mvnw\.cmd$`),
substring.Regexp(`(^|/)\.mvn/wrapper/`),
substring.Regexp(`-vsdoc\.js$`),
substring.Regexp(`\.intellisense\.js$`),
substring.Regexp(`(^|/)jquery([^.]*)\.validate(\.unobtrusive)?\.js$`),
substring.Regexp(`(^|/)jquery([^.]*)\.unobtrusive\-ajax\.js$`),
substring.Regexp(`(^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$`),
substring.Regexp(`^[Pp]ackages\/.+\.\d+\/`),
substring.Regexp(`(^|/)extjs/.*?\.js$`),
substring.Regexp(`(^|/)extjs/.*?\.xml$`),
substring.Regexp(`(^|/)extjs/.*?\.txt$`),
substring.Regexp(`(^|/)extjs/.*?\.html$`),
substring.Regexp(`(^|/)extjs/.*?\.properties$`),
substring.Regexp(`(^|/)extjs/.sencha/`),
substring.Regexp(`(^|/)extjs/docs/`),
substring.Regexp(`(^|/)extjs/builds/`),
substring.Regexp(`(^|/)extjs/cmd/`),
substring.Regexp(`(^|/)extjs/examples/`),
substring.Regexp(`(^|/)extjs/locale/`),
substring.Regexp(`(^|/)extjs/packages/`),
substring.Regexp(`(^|/)extjs/plugins/`),
substring.Regexp(`(^|/)extjs/resources/`),
substring.Regexp(`(^|/)extjs/src/`),
substring.Regexp(`(^|/)extjs/welcome/`),
substring.Regexp(`(^|/)html5shiv\.js$`),
substring.Regexp(`^[Tt]ests?/fixtures/`),
substring.Regexp(`^[Ss]pecs?/fixtures/`),
substring.Regexp(`(^|/)cordova([^.]*)\.js$`),
substring.Regexp(`(^|/)cordova\-\d\.\d(\.\d)?\.js$`),
substring.Regexp(`foundation(\..*)?\.js$`),
substring.Regexp(`^Vagrantfile$`),
substring.Regexp(`.[Dd][Ss]_[Ss]tore$`),
substring.Regexp(`^vignettes/`),
substring.Regexp(`^inst/extdata/`),
substring.Regexp(`octicons.css`),
substring.Regexp(`sprockets-octicons.scss`),
substring.Regexp(`(^|/)activator$`),
substring.Regexp(`(^|/)activator\.bat$`),
substring.Regexp(`proguard.pro`),
substring.Regexp(`proguard-rules.pro`),
substring.Regexp(`^puphpet/`),
substring.Regexp(`(^|/)\.google_apis/`),
substring.Regexp(`^Jenkinsfile$`),
)

@ -0,0 +1,16 @@
/*
Package enry implements multiple strategies for programming language identification.
Identification is made based on file name and file content using a seriece
of strategies to narrow down possible option.
Each strategy is available as a separate API call, as well as a main enty point
GetLanguage(filename string, content []byte) (language string)
It is a port of the https://github.com/github/linguist from Ruby.
Upstream Linguist YAML files are used to generate datastructures for data
package.
*/
package enry // import "github.com/src-d/enry/v2"
//go:generate make code-generate

@ -0,0 +1,11 @@
module github.com/src-d/enry/v2
go 1.12
require (
github.com/src-d/go-oniguruma v1.1.0
github.com/stretchr/testify v1.3.0
github.com/toqueteos/trie v1.0.0 // indirect
gopkg.in/toqueteos/substring.v1 v1.0.2
gopkg.in/yaml.v2 v2.2.2
)

@ -0,0 +1,17 @@
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/src-d/go-oniguruma v1.1.0 h1:EG+Nm5n2JqWUaCjtM0NtutPxU7ZN5Tp50GWrrV8bTww=
github.com/src-d/go-oniguruma v1.1.0/go.mod h1:chVbff8kcVtmrhxtZ3yBVLLquXbzCS6DrxQaAK/CeqM=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/toqueteos/trie v1.0.0 h1:8i6pXxNUXNRAqP246iibb7w/pSFquNTQ+uNfriG7vlk=
github.com/toqueteos/trie v1.0.0/go.mod h1:Ywk48QhEqhU1+DwhMkJ2x7eeGxDHiGkAdc9+0DYcbsM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/toqueteos/substring.v1 v1.0.2 h1:urLqCeMm6x/eTuQa1oZerNw8N1KNOIp5hD5kGL7lFsE=
gopkg.in/toqueteos/substring.v1 v1.0.2/go.mod h1:Eb2Z1UYehlVK8LYW2WBVR2rwbujsz3aX8XDrM1vbNew=
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=

@ -0,0 +1,7 @@
// Package tokenizer implements file tokenization used by the enry content
// classifier. This package is an implementation detail of enry and should not
// be imported by other packages.
package tokenizer
// ByteLimit defines the maximum prefix of an input text that will be tokenized.
const ByteLimit = 100000

File diff suppressed because it is too large Load Diff

@ -0,0 +1,336 @@
#ifndef linguist_yyHEADER_H
#define linguist_yyHEADER_H 1
#define linguist_yyIN_HEADER 1
#line 6 "lex.linguist_yy.h"
#define YY_INT_ALIGNED short int
/* A lexical scanner generated by flex */
#define FLEX_SCANNER
#define YY_FLEX_MAJOR_VERSION 2
#define YY_FLEX_MINOR_VERSION 5
#define YY_FLEX_SUBMINOR_VERSION 35
#if YY_FLEX_SUBMINOR_VERSION > 0
#define FLEX_BETA
#endif
/* First, we deal with platform-specific or compiler-specific issues. */
/* begin standard C headers. */
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
/* end standard C headers. */
/* flex integer type definitions */
#ifndef FLEXINT_H
#define FLEXINT_H
/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
* if you want the limit (max/min) macros for int types.
*/
#ifndef __STDC_LIMIT_MACROS
#define __STDC_LIMIT_MACROS 1
#endif
#include <inttypes.h>
typedef int8_t flex_int8_t;
typedef uint8_t flex_uint8_t;
typedef int16_t flex_int16_t;
typedef uint16_t flex_uint16_t;
typedef int32_t flex_int32_t;
typedef uint32_t flex_uint32_t;
typedef uint64_t flex_uint64_t;
#else
typedef signed char flex_int8_t;
typedef short int flex_int16_t;
typedef int flex_int32_t;
typedef unsigned char flex_uint8_t;
typedef unsigned short int flex_uint16_t;
typedef unsigned int flex_uint32_t;
#endif /* ! C99 */
/* Limits of integral types. */
#ifndef INT8_MIN
#define INT8_MIN (-128)
#endif
#ifndef INT16_MIN
#define INT16_MIN (-32767-1)
#endif
#ifndef INT32_MIN
#define INT32_MIN (-2147483647-1)
#endif
#ifndef INT8_MAX
#define INT8_MAX (127)
#endif
#ifndef INT16_MAX
#define INT16_MAX (32767)
#endif
#ifndef INT32_MAX
#define INT32_MAX (2147483647)
#endif
#ifndef UINT8_MAX
#define UINT8_MAX (255U)
#endif
#ifndef UINT16_MAX
#define UINT16_MAX (65535U)
#endif
#ifndef UINT32_MAX
#define UINT32_MAX (4294967295U)
#endif
#endif /* ! FLEXINT_H */
#ifdef __cplusplus
/* The "const" storage-class-modifier is valid. */
#define YY_USE_CONST
#else /* ! __cplusplus */
/* C99 requires __STDC__ to be defined as 1. */
#if defined (__STDC__)
#define YY_USE_CONST
#endif /* defined (__STDC__) */
#endif /* ! __cplusplus */
#ifdef YY_USE_CONST
#define yyconst const
#else
#define yyconst
#endif
/* An opaque pointer. */
#ifndef YY_TYPEDEF_YY_SCANNER_T
#define YY_TYPEDEF_YY_SCANNER_T
typedef void* yyscan_t;
#endif
/* For convenience, these vars (plus the bison vars far below)
are macros in the reentrant scanner. */
#define yyin yyg->yyin_r
#define yyout yyg->yyout_r
#define yyextra yyg->yyextra_r
#define yyleng yyg->yyleng_r
#define yytext yyg->yytext_r
#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno)
#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column)
#define yy_flex_debug yyg->yy_flex_debug_r
/* Size of default input buffer. */
#ifndef YY_BUF_SIZE
#define YY_BUF_SIZE 16384
#endif
#ifndef YY_TYPEDEF_YY_BUFFER_STATE
#define YY_TYPEDEF_YY_BUFFER_STATE
typedef struct yy_buffer_state *YY_BUFFER_STATE;
#endif
#ifndef YY_TYPEDEF_YY_SIZE_T
#define YY_TYPEDEF_YY_SIZE_T
typedef size_t yy_size_t;
#endif
#ifndef YY_STRUCT_YY_BUFFER_STATE
#define YY_STRUCT_YY_BUFFER_STATE
struct yy_buffer_state
{
FILE *yy_input_file;
char *yy_ch_buf; /* input buffer */
char *yy_buf_pos; /* current position in input buffer */
/* Size of input buffer in bytes, not including room for EOB
* characters.
*/
yy_size_t yy_buf_size;
/* Number of characters read into yy_ch_buf, not including EOB
* characters.
*/
yy_size_t yy_n_chars;
/* Whether we "own" the buffer - i.e., we know we created it,
* and can realloc() it to grow it, and should free() it to
* delete it.
*/
int yy_is_our_buffer;
/* Whether this is an "interactive" input source; if so, and
* if we're using stdio for input, then we want to use getc()
* instead of fread(), to make sure we stop fetching input after
* each newline.
*/
int yy_is_interactive;
/* Whether we're considered to be at the beginning of a line.
* If so, '^' rules will be active on the next match, otherwise
* not.
*/
int yy_at_bol;
int yy_bs_lineno; /**< The line count. */
int yy_bs_column; /**< The column count. */
/* Whether to try to fill the input buffer when we reach the
* end of it.
*/
int yy_fill_buffer;
int yy_buffer_status;
};
#endif /* !YY_STRUCT_YY_BUFFER_STATE */
void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner );
void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner );
void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner );
void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner );
void linguist_yypop_buffer_state (yyscan_t yyscanner );
YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner );
YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner );
YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner );
void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner );
void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner );
void linguist_yyfree (void * ,yyscan_t yyscanner );
/* Begin user sect3 */
#define yytext_ptr yytext_r
#ifdef YY_HEADER_EXPORT_START_CONDITIONS
#define INITIAL 0
#define sgml 1
#define c_comment 2
#define xml_comment 3
#define haskell_comment 4
#define ocaml_comment 5
#define python_dcomment 6
#define python_scomment 7
#endif
#ifndef YY_NO_UNISTD_H
/* Special case for "unistd.h", since it is non-ANSI. We include it way
* down here because we want the user's section 1 to have been scanned first.
* The user has a chance to override it with an option.
*/
#include <unistd.h>
#endif
#define YY_EXTRA_TYPE struct tokenizer_extra *
int linguist_yylex_init (yyscan_t* scanner);
int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner);
/* Accessor methods to globals.
These are made visible to non-reentrant scanners for convenience. */
int linguist_yylex_destroy (yyscan_t yyscanner );
int linguist_yyget_debug (yyscan_t yyscanner );
void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner );
YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner );
void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner );
FILE *linguist_yyget_in (yyscan_t yyscanner );
void linguist_yyset_in (FILE * in_str ,yyscan_t yyscanner );
FILE *linguist_yyget_out (yyscan_t yyscanner );
void linguist_yyset_out (FILE * out_str ,yyscan_t yyscanner );
yy_size_t linguist_yyget_leng (yyscan_t yyscanner );
char *linguist_yyget_text (yyscan_t yyscanner );
int linguist_yyget_lineno (yyscan_t yyscanner );
void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner );
/* Macros after this point can all be overridden by user definitions in
* section 1.
*/
#ifndef YY_SKIP_YYWRAP
#ifdef __cplusplus
extern "C" int linguist_yywrap (yyscan_t yyscanner );
#else
extern int linguist_yywrap (yyscan_t yyscanner );
#endif
#endif
#ifndef yytext_ptr
static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner);
#endif
#ifdef YY_NEED_STRLEN
static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner);
#endif
#ifndef YY_NO_INPUT
#endif
/* Amount of stuff to slurp up with each read. */
#ifndef YY_READ_BUF_SIZE
#define YY_READ_BUF_SIZE 8192
#endif
/* Number of entries by which start-condition stack grows. */
#ifndef YY_START_STACK_INCR
#define YY_START_STACK_INCR 25
#endif
/* Default declaration of generated scanner - a define so the user can
* easily add parameters.
*/
#ifndef YY_DECL
#define YY_DECL_IS_OURS 1
extern int linguist_yylex (yyscan_t yyscanner);
#define YY_DECL int linguist_yylex (yyscan_t yyscanner)
#endif /* !YY_DECL */
/* yy_get_previous_state - get the state just before the EOB char was reached */
#undef YY_NEW_FILE
#undef YY_FLUSH_BUFFER
#undef yy_set_bol
#undef yy_new_buffer
#undef yy_set_interactive
#undef YY_DO_BEFORE_ACTION
#ifdef YY_DECL_IS_OURS
#undef YY_DECL_IS_OURS
#undef YY_DECL
#endif
#line 118 "tokenizer.l"
#line 335 "lex.linguist_yy.h"
#undef linguist_yyIN_HEADER
#endif /* linguist_yyHEADER_H */

@ -0,0 +1,15 @@
// https://github.com/github/linguist/blob/f72f2a21dfe80ebd16af3bc6216da75cd983a4f6/ext/linguist/linguist.h#L1
enum tokenizer_type {
NO_ACTION,
REGULAR_TOKEN,
SHEBANG_TOKEN,
SGML_TOKEN,
};
struct tokenizer_extra {
char *token;
enum tokenizer_type type;
};
// TODO(bzz) port Win support from
// https://github.com/github/linguist/commit/8e912b4d8bf2aef7948de59eba48b75cfcbc97e0

@ -0,0 +1,71 @@
package flex
// #include <stdlib.h>
// #include "linguist.h"
// #include "lex.linguist_yy.h"
// int linguist_yywrap(yyscan_t yyscanner) {
// return 1;
// }
import "C"
import "unsafe"
const maxTokenLen = 32 // bytes
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
func TokenizeFlex(content []byte) []string {
var buf C.YY_BUFFER_STATE
var scanner C.yyscan_t
var extra C.struct_tokenizer_extra
var _len C.ulong
var r C.int
_len = C.ulong(len(content))
cs := C.CBytes(content)
defer C.free(unsafe.Pointer(cs))
C.linguist_yylex_init_extra(&extra, &scanner)
buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner)
ary := []string{}
for {
extra._type = C.NO_ACTION
extra.token = nil
r = C.linguist_yylex(scanner)
switch extra._type {
case C.NO_ACTION:
break
case C.REGULAR_TOKEN:
_len = C.strlen(extra.token)
if _len <= maxTokenLen {
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len)))
}
C.free(unsafe.Pointer(extra.token))
break
case C.SHEBANG_TOKEN:
_len = C.strlen(extra.token)
if _len <= maxTokenLen {
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len))
ary = append(ary, s)
}
C.free(unsafe.Pointer(extra.token))
break
case C.SGML_TOKEN:
_len = C.strlen(extra.token)
if _len <= maxTokenLen {
s := C.GoStringN(extra.token, (C.int)(_len)) + ">"
ary = append(ary, s)
}
C.free(unsafe.Pointer(extra.token))
break
}
if r == 0 {
break
}
}
C.linguist_yy_delete_buffer(buf, scanner)
C.linguist_yylex_destroy(scanner)
return ary
}

@ -0,0 +1,214 @@
// +build !flex
package tokenizer
import (
"bytes"
"github.com/src-d/enry/v2/regex"
)
// Tokenize returns lexical tokens from content. The tokens returned match what
// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
//
// BUG: Until https://github.com/src-d/enry/issues/193 is resolved, there are some
// differences between this function and the Linguist output.
func Tokenize(content []byte) []string {
if len(content) > ByteLimit {
content = content[:ByteLimit]
}
// Copy the input so that changes wrought by the tokenization steps do not
// modify the caller's copy of the input. See #196.
content = append([]byte(nil), content...)
tokens := make([][]byte, 0, 50)
for _, extract := range extractTokens {
var extractedTokens [][]byte
content, extractedTokens = extract(content)
tokens = append(tokens, extractedTokens...)
}
return toString(tokens)
}
func toString(tokens [][]byte) []string {
stokens := make([]string, 0, len(tokens))
for _, token := range tokens {
stokens = append(stokens, string(token))
}
return stokens
}
var (
extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){
// The order to must be this
extractAndReplaceShebang,
extractAndReplaceSGML,
skipCommentsAndLiterals,
extractAndReplacePunctuation,
extractAndReplaceRegular,
extractAndReplaceOperator,
extractRemainders,
}
// Differences between golang regexp and oniguruma:
// 1. no (?s) in oniguruma - makes dot match \n
// 2. no (?U) in oniguruma - ungreedy *
// 3. (?m) implies dot matches \n in oniguruma
// 4. oniguruma handles \w differently - impossible, but true
//
// Workarounds:
// 1. (.|\n)
// 2. replace * with *?
// 3. replace . with [^\n]
// 4. replace \w with [0-9A-Za-z_]
//
// Original golang regexps:
//
// reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
// reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
// reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
// reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
// reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
// rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
// reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
// reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`)
// reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
// reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`)
// reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`)
// reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
//
// These regexps were converted to work in the same way for both engines:
//
reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`)
reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`)
reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`)
reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`)
rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`)
reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`)
reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`)
reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`)
reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`)
reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
regexToSkip = []regex.EnryRegexp{
// The order must be this
reLiteralStringQuotes,
reMultilineComment,
reSingleLineComment,
reLiteralNumber,
}
)
func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) {
var shebangTokens [][]byte
matches := reShebang.FindAllSubmatch(content, -1)
if matches != nil {
shebangTokens = make([][]byte, 0, 2)
for _, match := range matches {
shebangToken := getShebangToken(match)
shebangTokens = append(shebangTokens, shebangToken)
}
reShebang.ReplaceAll(content, []byte(` `))
}
return content, shebangTokens
}
func getShebangToken(matchedShebang [][]byte) []byte {
const prefix = `SHEBANG#!`
var token []byte
for i := 1; i < len(matchedShebang); i++ {
if len(matchedShebang[i]) > 0 {
token = matchedShebang[i]
break
}
}
tokenShebang := append([]byte(prefix), token...)
return tokenShebang
}
func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) {
tokens := re.FindAll(content, -1)
content = re.ReplaceAll(content, []byte(` `))
return content, tokens
}
func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) {
return commonExtractAndReplace(content, rePunctuation)
}
func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) {
return commonExtractAndReplace(content, reRegularToken)
}
func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) {
return commonExtractAndReplace(content, reOperators)
}
func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) {
var SGMLTokens [][]byte
matches := reSGML.FindAllSubmatch(content, -1)
if matches != nil {
SGMLTokens = make([][]byte, 0, 2)
for _, match := range matches {
if reSGMLComment.Match(match[0]) {
continue
}
token := append(match[1], '>')
SGMLTokens = append(SGMLTokens, token)
attributes := getSGMLAttributes(match[0])
SGMLTokens = append(SGMLTokens, attributes...)
}
content = reSGML.ReplaceAll(content, []byte(` `))
}
return content, SGMLTokens
}
func getSGMLAttributes(SGMLTag []byte) [][]byte {
var attributes [][]byte
matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1)
if matches != nil {
attributes = make([][]byte, 0, 5)
for _, match := range matches {
if len(match[1]) != 0 {
attributes = append(attributes, match[1])
}
if len(match[2]) != 0 {
loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1)
attributes = append(attributes, loneAttributes...)
}
}
}
return attributes
}
func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) {
for _, skip := range regexToSkip {
content = skip.ReplaceAll(content, []byte(` `))
}
return content, nil
}
func extractRemainders(content []byte) ([]byte, [][]byte) {
splitted := bytes.Fields(content)
remainderTokens := make([][]byte, 0, len(splitted)*3)
for _, remainder := range splitted {
remainders := bytes.Split(remainder, nil)
remainderTokens = append(remainderTokens, remainders...)
}
return content, remainderTokens
}

@ -0,0 +1,15 @@
// +build flex
package tokenizer
import "github.com/src-d/enry/v2/internal/tokenizer/flex"
// Tokenize returns lexical tokens from content. The tokens returned match what
// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
func Tokenize(content []byte) []string {
if len(content) > ByteLimit {
content = content[:ByteLimit]
}
return flex.TokenizeFlex(content)
}

@ -0,0 +1,17 @@
// +build oniguruma
package regex
import (
rubex "github.com/src-d/go-oniguruma"
)
type EnryRegexp = *rubex.Regexp
func MustCompile(str string) EnryRegexp {
return rubex.MustCompileASCII(str)
}
func QuoteMeta(s string) string {
return rubex.QuoteMeta(s)
}

@ -0,0 +1,17 @@
// +build !oniguruma
package regex
import (
"regexp"
)
type EnryRegexp = *regexp.Regexp
func MustCompile(str string) EnryRegexp {
return regexp.MustCompile(str)
}
func QuoteMeta(s string) string {
return regexp.QuoteMeta(s)
}

@ -0,0 +1,84 @@
package enry
import (
"bytes"
"path/filepath"
"strings"
"github.com/src-d/enry/v2/data"
)
const binSniffLen = 8000
var configurationLanguages = map[string]bool{
"XML": true, "JSON": true, "TOML": true, "YAML": true, "INI": true, "SQL": true,
}
// IsConfiguration tells if filename is in one of the configuration languages.
func IsConfiguration(path string) bool {
language, _ := GetLanguageByExtension(path)
_, is := configurationLanguages[language]
return is
}
// IsImage tells if a given file is an image (PNG, JPEG or GIF format).
func IsImage(path string) bool {
extension := filepath.Ext(path)
if extension == ".png" || extension == ".jpg" || extension == ".jpeg" || extension == ".gif" {
return true
}
return false
}
// GetMIMEType returns a MIME type of a given file based on its languages.
func GetMIMEType(path string, language string) string {
if mime, ok := data.LanguagesMime[language]; ok {
return mime
}
if IsImage(path) {
return "image/" + filepath.Ext(path)[1:]
}
return "text/plain"
}
// IsDocumentation returns whether or not path is a documentation path.
func IsDocumentation(path string) bool {
return data.DocumentationMatchers.Match(path)
}
// IsDotFile returns whether or not path has dot as a prefix.
func IsDotFile(path string) bool {
base := filepath.Base(filepath.Clean(path))
return strings.HasPrefix(base, ".") && base != "."
}
// IsVendor returns whether or not path is a vendor path.
func IsVendor(path string) bool {
return data.VendorMatchers.Match(path)
}
// IsBinary detects if data is a binary value based on:
// http://git.kernel.org/cgit/git/git.git/tree/xdiff-interface.c?id=HEAD#n198
func IsBinary(data []byte) bool {
if len(data) > binSniffLen {
data = data[:binSniffLen]
}
if bytes.IndexByte(data, byte(0)) == -1 {
return false
}
return true
}
// GetColor returns a HTML color code of a given language.
func GetColor(language string) string {
if color, ok := data.LanguagesColor[language]; ok {
return color
}
return "#cccccc"
}

@ -0,0 +1,20 @@
dist: trusty
language: go
go:
- '1.11.x'
- '1.12.x'
env:
global:
- LD_LIBRARY_PATH="/usr/local/lib":${LD_LIBRARY_PATH}
- GO111MODULE=on
- ONIGURUMA_VERSION='6.9.1'
before_install: # install oniguruma manually as trusty has only ancient 5.x
- sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8 fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627
- wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_${ONIGURUMA_VERSION}-1_amd64.deb"
- sudo dpkg -i "libonig5_${ONIGURUMA_VERSION}-1_amd64.deb"
- wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb"
- sudo dpkg -i "libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb"
script:
- go test -v --cover -race

@ -0,0 +1,19 @@
Copyright (C) 2011 by Zhigang Chen
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

@ -0,0 +1,20 @@
## go-oniguruma
<a href="https://travis-ci.org/src-d/go-oniguruma"><img alt="Build Status" src="https://travis-ci.org/src-d/go-oniguruma.svg?branch=master" /></a>
This repository is a fork of [moovweb/rubex](https://github.com/moovweb/rubex/tree/go1) - a simple regular expression library (based on [oniguruma](https://github.com/kkos/oniguruma)) that supports Ruby's regex syntax.
The _rubex_ was originally created by Zhigang Chen (zhigang.chen@moovweb.com or zhigangc@gmail.com). It implements all the public functions of Go's Regexp package, except LiteralPrefix.
By the benchmark tests in regexp, the library is 40% to 10X faster than Regexp on all but one test. Unlike Go's regexp, this library supports named capture groups and also allow `"\\1"` and `"\\k<name>"` in replacement strings.
The library calls the _oniguruma_ regex library for regex pattern searching. All replacement code is done in Go.
### Install all (_oniguruma_ and _rubex_):
```sh
# linux (debian/ubuntu/...)
sudo apt-get install libonig-dev
# osx (homebrew)
brew install oniguruma
go install -i .
```

@ -0,0 +1,184 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#ifdef BENCHMARK_CHELP
#include <sys/time.h>
#endif
#include "chelper.h"
int NewOnigRegex( char *pattern, int pattern_length, int option,
OnigRegex *regex, OnigRegion **region, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer) {
int ret = ONIG_NORMAL;
int error_msg_len = 0;
OnigUChar *pattern_start = (OnigUChar *) pattern;
OnigUChar *pattern_end = (OnigUChar *) (pattern + pattern_length);
*error_info = (OnigErrorInfo *) malloc(sizeof(OnigErrorInfo));
memset(*error_info, 0, sizeof(OnigErrorInfo));
onig_initialize_encoding(*encoding);
*error_buffer = (char*) malloc(ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char));
memset(*error_buffer, 0, ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char));
*region = onig_region_new();
ret = onig_new(regex, pattern_start, pattern_end, (OnigOptionType)(option), *encoding, OnigDefaultSyntax, *error_info);
if (ret != ONIG_NORMAL) {
error_msg_len = onig_error_code_to_str((unsigned char*)(*error_buffer), ret, *error_info);
if (error_msg_len >= ONIG_MAX_ERROR_MESSAGE_LEN) {
error_msg_len = ONIG_MAX_ERROR_MESSAGE_LEN - 1;
}
(*error_buffer)[error_msg_len] = '\0';
}
return ret;
}
int SearchOnigRegex( void *str, int str_length, int offset, int option,
OnigRegex regex, OnigRegion *region, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures) {
int ret = ONIG_MISMATCH;
int error_msg_len = 0;
#ifdef BENCHMARK_CHELP
struct timeval tim1, tim2;
long t;
#endif
OnigUChar *str_start = (OnigUChar *) str;
OnigUChar *str_end = (OnigUChar *) (str_start + str_length);
OnigUChar *search_start = (OnigUChar *)(str_start + offset);
OnigUChar *search_end = str_end;
#ifdef BENCHMARK_CHELP
gettimeofday(&tim1, NULL);
#endif
ret = onig_search(regex, str_start, str_end, search_start, search_end, region, option);
if (ret < 0 && error_buffer != NULL) {
error_msg_len = onig_error_code_to_str((unsigned char*)(error_buffer), ret, error_info);
if (error_msg_len >= ONIG_MAX_ERROR_MESSAGE_LEN) {
error_msg_len = ONIG_MAX_ERROR_MESSAGE_LEN - 1;
}
error_buffer[error_msg_len] = '\0';
}
else if (captures != NULL) {
int i;
int count = 0;
for (i = 0; i < region->num_regs; i++) {
captures[2*count] = region->beg[i];
captures[2*count+1] = region->end[i];
count ++;
}
*numCaptures = count;
}
#ifdef BENCHMARK_CHELP
gettimeofday(&tim2, NULL);
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec;
printf("%ld microseconds elapsed\n", t);
#endif
return ret;
}
int MatchOnigRegex(void *str, int str_length, int offset, int option,
OnigRegex regex, OnigRegion *region) {
int ret = ONIG_MISMATCH;
int error_msg_len = 0;
#ifdef BENCHMARK_CHELP
struct timeval tim1, tim2;
long t;
#endif
OnigUChar *str_start = (OnigUChar *) str;
OnigUChar *str_end = (OnigUChar *) (str_start + str_length);
OnigUChar *search_start = (OnigUChar *)(str_start + offset);
#ifdef BENCHMARK_CHELP
gettimeofday(&tim1, NULL);
#endif
ret = onig_match(regex, str_start, str_end, search_start, region, option);
#ifdef BENCHMARK_CHELP
gettimeofday(&tim2, NULL);
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec;
printf("%ld microseconds elapsed\n", t);
#endif
return ret;
}
int LookupOnigCaptureByName(char *name, int name_length,
OnigRegex regex, OnigRegion *region) {
int ret = ONIGERR_UNDEFINED_NAME_REFERENCE;
#ifdef BENCHMARK_CHELP
struct timeval tim1, tim2;
long t;
#endif
OnigUChar *name_start = (OnigUChar *) name;
OnigUChar *name_end = (OnigUChar *) (name_start + name_length);
#ifdef BENCHMARK_CHELP
gettimeofday(&tim1, NULL);
#endif
ret = onig_name_to_backref_number(regex, name_start, name_end, region);
#ifdef BENCHMARK_CHELP
gettimeofday(&tim2, NULL);
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec;
printf("%ld microseconds elapsed\n", t);
#endif
return ret;
}
typedef struct {
char *nameBuffer;
int bufferOffset;
int bufferSize;
int *numbers;
int numIndex;
} group_info_t;
int name_callback(const UChar* name, const UChar* name_end,
int ngroup_num, int* group_nums,
regex_t* reg, void* arg)
{
int nameLen, offset, newOffset;
group_info_t *groupInfo;
groupInfo = (group_info_t*) arg;
offset = groupInfo->bufferOffset;
nameLen = name_end - name;
newOffset = offset + nameLen;
//if there are already names, add a ";"
if (offset > 0) {
newOffset += 1;
}
if (newOffset <= groupInfo->bufferSize) {
if (offset > 0) {
groupInfo->nameBuffer[offset] = ';';
offset += 1;
}
memcpy(&groupInfo->nameBuffer[offset], name, nameLen);
}
groupInfo->bufferOffset = newOffset;
if (ngroup_num > 0) {
groupInfo->numbers[groupInfo->numIndex] = group_nums[ngroup_num-1];
} else {
groupInfo->numbers[groupInfo->numIndex] = -1;
}
groupInfo->numIndex += 1;
return 0; /* 0: continue */
}
int GetCaptureNames(OnigRegex reg, void *buffer, int bufferSize, int* groupNumbers) {
int ret;
group_info_t groupInfo;
groupInfo.nameBuffer = (char*)buffer;
groupInfo.bufferOffset = 0;
groupInfo.bufferSize = bufferSize;
groupInfo.numbers = groupNumbers;
groupInfo.numIndex = 0;
onig_foreach_name(reg, name_callback, (void* )&groupInfo);
return groupInfo.bufferOffset;
}

@ -0,0 +1,14 @@
#include <oniguruma.h>
extern int NewOnigRegex( char *pattern, int pattern_length, int option,
OnigRegex *regex, OnigRegion **region, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer);
extern int SearchOnigRegex( void *str, int str_length, int offset, int option,
OnigRegex regex, OnigRegion *region, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures);
extern int MatchOnigRegex( void *str, int str_length, int offset, int option,
OnigRegex regex, OnigRegion *region);
extern int LookupOnigCaptureByName(char *name, int name_length, OnigRegex regex, OnigRegion *region);
extern int GetCaptureNames(OnigRegex regex, void *buffer, int bufferSize, int* groupNumbers);

@ -0,0 +1,27 @@
package rubex
const (
ONIG_OPTION_DEFAULT = ONIG_OPTION_NONE
/* options */
ONIG_OPTION_NONE = 0
ONIG_OPTION_IGNORECASE = 1
ONIG_OPTION_EXTEND = (ONIG_OPTION_IGNORECASE << 1)
ONIG_OPTION_MULTILINE = (ONIG_OPTION_EXTEND << 1)
ONIG_OPTION_SINGLELINE = (ONIG_OPTION_MULTILINE << 1)
ONIG_OPTION_FIND_LONGEST = (ONIG_OPTION_SINGLELINE << 1)
ONIG_OPTION_FIND_NOT_EMPTY = (ONIG_OPTION_FIND_LONGEST << 1)
ONIG_OPTION_NEGATE_SINGLELINE = (ONIG_OPTION_FIND_NOT_EMPTY << 1)
ONIG_OPTION_DONT_CAPTURE_GROUP = (ONIG_OPTION_NEGATE_SINGLELINE << 1)
ONIG_OPTION_CAPTURE_GROUP = (ONIG_OPTION_DONT_CAPTURE_GROUP << 1)
/* options (search time) */
ONIG_OPTION_NOTBOL = (ONIG_OPTION_CAPTURE_GROUP << 1)
ONIG_OPTION_NOTEOL = (ONIG_OPTION_NOTBOL << 1)
ONIG_OPTION_POSIX_REGION = (ONIG_OPTION_NOTEOL << 1)
ONIG_OPTION_MAXBIT = ONIG_OPTION_POSIX_REGION /* limit */
ONIG_NORMAL = 0
ONIG_MISMATCH = -1
ONIG_MISMATCH_STR = "mismatch"
ONIGERR_UNDEFINED_NAME_REFERENCE = -217
)

@ -0,0 +1 @@
module github.com/src-d/go-oniguruma

@ -0,0 +1,36 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package regexp implements a simple regular expression library.
// QuoteMeta func is copied here to avoid linking the entire Regexp library.
package rubex
func special(c int) bool {
for _, r := range `\.+*?()|[]^$` {
if c == int(r) {
return true
}
}
return false
}
// QuoteMeta returns a string that quotes all regular expression metacharacters
// inside the argument text; the returned string is a regular expression matching
// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
func QuoteMeta(s string) string {
b := make([]byte, 2*len(s))
// A byte loop is correct because all metacharacters are ASCII.
j := 0
for i := 0; i < len(s); i++ {
if special(int(s[i])) {
b[j] = '\\'
j++
}
b[j] = s[i]
j++
}
return string(b[0:j])
}

@ -0,0 +1,668 @@
package rubex
/*
#cgo CFLAGS: -I/usr/local/include
#cgo LDFLAGS: -L/usr/local/lib -lonig
#include <stdlib.h>
#include <oniguruma.h>
#include "chelper.h"
*/
import "C"
import (
"bytes"
"errors"
"fmt"
"io"
"log"
"runtime"
"strconv"
"sync"
"unicode/utf8"
"unsafe"
)
type strRange []int
const numMatchStartSize = 4
const numReadBufferStartSize = 256
var mutex sync.Mutex
type MatchData struct {
count int
indexes [][]int32
}
type NamedGroupInfo map[string]int
type Regexp struct {
pattern string
regex C.OnigRegex
region *C.OnigRegion
encoding C.OnigEncoding
errorInfo *C.OnigErrorInfo
errorBuf *C.char
matchData *MatchData
namedGroupInfo NamedGroupInfo
}
// NewRegexp creates and initializes a new Regexp with the given pattern and option.
func NewRegexp(pattern string, option int) (re *Regexp, err error) {
return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8}, option)
}
// NewRegexpASCII is equivalent to NewRegexp, but with the encoding restricted to ASCII.
func NewRegexpASCII(pattern string, option int) (re *Regexp, err error) {
return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_ASCII}, option)
}
func initRegexp(re *Regexp, option int) (*Regexp, error) {
var err error
patternCharPtr := C.CString(re.pattern)
defer C.free(unsafe.Pointer(patternCharPtr))
mutex.Lock()
defer mutex.Unlock()
errorCode := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf)
if errorCode != C.ONIG_NORMAL {
err = errors.New(C.GoString(re.errorBuf))
} else {
err = nil
numCapturesInPattern := int(C.onig_number_of_captures(re.regex)) + 1
re.matchData = &MatchData{}
re.matchData.indexes = make([][]int32, numMatchStartSize)
for i := 0; i < numMatchStartSize; i++ {
re.matchData.indexes[i] = make([]int32, numCapturesInPattern*2)
}
re.namedGroupInfo = re.getNamedGroupInfo()
runtime.SetFinalizer(re, (*Regexp).Free)
}
return re, err
}
func Compile(str string) (*Regexp, error) {
return NewRegexp(str, ONIG_OPTION_DEFAULT)
}
func MustCompile(str string) *Regexp {
regexp, error := NewRegexp(str, ONIG_OPTION_DEFAULT)
if error != nil {
panic("regexp: compiling " + str + ": " + error.Error())
}
return regexp
}
func CompileWithOption(str string, option int) (*Regexp, error) {
return NewRegexp(str, option)
}
func MustCompileWithOption(str string, option int) *Regexp {
regexp, error := NewRegexp(str, option)
if error != nil {
panic("regexp: compiling " + str + ": " + error.Error())
}
return regexp
}
// MustCompileASCII is equivalent to MustCompile, but with the encoding restricted to ASCII.
func MustCompileASCII(str string) *Regexp {
regexp, error := NewRegexpASCII(str, ONIG_OPTION_DEFAULT)
if error != nil {
panic("regexp: compiling " + str + ": " + error.Error())
}
return regexp
}
func (re *Regexp) Free() {
mutex.Lock()
if re.regex != nil {
C.onig_free(re.regex)
re.regex = nil
}
if re.region != nil {
C.onig_region_free(re.region, 1)
re.region = nil
}
mutex.Unlock()
if re.errorInfo != nil {
C.free(unsafe.Pointer(re.errorInfo))
re.errorInfo = nil
}
if re.errorBuf != nil {
C.free(unsafe.Pointer(re.errorBuf))
re.errorBuf = nil
}
}
func (re *Regexp) getNamedGroupInfo() (namedGroupInfo NamedGroupInfo) {
numNamedGroups := int(C.onig_number_of_names(re.regex))
//when any named capture exisits, there is no numbered capture even if there are unnamed captures
if numNamedGroups > 0 {
namedGroupInfo = make(map[string]int)
//try to get the names
bufferSize := len(re.pattern) * 2
nameBuffer := make([]byte, bufferSize)
groupNumbers := make([]int32, numNamedGroups)
bufferPtr := unsafe.Pointer(&nameBuffer[0])
numbersPtr := unsafe.Pointer(&groupNumbers[0])
length := int(C.GetCaptureNames(re.regex, bufferPtr, (C.int)(bufferSize), (*C.int)(numbersPtr)))
if length > 0 {
namesAsBytes := bytes.Split(nameBuffer[:length], ([]byte)(";"))
if len(namesAsBytes) != numNamedGroups {
log.Fatalf("the number of named groups (%d) does not match the number names found (%d)\n", numNamedGroups, len(namesAsBytes))
}
for i, nameAsBytes := range namesAsBytes {
name := string(nameAsBytes)
namedGroupInfo[name] = int(groupNumbers[i])
}
} else {
log.Fatalf("could not get the capture group names from %q", re.String())
}
}
return
}
func (re *Regexp) groupNameToId(name string) (id int) {
if re.namedGroupInfo == nil {
id = ONIGERR_UNDEFINED_NAME_REFERENCE
} else {
id = re.namedGroupInfo[name]
}
return
}
func (re *Regexp) processMatch(numCaptures int) (match []int32) {
if numCaptures <= 0 {
panic("cannot have 0 captures when processing a match")
}
matchData := re.matchData
return matchData.indexes[matchData.count][:numCaptures*2]
}
func (re *Regexp) ClearMatchData() {
matchData := re.matchData
matchData.count = 0
}
func (re *Regexp) find(b []byte, n int, offset int) (match []int) {
if n == 0 {
b = []byte{0}
}
ptr := unsafe.Pointer(&b[0])
matchData := re.matchData
capturesPtr := unsafe.Pointer(&(matchData.indexes[matchData.count][0]))
numCaptures := int32(0)
numCapturesPtr := unsafe.Pointer(&numCaptures)
pos := int(C.SearchOnigRegex((ptr), C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), re.regex, re.region, re.errorInfo, (*C.char)(nil), (*C.int)(capturesPtr), (*C.int)(numCapturesPtr)))
if pos >= 0 {
if numCaptures <= 0 {
panic("cannot have 0 captures when processing a match")
}
match2 := matchData.indexes[matchData.count][:numCaptures*2]
match = make([]int, len(match2))
for i := range match2 {
match[i] = int(match2[i])
}
numCapturesInPattern := int32(C.onig_number_of_captures(re.regex)) + 1
if numCapturesInPattern != numCaptures {
log.Fatalf("expected %d captures but got %d\n", numCapturesInPattern, numCaptures)
}
}
return
}
func getCapture(b []byte, beg int, end int) []byte {
if beg < 0 || end < 0 {
return nil
}
return b[beg:end]
}
func (re *Regexp) match(b []byte, n int, offset int) bool {
re.ClearMatchData()
if n == 0 {
b = []byte{0}
}
ptr := unsafe.Pointer(&b[0])
pos := int(C.SearchOnigRegex((ptr), C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), re.regex, re.region, re.errorInfo, (*C.char)(nil), (*C.int)(nil), (*C.int)(nil)))
return pos >= 0
}
func (re *Regexp) findAll(b []byte, n int) (matches [][]int) {
re.ClearMatchData()
if n < 0 {
n = len(b)
}
matchData := re.matchData
offset := 0
for offset <= n {
if matchData.count >= len(matchData.indexes) {
length := len(matchData.indexes[0])
matchData.indexes = append(matchData.indexes, make([]int32, length))
}
if match := re.find(b, n, offset); len(match) > 0 {
matchData.count += 1
//move offset to the ending index of the current match and prepare to find the next non-overlapping match
offset = match[1]
//if match[0] == match[1], it means the current match does not advance the search. we need to exit the loop to avoid getting stuck here.
if match[0] == match[1] {
if offset < n && offset >= 0 {
//there are more bytes, so move offset by a word
_, width := utf8.DecodeRune(b[offset:])
offset += width
} else {
//search is over, exit loop
break
}
}
} else {
break
}
}
matches2 := matchData.indexes[:matchData.count]
matches = make([][]int, len(matches2))
for i, v := range matches2 {
matches[i] = make([]int, len(v))
for j, v2 := range v {
matches[i][j] = int(v2)
}
}
return
}
func (re *Regexp) FindIndex(b []byte) []int {
re.ClearMatchData()
match := re.find(b, len(b), 0)
if len(match) == 0 {
return nil
}
return match[:2]
}
func (re *Regexp) Find(b []byte) []byte {
loc := re.FindIndex(b)
if loc == nil {
return nil
}
return getCapture(b, loc[0], loc[1])
}
func (re *Regexp) FindString(s string) string {
b := []byte(s)
mb := re.Find(b)
if mb == nil {
return ""
}
return string(mb)
}
func (re *Regexp) FindStringIndex(s string) []int {
b := []byte(s)
return re.FindIndex(b)
}
func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
matches := re.findAll(b, n)
if len(matches) == 0 {
return nil
}
return matches
}
func (re *Regexp) FindAll(b []byte, n int) [][]byte {
matches := re.FindAllIndex(b, n)
if matches == nil {
return nil
}
matchBytes := make([][]byte, 0, len(matches))
for _, match := range matches {
matchBytes = append(matchBytes, getCapture(b, match[0], match[1]))
}
return matchBytes
}
func (re *Regexp) FindAllString(s string, n int) []string {
b := []byte(s)
matches := re.FindAllIndex(b, n)
if matches == nil {
return nil
}
matchStrings := make([]string, 0, len(matches))
for _, match := range matches {
m := getCapture(b, match[0], match[1])
if m == nil {
matchStrings = append(matchStrings, "")
} else {
matchStrings = append(matchStrings, string(m))
}
}
return matchStrings
}
func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
b := []byte(s)
return re.FindAllIndex(b, n)
}
func (re *Regexp) findSubmatchIndex(b []byte) (match []int) {
re.ClearMatchData()
match = re.find(b, len(b), 0)
return
}
func (re *Regexp) FindSubmatchIndex(b []byte) []int {
match := re.findSubmatchIndex(b)
if len(match) == 0 {
return nil
}
return match
}
func (re *Regexp) FindSubmatch(b []byte) [][]byte {
match := re.findSubmatchIndex(b)
if match == nil {
return nil
}
length := len(match) / 2
if length == 0 {
return nil
}
results := make([][]byte, 0, length)
for i := 0; i < length; i++ {
results = append(results, getCapture(b, match[2*i], match[2*i+1]))
}
return results
}
func (re *Regexp) FindStringSubmatch(s string) []string {
b := []byte(s)
match := re.findSubmatchIndex(b)
if match == nil {
return nil
}
length := len(match) / 2
if length == 0 {
return nil
}
results := make([]string, 0, length)
for i := 0; i < length; i++ {
cap := getCapture(b, match[2*i], match[2*i+1])
if cap == nil {
results = append(results, "")
} else {
results = append(results, string(cap))
}
}
return results
}
func (re *Regexp) FindStringSubmatchIndex(s string) []int {
b := []byte(s)
return re.FindSubmatchIndex(b)
}
func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
matches := re.findAll(b, n)
if len(matches) == 0 {
return nil
}
return matches
}
func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
matches := re.findAll(b, n)
if len(matches) == 0 {
return nil
}
allCapturedBytes := make([][][]byte, 0, len(matches))
for _, match := range matches {
length := len(match) / 2
capturedBytes := make([][]byte, 0, length)
for i := 0; i < length; i++ {
capturedBytes = append(capturedBytes, getCapture(b, match[2*i], match[2*i+1]))
}
allCapturedBytes = append(allCapturedBytes, capturedBytes)
}
return allCapturedBytes
}
func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
b := []byte(s)
matches := re.findAll(b, n)
if len(matches) == 0 {
return nil
}
allCapturedStrings := make([][]string, 0, len(matches))
for _, match := range matches {
length := len(match) / 2
capturedStrings := make([]string, 0, length)
for i := 0; i < length; i++ {
cap := getCapture(b, match[2*i], match[2*i+1])
if cap == nil {
capturedStrings = append(capturedStrings, "")
} else {
capturedStrings = append(capturedStrings, string(cap))
}
}
allCapturedStrings = append(allCapturedStrings, capturedStrings)
}
return allCapturedStrings
}
func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
b := []byte(s)
return re.FindAllSubmatchIndex(b, n)
}
func (re *Regexp) Match(b []byte) bool {
return re.match(b, len(b), 0)
}
func (re *Regexp) MatchString(s string) bool {
b := []byte(s)
return re.Match(b)
}
func (re *Regexp) NumSubexp() int {
return (int)(C.onig_number_of_captures(re.regex))
}
func (re *Regexp) getNamedCapture(name []byte, capturedBytes [][]byte) []byte {
nameStr := string(name)
capNum := re.groupNameToId(nameStr)
if capNum < 0 || capNum >= len(capturedBytes) {
panic(fmt.Sprintf("capture group name (%q) has error\n", nameStr))
}
return capturedBytes[capNum]
}
func (re *Regexp) getNumberedCapture(num int, capturedBytes [][]byte) []byte {
//when named capture groups exist, numbered capture groups returns ""
if re.namedGroupInfo == nil && num <= (len(capturedBytes)-1) && num >= 0 {
return capturedBytes[num]
}
return ([]byte)("")
}
func fillCapturedValues(repl []byte, _ []byte, capturedBytes map[string][]byte) []byte {
replLen := len(repl)
newRepl := make([]byte, 0, replLen*3)
inEscapeMode := false
inGroupNameMode := false
groupName := make([]byte, 0, replLen)
for index := 0; index < replLen; index += 1 {
ch := repl[index]
if inGroupNameMode && ch == byte('<') {
} else if inGroupNameMode && ch == byte('>') {
inGroupNameMode = false
groupNameStr := string(groupName)
capBytes := capturedBytes[groupNameStr]
newRepl = append(newRepl, capBytes...)
groupName = groupName[:0] //reset the name
} else if inGroupNameMode {
groupName = append(groupName, ch)
} else if inEscapeMode && ch <= byte('9') && byte('1') <= ch {
capNumStr := string(ch)
capBytes := capturedBytes[capNumStr]
newRepl = append(newRepl, capBytes...)
} else if inEscapeMode && ch == byte('k') && (index+1) < replLen && repl[index+1] == byte('<') {
inGroupNameMode = true
inEscapeMode = false
index += 1 //bypass the next char '<'
} else if inEscapeMode {
newRepl = append(newRepl, '\\')
newRepl = append(newRepl, ch)
} else if ch != '\\' {
newRepl = append(newRepl, ch)
}
if ch == byte('\\') || inEscapeMode {
inEscapeMode = !inEscapeMode
}
}
return newRepl
}
func (re *Regexp) replaceAll(src, repl []byte, replFunc func([]byte, []byte, map[string][]byte) []byte) []byte {
srcLen := len(src)
matches := re.findAll(src, srcLen)
if len(matches) == 0 {
return src
}
dest := make([]byte, 0, srcLen)
for i, match := range matches {
length := len(match) / 2
capturedBytes := make(map[string][]byte)
if re.namedGroupInfo == nil {
for j := 0; j < length; j++ {
capturedBytes[strconv.Itoa(j)] = getCapture(src, match[2*j], match[2*j+1])
}
} else {
for name, j := range re.namedGroupInfo {
capturedBytes[name] = getCapture(src, match[2*j], match[2*j+1])
}
}
matchBytes := getCapture(src, match[0], match[1])
newRepl := replFunc(repl, matchBytes, capturedBytes)
prevEnd := 0
if i > 0 {
prevMatch := matches[i-1][:2]
prevEnd = prevMatch[1]
}
if match[0] > prevEnd && prevEnd >= 0 && match[0] <= srcLen {
dest = append(dest, src[prevEnd:match[0]]...)
}
dest = append(dest, newRepl...)
}
lastEnd := matches[len(matches)-1][1]
if lastEnd < srcLen && lastEnd >= 0 {
dest = append(dest, src[lastEnd:]...)
}
return dest
}
func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
return re.replaceAll(src, repl, fillCapturedValues)
}
func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
return re.replaceAll(src, []byte(""), func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte {
return repl(matchBytes)
})
}
func (re *Regexp) ReplaceAllString(src, repl string) string {
return string(re.ReplaceAll([]byte(src), []byte(repl)))
}
func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
srcB := []byte(src)
destB := re.replaceAll(srcB, []byte(""), func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte {
return []byte(repl(string(matchBytes)))
})
return string(destB)
}
func (re *Regexp) String() string {
return re.pattern
}
func grow_buffer(b []byte, offset int, n int) []byte {
if offset+n > cap(b) {
buf := make([]byte, 2*cap(b)+n)
copy(buf, b[:offset])
return buf
}
return b
}
func fromReader(r io.RuneReader) []byte {
b := make([]byte, numReadBufferStartSize)
offset := 0
var err error = nil
for err == nil {
rune, runeWidth, err := r.ReadRune()
if err == nil {
b = grow_buffer(b, offset, runeWidth)
writeWidth := utf8.EncodeRune(b[offset:], rune)
if runeWidth != writeWidth {
panic("reading rune width not equal to the written rune width")
}
offset += writeWidth
} else {
break
}
}
return b[:offset]
}
func (re *Regexp) FindReaderIndex(r io.RuneReader) []int {
b := fromReader(r)
return re.FindIndex(b)
}
func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
b := fromReader(r)
return re.FindSubmatchIndex(b)
}
func (re *Regexp) MatchReader(r io.RuneReader) bool {
b := fromReader(r)
return re.Match(b)
}
func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
//no easy way to implement this
return "", false
}
func MatchString(pattern string, s string) (matched bool, error error) {
re, err := Compile(pattern)
if err != nil {
return false, err
}
return re.MatchString(s), nil
}
func (re *Regexp) Gsub(src, repl string) string {
srcBytes := ([]byte)(src)
replBytes := ([]byte)(repl)
replaced := re.replaceAll(srcBytes, replBytes, fillCapturedValues)
return string(replaced)
}
func (re *Regexp) GsubFunc(src string, replFunc func(string, map[string]string) string) string {
srcBytes := ([]byte)(src)
replaced := re.replaceAll(srcBytes, nil, func(_ []byte, matchBytes []byte, capturedBytes map[string][]byte) []byte {
capturedStrings := make(map[string]string)
for name, capBytes := range capturedBytes {
capturedStrings[name] = string(capBytes)
}
matchString := string(matchBytes)
return ([]byte)(replFunc(matchString, capturedStrings))
})
return string(replaced)
}

@ -0,0 +1,22 @@
Copyright (c) 2013 Caleb Spare
MIT License
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

@ -0,0 +1,7 @@
# Trie
[![GoDoc](http://godoc.org/github.com/toqueteos/trie?status.png)](http://godoc.org/github.com/toqueteos/trie)
This is a fork of https://github.com/cespare/go-trie that adds the `PrefixIndex` method.
It's required for https://github.com/toqueteos/substring.

@ -0,0 +1 @@
module github.com/toqueteos/trie

@ -0,0 +1,102 @@
// Package trie is an implementation of a trie (prefix tree) data structure over byte slices. It provides a
// small and simple API for usage as a set as well as a 'Node' API for walking the trie.
package trie
// A Trie is a a prefix tree.
type Trie struct {
root *Node
}
// New construct a new, empty Trie ready for use.
func New() *Trie {
return &Trie{
root: &Node{},
}
}
// Insert puts b into the Trie. It returns true if the element was not previously in t.
func (t *Trie) Insert(b []byte) bool {
n := t.root
for _, c := range b {
next, ok := n.Walk(c)
if !ok {
next = &Node{}
n.branches[c] = next
n.hasChildren = true
}
n = next
}
if n.terminal {
return false
}
n.terminal = true
return true
}
// Contains checks t for membership of b.
func (t *Trie) Contains(b []byte) bool {
n := t.root
for _, c := range b {
next, ok := n.Walk(c)
if !ok {
return false
}
n = next
}
return n.terminal
}
// PrefixIndex walks through `b` until a prefix is found (terminal node) or it is exhausted.
func (t *Trie) PrefixIndex(b []byte) int {
var idx int
n := t.root
for _, c := range b {
next, ok := n.Walk(c)
if !ok {
return -1
}
if next.terminal {
return idx
}
n = next
idx++
}
if !n.terminal {
idx = -1
}
return idx
}
// Root returns the root node of a Trie. A valid Trie (i.e., constructed with New), always has a non-nil root
// node.
func (t *Trie) Root() *Node {
return t.root
}
// A Node represents a logical vertex in the trie structure.
type Node struct {
branches [256]*Node
terminal bool
hasChildren bool
}
// Walk returns the node reached along edge c, if one exists. The ok value indicates whether such a node
// exist.
func (n *Node) Walk(c byte) (next *Node, ok bool) {
next = n.branches[int(c)]
return next, (next != nil)
}
// Terminal indicates whether n is terminal in the trie (that is, whether the path from the root to n
// represents an element in the set). For instance, if the root node is terminal, then []byte{} is in the
// trie.
func (n *Node) Terminal() bool {
return n.terminal
}
// Leaf indicates whether n is a leaf node in the trie (that is, whether it has children). A leaf node must be
// terminal (else it would not exist). Logically, if n is a leaf node then the []byte represented by the path
// from the root to n is not a proper prefix of any element of the trie.
func (n *Node) Leaf() bool {
return !n.hasChildren
}

@ -0,0 +1,24 @@
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof

@ -0,0 +1,11 @@
language: go
go:
- 1.2
- 1.3
- 1.4
- tip
script:
- go get launchpad.net/gocheck
- go test

@ -0,0 +1,22 @@
The MIT License (MIT)
Copyright (c) 2015 Carlos Cobo
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -0,0 +1,80 @@
# substring [![Build Status](https://travis-ci.org/toqueteos/substring.png?branch=master)](https://travis-ci.org/toqueteos/substring) [![GoDoc](http://godoc.org/github.com/toqueteos/substring?status.png)](http://godoc.org/github.com/toqueteos/substring) [![GitHub release](https://img.shields.io/github/release/toqueteos/substring.svg)](https://github.com/toqueteos/substring/releases)
Simple and composable alternative to [regexp](http://golang.org/pkg/regexp/) package for fast substring searches.
## Installation
The recommended way to install substring
```
go get -t gopkg.in/toqueteos/substring.v1
```
The `-t` flag is for fetching [gocheck](https://gopkg.in/check.v1), required for tests and benchmarks.
## Examples
A basic example with two matchers:
```go
package main
import (
"fmt"
"regexp"
"gopkg.in/toqueteos/substring.v1"
)
func main() {
m1 := substring.After("assets/", substring.Or(
substring.Has("jquery"),
substring.Has("angular"),
substring.Suffixes(".js", ".css", ".html"),
))
fmt.Println(m1.Match("assets/angular/foo/bar")) //Prints: true
fmt.Println(m1.Match("assets/js/file.js")) //Prints: true
fmt.Println(m1.Match("assets/style/bar.css")) //Prints: true
fmt.Println(m1.Match("assets/foo/bar.html")) //Prints: false
fmt.Println(m1.Match("assets/js/qux.json")) //Prints: false
fmt.Println(m1.Match("core/file.html")) //Prints: false
fmt.Println(m1.Match("foobar/that.jsx")) //Prints: false
m2 := substring.After("vendor/", substring.Suffixes(".css", ".js", ".less"))
fmt.Println(m2.Match("foo/vendor/bar/qux.css")) //Prints: true
fmt.Println(m2.Match("foo/var/qux.less")) //Prints: false
re := regexp.MustCompile(`vendor\/.*\.(css|js|less)$`)
fmt.Println(re.MatchString("foo/vendor/bar/qux.css")) //Prints: true
fmt.Println(re.MatchString("foo/var/qux.less")) //Prints: false
}
```
## How fast?
It may vary depending on your use case but 1~2 orders of magnitude faster than `regexp` is pretty common.
Test it out for yourself by running `go test -check.b`!
```
$ go test -check.b
PASS: lib_test.go:18: LibSuite.BenchmarkExample1 10000000 221 ns/op
PASS: lib_test.go:23: LibSuite.BenchmarkExample2 10000000 229 ns/op
PASS: lib_test.go:28: LibSuite.BenchmarkExample3 10000000 216 ns/op
PASS: lib_test.go:33: LibSuite.BenchmarkExample4 10000000 208 ns/op
PASS: lib_test.go:38: LibSuite.BenchmarkExample5 20000000 82.1 ns/op
PASS: lib_test.go:48: LibSuite.BenchmarkExampleRe1 500000 4136 ns/op
PASS: lib_test.go:53: LibSuite.BenchmarkExampleRe2 500000 5222 ns/op
PASS: lib_test.go:58: LibSuite.BenchmarkExampleRe3 500000 5116 ns/op
PASS: lib_test.go:63: LibSuite.BenchmarkExampleRe4 500000 4020 ns/op
PASS: lib_test.go:68: LibSuite.BenchmarkExampleRe5 10000000 226 ns/op
OK: 10 passed
PASS
ok gopkg.in/toqueteos/substring.v1 23.471s
```
License
-------
MIT, see [LICENSE](LICENSE)

@ -0,0 +1,229 @@
package substring
import (
"bytes"
"regexp"
"github.com/toqueteos/trie"
)
type BytesMatcher interface {
Match(b []byte) bool
MatchIndex(b []byte) int
}
// regexp
type regexpBytes struct{ re *regexp.Regexp }
func BytesRegexp(pat string) *regexpBytes { return &regexpBytes{regexp.MustCompile(pat)} }
func (m *regexpBytes) Match(b []byte) bool { return m.re.Match(b) }
func (m *regexpBytes) MatchIndex(b []byte) int {
found := m.re.FindIndex(b)
if found != nil {
return found[1]
}
return -1
}
// exact
type exactBytes struct{ pat []byte }
func BytesExact(pat string) *exactBytes { return &exactBytes{[]byte(pat)} }
func (m *exactBytes) Match(b []byte) bool {
l, r := len(m.pat), len(b)
if l != r {
return false
}
for i := 0; i < l; i++ {
if b[i] != m.pat[i] {
return false
}
}
return true
}
func (m *exactBytes) MatchIndex(b []byte) int {
if m.Match(b) {
return len(b)
}
return -1
}
// any, search `s` in `.Match(pat)`
type anyBytes struct {
pat []byte
}
func BytesAny(pat string) *anyBytes { return &anyBytes{[]byte(pat)} }
func (m *anyBytes) Match(b []byte) bool { return bytes.Index(m.pat, b) >= 0 }
func (m *anyBytes) MatchIndex(b []byte) int {
if idx := bytes.Index(m.pat, b); idx >= 0 {
return idx + len(b)
}
return -1
}
// has, search `pat` in `.Match(s)`
type hasBytes struct {
pat []byte
}
func BytesHas(pat string) *hasBytes { return &hasBytes{[]byte(pat)} }
func (m *hasBytes) Match(b []byte) bool { return bytes.Index(b, m.pat) >= 0 }
func (m *hasBytes) MatchIndex(b []byte) int {
if idx := bytes.Index(b, m.pat); idx >= 0 {
return idx + len(m.pat)
}
return -1
}
// prefix
type prefixBytes struct{ pat []byte }
func BytesPrefix(pat string) *prefixBytes { return &prefixBytes{[]byte(pat)} }
func (m *prefixBytes) Match(b []byte) bool { return bytes.HasPrefix(b, m.pat) }
func (m *prefixBytes) MatchIndex(b []byte) int {
if bytes.HasPrefix(b, m.pat) {
return len(m.pat)
}
return -1
}
// prefixes
type prefixesBytes struct {
t *trie.Trie
}
func BytesPrefixes(pats ...string) *prefixesBytes {
t := trie.New()
for _, pat := range pats {
t.Insert([]byte(pat))
}
return &prefixesBytes{t}
}
func (m *prefixesBytes) Match(b []byte) bool { return m.t.PrefixIndex(b) >= 0 }
func (m *prefixesBytes) MatchIndex(b []byte) int {
if idx := m.t.PrefixIndex(b); idx >= 0 {
return idx
}
return -1
}
// suffix
type suffixBytes struct{ pat []byte }
func BytesSuffix(pat string) *suffixBytes { return &suffixBytes{[]byte(pat)} }
func (m *suffixBytes) Match(b []byte) bool { return bytes.HasSuffix(b, m.pat) }
func (m *suffixBytes) MatchIndex(b []byte) int {
if bytes.HasSuffix(b, m.pat) {
return len(m.pat)
}
return -1
}
// suffixes
type suffixesBytes struct {
t *trie.Trie
}
func BytesSuffixes(pats ...string) *suffixesBytes {
t := trie.New()
for _, pat := range pats {
t.Insert(reverse([]byte(pat)))
}
return &suffixesBytes{t}
}
func (m *suffixesBytes) Match(b []byte) bool {
return m.t.PrefixIndex(reverse(b)) >= 0
}
func (m *suffixesBytes) MatchIndex(b []byte) int {
if idx := m.t.PrefixIndex(reverse(b)); idx >= 0 {
return idx
}
return -1
}
// after
type afterBytes struct {
first []byte
matcher BytesMatcher
}
func BytesAfter(first string, m BytesMatcher) *afterBytes { return &afterBytes{[]byte(first), m} }
func (a *afterBytes) Match(b []byte) bool {
if idx := bytes.Index(b, a.first); idx >= 0 {
return a.matcher.Match(b[idx+len(a.first):])
}
return false
}
func (a *afterBytes) MatchIndex(b []byte) int {
if idx := bytes.Index(b, a.first); idx >= 0 {
return idx + a.matcher.MatchIndex(b[idx:])
}
return -1
}
// and, returns true iff all matchers return true
type andBytes struct{ matchers []BytesMatcher }
func BytesAnd(m ...BytesMatcher) *andBytes { return &andBytes{m} }
func (a *andBytes) Match(b []byte) bool {
for _, m := range a.matchers {
if !m.Match(b) {
return false
}
}
return true
}
func (a *andBytes) MatchIndex(b []byte) int {
longest := 0
for _, m := range a.matchers {
if idx := m.MatchIndex(b); idx < 0 {
return -1
} else if idx > longest {
longest = idx
}
}
return longest
}
// or, returns true iff any matcher returns true
type orBytes struct{ matchers []BytesMatcher }
func BytesOr(m ...BytesMatcher) *orBytes { return &orBytes{m} }
func (o *orBytes) Match(b []byte) bool {
for _, m := range o.matchers {
if m.Match(b) {
return true
}
}
return false
}
func (o *orBytes) MatchIndex(b []byte) int {
for _, m := range o.matchers {
if idx := m.MatchIndex(b); idx >= 0 {
return idx
}
}
return -1
}
type suffixGroupBytes struct {
suffix BytesMatcher
matchers []BytesMatcher
}
func BytesSuffixGroup(s string, m ...BytesMatcher) *suffixGroupBytes {
return &suffixGroupBytes{BytesSuffix(s), m}
}
func (sg *suffixGroupBytes) Match(b []byte) bool {
if sg.suffix.Match(b) {
return BytesOr(sg.matchers...).Match(b)
}
return false
}
func (sg *suffixGroupBytes) MatchIndex(b []byte) int {
if sg.suffix.MatchIndex(b) >= 0 {
return BytesOr(sg.matchers...).MatchIndex(b)
}
return -1
}

@ -0,0 +1,10 @@
package substring
// reverse is a helper fn for Suffixes
func reverse(b []byte) []byte {
n := len(b)
for i := 0; i < n/2; i++ {
b[i], b[n-1-i] = b[n-1-i], b[i]
}
return b
}

@ -0,0 +1,216 @@
package substring
import (
"regexp"
"strings"
"github.com/toqueteos/trie"
)
type StringsMatcher interface {
Match(s string) bool
MatchIndex(s string) int
}
// regexp
type regexpString struct{ re *regexp.Regexp }
func Regexp(pat string) *regexpString { return &regexpString{regexp.MustCompile(pat)} }
func (m *regexpString) Match(s string) bool { return m.re.MatchString(s) }
func (m *regexpString) MatchIndex(s string) int {
found := m.re.FindStringIndex(s)
if found != nil {
return found[1]
}
return -1
}
// exact
type exactString struct{ pat string }
func Exact(pat string) *exactString { return &exactString{pat} }
func (m *exactString) Match(s string) bool { return m.pat == s }
func (m *exactString) MatchIndex(s string) int {
if m.pat == s {
return len(s)
}
return -1
}
// any, search `s` in `.Match(pat)`
type anyString struct{ pat string }
func Any(pat string) *anyString { return &anyString{pat} }
func (m *anyString) Match(s string) bool {
return strings.Index(m.pat, s) >= 0
}
func (m *anyString) MatchIndex(s string) int {
if idx := strings.Index(m.pat, s); idx >= 0 {
return idx + len(s)
}
return -1
}
// has, search `pat` in `.Match(s)`
type hasString struct{ pat string }
func Has(pat string) *hasString { return &hasString{pat} }
func (m *hasString) Match(s string) bool {
return strings.Index(s, m.pat) >= 0
}
func (m *hasString) MatchIndex(s string) int {
if idx := strings.Index(s, m.pat); idx >= 0 {
return idx + len(m.pat)
}
return -1
}
// prefix
type prefixString struct{ pat string }
func Prefix(pat string) *prefixString { return &prefixString{pat} }
func (m *prefixString) Match(s string) bool { return strings.HasPrefix(s, m.pat) }
func (m *prefixString) MatchIndex(s string) int {
if strings.HasPrefix(s, m.pat) {
return len(m.pat)
}
return -1
}
// prefixes
type prefixesString struct{ t *trie.Trie }
func Prefixes(pats ...string) *prefixesString {
t := trie.New()
for _, pat := range pats {
t.Insert([]byte(pat))
}
return &prefixesString{t}
}
func (m *prefixesString) Match(s string) bool { return m.t.PrefixIndex([]byte(s)) >= 0 }
func (m *prefixesString) MatchIndex(s string) int {
if idx := m.t.PrefixIndex([]byte(s)); idx >= 0 {
return idx
}
return -1
}
// suffix
type suffixString struct{ pat string }
func Suffix(pat string) *suffixString { return &suffixString{pat} }
func (m *suffixString) Match(s string) bool { return strings.HasSuffix(s, m.pat) }
func (m *suffixString) MatchIndex(s string) int {
if strings.HasSuffix(s, m.pat) {
return len(m.pat)
}
return -1
}
// suffixes
type suffixesString struct{ t *trie.Trie }
func Suffixes(pats ...string) *suffixesString {
t := trie.New()
for _, pat := range pats {
t.Insert(reverse([]byte(pat)))
}
return &suffixesString{t}
}
func (m *suffixesString) Match(s string) bool {
return m.t.PrefixIndex(reverse([]byte(s))) >= 0
}
func (m *suffixesString) MatchIndex(s string) int {
if idx := m.t.PrefixIndex(reverse([]byte(s))); idx >= 0 {
return idx
}
return -1
}
// after
type afterString struct {
first string
matcher StringsMatcher
}
func After(first string, m StringsMatcher) *afterString {
return &afterString{first, m}
}
func (a *afterString) Match(s string) bool {
if idx := strings.Index(s, a.first); idx >= 0 {
return a.matcher.Match(s[idx+len(a.first):])
}
return false
}
func (a *afterString) MatchIndex(s string) int {
if idx := strings.Index(s, a.first); idx >= 0 {
return idx + a.matcher.MatchIndex(s[idx+len(a.first):])
}
return -1
}
// and, returns true iff all matchers return true
type andString struct{ matchers []StringsMatcher }
func And(m ...StringsMatcher) *andString { return &andString{m} }
func (a *andString) Match(s string) bool {
for _, m := range a.matchers {
if !m.Match(s) {
return false
}
}
return true
}
func (a *andString) MatchIndex(s string) int {
longest := 0
for _, m := range a.matchers {
if idx := m.MatchIndex(s); idx < 0 {
return -1
} else if idx > longest {
longest = idx
}
}
return longest
}
// or, returns true iff any matcher returns true
type orString struct{ matchers []StringsMatcher }
func Or(m ...StringsMatcher) *orString { return &orString{m} }
func (o *orString) Match(s string) bool {
for _, m := range o.matchers {
if m.Match(s) {
return true
}
}
return false
}
func (o *orString) MatchIndex(s string) int {
for _, m := range o.matchers {
if idx := m.MatchIndex(s); idx >= 0 {
return idx
}
}
return -1
}
type suffixGroupString struct {
suffix StringsMatcher
matchers []StringsMatcher
}
func SuffixGroup(s string, m ...StringsMatcher) *suffixGroupString {
return &suffixGroupString{Suffix(s), m}
}
func (sg *suffixGroupString) Match(s string) bool {
if sg.suffix.Match(s) {
return Or(sg.matchers...).Match(s)
}
return false
}
func (sg *suffixGroupString) MatchIndex(s string) int {
if sg.suffix.MatchIndex(s) >= 0 {
return Or(sg.matchers...).MatchIndex(s)
}
return -1
}

13
vendor/modules.txt vendored

@ -386,11 +386,20 @@ github.com/spf13/jwalterweatherman
github.com/spf13/pflag
# github.com/spf13/viper v1.4.0
github.com/spf13/viper
# github.com/src-d/enry/v2 v2.1.0
github.com/src-d/enry/v2
github.com/src-d/enry/v2/data
github.com/src-d/enry/v2/data/rule
github.com/src-d/enry/v2/internal/tokenizer
github.com/src-d/enry/v2/internal/tokenizer/flex
github.com/src-d/enry/v2/regex
# github.com/src-d/gcfg v1.4.0
github.com/src-d/gcfg
github.com/src-d/gcfg/scanner
github.com/src-d/gcfg/token
github.com/src-d/gcfg/types
# github.com/src-d/go-oniguruma v1.1.0
github.com/src-d/go-oniguruma
# github.com/steveyen/gtreap v0.0.0-20150807155958-0abe01ef9be2
github.com/steveyen/gtreap
# github.com/stretchr/testify v1.4.0
@ -411,6 +420,8 @@ github.com/syndtr/goleveldb/leveldb/table
github.com/syndtr/goleveldb/leveldb/util
# github.com/tinylib/msgp v1.1.0
github.com/tinylib/msgp/msgp
# github.com/toqueteos/trie v1.0.0
github.com/toqueteos/trie
# github.com/toqueteos/webbrowser v1.2.0
github.com/toqueteos/webbrowser
# github.com/tstranex/u2f v1.0.0
@ -607,6 +618,8 @@ gopkg.in/src-d/go-git.v4/utils/merkletrie/internal/frame
gopkg.in/src-d/go-git.v4/utils/merkletrie/noder
# gopkg.in/testfixtures.v2 v2.5.0
gopkg.in/testfixtures.v2
# gopkg.in/toqueteos/substring.v1 v1.0.2
gopkg.in/toqueteos/substring.v1
# gopkg.in/warnings.v0 v0.1.2
gopkg.in/warnings.v0
# gopkg.in/yaml.v2 v2.2.2

@ -1127,6 +1127,14 @@ function initRepository() {
}
});
}
// Language stats
if ($('.language-stats').length > 0) {
$('.language-stats').on('click', (e) => {
e.preventDefault();
$('.language-stats-details, .repository-menu').slideToggle();
});
}
}
function initMigration() {

@ -1182,3 +1182,13 @@ i.icon.centerlock {
.ui.popup .ui.label {
margin-bottom: 0.4em;
}
.color-icon {
padding-right: 0.7em;
padding-left: 0.5em;
margin-right: 0.5em;
margin-left: 0.5em;
display: inline;
border: 1px solid #ccc;
border-radius: 500em;
}

@ -1920,6 +1920,10 @@
}
}
span.ui {
color: black;
}
&.active {
background: rgba(0, 0, 0, 0.05);
}
@ -1998,6 +2002,22 @@
flex: auto;
}
}
.segment.language-stats {
padding: 0;
height: 0.6em;
display: flex;
white-space: nowrap;
width: 100%;
.bar {
white-space: nowrap;
border: 0;
padding: 0;
margin: 0;
height: 0.6em;
}
}
}
// End of .repository

@ -65,8 +65,12 @@
color: #7f7f7f;
}
.repository .ui.segment.sub-menu .list .item a {
.repository .ui.segment.sub-menu .list .item {
color: #dbdbdb;
a,
span.ui {
color: #dbdbdb;
}
}
.ui.horizontal.segments > .segment {

Loading…
Cancel
Save