mirror of https://github.com/go-gitea/gitea
Language statistics bar for repositories (#8037)
* Implementation for calculating language statistics Impement saving code language statistics to database Implement rendering langauge stats Add primary laguage to show in repository list Implement repository stats indexer queue Add indexer test Refactor to use queue module * Do not timeout for queuespull/10216/head^2
parent
37892be635
commit
ad2642a8aa
@ -0,0 +1,45 @@ |
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package migrations |
||||
|
||||
import ( |
||||
"fmt" |
||||
|
||||
"code.gitea.io/gitea/modules/timeutil" |
||||
|
||||
"xorm.io/xorm" |
||||
) |
||||
|
||||
func addLanguageStats(x *xorm.Engine) error { |
||||
// LanguageStat see models/repo_language_stats.go
|
||||
type LanguageStat struct { |
||||
ID int64 `xorm:"pk autoincr"` |
||||
RepoID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"` |
||||
CommitID string |
||||
IsPrimary bool |
||||
Language string `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"` |
||||
Percentage float32 `xorm:"NUMERIC(5,2) NOT NULL DEFAULT 0"` |
||||
Color string `xorm:"-"` |
||||
CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"` |
||||
} |
||||
|
||||
type RepoIndexerType int |
||||
|
||||
// RepoIndexerStatus see models/repo_stats_indexer.go
|
||||
type RepoIndexerStatus struct { |
||||
ID int64 `xorm:"pk autoincr"` |
||||
RepoID int64 `xorm:"INDEX(s)"` |
||||
CommitSha string `xorm:"VARCHAR(40)"` |
||||
IndexerType RepoIndexerType `xorm:"INDEX(s) NOT NULL DEFAULT 0"` |
||||
} |
||||
|
||||
if err := x.Sync2(new(LanguageStat)); err != nil { |
||||
return fmt.Errorf("Sync2: %v", err) |
||||
} |
||||
if err := x.Sync2(new(RepoIndexerStatus)); err != nil { |
||||
return fmt.Errorf("Sync2: %v", err) |
||||
} |
||||
return nil |
||||
} |
@ -0,0 +1,137 @@ |
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package models |
||||
|
||||
import ( |
||||
"math" |
||||
"strings" |
||||
|
||||
"code.gitea.io/gitea/modules/timeutil" |
||||
|
||||
"github.com/src-d/enry/v2" |
||||
) |
||||
|
||||
// LanguageStat describes language statistics of a repository
|
||||
type LanguageStat struct { |
||||
ID int64 `xorm:"pk autoincr"` |
||||
RepoID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"` |
||||
CommitID string |
||||
IsPrimary bool |
||||
Language string `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"` |
||||
Percentage float32 `xorm:"NUMERIC(5,2) NOT NULL DEFAULT 0"` |
||||
Color string `xorm:"-"` |
||||
CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"` |
||||
} |
||||
|
||||
// LanguageStatList defines a list of language statistics
|
||||
type LanguageStatList []*LanguageStat |
||||
|
||||
func (stats LanguageStatList) loadAttributes() { |
||||
for i := range stats { |
||||
stats[i].Color = enry.GetColor(stats[i].Language) |
||||
} |
||||
} |
||||
|
||||
func (repo *Repository) getLanguageStats(e Engine) (LanguageStatList, error) { |
||||
stats := make(LanguageStatList, 0, 6) |
||||
if err := e.Where("`repo_id` = ?", repo.ID).Desc("`percentage`").Find(&stats); err != nil { |
||||
return nil, err |
||||
} |
||||
stats.loadAttributes() |
||||
return stats, nil |
||||
} |
||||
|
||||
// GetLanguageStats returns the language statistics for a repository
|
||||
func (repo *Repository) GetLanguageStats() (LanguageStatList, error) { |
||||
return repo.getLanguageStats(x) |
||||
} |
||||
|
||||
// GetTopLanguageStats returns the top language statistics for a repository
|
||||
func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error) { |
||||
stats, err := repo.getLanguageStats(x) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
topstats := make(LanguageStatList, 0, limit) |
||||
var other float32 |
||||
for i := range stats { |
||||
if stats[i].Language == "other" || len(topstats) >= limit { |
||||
other += stats[i].Percentage |
||||
continue |
||||
} |
||||
topstats = append(topstats, stats[i]) |
||||
} |
||||
if other > 0 { |
||||
topstats = append(topstats, &LanguageStat{ |
||||
RepoID: repo.ID, |
||||
Language: "other", |
||||
Color: "#cccccc", |
||||
Percentage: float32(math.Round(float64(other)*10) / 10), |
||||
}) |
||||
} |
||||
return topstats, nil |
||||
} |
||||
|
||||
// UpdateLanguageStats updates the language statistics for repository
|
||||
func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]float32) error { |
||||
sess := x.NewSession() |
||||
if err := sess.Begin(); err != nil { |
||||
return err |
||||
} |
||||
defer sess.Close() |
||||
|
||||
oldstats, err := repo.getLanguageStats(sess) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
var topLang string |
||||
var p float32 |
||||
for lang, perc := range stats { |
||||
if perc > p { |
||||
p = perc |
||||
topLang = strings.ToLower(lang) |
||||
} |
||||
} |
||||
|
||||
for lang, perc := range stats { |
||||
upd := false |
||||
llang := strings.ToLower(lang) |
||||
for _, s := range oldstats { |
||||
// Update already existing language
|
||||
if strings.ToLower(s.Language) == llang { |
||||
s.CommitID = commitID |
||||
s.IsPrimary = llang == topLang |
||||
s.Percentage = perc |
||||
if _, err := sess.ID(s.ID).Cols("`commit_id`", "`percentage`", "`is_primary`").Update(s); err != nil { |
||||
return err |
||||
} |
||||
upd = true |
||||
break |
||||
} |
||||
} |
||||
// Insert new language
|
||||
if !upd { |
||||
if _, err := sess.Insert(&LanguageStat{ |
||||
RepoID: repo.ID, |
||||
CommitID: commitID, |
||||
IsPrimary: llang == topLang, |
||||
Language: lang, |
||||
Percentage: perc, |
||||
}); err != nil { |
||||
return err |
||||
} |
||||
} |
||||
} |
||||
// Delete old languages
|
||||
if _, err := sess.Where("`id` IN (SELECT `id` FROM `language_stat` WHERE `repo_id` = ? AND `commit_id` != ?)", repo.ID, commitID).Delete(&LanguageStat{}); err != nil { |
||||
return err |
||||
} |
||||
|
||||
if err = repo.updateIndexerStatus(sess, RepoIndexerTypeStats, commitID); err != nil { |
||||
return err |
||||
} |
||||
|
||||
return sess.Commit() |
||||
} |
@ -0,0 +1,116 @@ |
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package git |
||||
|
||||
import ( |
||||
"bytes" |
||||
"io" |
||||
"io/ioutil" |
||||
"math" |
||||
"path/filepath" |
||||
|
||||
"github.com/src-d/enry/v2" |
||||
"gopkg.in/src-d/go-git.v4" |
||||
"gopkg.in/src-d/go-git.v4/plumbing" |
||||
"gopkg.in/src-d/go-git.v4/plumbing/object" |
||||
) |
||||
|
||||
const fileSizeLimit int64 = 16 * 1024 * 1024 |
||||
|
||||
// GetLanguageStats calculates language stats for git repository at specified commit
|
||||
func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, error) { |
||||
r, err := git.PlainOpen(repo.Path) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
rev, err := r.ResolveRevision(plumbing.Revision(commitID)) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
commit, err := r.CommitObject(*rev) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
tree, err := commit.Tree() |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
sizes := make(map[string]int64) |
||||
var total int64 |
||||
err = tree.Files().ForEach(func(f *object.File) error { |
||||
if enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || |
||||
enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { |
||||
return nil |
||||
} |
||||
|
||||
// TODO: Use .gitattributes file for linguist overrides
|
||||
|
||||
language, ok := enry.GetLanguageByExtension(f.Name) |
||||
if !ok { |
||||
if language, ok = enry.GetLanguageByFilename(f.Name); !ok { |
||||
content, err := readFile(f, fileSizeLimit) |
||||
if err != nil { |
||||
return nil |
||||
} |
||||
|
||||
language = enry.GetLanguage(filepath.Base(f.Name), content) |
||||
if language == enry.OtherLanguage { |
||||
return nil |
||||
} |
||||
} |
||||
} |
||||
|
||||
if language != "" { |
||||
sizes[language] += f.Size |
||||
total += f.Size |
||||
} |
||||
|
||||
return nil |
||||
}) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
stats := make(map[string]float32) |
||||
var otherPerc float32 = 100 |
||||
for language, size := range sizes { |
||||
perc := float32(math.Round(float64(size)/float64(total)*1000) / 10) |
||||
if perc <= 0.1 { |
||||
continue |
||||
} |
||||
otherPerc -= perc |
||||
stats[language] = perc |
||||
} |
||||
otherPerc = float32(math.Round(float64(otherPerc)*10) / 10) |
||||
if otherPerc > 0 { |
||||
stats["other"] = otherPerc |
||||
} |
||||
return stats, nil |
||||
} |
||||
|
||||
func readFile(f *object.File, limit int64) ([]byte, error) { |
||||
r, err := f.Reader() |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
defer r.Close() |
||||
|
||||
if limit <= 0 { |
||||
return ioutil.ReadAll(r) |
||||
} |
||||
|
||||
size := f.Size |
||||
if limit > 0 && size > limit { |
||||
size = limit |
||||
} |
||||
buf := bytes.NewBuffer(nil) |
||||
buf.Grow(int(size)) |
||||
_, err = io.Copy(buf, io.LimitReader(r, limit)) |
||||
return buf.Bytes(), err |
||||
} |
@ -0,0 +1,54 @@ |
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package stats |
||||
|
||||
import ( |
||||
"code.gitea.io/gitea/models" |
||||
"code.gitea.io/gitea/modules/git" |
||||
) |
||||
|
||||
// DBIndexer implements Indexer interface to use database's like search
|
||||
type DBIndexer struct { |
||||
} |
||||
|
||||
// Index repository status function
|
||||
func (db *DBIndexer) Index(id int64) error { |
||||
repo, err := models.GetRepositoryByID(id) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
status, err := repo.GetIndexerStatus(models.RepoIndexerTypeStats) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
|
||||
gitRepo, err := git.OpenRepository(repo.RepoPath()) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
defer gitRepo.Close() |
||||
|
||||
// Get latest commit for default branch
|
||||
commitID, err := gitRepo.GetBranchCommitID(repo.DefaultBranch) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
|
||||
// Do not recalculate stats if already calculated for this commit
|
||||
if status.CommitSha == commitID { |
||||
return nil |
||||
} |
||||
|
||||
// Calculate and save language statistics to database
|
||||
stats, err := gitRepo.GetLanguageStats(commitID) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
return repo.UpdateLanguageStats(commitID, stats) |
||||
} |
||||
|
||||
// Close dummy function
|
||||
func (db *DBIndexer) Close() { |
||||
} |
@ -0,0 +1,85 @@ |
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package stats |
||||
|
||||
import ( |
||||
"code.gitea.io/gitea/models" |
||||
"code.gitea.io/gitea/modules/graceful" |
||||
"code.gitea.io/gitea/modules/log" |
||||
) |
||||
|
||||
// Indexer defines an interface to index repository stats
|
||||
type Indexer interface { |
||||
Index(id int64) error |
||||
Close() |
||||
} |
||||
|
||||
// indexer represents a indexer instance
|
||||
var indexer Indexer |
||||
|
||||
// Init initialize the repo indexer
|
||||
func Init() error { |
||||
indexer = &DBIndexer{} |
||||
|
||||
if err := initStatsQueue(); err != nil { |
||||
return err |
||||
} |
||||
|
||||
go populateRepoIndexer() |
||||
|
||||
return nil |
||||
} |
||||
|
||||
// populateRepoIndexer populate the repo indexer with pre-existing data. This
|
||||
// should only be run when the indexer is created for the first time.
|
||||
func populateRepoIndexer() { |
||||
log.Info("Populating the repo stats indexer with existing repositories") |
||||
|
||||
isShutdown := graceful.GetManager().IsShutdown() |
||||
|
||||
exist, err := models.IsTableNotEmpty("repository") |
||||
if err != nil { |
||||
log.Fatal("System error: %v", err) |
||||
} else if !exist { |
||||
return |
||||
} |
||||
|
||||
var maxRepoID int64 |
||||
if maxRepoID, err = models.GetMaxID("repository"); err != nil { |
||||
log.Fatal("System error: %v", err) |
||||
} |
||||
|
||||
// start with the maximum existing repo ID and work backwards, so that we
|
||||
// don't include repos that are created after gitea starts; such repos will
|
||||
// already be added to the indexer, and we don't need to add them again.
|
||||
for maxRepoID > 0 { |
||||
select { |
||||
case <-isShutdown: |
||||
log.Info("Repository Stats Indexer population shutdown before completion") |
||||
return |
||||
default: |
||||
} |
||||
ids, err := models.GetUnindexedRepos(models.RepoIndexerTypeStats, maxRepoID, 0, 50) |
||||
if err != nil { |
||||
log.Error("populateRepoIndexer: %v", err) |
||||
return |
||||
} else if len(ids) == 0 { |
||||
break |
||||
} |
||||
for _, id := range ids { |
||||
select { |
||||
case <-isShutdown: |
||||
log.Info("Repository Stats Indexer population shutdown before completion") |
||||
return |
||||
default: |
||||
} |
||||
if err := statsQueue.Push(id); err != nil { |
||||
log.Error("statsQueue.Push: %v", err) |
||||
} |
||||
maxRepoID = id - 1 |
||||
} |
||||
} |
||||
log.Info("Done (re)populating the repo stats indexer with existing repositories") |
||||
} |
@ -0,0 +1,42 @@ |
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package stats |
||||
|
||||
import ( |
||||
"path/filepath" |
||||
"testing" |
||||
"time" |
||||
|
||||
"code.gitea.io/gitea/models" |
||||
"code.gitea.io/gitea/modules/setting" |
||||
|
||||
"gopkg.in/ini.v1" |
||||
|
||||
"github.com/stretchr/testify/assert" |
||||
) |
||||
|
||||
func TestMain(m *testing.M) { |
||||
models.MainTest(m, filepath.Join("..", "..", "..")) |
||||
} |
||||
|
||||
func TestRepoStatsIndex(t *testing.T) { |
||||
assert.NoError(t, models.PrepareTestDatabase()) |
||||
setting.Cfg = ini.Empty() |
||||
|
||||
setting.NewQueueService() |
||||
|
||||
err := Init() |
||||
assert.NoError(t, err) |
||||
|
||||
time.Sleep(5 * time.Second) |
||||
|
||||
repo, err := models.GetRepositoryByID(1) |
||||
assert.NoError(t, err) |
||||
langs, err := repo.GetTopLanguageStats(5) |
||||
assert.NoError(t, err) |
||||
assert.Len(t, langs, 1) |
||||
assert.Equal(t, "other", langs[0].Language) |
||||
assert.Equal(t, float32(100), langs[0].Percentage) |
||||
} |
@ -0,0 +1,43 @@ |
||||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package stats |
||||
|
||||
import ( |
||||
"fmt" |
||||
|
||||
"code.gitea.io/gitea/models" |
||||
"code.gitea.io/gitea/modules/graceful" |
||||
"code.gitea.io/gitea/modules/log" |
||||
"code.gitea.io/gitea/modules/queue" |
||||
) |
||||
|
||||
// statsQueue represents a queue to handle repository stats updates
|
||||
var statsQueue queue.Queue |
||||
|
||||
// handle passed PR IDs and test the PRs
|
||||
func handle(data ...queue.Data) { |
||||
for _, datum := range data { |
||||
opts := datum.(int64) |
||||
if err := indexer.Index(opts); err != nil { |
||||
log.Error("stats queue idexer.Index(%d) failed: %v", opts, err) |
||||
} |
||||
} |
||||
} |
||||
|
||||
func initStatsQueue() error { |
||||
statsQueue = queue.CreateQueue("repo_stats_update", handle, int64(0)).(queue.Queue) |
||||
if statsQueue == nil { |
||||
return fmt.Errorf("Unable to create repo_stats_update Queue") |
||||
} |
||||
|
||||
go graceful.GetManager().RunWithShutdownFns(statsQueue.Run) |
||||
|
||||
return nil |
||||
} |
||||
|
||||
// UpdateRepoIndexer update a repository's entries in the indexer
|
||||
func UpdateRepoIndexer(repo *models.Repository) error { |
||||
return statsQueue.Push(repo.ID) |
||||
} |
@ -0,0 +1,11 @@ |
||||
.linguist |
||||
benchmarks/output |
||||
.ci |
||||
Makefile.main |
||||
.shared |
||||
.idea |
||||
.docsrv-resources |
||||
build/ |
||||
vendor/ |
||||
java/lib/ |
||||
.vscode/ |
@ -0,0 +1,132 @@ |
||||
dist: trusty |
||||
language: go |
||||
go: |
||||
- '1.12.x' |
||||
- '1.11.x' |
||||
env: |
||||
global: |
||||
- GO_VERSION_FOR_JVM='1.11.x' |
||||
- CGO_ENABLED=0 |
||||
- GO111MODULE=on |
||||
- ONIGURUMA_VERSION='6.9.1' |
||||
matrix: |
||||
- ONIGURUMA=0 |
||||
- ONIGURUMA=1 |
||||
matrix: |
||||
fast_finish: true |
||||
|
||||
stages: |
||||
- name: test |
||||
- name: release |
||||
if: tag IS present |
||||
- name: publish |
||||
if: tag IS present |
||||
|
||||
stage: test |
||||
install: |
||||
- > |
||||
if [[ "${ONIGURUMA}" -gt 0 ]]; then |
||||
export CGO_ENABLED=1 |
||||
export GO_TAGS='oniguruma' |
||||
# install oniguruma manually as trusty has only ancient 5.x |
||||
sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8 fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627 |
||||
wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_${ONIGURUMA_VERSION}-1_amd64.deb" |
||||
sudo dpkg -i "libonig5_${ONIGURUMA_VERSION}-1_amd64.deb" |
||||
wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb" |
||||
sudo dpkg -i "libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb" |
||||
fi; |
||||
script: |
||||
- make test-coverage |
||||
after_success: |
||||
- bash <(curl -s https://codecov.io/bash) |
||||
|
||||
jobs: |
||||
include: |
||||
- name: 'java unit-tests' |
||||
stage: test |
||||
language: scala |
||||
jdk: oraclejdk8 |
||||
install: |
||||
- export CGO_ENABLED=1 |
||||
- eval "$(curl -sL https://raw.githubusercontent.com/travis-ci/gimme/master/gimme | GIMME_GO_VERSION=$GO_VERSION_FOR_JVM bash)" |
||||
- go version |
||||
before_script: |
||||
- cd java |
||||
- make |
||||
script: |
||||
- make test |
||||
|
||||
- name: 'linux packages' |
||||
stage: release |
||||
install: |
||||
- go version |
||||
script: make packages |
||||
deploy: |
||||
provider: releases |
||||
api_key: |
||||
secure: $GITHUB_TOKEN |
||||
file_glob: true |
||||
file: build/*.tar.gz |
||||
skip_cleanup: true |
||||
on: |
||||
tags: true |
||||
|
||||
- name: 'linux shared lib' |
||||
stage: release |
||||
install: |
||||
- go version |
||||
script: make linux-shared |
||||
deploy: |
||||
provider: releases |
||||
api_key: |
||||
secure: $GITHUB_TOKEN |
||||
file: |
||||
- ./.shared/linux-x86-64/libenry.so |
||||
skip_cleanup: true |
||||
on: |
||||
tags: true |
||||
|
||||
- name: 'macOS shared lib' |
||||
stage: release |
||||
env: |
||||
- OSXCROSS_PACKAGE="osxcross_3034f7149716d815bc473d0a7b35d17e4cf175aa.tar.gz" |
||||
- OSXCROSS_URL="https://github.com/bblfsh/client-scala/releases/download/v1.5.2/${OSXCROSS_PACKAGE}" |
||||
- PATH="/$HOME/osxcross/bin:$PATH" |
||||
install: |
||||
- go version |
||||
- sudo apt-get update |
||||
- sudo apt-get install -y --no-install-recommends clang g++ gcc gcc-multilib libc6-dev libc6-dev-i386 mingw-w64 patch xz-utils |
||||
- cd ${HOME} |
||||
- curl -sfSL ${OSXCROSS_URL} | tar -C ${HOME} -xzf - |
||||
- cd $GOPATH/src/github.com/src-d/enry |
||||
script: make darwin-shared |
||||
deploy: |
||||
provider: releases |
||||
api_key: |
||||
secure: $GITHUB_TOKEN |
||||
file: ./.shared/darwin/libenry.dylib |
||||
skip_cleanup: true |
||||
on: |
||||
tags: true |
||||
|
||||
- name: 'java: publish to maven' |
||||
stage: publish |
||||
language: scala |
||||
jdk: oraclejdk8 |
||||
install: |
||||
- export CGO_ENABLED=1 |
||||
- eval "$(curl -sL https://raw.githubusercontent.com/travis-ci/gimme/master/gimme | GIMME_GO_VERSION=$GO_VERSION_FOR_JVM bash)" |
||||
- go version |
||||
before_script: |
||||
- cd java |
||||
- make |
||||
- curl -o ./shared/linux-x86-64/libenry.so -sfL "https://github.com/$TRAVIS_REPO_SLUG/releases/download/$TRAVIS_TAG/libenry.so" || travis_terminate 1 |
||||
- mkdir -p ./shared/darwin |
||||
- curl -o ./shared/darwin/libenry.dylib -sfL "https://github.com/$TRAVIS_REPO_SLUG/releases/download/$TRAVIS_TAG/libenry.dylib" || travis_terminate 1 |
||||
- openssl aes-256-cbc -K $encrypted_a0e1c69dbbc7_key -iv $encrypted_a0e1c69dbbc7_iv -in key.asc.enc -out key.asc -d |
||||
- gpg --no-default-keyring --primary-keyring ./project/.gnupg/pubring.gpg --secret-keyring ./project/.gnupg/secring.gpg --keyring ./project/.gnupg/pubring.gpg --fingerprint --import key.asc |
||||
script: |
||||
- make test # ensure the shared objects are functional |
||||
- ./sbt publishLocal |
||||
- ./sbt publishSigned |
||||
- ./sbt sonatypeRelease |
@ -0,0 +1,61 @@ |
||||
# source{d} Contributing Guidelines |
||||
|
||||
source{d} projects accept contributions via GitHub pull requests. |
||||
This document outlines some of the |
||||
conventions on development workflow, commit message formatting, contact points, |
||||
and other resources to make it easier to get your contribution accepted. |
||||
|
||||
## Certificate of Origin |
||||
|
||||
By contributing to this project, you agree to the [Developer Certificate of |
||||
Origin (DCO)](DCO). This document was created by the Linux Kernel community and is a |
||||
simple statement that you, as a contributor, have the legal right to make the |
||||
contribution. |
||||
|
||||
In order to show your agreement with the DCO you should include at the end of the commit message, |
||||
the following line: `Signed-off-by: John Doe <john.doe@example.com>`, using your real name. |
||||
|
||||
This can be done easily using the [`-s`](https://github.com/git/git/blob/b2c150d3aa82f6583b9aadfecc5f8fa1c74aca09/Documentation/git-commit.txt#L154-L161) flag on the `git commit`. |
||||
|
||||
If you find yourself pushed a few commits without `Signed-off-by`, you can still add it afterwards. We wrote a manual which can help: [fix-DCO.md](https://github.com/src-d/guide/blob/master/developer-community/fix-DCO.md). |
||||
|
||||
## Support Channels |
||||
|
||||
The official support channels, for both users and contributors, are: |
||||
|
||||
- GitHub issues: each repository has its own list of issues. |
||||
- Slack: join the [source{d} Slack](https://join.slack.com/t/sourced-community/shared_invite/enQtMjc4Njk5MzEyNzM2LTFjNzY4NjEwZGEwMzRiNTM4MzRlMzQ4MmIzZjkwZmZlM2NjODUxZmJjNDI1OTcxNDAyMmZlNmFjODZlNTg0YWM) community. |
||||
|
||||
*Before opening a new issue or submitting a new pull request, it's helpful to |
||||
search the project - it's likely that another user has already reported the |
||||
issue you're facing, or it's a known issue that we're already aware of. |
||||
|
||||
|
||||
## How to Contribute |
||||
|
||||
Pull Requests (PRs) are the main and exclusive way to contribute code to source{d} projects. |
||||
In order for a PR to be accepted it needs to pass this list of requirements: |
||||
|
||||
- The contribution must be correctly explained with natural language and providing a minimum working example that reproduces it. |
||||
- All PRs must be written idiomaticly: |
||||
- for Go: formatted according to [gofmt](https://golang.org/cmd/gofmt/), and without any warnings from [go lint](https://github.com/golang/lint) nor [go vet](https://golang.org/cmd/vet/) |
||||
- for other languages, similar constraints apply. |
||||
- They should in general include tests, and those shall pass. |
||||
- If the PR is a bug fix, it has to include a new unit test that fails before the patch is merged. |
||||
- If the PR is a new feature, it has to come with a suite of unit tests, that tests the new functionality. |
||||
- In any case, all the PRs have to pass the personal evaluation of at least one of the [maintainers](MAINTAINERS) of the project. |
||||
|
||||
|
||||
### Format of the commit message |
||||
|
||||
Every commit message should describe what was changed, under which context and, if applicable, the GitHub issue it relates to: |
||||
|
||||
``` |
||||
plumbing: packp, Skip argument validations for unknown capabilities. Fixes #623 |
||||
``` |
||||
|
||||
The format can be described more formally as follows: |
||||
|
||||
``` |
||||
<package>: <subpackage>, <what changed>. [Fixes #<issue-number>] |
||||
``` |
@ -0,0 +1,25 @@ |
||||
Developer's Certificate of Origin 1.1 |
||||
|
||||
By making a contribution to this project, I certify that: |
||||
|
||||
(a) The contribution was created in whole or in part by me and I |
||||
have the right to submit it under the open source license |
||||
indicated in the file; or |
||||
|
||||
(b) The contribution is based upon previous work that, to the best |
||||
of my knowledge, is covered under an appropriate open source |
||||
license and I have the right under that license to submit that |
||||
work with modifications, whether created in whole or in part |
||||
by me, under the same open source license (unless I am |
||||
permitted to submit under a different license), as indicated |
||||
in the file; or |
||||
|
||||
(c) The contribution was provided directly to me by some other |
||||
person who certified (a), (b) or (c) and I have not modified |
||||
it. |
||||
|
||||
(d) I understand and agree that this project and the contribution |
||||
are public and that a record of the contribution (including all |
||||
personal information I submit with it, including my sign-off) is |
||||
maintained indefinitely and may be redistributed consistent with |
||||
this project or the open source license(s) involved. |
@ -0,0 +1,201 @@ |
||||
Apache License |
||||
Version 2.0, January 2004 |
||||
http://www.apache.org/licenses/ |
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION |
||||
|
||||
1. Definitions. |
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction, |
||||
and distribution as defined by Sections 1 through 9 of this document. |
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by |
||||
the copyright owner that is granting the License. |
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all |
||||
other entities that control, are controlled by, or are under common |
||||
control with that entity. For the purposes of this definition, |
||||
"control" means (i) the power, direct or indirect, to cause the |
||||
direction or management of such entity, whether by contract or |
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the |
||||
outstanding shares, or (iii) beneficial ownership of such entity. |
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity |
||||
exercising permissions granted by this License. |
||||
|
||||
"Source" form shall mean the preferred form for making modifications, |
||||
including but not limited to software source code, documentation |
||||
source, and configuration files. |
||||
|
||||
"Object" form shall mean any form resulting from mechanical |
||||
transformation or translation of a Source form, including but |
||||
not limited to compiled object code, generated documentation, |
||||
and conversions to other media types. |
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or |
||||
Object form, made available under the License, as indicated by a |
||||
copyright notice that is included in or attached to the work |
||||
(an example is provided in the Appendix below). |
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object |
||||
form, that is based on (or derived from) the Work and for which the |
||||
editorial revisions, annotations, elaborations, or other modifications |
||||
represent, as a whole, an original work of authorship. For the purposes |
||||
of this License, Derivative Works shall not include works that remain |
||||
separable from, or merely link (or bind by name) to the interfaces of, |
||||
the Work and Derivative Works thereof. |
||||
|
||||
"Contribution" shall mean any work of authorship, including |
||||
the original version of the Work and any modifications or additions |
||||
to that Work or Derivative Works thereof, that is intentionally |
||||
submitted to Licensor for inclusion in the Work by the copyright owner |
||||
or by an individual or Legal Entity authorized to submit on behalf of |
||||
the copyright owner. For the purposes of this definition, "submitted" |
||||
means any form of electronic, verbal, or written communication sent |
||||
to the Licensor or its representatives, including but not limited to |
||||
communication on electronic mailing lists, source code control systems, |
||||
and issue tracking systems that are managed by, or on behalf of, the |
||||
Licensor for the purpose of discussing and improving the Work, but |
||||
excluding communication that is conspicuously marked or otherwise |
||||
designated in writing by the copyright owner as "Not a Contribution." |
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity |
||||
on behalf of whom a Contribution has been received by Licensor and |
||||
subsequently incorporated within the Work. |
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of |
||||
this License, each Contributor hereby grants to You a perpetual, |
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||
copyright license to reproduce, prepare Derivative Works of, |
||||
publicly display, publicly perform, sublicense, and distribute the |
||||
Work and such Derivative Works in Source or Object form. |
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of |
||||
this License, each Contributor hereby grants to You a perpetual, |
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||
(except as stated in this section) patent license to make, have made, |
||||
use, offer to sell, sell, import, and otherwise transfer the Work, |
||||
where such license applies only to those patent claims licensable |
||||
by such Contributor that are necessarily infringed by their |
||||
Contribution(s) alone or by combination of their Contribution(s) |
||||
with the Work to which such Contribution(s) was submitted. If You |
||||
institute patent litigation against any entity (including a |
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work |
||||
or a Contribution incorporated within the Work constitutes direct |
||||
or contributory patent infringement, then any patent licenses |
||||
granted to You under this License for that Work shall terminate |
||||
as of the date such litigation is filed. |
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the |
||||
Work or Derivative Works thereof in any medium, with or without |
||||
modifications, and in Source or Object form, provided that You |
||||
meet the following conditions: |
||||
|
||||
(a) You must give any other recipients of the Work or |
||||
Derivative Works a copy of this License; and |
||||
|
||||
(b) You must cause any modified files to carry prominent notices |
||||
stating that You changed the files; and |
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works |
||||
that You distribute, all copyright, patent, trademark, and |
||||
attribution notices from the Source form of the Work, |
||||
excluding those notices that do not pertain to any part of |
||||
the Derivative Works; and |
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its |
||||
distribution, then any Derivative Works that You distribute must |
||||
include a readable copy of the attribution notices contained |
||||
within such NOTICE file, excluding those notices that do not |
||||
pertain to any part of the Derivative Works, in at least one |
||||
of the following places: within a NOTICE text file distributed |
||||
as part of the Derivative Works; within the Source form or |
||||
documentation, if provided along with the Derivative Works; or, |
||||
within a display generated by the Derivative Works, if and |
||||
wherever such third-party notices normally appear. The contents |
||||
of the NOTICE file are for informational purposes only and |
||||
do not modify the License. You may add Your own attribution |
||||
notices within Derivative Works that You distribute, alongside |
||||
or as an addendum to the NOTICE text from the Work, provided |
||||
that such additional attribution notices cannot be construed |
||||
as modifying the License. |
||||
|
||||
You may add Your own copyright statement to Your modifications and |
||||
may provide additional or different license terms and conditions |
||||
for use, reproduction, or distribution of Your modifications, or |
||||
for any such Derivative Works as a whole, provided Your use, |
||||
reproduction, and distribution of the Work otherwise complies with |
||||
the conditions stated in this License. |
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise, |
||||
any Contribution intentionally submitted for inclusion in the Work |
||||
by You to the Licensor shall be under the terms and conditions of |
||||
this License, without any additional terms or conditions. |
||||
Notwithstanding the above, nothing herein shall supersede or modify |
||||
the terms of any separate license agreement you may have executed |
||||
with Licensor regarding such Contributions. |
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade |
||||
names, trademarks, service marks, or product names of the Licensor, |
||||
except as required for reasonable and customary use in describing the |
||||
origin of the Work and reproducing the content of the NOTICE file. |
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or |
||||
agreed to in writing, Licensor provides the Work (and each |
||||
Contributor provides its Contributions) on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
||||
implied, including, without limitation, any warranties or conditions |
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A |
||||
PARTICULAR PURPOSE. You are solely responsible for determining the |
||||
appropriateness of using or redistributing the Work and assume any |
||||
risks associated with Your exercise of permissions under this License. |
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory, |
||||
whether in tort (including negligence), contract, or otherwise, |
||||
unless required by applicable law (such as deliberate and grossly |
||||
negligent acts) or agreed to in writing, shall any Contributor be |
||||
liable to You for damages, including any direct, indirect, special, |
||||
incidental, or consequential damages of any character arising as a |
||||
result of this License or out of the use or inability to use the |
||||
Work (including but not limited to damages for loss of goodwill, |
||||
work stoppage, computer failure or malfunction, or any and all |
||||
other commercial damages or losses), even if such Contributor |
||||
has been advised of the possibility of such damages. |
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing |
||||
the Work or Derivative Works thereof, You may choose to offer, |
||||
and charge a fee for, acceptance of support, warranty, indemnity, |
||||
or other liability obligations and/or rights consistent with this |
||||
License. However, in accepting such obligations, You may act only |
||||
on Your own behalf and on Your sole responsibility, not on behalf |
||||
of any other Contributor, and only if You agree to indemnify, |
||||
defend, and hold each Contributor harmless for any liability |
||||
incurred by, or claims asserted against, such Contributor by reason |
||||
of your accepting any such warranty or additional liability. |
||||
|
||||
END OF TERMS AND CONDITIONS |
||||
|
||||
APPENDIX: How to apply the Apache License to your work. |
||||
|
||||
To apply the Apache License to your work, attach the following |
||||
boilerplate notice, with the fields enclosed by brackets "{}" |
||||
replaced with your own identifying information. (Don't include |
||||
the brackets!) The text should be enclosed in the appropriate |
||||
comment syntax for the file format. We also recommend that a |
||||
file or class name and description of purpose be included on the |
||||
same "printed page" as the copyright notice for easier |
||||
identification within third-party archives. |
||||
|
||||
Copyright 2017 Sourced Technologies, S.L. |
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); |
||||
you may not use this file except in compliance with the License. |
||||
You may obtain a copy of the License at |
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
Unless required by applicable law or agreed to in writing, software |
||||
distributed under the License is distributed on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
See the License for the specific language governing permissions and |
||||
limitations under the License. |
@ -0,0 +1 @@ |
||||
Alexander Bezzubov <alex@sourced.tech> (@bzz) |
@ -0,0 +1,82 @@ |
||||
# Package configuration
|
||||
PROJECT = enry
|
||||
COMMANDS = cmd/enry
|
||||
|
||||
# Including ci Makefile
|
||||
CI_REPOSITORY ?= https://github.com/src-d/ci.git
|
||||
CI_BRANCH ?= v1
|
||||
CI_PATH ?= .ci
|
||||
MAKEFILE := $(CI_PATH)/Makefile.main
|
||||
$(MAKEFILE): |
||||
git clone --quiet --depth 1 -b $(CI_BRANCH) $(CI_REPOSITORY) $(CI_PATH);
|
||||
-include $(MAKEFILE) |
||||
|
||||
# Docsrv: configure the languages whose api-doc can be auto generated
|
||||
LANGUAGES = go
|
||||
# Docs: do not edit this
|
||||
DOCS_REPOSITORY := https://github.com/src-d/docs
|
||||
SHARED_PATH ?= $(shell pwd)/.docsrv-resources
|
||||
DOCS_PATH ?= $(SHARED_PATH)/.docs
|
||||
$(DOCS_PATH)/Makefile.inc: |
||||
git clone --quiet --depth 1 $(DOCS_REPOSITORY) $(DOCS_PATH);
|
||||
-include $(DOCS_PATH)/Makefile.inc |
||||
|
||||
LINGUIST_PATH = .linguist
|
||||
|
||||
# shared objects
|
||||
RESOURCES_DIR=./.shared
|
||||
LINUX_DIR=$(RESOURCES_DIR)/linux-x86-64
|
||||
LINUX_SHARED_LIB=$(LINUX_DIR)/libenry.so
|
||||
DARWIN_DIR=$(RESOURCES_DIR)/darwin
|
||||
DARWIN_SHARED_LIB=$(DARWIN_DIR)/libenry.dylib
|
||||
HEADER_FILE=libenry.h
|
||||
NATIVE_LIB=./shared/enry.go
|
||||
|
||||
$(LINGUIST_PATH): |
||||
git clone https://github.com/github/linguist.git $@
|
||||
|
||||
clean-linguist: |
||||
rm -rf $(LINGUIST_PATH)
|
||||
|
||||
clean-shared: |
||||
rm -rf $(RESOURCES_DIR)
|
||||
|
||||
clean: clean-linguist clean-shared |
||||
|
||||
code-generate: $(LINGUIST_PATH) |
||||
mkdir -p data && \
|
||||
go run internal/code-generator/main.go
|
||||
ENRY_TEST_REPO="$${PWD}/.linguist" go test -v \
|
||||
-run Test_GeneratorTestSuite \
|
||||
./internal/code-generator/generator \
|
||||
-testify.m TestUpdateGeneratorTestSuiteGold \
|
||||
-update_gold
|
||||
|
||||
benchmarks: $(LINGUIST_PATH) |
||||
go test -run=NONE -bench=. && \
|
||||
benchmarks/linguist-total.rb
|
||||
|
||||
benchmarks-samples: $(LINGUIST_PATH) |
||||
go test -run=NONE -bench=. -benchtime=5us && \
|
||||
benchmarks/linguist-samples.rb
|
||||
|
||||
benchmarks-slow: $(LINGUIST_PATH) |
||||
mkdir -p benchmarks/output && \
|
||||
go test -run=NONE -bench=. -slow -benchtime=100ms -timeout=100h > benchmarks/output/enry_samples.bench && \
|
||||
benchmarks/linguist-samples.rb 5 > benchmarks/output/linguist_samples.bench
|
||||
|
||||
linux-shared: $(LINUX_SHARED_LIB) |
||||
|
||||
darwin-shared: $(DARWIN_SHARED_LIB) |
||||
|
||||
$(DARWIN_SHARED_LIB): |
||||
mkdir -p $(DARWIN_DIR) && \
|
||||
CC="o64-clang" CXX="o64-clang++" CGO_ENABLED=1 GOOS=darwin go build -buildmode=c-shared -o $(DARWIN_SHARED_LIB) $(NATIVE_LIB) && \
|
||||
mv $(DARWIN_DIR)/$(HEADER_FILE) $(RESOURCES_DIR)/$(HEADER_FILE)
|
||||
|
||||
$(LINUX_SHARED_LIB): |
||||
mkdir -p $(LINUX_DIR) && \
|
||||
CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -buildmode=c-shared -o $(LINUX_SHARED_LIB) $(NATIVE_LIB) && \
|
||||
mv $(LINUX_DIR)/$(HEADER_FILE) $(RESOURCES_DIR)/$(HEADER_FILE)
|
||||
|
||||
.PHONY: benchmarks benchmarks-samples benchmarks-slow |
@ -0,0 +1,328 @@ |
||||
# enry [![GoDoc](https://godoc.org/github.com/src-d/enry?status.svg)](https://godoc.org/github.com/src-d/enry) [![Build Status](https://travis-ci.com/src-d/enry.svg?branch=master)](https://travis-ci.com/src-d/enry) [![codecov](https://codecov.io/gh/src-d/enry/branch/master/graph/badge.svg)](https://codecov.io/gh/src-d/enry) |
||||
|
||||
File programming language detector and toolbox to ignore binary or vendored files. *enry*, started as a port to _Go_ of the original [linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved *2x performance*. |
||||
|
||||
* [Installation](#installation) |
||||
* [Examples](#examples) |
||||
* [CLI](#cli) |
||||
* [Java bindings](#java-bindings) |
||||
* [Python bindings](#python-bindings) |
||||
* [Divergences from linguist](#divergences-from-linguist) |
||||
* [Benchmarks](#benchmarks) |
||||
* [Why Enry?](#why-enry) |
||||
* [Development](#development) |
||||
* [Sync with github/linguist upstream](#sync-with-githublinguist-upstream) |
||||
* [Misc](#misc) |
||||
* [Benchmark](#benchmark) |
||||
* [Faster regexp engine (optional)](#faster-regexp-engine-optional) |
||||
* [License](#license) |
||||
|
||||
Installation |
||||
------------ |
||||
|
||||
The recommended way to install enry is to either [download a release](https://github.com/src-d/enry/releases) or |
||||
|
||||
``` |
||||
go get github.com/src-d/enry/cmd/enry |
||||
``` |
||||
|
||||
This project is now part of [source{d} Engine](https://sourced.tech/engine), |
||||
which provides the simplest way to get started with a single command. |
||||
Visit [sourced.tech/engine](https://sourced.tech/engine) for more information. |
||||
|
||||
|
||||
Examples |
||||
------------ |
||||
|
||||
```go |
||||
lang, safe := enry.GetLanguageByExtension("foo.go") |
||||
fmt.Println(lang, safe) |
||||
// result: Go true |
||||
|
||||
lang, safe := enry.GetLanguageByContent("foo.m", []byte("<matlab-code>")) |
||||
fmt.Println(lang, safe) |
||||
// result: Matlab true |
||||
|
||||
lang, safe := enry.GetLanguageByContent("bar.m", []byte("<objective-c-code>")) |
||||
fmt.Println(lang, safe) |
||||
// result: Objective-C true |
||||
|
||||
// all strategies together |
||||
lang := enry.GetLanguage("foo.cpp", []byte("<cpp-code>")) |
||||
// result: C++ true |
||||
``` |
||||
|
||||
Note that the returned boolean value `safe` is set either to `true`, if there is only one possible language detected, or to `false` otherwise. |
||||
|
||||
To get a list of possible languages for a given file, you can use the plural version of the detecting functions. |
||||
|
||||
```go |
||||
langs := enry.GetLanguages("foo.h", []byte("<cpp-code>")) |
||||
// result: []string{"C", "C++", "Objective-C} |
||||
|
||||
langs := enry.GetLanguagesByExtension("foo.asc", []byte("<content>"), nil) |
||||
// result: []string{"AGS Script", "AsciiDoc", "Public Key"} |
||||
|
||||
langs := enry.GetLanguagesByFilename("Gemfile", []byte("<content>"), []string{}) |
||||
// result: []string{"Ruby"} |
||||
``` |
||||
|
||||
|
||||
CLI |
||||
------------ |
||||
|
||||
You can use enry as a command, |
||||
|
||||
```bash |
||||
$ enry --help |
||||
enry v2.0.0 build: 05-08-2019_20_40_35 commit: 6ccf0b6, based on linguist commit: e456098 |
||||
enry, A simple (and faster) implementation of github/linguist |
||||
usage: enry [-mode=(file|line|byte)] [-prog] <path> |
||||
enry [-mode=(file|line|byte)] [-prog] [-json] [-breakdown] <path> |
||||
enry [-mode=(file|line|byte)] [-prog] [-json] [-breakdown] |
||||
enry [-version] |
||||
``` |
||||
|
||||
and on repository root, it'll return an output similar to *linguist*'s output, |
||||
|
||||
```bash |
||||
$ enry |
||||
97.71% Go |
||||
1.60% C |
||||
0.31% Shell |
||||
0.22% Java |
||||
0.07% Ruby |
||||
0.05% Makefile |
||||
0.04% Scala |
||||
0.01% Gnuplot |
||||
``` |
||||
|
||||
but not only the output; its flags are also the same as *linguist*'s ones, |
||||
|
||||
```bash |
||||
$ enry --breakdown |
||||
97.71% Go |
||||
1.60% C |
||||
0.31% Shell |
||||
0.22% Java |
||||
0.07% Ruby |
||||
0.05% Makefile |
||||
0.04% Scala |
||||
0.01% Gnuplot |
||||
|
||||
Scala |
||||
java/build.sbt |
||||
java/project/plugins.sbt |
||||
|
||||
Java |
||||
java/src/main/java/tech/sourced/enry/Enry.java |
||||
java/src/main/java/tech/sourced/enry/GoUtils.java |
||||
java/src/main/java/tech/sourced/enry/Guess.java |
||||
java/src/test/java/tech/sourced/enry/EnryTest.java |
||||
|
||||
Makefile |
||||
Makefile |
||||
java/Makefile |
||||
|
||||
Go |
||||
benchmark_test.go |
||||
``` |
||||
|
||||
even the JSON flag, |
||||
|
||||
```bash |
||||
$ enry --json | jq . |
||||
{ |
||||
"C": [ |
||||
"internal/tokenizer/flex/lex.linguist_yy.c", |
||||
"internal/tokenizer/flex/lex.linguist_yy.h", |
||||
"internal/tokenizer/flex/linguist.h", |
||||
"python/_c_enry.c", |
||||
"python/enry.c" |
||||
], |
||||
"Gnuplot": [ |
||||
"benchmarks/plot-histogram.gp" |
||||
], |
||||
"Go": [ |
||||
"benchmark_test.go", |
||||
``` |
||||
|
||||
Note that enry's CLI **_doesn't need a git repository to work_**, which is intentionally different from the linguist. |
||||
|
||||
## Java bindings |
||||
|
||||
|
||||
Generated Java bindings using a C shared library and JNI are available under [`java`](https://github.com/src-d/enry/blob/master/java) and published on Maven at [tech.sourced:enry-java](https://mvnrepository.com/artifact/tech.sourced/enry-java) for macOS and linux. |
||||
|
||||
|
||||
## Python bindings |
||||
Generated Python bindings using a C shared library and cffi are not available yet and are WIP under [src-d/enry#154](https://github.com/src-d/enry/issues/154). |
||||
|
||||
Divergences from linguist |
||||
------------ |
||||
|
||||
The `enry` library is based on the data from `github/linguist` version **v7.5.1**. |
||||
|
||||
As opposed to linguist, `enry` [CLI tool](#cli) does *not* require a full Git repository in the filesystem in order to report languages. |
||||
|
||||
Parsing [linguist/samples](https://github.com/github/linguist/tree/master/samples) the following `enry` results are different from linguist: |
||||
|
||||
* [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine. |
||||
|
||||
* [Heuristics for ".rno" extension](https://github.com/github/linguist/blob/3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d/lib/linguist/heuristics.yml#L365) in RUNOFF could not be parsed, due to unsupported lookahead in RE2 regexp engine. |
||||
|
||||
* As of [Linguist v5.3.2](https://github.com/github/linguist/releases/tag/v5.3.2) it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193). |
||||
|
||||
* Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194). |
||||
|
||||
* Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet. |
||||
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213). |
||||
|
||||
* XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192). |
||||
|
||||
* Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18). |
||||
|
||||
* `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as linguist does |
||||
|
||||
In all the cases above that have an issue number - we plan to update enry to match Linguist behavior. |
||||
|
||||
|
||||
Benchmarks |
||||
------------ |
||||
|
||||
Enry's language detection has been compared with Linguist's one. In order to do that, Linguist's project directory [*linguist/samples*](https://github.com/github/linguist/tree/master/samples) was used as a set of files to run benchmarks against. |
||||
|
||||
We got these results: |
||||
|
||||
![histogram](benchmarks/histogram/distribution.png) |
||||
|
||||
The histogram shows the number of files detected (y-axis) per time interval bucket (x-axis). As one can see, most of the files were detected faster by enry. |
||||
|
||||
We found few cases where enry turns slower than linguist due to |
||||
Go regexp engine being slower than Ruby's, based on [oniguruma](https://github.com/kkos/oniguruma) library, written in C. |
||||
|
||||
See [instructions](#misc) for running enry with oniguruma. |
||||
|
||||
|
||||
Why Enry? |
||||
------------ |
||||
|
||||
In the movie [My Fair Lady](https://en.wikipedia.org/wiki/My_Fair_Lady), [Professor Henry Higgins](http://www.imdb.com/character/ch0011719/?ref_=tt_cl_t2) is one of the main characters. Henry is a linguist and at the very beginning of the movie enjoys guessing the origin of people based on their accent. |
||||
|
||||
"Enry Iggins" is how [Eliza Doolittle](http://www.imdb.com/character/ch0011720/?ref_=tt_cl_t1), [pronounces](https://www.youtube.com/watch?v=pwNKyTktDIE) the name of the Professor during the first half of the movie. |
||||
|
||||
## Development |
||||
|
||||
To build enry's CLI run: |
||||
|
||||
make build |
||||
|
||||
this will generate a binary in the project's root directory called `enry`. |
||||
|
||||
To run the tests: |
||||
|
||||
make test |
||||
|
||||
|
||||
### Sync with github/linguist upstream |
||||
|
||||
*enry* re-uses parts of the original [github/linguist](https://github.com/github/linguist) to generate internal data structures. |
||||
In order to update to the latest release of linguist do: |
||||
|
||||
```bash |
||||
$ git clone https://github.com/github/linguist.git .linguist |
||||
$ cd .linguist; git checkout <release-tag>; cd .. |
||||
|
||||
# put the new release's commit sha in the generator_test.go (to re-generate .gold test fixtures) |
||||
# https://github.com/src-d/enry/blob/13d3d66d37a87f23a013246a1b0678c9ee3d524b/internal/code-generator/generator/generator_test.go#L18 |
||||
|
||||
$ make code-generate |
||||
``` |
||||
|
||||
To stay in sync, enry needs to be updated when a new release of the linguist includes changes to any of the following files: |
||||
|
||||
* [languages.yml](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml) |
||||
* [heuristics.yml](https://github.com/github/linguist/blob/master/lib/linguist/heuristics.yml) |
||||
* [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml) |
||||
* [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml) |
||||
|
||||
There is no automation for detecting the changes in the linguist project, so this process above has to be done manually from time to time. |
||||
|
||||
When submitting a pull request syncing up to a new release, please make sure it only contains the changes in |
||||
the generated files (in [data](https://github.com/src-d/enry/blob/master/data) subdirectory). |
||||
|
||||
Separating all the necessary "manual" code changes to a different PR that includes some background description and an update to the documentation on ["divergences from linguist"](##divergences-from-linguist) is very much appreciated as it simplifies the maintenance (review/release notes/etc). |
||||
|
||||
|
||||
|
||||
## Misc |
||||
|
||||
<details> |
||||
|
||||
### Benchmark |
||||
|
||||
All benchmark scripts are in [*benchmarks*](https://github.com/src-d/enry/blob/master/benchmarks) directory. |
||||
|
||||
|
||||
#### Dependencies |
||||
As benchmarks depend on Ruby and Github-Linguist gem make sure you have: |
||||
- Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed |
||||
- Docker |
||||
- [native dependencies](https://github.com/github/linguist/#dependencies) installed |
||||
- Build the gem `cd .linguist && bundle install && rake build_gem && cd -` |
||||
- Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem` |
||||
|
||||
|
||||
#### Quick benchmark |
||||
To run quicker benchmarks you can either: |
||||
|
||||
make benchmarks |
||||
|
||||
to get average times for the main detection function and strategies for the whole samples set or: |
||||
|
||||
make benchmarks-samples |
||||
|
||||
if you want to see measures per sample file. |
||||
|
||||
|
||||
#### Full benchmark |
||||
If you want to reproduce the same benchmarks as reported above: |
||||
- Make sure all [dependencies](#benchmark-dependencies) are installed |
||||
- Install [gnuplot](http://gnuplot.info) (in order to plot the histogram) |
||||
- Run `ENRY_TEST_REPO="$PWD/.linguist" benchmarks/run.sh` (takes ~15h) |
||||
|
||||
It will run the benchmarks for enry and linguist, parse the output, create csv files and plot the histogram. |
||||
|
||||
### Faster regexp engine (optional) |
||||
|
||||
[Oniguruma](https://github.com/kkos/oniguruma) is CRuby's regular expression engine. |
||||
It is very fast and performs better than the one built into Go runtime. *enry* supports swapping |
||||
between those two engines thanks to [rubex](https://github.com/moovweb/rubex) project. |
||||
The typical overall speedup from using Oniguruma is 1.5-2x. However, it requires CGo and the external shared library. |
||||
On macOS with [Homebrew](https://brew.sh/), it is: |
||||
|
||||
``` |
||||
brew install oniguruma |
||||
``` |
||||
|
||||
On Ubuntu, it is |
||||
|
||||
``` |
||||
sudo apt install libonig-dev |
||||
``` |
||||
|
||||
To build enry with Oniguruma regexps use the `oniguruma` build tag |
||||
|
||||
``` |
||||
go get -v -t --tags oniguruma ./... |
||||
``` |
||||
|
||||
and then rebuild the project. |
||||
|
||||
</details> |
||||
|
||||
|
||||
License |
||||
------------ |
||||
|
||||
Apache License, Version 2.0. See [LICENSE](LICENSE) |
@ -0,0 +1,107 @@ |
||||
package enry |
||||
|
||||
import ( |
||||
"math" |
||||
"sort" |
||||
|
||||
"github.com/src-d/enry/v2/internal/tokenizer" |
||||
) |
||||
|
||||
// Classifier is the interface in charge to detect the possible languages of the given content based on a set of
|
||||
// candidates. Candidates is a map which can be used to assign weights to languages dynamically.
|
||||
type Classifier interface { |
||||
Classify(content []byte, candidates map[string]float64) (languages []string) |
||||
} |
||||
|
||||
type classifier struct { |
||||
languagesLogProbabilities map[string]float64 |
||||
tokensLogProbabilities map[string]map[string]float64 |
||||
tokensTotal float64 |
||||
} |
||||
|
||||
type scoredLanguage struct { |
||||
language string |
||||
score float64 |
||||
} |
||||
|
||||
// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
|
||||
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string { |
||||
|
||||
var languages map[string]float64 |
||||
if len(candidates) == 0 { |
||||
languages = c.knownLangs() |
||||
} else { |
||||
languages = make(map[string]float64, len(candidates)) |
||||
for candidate, weight := range candidates { |
||||
if lang, ok := GetLanguageByAlias(candidate); ok { |
||||
candidate = lang |
||||
} |
||||
|
||||
languages[candidate] = weight |
||||
} |
||||
} |
||||
|
||||
empty := len(content) == 0 |
||||
scoredLangs := make([]*scoredLanguage, 0, len(languages)) |
||||
|
||||
var tokens []string |
||||
if !empty { |
||||
tokens = tokenizer.Tokenize(content) |
||||
} |
||||
|
||||
for language := range languages { |
||||
score := c.languagesLogProbabilities[language] |
||||
if !empty { |
||||
score += c.tokensLogProbability(tokens, language) |
||||
} |
||||
scoredLangs = append(scoredLangs, &scoredLanguage{ |
||||
language: language, |
||||
score: score, |
||||
}) |
||||
} |
||||
|
||||
return sortLanguagesByScore(scoredLangs) |
||||
} |
||||
|
||||
func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string { |
||||
sort.Stable(byScore(scoredLangs)) |
||||
sortedLanguages := make([]string, 0, len(scoredLangs)) |
||||
for _, scoredLang := range scoredLangs { |
||||
sortedLanguages = append(sortedLanguages, scoredLang.language) |
||||
} |
||||
|
||||
return sortedLanguages |
||||
} |
||||
|
||||
func (c *classifier) knownLangs() map[string]float64 { |
||||
langs := make(map[string]float64, len(c.languagesLogProbabilities)) |
||||
for lang := range c.languagesLogProbabilities { |
||||
langs[lang]++ |
||||
} |
||||
|
||||
return langs |
||||
} |
||||
|
||||
func (c *classifier) tokensLogProbability(tokens []string, language string) float64 { |
||||
var sum float64 |
||||
for _, token := range tokens { |
||||
sum += c.tokenProbability(token, language) |
||||
} |
||||
|
||||
return sum |
||||
} |
||||
|
||||
func (c *classifier) tokenProbability(token, language string) float64 { |
||||
tokenProb, ok := c.tokensLogProbabilities[language][token] |
||||
if !ok { |
||||
tokenProb = math.Log(1.000000 / c.tokensTotal) |
||||
} |
||||
|
||||
return tokenProb |
||||
} |
||||
|
||||
type byScore []*scoredLanguage |
||||
|
||||
func (b byScore) Len() int { return len(b) } |
||||
func (b byScore) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
||||
func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score } |
@ -0,0 +1,472 @@ |
||||
package enry |
||||
|
||||
import ( |
||||
"bufio" |
||||
"bytes" |
||||
"path/filepath" |
||||
"strings" |
||||
|
||||
"github.com/src-d/enry/v2/data" |
||||
"github.com/src-d/enry/v2/regex" |
||||
) |
||||
|
||||
// OtherLanguage is used as a zero value when a function can not return a specific language.
|
||||
const OtherLanguage = "" |
||||
|
||||
// Strategy type fix the signature for the functions that can be used as a strategy.
|
||||
type Strategy func(filename string, content []byte, candidates []string) (languages []string) |
||||
|
||||
// DefaultStrategies is a sequence of strategies used by GetLanguage to detect languages.
|
||||
var DefaultStrategies = []Strategy{ |
||||
GetLanguagesByModeline, |
||||
GetLanguagesByFilename, |
||||
GetLanguagesByShebang, |
||||
GetLanguagesByExtension, |
||||
GetLanguagesByContent, |
||||
GetLanguagesByClassifier, |
||||
} |
||||
|
||||
// DefaultClassifier is a Naive Bayes classifier trained on Linguist samples.
|
||||
var DefaultClassifier Classifier = &classifier{ |
||||
languagesLogProbabilities: data.LanguagesLogProbabilities, |
||||
tokensLogProbabilities: data.TokensLogProbabilities, |
||||
tokensTotal: data.TokensTotal, |
||||
} |
||||
|
||||
// GetLanguage applies a sequence of strategies based on the given filename and content
|
||||
// to find out the most probably language to return.
|
||||
func GetLanguage(filename string, content []byte) (language string) { |
||||
languages := GetLanguages(filename, content) |
||||
return firstLanguage(languages) |
||||
} |
||||
|
||||
func firstLanguage(languages []string) string { |
||||
for _, l := range languages { |
||||
if l != "" { |
||||
return l |
||||
} |
||||
} |
||||
return OtherLanguage |
||||
} |
||||
|
||||
// GetLanguageByModeline returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByModeline(content []byte) (language string, safe bool) { |
||||
return getLanguageByStrategy(GetLanguagesByModeline, "", content, nil) |
||||
} |
||||
|
||||
// GetLanguageByEmacsModeline returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByEmacsModeline(content []byte) (language string, safe bool) { |
||||
return getLanguageByStrategy(GetLanguagesByEmacsModeline, "", content, nil) |
||||
} |
||||
|
||||
// GetLanguageByVimModeline returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByVimModeline(content []byte) (language string, safe bool) { |
||||
return getLanguageByStrategy(GetLanguagesByVimModeline, "", content, nil) |
||||
} |
||||
|
||||
// GetLanguageByFilename returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByFilename(filename string) (language string, safe bool) { |
||||
return getLanguageByStrategy(GetLanguagesByFilename, filename, nil, nil) |
||||
} |
||||
|
||||
// GetLanguageByShebang returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByShebang(content []byte) (language string, safe bool) { |
||||
return getLanguageByStrategy(GetLanguagesByShebang, "", content, nil) |
||||
} |
||||
|
||||
// GetLanguageByExtension returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByExtension(filename string) (language string, safe bool) { |
||||
return getLanguageByStrategy(GetLanguagesByExtension, filename, nil, nil) |
||||
} |
||||
|
||||
// GetLanguageByContent returns detected language. If there are more than one possibles languages
|
||||
// it returns the first language by alphabetically order and safe to false.
|
||||
func GetLanguageByContent(filename string, content []byte) (language string, safe bool) { |
||||
return getLanguageByStrategy(GetLanguagesByContent, filename, content, nil) |
||||
} |
||||
|
||||
// GetLanguageByClassifier returns the most probably language detected for the given content. It uses
|
||||
// DefaultClassifier, if no candidates are provided it returns OtherLanguage.
|
||||
func GetLanguageByClassifier(content []byte, candidates []string) (language string, safe bool) { |
||||
return getLanguageByStrategy(GetLanguagesByClassifier, "", content, candidates) |
||||
} |
||||
|
||||
func getLanguageByStrategy(strategy Strategy, filename string, content []byte, candidates []string) (string, bool) { |
||||
languages := strategy(filename, content, candidates) |
||||
return getFirstLanguageAndSafe(languages) |
||||
} |
||||
|
||||
func getFirstLanguageAndSafe(languages []string) (language string, safe bool) { |
||||
language = firstLanguage(languages) |
||||
safe = len(languages) == 1 |
||||
return |
||||
} |
||||
|
||||
// GetLanguageBySpecificClassifier returns the most probably language for the given content using
|
||||
// classifier to detect language.
|
||||
func GetLanguageBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (language string, safe bool) { |
||||
languages := GetLanguagesBySpecificClassifier(content, candidates, classifier) |
||||
return getFirstLanguageAndSafe(languages) |
||||
} |
||||
|
||||
// GetLanguages applies a sequence of strategies based on the given filename and content
|
||||
// to find out the most probably languages to return.
|
||||
// At least one of arguments should be set. If content is missing, language detection will be based on the filename.
|
||||
// The function won't read the file, given an empty content.
|
||||
func GetLanguages(filename string, content []byte) []string { |
||||
if IsBinary(content) { |
||||
return nil |
||||
} |
||||
|
||||
var languages []string |
||||
candidates := []string{} |
||||
for _, strategy := range DefaultStrategies { |
||||
languages = strategy(filename, content, candidates) |
||||
if len(languages) == 1 { |
||||
return languages |
||||
} |
||||
|
||||
if len(languages) > 0 { |
||||
candidates = append(candidates, languages...) |
||||
} |
||||
} |
||||
|
||||
return languages |
||||
} |
||||
|
||||
// GetLanguagesByModeline returns a slice of possible languages for the given content.
|
||||
// It complies with the signature to be a Strategy type.
|
||||
func GetLanguagesByModeline(_ string, content []byte, candidates []string) []string { |
||||
headFoot := getHeaderAndFooter(content) |
||||
var languages []string |
||||
for _, getLang := range modelinesFunc { |
||||
languages = getLang("", headFoot, candidates) |
||||
if len(languages) > 0 { |
||||
break |
||||
} |
||||
} |
||||
|
||||
return languages |
||||
} |
||||
|
||||
var modelinesFunc = []Strategy{ |
||||
GetLanguagesByEmacsModeline, |
||||
GetLanguagesByVimModeline, |
||||
} |
||||
|
||||
func getHeaderAndFooter(content []byte) []byte { |
||||
const searchScope = 5 |
||||
|
||||
if len(content) == 0 { |
||||
return content |
||||
} |
||||
|
||||
if bytes.Count(content, []byte("\n")) < 2*searchScope { |
||||
return content |
||||
} |
||||
|
||||
header := headScope(content, searchScope) |
||||
footer := footScope(content, searchScope) |
||||
headerAndFooter := make([]byte, 0, len(content[:header])+len(content[footer:])) |
||||
headerAndFooter = append(headerAndFooter, content[:header]...) |
||||
headerAndFooter = append(headerAndFooter, content[footer:]...) |
||||
return headerAndFooter |
||||
} |
||||
|
||||
func headScope(content []byte, scope int) (index int) { |
||||
for i := 0; i < scope; i++ { |
||||
eol := bytes.IndexAny(content, "\n") |
||||
content = content[eol+1:] |
||||
index += eol |
||||
} |
||||
|
||||
return index + scope - 1 |
||||
} |
||||
|
||||
func footScope(content []byte, scope int) (index int) { |
||||
for i := 0; i < scope; i++ { |
||||
index = bytes.LastIndexAny(content, "\n") |
||||
content = content[:index] |
||||
} |
||||
|
||||
return index + 1 |
||||
} |
||||
|
||||
var ( |
||||
reEmacsModeline = regex.MustCompile(`.*-\*-\s*(.+?)\s*-\*-.*(?m:$)`) |
||||
reEmacsLang = regex.MustCompile(`.*(?i:mode)\s*:\s*([^\s;]+)\s*;*.*`) |
||||
reVimModeline = regex.MustCompile(`(?:(?m:\s|^)vi(?:m[<=>]?\d+|m)?|[\t\x20]*ex)\s*[:]\s*(.*)(?m:$)`) |
||||
reVimLang = regex.MustCompile(`(?i:filetype|ft|syntax)\s*=(\w+)(?:\s|:|$)`) |
||||
) |
||||
|
||||
// GetLanguagesByEmacsModeline returns a slice of possible languages for the given content.
|
||||
// It complies with the signature to be a Strategy type.
|
||||
func GetLanguagesByEmacsModeline(_ string, content []byte, _ []string) []string { |
||||
matched := reEmacsModeline.FindAllSubmatch(content, -1) |
||||
if matched == nil { |
||||
return nil |
||||
} |
||||
|
||||
// only take the last matched line, discard previous lines
|
||||
lastLineMatched := matched[len(matched)-1][1] |
||||
matchedAlias := reEmacsLang.FindSubmatch(lastLineMatched) |
||||
var alias string |
||||
if matchedAlias != nil { |
||||
alias = string(matchedAlias[1]) |
||||
} else { |
||||
alias = string(lastLineMatched) |
||||
} |
||||
|
||||
language, ok := GetLanguageByAlias(alias) |
||||
if !ok { |
||||
return nil |
||||
} |
||||
|
||||
return []string{language} |
||||
} |
||||
|
||||
// GetLanguagesByVimModeline returns a slice of possible languages for the given content.
|
||||
// It complies with the signature to be a Strategy type.
|
||||
func GetLanguagesByVimModeline(_ string, content []byte, _ []string) []string { |
||||
matched := reVimModeline.FindAllSubmatch(content, -1) |
||||
if matched == nil { |
||||
return nil |
||||
} |
||||
|
||||
// only take the last matched line, discard previous lines
|
||||
lastLineMatched := matched[len(matched)-1][1] |
||||
matchedAlias := reVimLang.FindAllSubmatch(lastLineMatched, -1) |
||||
if matchedAlias == nil { |
||||
return nil |
||||
} |
||||
|
||||
alias := string(matchedAlias[0][1]) |
||||
if len(matchedAlias) > 1 { |
||||
// cases:
|
||||
// matchedAlias = [["syntax=ruby " "ruby"] ["ft=python " "python"] ["filetype=perl " "perl"]] returns OtherLanguage;
|
||||
// matchedAlias = [["syntax=python " "python"] ["ft=python " "python"] ["filetype=python " "python"]] returns "Python";
|
||||
for _, match := range matchedAlias { |
||||
otherAlias := string(match[1]) |
||||
if otherAlias != alias { |
||||
return nil |
||||
} |
||||
} |
||||
} |
||||
|
||||
language, ok := GetLanguageByAlias(alias) |
||||
if !ok { |
||||
return nil |
||||
} |
||||
|
||||
return []string{language} |
||||
} |
||||
|
||||
// GetLanguagesByFilename returns a slice of possible languages for the given filename.
|
||||
// It complies with the signature to be a Strategy type.
|
||||
func GetLanguagesByFilename(filename string, _ []byte, _ []string) []string { |
||||
if filename == "" { |
||||
return nil |
||||
} |
||||
|
||||
return data.LanguagesByFilename[filepath.Base(filename)] |
||||
} |
||||
|
||||
// GetLanguagesByShebang returns a slice of possible languages for the given content.
|
||||
// It complies with the signature to be a Strategy type.
|
||||
func GetLanguagesByShebang(_ string, content []byte, _ []string) (languages []string) { |
||||
interpreter := getInterpreter(content) |
||||
return data.LanguagesByInterpreter[interpreter] |
||||
} |
||||
|
||||
var ( |
||||
shebangExecHack = regex.MustCompile(`exec (\w+).+\$0.+\$@`) |
||||
pythonVersion = regex.MustCompile(`python\d\.\d+`) |
||||
) |
||||
|
||||
func getInterpreter(data []byte) (interpreter string) { |
||||
line := getFirstLine(data) |
||||
if !hasShebang(line) { |
||||
return "" |
||||
} |
||||
|
||||
// skip shebang
|
||||
line = bytes.TrimSpace(line[2:]) |
||||
splitted := bytes.Fields(line) |
||||
if len(splitted) == 0 { |
||||
return "" |
||||
} |
||||
|
||||
if bytes.Contains(splitted[0], []byte("env")) { |
||||
if len(splitted) > 1 { |
||||
interpreter = string(splitted[1]) |
||||
} |
||||
} else { |
||||
splittedPath := bytes.Split(splitted[0], []byte{'/'}) |
||||
interpreter = string(splittedPath[len(splittedPath)-1]) |
||||
} |
||||
|
||||
if interpreter == "sh" { |
||||
interpreter = lookForMultilineExec(data) |
||||
} |
||||
|
||||
if pythonVersion.MatchString(interpreter) { |
||||
interpreter = interpreter[:strings.Index(interpreter, `.`)] |
||||
} |
||||
|
||||
// If osascript is called with argument -l it could be different language so do not relay on it
|
||||
// To match linguist behaviour, see ref https://github.com/github/linguist/blob/d95bae794576ab0ef2fcb41a39eb61ea5302c5b5/lib/linguist/shebang.rb#L63
|
||||
if interpreter == "osascript" && bytes.Contains(line, []byte("-l")) { |
||||
interpreter = "" |
||||
} |
||||
|
||||
return |
||||
} |
||||
|
||||
func getFirstLine(data []byte) []byte { |
||||
buf := bufio.NewScanner(bytes.NewReader(data)) |
||||
buf.Scan() |
||||
line := buf.Bytes() |
||||
if err := buf.Err(); err != nil { |
||||
return nil |
||||
} |
||||
|
||||
return line |
||||
} |
||||
|
||||
func hasShebang(line []byte) bool { |
||||
const shebang = `#!` |
||||
prefix := []byte(shebang) |
||||
return bytes.HasPrefix(line, prefix) |
||||
} |
||||
|
||||
func lookForMultilineExec(data []byte) string { |
||||
const magicNumOfLines = 5 |
||||
interpreter := "sh" |
||||
|
||||
buf := bufio.NewScanner(bytes.NewReader(data)) |
||||
for i := 0; i < magicNumOfLines && buf.Scan(); i++ { |
||||
line := buf.Bytes() |
||||
if shebangExecHack.Match(line) { |
||||
interpreter = shebangExecHack.FindStringSubmatch(string(line))[1] |
||||
break |
||||
} |
||||
} |
||||
|
||||
if err := buf.Err(); err != nil { |
||||
return interpreter |
||||
} |
||||
|
||||
return interpreter |
||||
} |
||||
|
||||
// GetLanguagesByExtension returns a slice of possible languages for the given filename.
|
||||
// It complies with the signature to be a Strategy type.
|
||||
func GetLanguagesByExtension(filename string, _ []byte, _ []string) []string { |
||||
if !strings.Contains(filename, ".") { |
||||
return nil |
||||
} |
||||
|
||||
filename = strings.ToLower(filename) |
||||
dots := getDotIndexes(filename) |
||||
for _, dot := range dots { |
||||
ext := filename[dot:] |
||||
languages, ok := data.LanguagesByExtension[ext] |
||||
if ok { |
||||
return languages |
||||
} |
||||
} |
||||
|
||||
return nil |
||||
} |
||||
|
||||
func getDotIndexes(filename string) []int { |
||||
dots := make([]int, 0, 2) |
||||
for i, letter := range filename { |
||||
if letter == rune('.') { |
||||
dots = append(dots, i) |
||||
} |
||||
} |
||||
|
||||
return dots |
||||
} |
||||
|
||||
// GetLanguagesByContent returns a slice of languages for the given content.
|
||||
// It is a Strategy that uses content-based regexp heuristics and a filename extension.
|
||||
func GetLanguagesByContent(filename string, content []byte, _ []string) []string { |
||||
if filename == "" { |
||||
return nil |
||||
} |
||||
|
||||
ext := strings.ToLower(filepath.Ext(filename)) |
||||
|
||||
heuristic, ok := data.ContentHeuristics[ext] |
||||
if !ok { |
||||
return nil |
||||
} |
||||
|
||||
return heuristic.Match(content) |
||||
} |
||||
|
||||
// GetLanguagesByClassifier uses DefaultClassifier as a Classifier and returns a sorted slice of possible languages ordered by
|
||||
// decreasing language's probability. If there are not candidates it returns nil. It complies with the signature to be a Strategy type.
|
||||
func GetLanguagesByClassifier(filename string, content []byte, candidates []string) (languages []string) { |
||||
if len(candidates) == 0 { |
||||
return nil |
||||
} |
||||
|
||||
return GetLanguagesBySpecificClassifier(content, candidates, DefaultClassifier) |
||||
} |
||||
|
||||
// GetLanguagesBySpecificClassifier returns a slice of possible languages. It takes in a Classifier to be used.
|
||||
func GetLanguagesBySpecificClassifier(content []byte, candidates []string, classifier Classifier) (languages []string) { |
||||
mapCandidates := make(map[string]float64) |
||||
for _, candidate := range candidates { |
||||
mapCandidates[candidate]++ |
||||
} |
||||
|
||||
return classifier.Classify(content, mapCandidates) |
||||
} |
||||
|
||||
// GetLanguageExtensions returns the different extensions being used by the language.
|
||||
func GetLanguageExtensions(language string) []string { |
||||
return data.ExtensionsByLanguage[language] |
||||
} |
||||
|
||||
// Type represent language's type. Either data, programming, markup, prose, or unknown.
|
||||
type Type int |
||||
|
||||
// Type's values.
|
||||
const ( |
||||
Unknown Type = iota |
||||
Data |
||||
Programming |
||||
Markup |
||||
Prose |
||||
) |
||||
|
||||
// GetLanguageType returns the type of the given language.
|
||||
func GetLanguageType(language string) (langType Type) { |
||||
intType, ok := data.LanguagesType[language] |
||||
langType = Type(intType) |
||||
if !ok { |
||||
langType = Unknown |
||||
} |
||||
return langType |
||||
} |
||||
|
||||
// GetLanguageByAlias returns either the language related to the given alias and ok set to true
|
||||
// or Otherlanguage and ok set to false if the alias is not recognized.
|
||||
func GetLanguageByAlias(alias string) (lang string, ok bool) { |
||||
lang, ok = data.LanguageByAlias(alias) |
||||
if !ok { |
||||
lang = OtherLanguage |
||||
} |
||||
|
||||
return |
||||
} |
@ -0,0 +1,783 @@ |
||||
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
|
||||
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
|
||||
|
||||
package data |
||||
|
||||
import "strings" |
||||
|
||||
// LanguageByAliasMap keeps alias for different languages and use the name of the languages as an alias too.
|
||||
// All the keys (alias or not) are written in lower case and the whitespaces has been replaced by underscores.
|
||||
var LanguageByAliasMap = map[string]string{ |
||||
"1c_enterprise": "1C Enterprise", |
||||
"abap": "ABAP", |
||||
"abl": "OpenEdge ABL", |
||||
"abnf": "ABNF", |
||||
"abuild": "Alpine Abuild", |
||||
"acfm": "Adobe Font Metrics", |
||||
"aconf": "ApacheConf", |
||||
"actionscript": "ActionScript", |
||||
"actionscript3": "ActionScript", |
||||
"actionscript_3": "ActionScript", |
||||
"ada": "Ada", |
||||
"ada2005": "Ada", |
||||
"ada95": "Ada", |
||||
"adobe_composite_font_metrics": "Adobe Font Metrics", |
||||
"adobe_font_metrics": "Adobe Font Metrics", |
||||
"adobe_multiple_font_metrics": "Adobe Font Metrics", |
||||
"advpl": "xBase", |
||||
"afdko": "OpenType Feature File", |
||||
"agda": "Agda", |
||||
"ags": "AGS Script", |
||||
"ags_script": "AGS Script", |
||||
"ahk": "AutoHotkey", |
||||
"alloy": "Alloy", |
||||
"alpine_abuild": "Alpine Abuild", |
||||
"altium": "Altium Designer", |
||||
"altium_designer": "Altium Designer", |
||||
"amfm": "Adobe Font Metrics", |
||||
"ampl": "AMPL", |
||||
"angelscript": "AngelScript", |
||||
"ant_build_system": "Ant Build System", |
||||
"antlr": "ANTLR", |
||||
"apache": "ApacheConf", |
||||
"apacheconf": "ApacheConf", |
||||
"apex": "Apex", |
||||
"api_blueprint": "API Blueprint", |
||||
"apkbuild": "Alpine Abuild", |
||||
"apl": "APL", |
||||
"apollo_guidance_computer": "Apollo Guidance Computer", |
||||
"applescript": "AppleScript", |
||||
"arc": "Arc", |
||||
"arexx": "REXX", |
||||
"as3": "ActionScript", |
||||
"asciidoc": "AsciiDoc", |
||||
"asm": "Assembly", |
||||
"asn.1": "ASN.1", |
||||
"asp": "ASP", |
||||
"aspectj": "AspectJ", |
||||
"aspx": "ASP", |
||||
"aspx-vb": "ASP", |
||||
"assembly": "Assembly", |
||||
"asymptote": "Asymptote", |
||||
"ats": "ATS", |
||||
"ats2": "ATS", |
||||
"au3": "AutoIt", |
||||
"augeas": "Augeas", |
||||
"autoconf": "M4Sugar", |
||||
"autohotkey": "AutoHotkey", |
||||
"autoit": "AutoIt", |
||||
"autoit3": "AutoIt", |
||||
"autoitscript": "AutoIt", |
||||
"awk": "Awk", |
||||
"b3d": "BlitzBasic", |
||||
"ballerina": "Ballerina", |
||||
"bash": "Shell", |
||||
"bash_session": "ShellSession", |
||||
"bat": "Batchfile", |
||||
"batch": "Batchfile", |
||||
"batchfile": "Batchfile", |
||||
"befunge": "Befunge", |
||||
"bison": "Bison", |
||||
"bitbake": "BitBake", |
||||
"blade": "Blade", |
||||
"blitz3d": "BlitzBasic", |
||||
"blitzbasic": "BlitzBasic", |
||||
"blitzmax": "BlitzMax", |
||||
"blitzplus": "BlitzBasic", |
||||
"bluespec": "Bluespec", |
||||
"bmax": "BlitzMax", |
||||
"boo": "Boo", |
||||
"bplus": "BlitzBasic", |
||||
"brainfuck": "Brainfuck", |
||||
"brightscript": "Brightscript", |
||||
"bro": "Zeek", |
||||
"bsdmake": "Makefile", |
||||
"byond": "DM", |
||||
"c": "C", |
||||
"c#": "C#", |
||||
"c++": "C++", |
||||
"c++-objdump": "Cpp-ObjDump", |
||||
"c-objdump": "C-ObjDump", |
||||
"c2hs": "C2hs Haskell", |
||||
"c2hs_haskell": "C2hs Haskell", |
||||
"cabal": "Cabal Config", |
||||
"cabal_config": "Cabal Config", |
||||
"cap'n_proto": "Cap'n Proto", |
||||
"carto": "CartoCSS", |
||||
"cartocss": "CartoCSS", |
||||
"ceylon": "Ceylon", |
||||
"cfc": "ColdFusion CFC", |
||||
"cfm": "ColdFusion", |
||||
"cfml": "ColdFusion", |
||||
"chapel": "Chapel", |
||||
"charity": "Charity", |
||||
"chpl": "Chapel", |
||||
"chuck": "ChucK", |
||||
"cirru": "Cirru", |
||||
"clarion": "Clarion", |
||||
"clean": "Clean", |
||||
"click": "Click", |
||||
"clipper": "xBase", |
||||
"clips": "CLIPS", |
||||
"clojure": "Clojure", |
||||
"closure_templates": "Closure Templates", |
||||
"cloud_firestore_security_rules": "Cloud Firestore Security Rules", |
||||
"cmake": "CMake", |
||||
"cobol": "COBOL", |
||||
"coffee": "CoffeeScript", |
||||
"coffee-script": "CoffeeScript", |
||||
"coffeescript": "CoffeeScript", |
||||
"coldfusion": "ColdFusion", |
||||
"coldfusion_cfc": "ColdFusion CFC", |
||||
"coldfusion_html": "ColdFusion", |
||||
"collada": "COLLADA", |
||||
"common_lisp": "Common Lisp", |
||||
"common_workflow_language": "Common Workflow Language", |
||||
"component_pascal": "Component Pascal", |
||||
"conll": "CoNLL-U", |
||||
"conll-u": "CoNLL-U", |
||||
"conll-x": "CoNLL-U", |
||||
"console": "ShellSession", |
||||
"cool": "Cool", |
||||
"coq": "Coq", |
||||
"cperl": "Perl", |
||||
"cpp": "C++", |
||||
"cpp-objdump": "Cpp-ObjDump", |
||||
"creole": "Creole", |
||||
"crystal": "Crystal", |
||||
"csharp": "C#", |
||||
"cson": "CSON", |
||||
"csound": "Csound", |
||||
"csound-csd": "Csound Document", |
||||
"csound-orc": "Csound", |
||||
"csound-sco": "Csound Score", |
||||
"csound_document": "Csound Document", |
||||
"csound_score": "Csound Score", |
||||
"css": "CSS", |
||||
"csv": "CSV", |
||||
"cucumber": "Gherkin", |
||||
"cuda": "Cuda", |
||||
"cweb": "CWeb", |
||||
"cwl": "Common Workflow Language", |
||||
"cycript": "Cycript", |
||||
"cython": "Cython", |
||||
"d": "D", |
||||
"d-objdump": "D-ObjDump", |
||||
"darcs_patch": "Darcs Patch", |
||||
"dart": "Dart", |
||||
"dataweave": "DataWeave", |
||||
"dcl": "DIGITAL Command Language", |
||||
"delphi": "Component Pascal", |
||||
"desktop": "desktop", |
||||
"dhall": "Dhall", |
||||
"diff": "Diff", |
||||
"digital_command_language": "DIGITAL Command Language", |
||||
"django": "HTML+Django", |
||||
"dm": "DM", |
||||
"dns_zone": "DNS Zone", |
||||
"dockerfile": "Dockerfile", |
||||
"dogescript": "Dogescript", |
||||
"dosbatch": "Batchfile", |
||||
"dosini": "INI", |
||||
"dpatch": "Darcs Patch", |
||||
"dtrace": "DTrace", |
||||
"dtrace-script": "DTrace", |
||||
"dylan": "Dylan", |
||||
"e": "E", |
||||
"eagle": "Eagle", |
||||
"easybuild": "Easybuild", |
||||
"ebnf": "EBNF", |
||||
"ec": "eC", |
||||
"ecere_projects": "Ecere Projects", |
||||
"ecl": "ECL", |
||||
"eclipse": "ECLiPSe", |
||||
"ecr": "HTML+ECR", |
||||
"editor-config": "EditorConfig", |
||||
"editorconfig": "EditorConfig", |
||||
"edje_data_collection": "Edje Data Collection", |
||||
"edn": "edn", |
||||
"eeschema_schematic": "KiCad Schematic", |
||||
"eex": "HTML+EEX", |
||||
"eiffel": "Eiffel", |
||||
"ejs": "EJS", |
||||
"elisp": "Emacs Lisp", |
||||
"elixir": "Elixir", |
||||
"elm": "Elm", |
||||
"emacs": "Emacs Lisp", |
||||
"emacs_lisp": "Emacs Lisp", |
||||
"emberscript": "EmberScript", |
||||
"eml": "EML", |
||||
"eq": "EQ", |
||||
"erb": "HTML+ERB", |
||||
"erlang": "Erlang", |
||||
"f#": "F#", |
||||
"f*": "F*", |
||||
"factor": "Factor", |
||||
"fancy": "Fancy", |
||||
"fantom": "Fantom", |
||||
"figfont": "FIGlet Font", |
||||
"figlet_font": "FIGlet Font", |
||||
"filebench_wml": "Filebench WML", |
||||
"filterscript": "Filterscript", |
||||
"fish": "fish", |
||||
"flex": "Lex", |
||||
"flux": "FLUX", |
||||
"formatted": "Formatted", |
||||
"forth": "Forth", |
||||
"fortran": "Fortran", |
||||
"foxpro": "xBase", |
||||
"freemarker": "FreeMarker", |
||||
"frege": "Frege", |
||||
"fsharp": "F#", |
||||
"fstar": "F*", |
||||
"ftl": "FreeMarker", |
||||
"fundamental": "Text", |
||||
"g-code": "G-code", |
||||
"game_maker_language": "Game Maker Language", |
||||
"gams": "GAMS", |
||||
"gap": "GAP", |
||||
"gcc_machine_description": "GCC Machine Description", |
||||
"gdb": "GDB", |
||||
"gdscript": "GDScript", |
||||
"genie": "Genie", |
||||
"genshi": "Genshi", |
||||
"gentoo_ebuild": "Gentoo Ebuild", |
||||
"gentoo_eclass": "Gentoo Eclass", |
||||
"gerber_image": "Gerber Image", |
||||
"gettext_catalog": "Gettext Catalog", |
||||
"gf": "Grammatical Framework", |
||||
"gherkin": "Gherkin", |
||||
"git-ignore": "Ignore List", |
||||
"git_attributes": "Git Attributes", |
||||
"git_config": "Git Config", |
||||
"gitattributes": "Git Attributes", |
||||
"gitconfig": "Git Config", |
||||
"gitignore": "Ignore List", |
||||
"gitmodules": "Git Config", |
||||
"glsl": "GLSL", |
||||
"glyph": "Glyph", |
||||
"glyph_bitmap_distribution_format": "Glyph Bitmap Distribution Format", |
||||
"gn": "GN", |
||||
"gnuplot": "Gnuplot", |
||||
"go": "Go", |
||||
"golang": "Go", |
||||
"golo": "Golo", |
||||
"gosu": "Gosu", |
||||
"grace": "Grace", |
||||
"gradle": "Gradle", |
||||
"grammatical_framework": "Grammatical Framework", |
||||
"graph_modeling_language": "Graph Modeling Language", |
||||
"graphql": "GraphQL", |
||||
"graphviz_(dot)": "Graphviz (DOT)", |
||||
"groff": "Roff", |
||||
"groovy": "Groovy", |
||||
"groovy_server_pages": "Groovy Server Pages", |
||||
"gsp": "Groovy Server Pages", |
||||
"hack": "Hack", |
||||
"haml": "Haml", |
||||
"handlebars": "Handlebars", |
||||
"haproxy": "HAProxy", |
||||
"harbour": "Harbour", |
||||
"haskell": "Haskell", |
||||
"haxe": "Haxe", |
||||
"hbs": "Handlebars", |
||||
"hcl": "HCL", |
||||
"hiveql": "HiveQL", |
||||
"hlsl": "HLSL", |
||||
"holyc": "HolyC", |
||||
"html": "HTML", |
||||
"html+django": "HTML+Django", |
||||
"html+django/jinja": "HTML+Django", |
||||
"html+ecr": "HTML+ECR", |
||||
"html+eex": "HTML+EEX", |
||||
"html+erb": "HTML+ERB", |
||||
"html+jinja": "HTML+Django", |
||||
"html+php": "HTML+PHP", |
||||
"html+razor": "HTML+Razor", |
||||
"html+ruby": "RHTML", |
||||
"htmlbars": "Handlebars", |
||||
"htmldjango": "HTML+Django", |
||||
"http": "HTTP", |
||||
"hxml": "HXML", |
||||
"hy": "Hy", |
||||
"hylang": "Hy", |
||||
"hyphy": "HyPhy", |
||||
"i7": "Inform 7", |
||||
"idl": "IDL", |
||||
"idris": "Idris", |
||||
"ignore": "Ignore List", |
||||
"ignore_list": "Ignore List", |
||||
"igor": "IGOR Pro", |
||||
"igor_pro": "IGOR Pro", |
||||
"igorpro": "IGOR Pro", |
||||
"inc": "PHP", |
||||
"inform7": "Inform 7", |
||||
"inform_7": "Inform 7", |
||||
"ini": "INI", |
||||
"inno_setup": "Inno Setup", |
||||
"io": "Io", |
||||
"ioke": "Ioke", |
||||
"ipython_notebook": "Jupyter Notebook", |
||||
"irc": "IRC log", |
||||
"irc_log": "IRC log", |
||||
"irc_logs": "IRC log", |
||||
"isabelle": "Isabelle", |
||||
"isabelle_root": "Isabelle ROOT", |
||||
"j": "J", |
||||
"jasmin": "Jasmin", |
||||
"java": "Java", |
||||
"java_properties": "Java Properties", |
||||
"java_server_page": "Groovy Server Pages", |
||||
"java_server_pages": "Java Server Pages", |
||||
"javascript": "JavaScript", |
||||
"javascript+erb": "JavaScript+ERB", |
||||
"jflex": "JFlex", |
||||
"jison": "Jison", |
||||
"jison_lex": "Jison Lex", |
||||
"jolie": "Jolie", |
||||
"jruby": "Ruby", |
||||
"js": "JavaScript", |
||||
"json": "JSON", |
||||
"json5": "JSON5", |
||||
"json_with_comments": "JSON with Comments", |
||||
"jsonc": "JSON with Comments", |
||||
"jsoniq": "JSONiq", |
||||
"jsonld": "JSONLD", |
||||
"jsonnet": "Jsonnet", |
||||
"jsp": "Java Server Pages", |
||||
"jsx": "JSX", |
||||
"julia": "Julia", |
||||
"jupyter_notebook": "Jupyter Notebook", |
||||
"kicad_layout": "KiCad Layout", |
||||
"kicad_legacy_layout": "KiCad Legacy Layout", |
||||
"kicad_schematic": "KiCad Schematic", |
||||
"kit": "Kit", |
||||
"kotlin": "Kotlin", |
||||
"krl": "KRL", |
||||
"labview": "LabVIEW", |
||||
"lasso": "Lasso", |
||||
"lassoscript": "Lasso", |
||||
"latex": "TeX", |
||||
"latte": "Latte", |
||||
"lean": "Lean", |
||||
"less": "Less", |
||||
"lex": "Lex", |
||||
"lfe": "LFE", |
||||
"lhaskell": "Literate Haskell", |
||||
"lhs": "Literate Haskell", |
||||
"lilypond": "LilyPond", |
||||
"limbo": "Limbo", |
||||
"linker_script": "Linker Script", |
||||
"linux_kernel_module": "Linux Kernel Module", |
||||
"liquid": "Liquid", |
||||
"lisp": "Common Lisp", |
||||
"litcoffee": "Literate CoffeeScript", |
||||
"literate_agda": "Literate Agda", |
||||
"literate_coffeescript": "Literate CoffeeScript", |
||||
"literate_haskell": "Literate Haskell", |
||||
"live-script": "LiveScript", |
||||
"livescript": "LiveScript", |
||||
"llvm": "LLVM", |
||||
"logos": "Logos", |
||||
"logtalk": "Logtalk", |
||||
"lolcode": "LOLCODE", |
||||
"lookml": "LookML", |
||||
"loomscript": "LoomScript", |
||||
"ls": "LiveScript", |
||||
"lsl": "LSL", |
||||
"ltspice_symbol": "LTspice Symbol", |
||||
"lua": "Lua", |
||||
"m": "M", |
||||
"m4": "M4", |
||||
"m4sugar": "M4Sugar", |
||||
"macruby": "Ruby", |
||||
"make": "Makefile", |
||||
"makefile": "Makefile", |
||||
"mako": "Mako", |
||||
"man": "Roff", |
||||
"man-page": "Roff", |
||||
"man_page": "Roff", |
||||
"manpage": "Roff", |
||||
"markdown": "Markdown", |
||||
"marko": "Marko", |
||||
"markojs": "Marko", |
||||
"mask": "Mask", |
||||
"mathematica": "Mathematica", |
||||
"matlab": "MATLAB", |
||||
"maven_pom": "Maven POM", |
||||
"max": "Max", |
||||
"max/msp": "Max", |
||||
"maxmsp": "Max", |
||||
"maxscript": "MAXScript", |
||||
"mcfunction": "mcfunction", |
||||
"mdoc": "Roff", |
||||
"mediawiki": "MediaWiki", |
||||
"mercury": "Mercury", |
||||
"meson": "Meson", |
||||
"metal": "Metal", |
||||
"mf": "Makefile", |
||||
"minid": "MiniD", |
||||
"mirah": "Mirah", |
||||
"mma": "Mathematica", |
||||
"modelica": "Modelica", |
||||
"modula-2": "Modula-2", |
||||
"modula-3": "Modula-3", |
||||
"module_management_system": "Module Management System", |
||||
"monkey": "Monkey", |
||||
"moocode": "Moocode", |
||||
"moonscript": "MoonScript", |
||||
"motorola_68k_assembly": "Motorola 68K Assembly", |
||||
"mql4": "MQL4", |
||||
"mql5": "MQL5", |
||||
"mtml": "MTML", |
||||
"muf": "MUF", |
||||
"mumps": "M", |
||||
"mupad": "mupad", |
||||
"myghty": "Myghty", |
||||
"nanorc": "nanorc", |
||||
"nasm": "Assembly", |
||||
"ncl": "NCL", |
||||
"nearley": "Nearley", |
||||
"nemerle": "Nemerle", |
||||
"nesc": "nesC", |
||||
"netlinx": "NetLinx", |
||||
"netlinx+erb": "NetLinx+ERB", |
||||
"netlogo": "NetLogo", |
||||
"newlisp": "NewLisp", |
||||
"nextflow": "Nextflow", |
||||
"nginx": "Nginx", |
||||
"nginx_configuration_file": "Nginx", |
||||
"nim": "Nim", |
||||
"ninja": "Ninja", |
||||
"nit": "Nit", |
||||
"nix": "Nix", |
||||
"nixos": "Nix", |
||||
"njk": "HTML+Django", |
||||
"nl": "NL", |
||||
"node": "JavaScript", |
||||
"nroff": "Roff", |
||||
"nsis": "NSIS", |
||||
"nu": "Nu", |
||||
"numpy": "NumPy", |
||||
"nunjucks": "HTML+Django", |
||||
"nush": "Nu", |
||||
"nvim": "Vim script", |
||||
"obj-c": "Objective-C", |
||||
"obj-c++": "Objective-C++", |
||||
"obj-j": "Objective-J", |
||||
"objc": "Objective-C", |
||||
"objc++": "Objective-C++", |
||||
"objdump": "ObjDump", |
||||
"objective-c": "Objective-C", |
||||
"objective-c++": "Objective-C++", |
||||
"objective-j": "Objective-J", |
||||
"objectivec": "Objective-C", |
||||
"objectivec++": "Objective-C++", |
||||
"objectivej": "Objective-J", |
||||
"objectpascal": "Component Pascal", |
||||
"objectscript": "ObjectScript", |
||||
"objj": "Objective-J", |
||||
"ocaml": "OCaml", |
||||
"octave": "MATLAB", |
||||
"omgrofl": "Omgrofl", |
||||
"oncrpc": "RPC", |
||||
"ooc": "ooc", |
||||
"opa": "Opa", |
||||
"opal": "Opal", |
||||
"opencl": "OpenCL", |
||||
"openedge": "OpenEdge ABL", |
||||
"openedge_abl": "OpenEdge ABL", |
||||
"openrc": "OpenRC runscript", |
||||
"openrc_runscript": "OpenRC runscript", |
||||
"openscad": "OpenSCAD", |
||||
"opentype_feature_file": "OpenType Feature File", |
||||
"org": "Org", |
||||
"osascript": "AppleScript", |
||||
"ox": "Ox", |
||||
"oxygene": "Oxygene", |
||||
"oz": "Oz", |
||||
"p4": "P4", |
||||
"pan": "Pan", |
||||
"pandoc": "Markdown", |
||||
"papyrus": "Papyrus", |
||||
"parrot": "Parrot", |
||||
"parrot_assembly": "Parrot Assembly", |
||||
"parrot_internal_representation": "Parrot Internal Representation", |
||||
"pascal": "Pascal", |
||||
"pasm": "Parrot Assembly", |
||||
"pawn": "Pawn", |
||||
"pcbnew": "KiCad Layout", |
||||
"pep8": "Pep8", |
||||
"perl": "Perl", |
||||
"perl6": "Perl 6", |
||||
"perl_6": "Perl 6", |
||||
"php": "PHP", |
||||
"pic": "Pic", |
||||
"pickle": "Pickle", |
||||
"picolisp": "PicoLisp", |
||||
"piglatin": "PigLatin", |
||||
"pike": "Pike", |
||||
"pir": "Parrot Internal Representation", |
||||
"plpgsql": "PLpgSQL", |
||||
"plsql": "PLSQL", |
||||
"pod": "Pod", |
||||
"pod_6": "Pod 6", |
||||
"pogoscript": "PogoScript", |
||||
"pony": "Pony", |
||||
"posh": "PowerShell", |
||||
"postcss": "PostCSS", |
||||
"postscr": "PostScript", |
||||
"postscript": "PostScript", |
||||
"pot": "Gettext Catalog", |
||||
"pov-ray": "POV-Ray SDL", |
||||
"pov-ray_sdl": "POV-Ray SDL", |
||||
"povray": "POV-Ray SDL", |
||||
"powerbuilder": "PowerBuilder", |
||||
"powershell": "PowerShell", |
||||
"processing": "Processing", |
||||
"progress": "OpenEdge ABL", |
||||
"prolog": "Prolog", |
||||
"propeller_spin": "Propeller Spin", |
||||
"protobuf": "Protocol Buffer", |
||||
"protocol_buffer": "Protocol Buffer", |
||||
"protocol_buffers": "Protocol Buffer", |
||||
"public_key": "Public Key", |
||||
"pug": "Pug", |
||||
"puppet": "Puppet", |
||||
"pure_data": "Pure Data", |
||||
"purebasic": "PureBasic", |
||||
"purescript": "PureScript", |
||||
"pwsh": "PowerShell", |
||||
"pycon": "Python console", |
||||
"pyrex": "Cython", |
||||
"python": "Python", |
||||
"python3": "Python", |
||||
"python_console": "Python console", |
||||
"python_traceback": "Python traceback", |
||||
"q": "q", |
||||
"qmake": "QMake", |
||||
"qml": "QML", |
||||
"quake": "Quake", |
||||
"r": "R", |
||||
"racket": "Racket", |
||||
"ragel": "Ragel", |
||||
"ragel-rb": "Ragel", |
||||
"ragel-ruby": "Ragel", |
||||
"rake": "Ruby", |
||||
"raml": "RAML", |
||||
"rascal": "Rascal", |
||||
"raw": "Raw token data", |
||||
"raw_token_data": "Raw token data", |
||||
"razor": "HTML+Razor", |
||||
"rb": "Ruby", |
||||
"rbx": "Ruby", |
||||
"rdoc": "RDoc", |
||||
"realbasic": "REALbasic", |
||||
"reason": "Reason", |
||||
"rebol": "Rebol", |
||||
"red": "Red", |
||||
"red/system": "Red", |
||||
"redcode": "Redcode", |
||||
"regex": "Regular Expression", |
||||
"regexp": "Regular Expression", |
||||
"regular_expression": "Regular Expression", |
||||
"ren'py": "Ren'Py", |
||||
"renderscript": "RenderScript", |
||||
"renpy": "Ren'Py", |
||||
"restructuredtext": "reStructuredText", |
||||
"rexx": "REXX", |
||||
"rhtml": "RHTML", |
||||
"rich_text_format": "Rich Text Format", |
||||
"ring": "Ring", |
||||
"rmarkdown": "RMarkdown", |
||||
"robotframework": "RobotFramework", |
||||
"roff": "Roff", |
||||
"roff_manpage": "Roff Manpage", |
||||
"rouge": "Rouge", |
||||
"rpc": "RPC", |
||||
"rpcgen": "RPC", |
||||
"rpm_spec": "RPM Spec", |
||||
"rs-274x": "Gerber Image", |
||||
"rscript": "R", |
||||
"rss": "XML", |
||||
"rst": "reStructuredText", |
||||
"ruby": "Ruby", |
||||
"runoff": "RUNOFF", |
||||
"rust": "Rust", |
||||
"rusthon": "Python", |
||||
"sage": "Sage", |
||||
"salt": "SaltStack", |
||||
"saltstack": "SaltStack", |
||||
"saltstate": "SaltStack", |
||||
"sas": "SAS", |
||||
"sass": "Sass", |
||||
"scala": "Scala", |
||||
"scaml": "Scaml", |
||||
"scheme": "Scheme", |
||||
"scilab": "Scilab", |
||||
"scss": "SCSS", |
||||
"sed": "sed", |
||||
"self": "Self", |
||||
"sh": "Shell", |
||||
"shaderlab": "ShaderLab", |
||||
"shell": "Shell", |
||||
"shell-script": "Shell", |
||||
"shellsession": "ShellSession", |
||||
"shen": "Shen", |
||||
"slash": "Slash", |
||||
"slice": "Slice", |
||||
"slim": "Slim", |
||||
"smali": "Smali", |
||||
"smalltalk": "Smalltalk", |
||||
"smarty": "Smarty", |
||||
"sml": "Standard ML", |
||||
"smt": "SMT", |
||||
"snippet": "YASnippet", |
||||
"solidity": "Solidity", |
||||
"sourcemod": "SourcePawn", |
||||
"sourcepawn": "SourcePawn", |
||||
"soy": "Closure Templates", |
||||
"sparql": "SPARQL", |
||||
"specfile": "RPM Spec", |
||||
"spline_font_database": "Spline Font Database", |
||||
"splus": "R", |
||||
"sqf": "SQF", |
||||
"sql": "SQL", |
||||
"sqlpl": "SQLPL", |
||||
"squeak": "Smalltalk", |
||||
"squirrel": "Squirrel", |
||||
"srecode_template": "SRecode Template", |
||||
"ssh_config": "SSH Config", |
||||
"stan": "Stan", |
||||
"standard_ml": "Standard ML", |
||||
"stata": "Stata", |
||||
"ston": "STON", |
||||
"stylus": "Stylus", |
||||
"subrip_text": "SubRip Text", |
||||
"sugarss": "SugarSS", |
||||
"supercollider": "SuperCollider", |
||||
"svelte": "Svelte", |
||||
"svg": "SVG", |
||||
"swift": "Swift", |
||||
"systemverilog": "SystemVerilog", |
||||
"tcl": "Tcl", |
||||
"tcsh": "Tcsh", |
||||
"tea": "Tea", |
||||
"terra": "Terra", |
||||
"terraform": "HCL", |
||||
"tex": "TeX", |
||||
"text": "Text", |
||||
"textile": "Textile", |
||||
"thrift": "Thrift", |
||||
"ti_program": "TI Program", |
||||
"tl": "Type Language", |
||||
"tla": "TLA", |
||||
"toml": "TOML", |
||||
"troff": "Roff", |
||||
"ts": "TypeScript", |
||||
"tsql": "TSQL", |
||||
"tsx": "TSX", |
||||
"turing": "Turing", |
||||
"turtle": "Turtle", |
||||
"twig": "Twig", |
||||
"txl": "TXL", |
||||
"type_language": "Type Language", |
||||
"typescript": "TypeScript", |
||||
"udiff": "Diff", |
||||
"unified_parallel_c": "Unified Parallel C", |
||||
"unity3d_asset": "Unity3D Asset", |
||||
"unix_assembly": "Unix Assembly", |
||||
"uno": "Uno", |
||||
"unrealscript": "UnrealScript", |
||||
"ur": "UrWeb", |
||||
"ur/web": "UrWeb", |
||||
"urweb": "UrWeb", |
||||
"vala": "Vala", |
||||
"vb.net": "Visual Basic", |
||||
"vbnet": "Visual Basic", |
||||
"vcl": "VCL", |
||||
"verilog": "Verilog", |
||||
"vhdl": "VHDL", |
||||
"vim": "Vim script", |
||||
"vim_script": "Vim script", |
||||
"viml": "Vim script", |
||||
"visual_basic": "Visual Basic", |
||||
"volt": "Volt", |
||||
"vue": "Vue", |
||||
"wasm": "WebAssembly", |
||||
"wast": "WebAssembly", |
||||
"wavefront_material": "Wavefront Material", |
||||
"wavefront_object": "Wavefront Object", |
||||
"wdl": "wdl", |
||||
"web_ontology_language": "Web Ontology Language", |
||||
"webassembly": "WebAssembly", |
||||
"webidl": "WebIDL", |
||||
"webvtt": "WebVTT", |
||||
"winbatch": "Batchfile", |
||||
"windows_registry_entries": "Windows Registry Entries", |
||||
"wisp": "wisp", |
||||
"wollok": "Wollok", |
||||
"world_of_warcraft_addon_data": "World of Warcraft Addon Data", |
||||
"wsdl": "XML", |
||||
"x10": "X10", |
||||
"x_bitmap": "X BitMap", |
||||
"x_font_directory_index": "X Font Directory Index", |
||||
"x_pixmap": "X PixMap", |
||||
"xbase": "xBase", |
||||
"xbm": "X BitMap", |
||||
"xc": "XC", |
||||
"xcompose": "XCompose", |
||||
"xdr": "RPC", |
||||
"xhtml": "HTML", |
||||
"xml": "XML", |
||||
"xml+genshi": "Genshi", |
||||
"xml+kid": "Genshi", |
||||
"xojo": "Xojo", |
||||
"xpages": "XPages", |
||||
"xpm": "X PixMap", |
||||
"xproc": "XProc", |
||||
"xquery": "XQuery", |
||||
"xs": "XS", |
||||
"xsd": "XML", |
||||
"xsl": "XSLT", |
||||
"xslt": "XSLT", |
||||
"xten": "X10", |
||||
"xtend": "Xtend", |
||||
"yacc": "Yacc", |
||||
"yaml": "YAML", |
||||
"yang": "YANG", |
||||
"yara": "YARA", |
||||
"yas": "YASnippet", |
||||
"yasnippet": "YASnippet", |
||||
"yml": "YAML", |
||||
"zap": "ZAP", |
||||
"zeek": "Zeek", |
||||
"zenscript": "ZenScript", |
||||
"zephir": "Zephir", |
||||
"zig": "Zig", |
||||
"zil": "ZIL", |
||||
"zimpl": "Zimpl", |
||||
"zsh": "Shell", |
||||
} |
||||
|
||||
// LanguageByAlias looks up the language name by it's alias or name.
|
||||
// It mirrors the logic of github linguist and is needed e.g for heuristcs.yml
|
||||
// that mixes names and aliases in a language field (see XPM example).
|
||||
func LanguageByAlias(langOrAlias string) (lang string, ok bool) { |
||||
k := convertToAliasKey(langOrAlias) |
||||
lang, ok = LanguageByAliasMap[k] |
||||
return |
||||
} |
||||
|
||||
// convertToAliasKey converts language name to a key in LanguageByAliasMap.
|
||||
// Following
|
||||
// - internal.code-generator.generator.convertToAliasKey()
|
||||
// - GetLanguageByAlias()
|
||||
// conventions.
|
||||
// It is here to avoid dependency on "generate" and "enry" packages.
|
||||
func convertToAliasKey(langName string) string { |
||||
ak := strings.SplitN(langName, `,`, 2)[0] |
||||
ak = strings.Replace(ak, ` `, `_`, -1) |
||||
ak = strings.ToLower(ak) |
||||
return ak |
||||
} |
@ -0,0 +1,254 @@ |
||||
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
|
||||
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
|
||||
|
||||
package data |
||||
|
||||
var LanguagesColor = map[string]string{ |
||||
"1C Enterprise": "#814CCC", |
||||
"ABAP": "#E8274B", |
||||
"AGS Script": "#B9D9FF", |
||||
"AMPL": "#E6EFBB", |
||||
"ANTLR": "#9DC3FF", |
||||
"API Blueprint": "#2ACCA8", |
||||
"APL": "#5A8164", |
||||
"ASP": "#6a40fd", |
||||
"ATS": "#1ac620", |
||||
"ActionScript": "#882B0F", |
||||
"Ada": "#02f88c", |
||||
"Agda": "#315665", |
||||
"Alloy": "#64C800", |
||||
"AngelScript": "#C7D7DC", |
||||
"AppleScript": "#101F1F", |
||||
"Arc": "#aa2afe", |
||||
"AspectJ": "#a957b0", |
||||
"Assembly": "#6E4C13", |
||||
"Asymptote": "#4a0c0c", |
||||
"AutoHotkey": "#6594b9", |
||||
"AutoIt": "#1C3552", |
||||
"Ballerina": "#FF5000", |
||||
"Batchfile": "#C1F12E", |
||||
"BlitzMax": "#cd6400", |
||||
"Boo": "#d4bec1", |
||||
"Brainfuck": "#2F2530", |
||||
"C": "#555555", |
||||
"C#": "#178600", |
||||
"C++": "#f34b7d", |
||||
"CSS": "#563d7c", |
||||
"Ceylon": "#dfa535", |
||||
"Chapel": "#8dc63f", |
||||
"Cirru": "#ccccff", |
||||
"Clarion": "#db901e", |
||||
"Clean": "#3F85AF", |
||||
"Click": "#E4E6F3", |
||||
"Clojure": "#db5855", |
||||
"CoffeeScript": "#244776", |
||||
"ColdFusion": "#ed2cd6", |
||||
"Common Lisp": "#3fb68b", |
||||
"Common Workflow Language": "#B5314C", |
||||
"Component Pascal": "#B0CE4E", |
||||
"Crystal": "#000100", |
||||
"Cuda": "#3A4E3A", |
||||
"D": "#ba595e", |
||||
"DM": "#447265", |
||||
"Dart": "#00B4AB", |
||||
"DataWeave": "#003a52", |
||||
"Dhall": "#dfafff", |
||||
"Dockerfile": "#384d54", |
||||
"Dogescript": "#cca760", |
||||
"Dylan": "#6c616e", |
||||
"E": "#ccce35", |
||||
"ECL": "#8a1267", |
||||
"EQ": "#a78649", |
||||
"Eiffel": "#946d57", |
||||
"Elixir": "#6e4a7e", |
||||
"Elm": "#60B5CC", |
||||
"Emacs Lisp": "#c065db", |
||||
"EmberScript": "#FFF4F3", |
||||
"Erlang": "#B83998", |
||||
"F#": "#b845fc", |
||||
"F*": "#572e30", |
||||
"FLUX": "#88ccff", |
||||
"Factor": "#636746", |
||||
"Fancy": "#7b9db4", |
||||
"Fantom": "#14253c", |
||||
"Forth": "#341708", |
||||
"Fortran": "#4d41b1", |
||||
"FreeMarker": "#0050b2", |
||||
"Frege": "#00cafe", |
||||
"G-code": "#D08CF2", |
||||
"GDScript": "#355570", |
||||
"Game Maker Language": "#71b417", |
||||
"Genie": "#fb855d", |
||||
"Gherkin": "#5B2063", |
||||
"Glyph": "#c1ac7f", |
||||
"Gnuplot": "#f0a9f0", |
||||
"Go": "#00ADD8", |
||||
"Golo": "#88562A", |
||||
"Gosu": "#82937f", |
||||
"Grammatical Framework": "#79aa7a", |
||||
"Groovy": "#e69f56", |
||||
"HTML": "#e34c26", |
||||
"Hack": "#878787", |
||||
"Harbour": "#0e60e3", |
||||
"Haskell": "#5e5086", |
||||
"Haxe": "#df7900", |
||||
"HiveQL": "#dce200", |
||||
"HolyC": "#ffefaf", |
||||
"Hy": "#7790B2", |
||||
"IDL": "#a3522f", |
||||
"Idris": "#b30000", |
||||
"Io": "#a9188d", |
||||
"Ioke": "#078193", |
||||
"Isabelle": "#FEFE00", |
||||
"J": "#9EEDFF", |
||||
"JSONiq": "#40d47e", |
||||
"Java": "#b07219", |
||||
"JavaScript": "#f1e05a", |
||||
"Jolie": "#843179", |
||||
"Jsonnet": "#0064bd", |
||||
"Julia": "#a270ba", |
||||
"Jupyter Notebook": "#DA5B0B", |
||||
"KRL": "#28430A", |
||||
"Kotlin": "#F18E33", |
||||
"LFE": "#4C3023", |
||||
"LLVM": "#185619", |
||||
"LOLCODE": "#cc9900", |
||||
"LSL": "#3d9970", |
||||
"Lasso": "#999999", |
||||
"Lex": "#DBCA00", |
||||
"LiveScript": "#499886", |
||||
"LookML": "#652B81", |
||||
"Lua": "#000080", |
||||
"MATLAB": "#e16737", |
||||
"MAXScript": "#00a6a6", |
||||
"MQL4": "#62A8D6", |
||||
"MQL5": "#4A76B8", |
||||
"MTML": "#b7e1f4", |
||||
"Makefile": "#427819", |
||||
"Mask": "#f97732", |
||||
"Max": "#c4a79c", |
||||
"Mercury": "#ff2b2b", |
||||
"Meson": "#007800", |
||||
"Metal": "#8f14e9", |
||||
"Mirah": "#c7a938", |
||||
"Modula-3": "#223388", |
||||
"NCL": "#28431f", |
||||
"Nearley": "#990000", |
||||
"Nemerle": "#3d3c6e", |
||||
"NetLinx": "#0aa0ff", |
||||
"NetLinx+ERB": "#747faa", |
||||
"NetLogo": "#ff6375", |
||||
"NewLisp": "#87AED7", |
||||
"Nextflow": "#3ac486", |
||||
"Nim": "#37775b", |
||||
"Nit": "#009917", |
||||
"Nix": "#7e7eff", |
||||
"Nu": "#c9df40", |
||||
"OCaml": "#3be133", |
||||
"ObjectScript": "#424893", |
||||
"Objective-C": "#438eff", |
||||
"Objective-C++": "#6866fb", |
||||
"Objective-J": "#ff0c5a", |
||||
"Omgrofl": "#cabbff", |
||||
"Opal": "#f7ede0", |
||||
"Oxygene": "#cdd0e3", |
||||
"Oz": "#fab738", |
||||
"P4": "#7055b5", |
||||
"PHP": "#4F5D95", |
||||
"PLSQL": "#dad8d8", |
||||
"Pan": "#cc0000", |
||||
"Papyrus": "#6600cc", |
||||
"Parrot": "#f3ca0a", |
||||
"Pascal": "#E3F171", |
||||
"Pawn": "#dbb284", |
||||
"Pep8": "#C76F5B", |
||||
"Perl": "#0298c3", |
||||
"Perl 6": "#0000fb", |
||||
"PigLatin": "#fcd7de", |
||||
"Pike": "#005390", |
||||
"PogoScript": "#d80074", |
||||
"PostScript": "#da291c", |
||||
"PowerBuilder": "#8f0f8d", |
||||
"PowerShell": "#012456", |
||||
"Processing": "#0096D8", |
||||
"Prolog": "#74283c", |
||||
"Propeller Spin": "#7fa2a7", |
||||
"Puppet": "#302B6D", |
||||
"PureBasic": "#5a6986", |
||||
"PureScript": "#1D222D", |
||||
"Python": "#3572A5", |
||||
"QML": "#44a51c", |
||||
"Quake": "#882233", |
||||
"R": "#198CE7", |
||||
"RAML": "#77d9fb", |
||||
"RUNOFF": "#665a4e", |
||||
"Racket": "#3c5caa", |
||||
"Ragel": "#9d5200", |
||||
"Rascal": "#fffaa0", |
||||
"Rebol": "#358a5b", |
||||
"Red": "#f50000", |
||||
"Ren'Py": "#ff7f7f", |
||||
"Ring": "#2D54CB", |
||||
"Roff": "#ecdebe", |
||||
"Rouge": "#cc0088", |
||||
"Ruby": "#701516", |
||||
"Rust": "#dea584", |
||||
"SAS": "#B34936", |
||||
"SQF": "#3F3F3F", |
||||
"SRecode Template": "#348a34", |
||||
"SaltStack": "#646464", |
||||
"Scala": "#c22d40", |
||||
"Scheme": "#1e4aec", |
||||
"Self": "#0579aa", |
||||
"Shell": "#89e051", |
||||
"Shen": "#120F14", |
||||
"Slash": "#007eff", |
||||
"Slice": "#003fa2", |
||||
"Smalltalk": "#596706", |
||||
"Solidity": "#AA6746", |
||||
"SourcePawn": "#5c7611", |
||||
"Squirrel": "#800000", |
||||
"Stan": "#b2011d", |
||||
"Standard ML": "#dc566d", |
||||
"SuperCollider": "#46390b", |
||||
"Swift": "#ffac45", |
||||
"SystemVerilog": "#DAE1C2", |
||||
"TI Program": "#A0AA87", |
||||
"Tcl": "#e4cc98", |
||||
"TeX": "#3D6117", |
||||
"Terra": "#00004c", |
||||
"Turing": "#cf142b", |
||||
"TypeScript": "#2b7489", |
||||
"UnrealScript": "#a54c4d", |
||||
"VCL": "#148AA8", |
||||
"VHDL": "#adb2cb", |
||||
"Vala": "#fbe5cd", |
||||
"Verilog": "#b2b7f8", |
||||
"Vim script": "#199f4b", |
||||
"Visual Basic": "#945db7", |
||||
"Volt": "#1F1F1F", |
||||
"Vue": "#2c3e50", |
||||
"WebAssembly": "#04133b", |
||||
"Wollok": "#a23738", |
||||
"X10": "#4B6BEF", |
||||
"XC": "#99DA07", |
||||
"XQuery": "#5232e7", |
||||
"XSLT": "#EB8CEB", |
||||
"YARA": "#220000", |
||||
"YASnippet": "#32AB90", |
||||
"Yacc": "#4B6C4B", |
||||
"ZAP": "#0d665e", |
||||
"ZIL": "#dc75e5", |
||||
"ZenScript": "#00BCD1", |
||||
"Zephir": "#118f9e", |
||||
"Zig": "#ec915c", |
||||
"eC": "#913960", |
||||
"mcfunction": "#E22837", |
||||
"nesC": "#94B0C7", |
||||
"ooc": "#b0b77e", |
||||
"q": "#0040cd", |
||||
"sed": "#64b970", |
||||
"wdl": "#42f1f4", |
||||
"wisp": "#7582D1", |
||||
"xBase": "#403a40", |
||||
} |
@ -0,0 +1,7 @@ |
||||
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
|
||||
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
|
||||
|
||||
package data |
||||
|
||||
// linguist's commit from which files were generated.
|
||||
var LinguistCommit = "3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d" |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,3 @@ |
||||
// Package data contains only auto-generated data-structures for all the language
|
||||
// identification strategies from the Linguist project sources.
|
||||
package data |
@ -0,0 +1,26 @@ |
||||
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
|
||||
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
|
||||
|
||||
package data |
||||
|
||||
import "gopkg.in/toqueteos/substring.v1" |
||||
|
||||
var DocumentationMatchers = substring.Or( |
||||
substring.Regexp(`^[Dd]ocs?/`), |
||||
substring.Regexp(`(^|/)[Dd]ocumentation/`), |
||||
substring.Regexp(`(^|/)[Gg]roovydoc/`), |
||||
substring.Regexp(`(^|/)[Jj]avadoc/`), |
||||
substring.Regexp(`^[Mm]an/`), |
||||
substring.Regexp(`^[Ee]xamples/`), |
||||
substring.Regexp(`^[Dd]emos?/`), |
||||
substring.Regexp(`(^|/)inst/doc/`), |
||||
substring.Regexp(`(^|/)CHANGE(S|LOG)?(\.|$)`), |
||||
substring.Regexp(`(^|/)CONTRIBUTING(\.|$)`), |
||||
substring.Regexp(`(^|/)COPYING(\.|$)`), |
||||
substring.Regexp(`(^|/)INSTALL(\.|$)`), |
||||
substring.Regexp(`(^|/)LICEN[CS]E(\.|$)`), |
||||
substring.Regexp(`(^|/)[Ll]icen[cs]e(\.|$)`), |
||||
substring.Regexp(`(^|/)README(\.|$)`), |
||||
substring.Regexp(`(^|/)[Rr]eadme(\.|$)`), |
||||
substring.Regexp(`^[Ss]amples?/`), |
||||
) |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,241 @@ |
||||
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
|
||||
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
|
||||
|
||||
package data |
||||
|
||||
var LanguagesByFilename = map[string][]string{ |
||||
".Rprofile": {"R"}, |
||||
".XCompose": {"XCompose"}, |
||||
".abbrev_defs": {"Emacs Lisp"}, |
||||
".arcconfig": {"JSON"}, |
||||
".atomignore": {"Ignore List"}, |
||||
".babelignore": {"Ignore List"}, |
||||
".babelrc": {"JSON with Comments"}, |
||||
".bash_aliases": {"Shell"}, |
||||
".bash_history": {"Shell"}, |
||||
".bash_logout": {"Shell"}, |
||||
".bash_profile": {"Shell"}, |
||||
".bashrc": {"Shell"}, |
||||
".bzrignore": {"Ignore List"}, |
||||
".clang-format": {"YAML"}, |
||||
".clang-tidy": {"YAML"}, |
||||
".classpath": {"XML"}, |
||||
".coffeelintignore": {"Ignore List"}, |
||||
".cproject": {"XML"}, |
||||
".cshrc": {"Shell"}, |
||||
".cvsignore": {"Ignore List"}, |
||||
".dockerignore": {"Ignore List"}, |
||||
".editorconfig": {"EditorConfig"}, |
||||
".emacs": {"Emacs Lisp"}, |
||||
".emacs.desktop": {"Emacs Lisp"}, |
||||
".eslintignore": {"Ignore List"}, |
||||
".eslintrc.json": {"JSON with Comments"}, |
||||
".factor-boot-rc": {"Factor"}, |
||||
".factor-rc": {"Factor"}, |
||||
".gclient": {"Python"}, |
||||
".gemrc": {"YAML"}, |
||||
".gitattributes": {"Git Attributes"}, |
||||
".gitconfig": {"Git Config"}, |
||||
".gitignore": {"Ignore List"}, |
||||
".gitmodules": {"Git Config"}, |
||||
".gn": {"GN"}, |
||||
".gnus": {"Emacs Lisp"}, |
||||
".gvimrc": {"Vim script"}, |
||||
".htaccess": {"ApacheConf"}, |
||||
".htmlhintrc": {"JSON"}, |
||||
".irbrc": {"Ruby"}, |
||||
".jscsrc": {"JSON with Comments"}, |
||||
".jshintrc": {"JSON with Comments"}, |
||||
".jslintrc": {"JSON with Comments"}, |
||||
".login": {"Shell"}, |
||||
".nanorc": {"nanorc"}, |
||||
".nodemonignore": {"Ignore List"}, |
||||
".npmignore": {"Ignore List"}, |
||||
".nvimrc": {"Vim script"}, |
||||
".php": {"PHP"}, |
||||
".php_cs": {"PHP"}, |
||||
".php_cs.dist": {"PHP"}, |
||||
".prettierignore": {"Ignore List"}, |
||||
".profile": {"Shell"}, |
||||
".project": {"XML"}, |
||||
".pryrc": {"Ruby"}, |
||||
".spacemacs": {"Emacs Lisp"}, |
||||
".stylelintignore": {"Ignore List"}, |
||||
".tern-config": {"JSON"}, |
||||
".tern-project": {"JSON"}, |
||||
".vimrc": {"Vim script"}, |
||||
".viper": {"Emacs Lisp"}, |
||||
".vscodeignore": {"Ignore List"}, |
||||
".watchmanconfig": {"JSON"}, |
||||
".zlogin": {"Shell"}, |
||||
".zlogout": {"Shell"}, |
||||
".zprofile": {"Shell"}, |
||||
".zshenv": {"Shell"}, |
||||
".zshrc": {"Shell"}, |
||||
"9fs": {"Shell"}, |
||||
"APKBUILD": {"Alpine Abuild"}, |
||||
"App.config": {"XML"}, |
||||
"Appraisals": {"Ruby"}, |
||||
"BSDmakefile": {"Makefile"}, |
||||
"BUCK": {"Python"}, |
||||
"BUILD": {"Python"}, |
||||
"BUILD.bazel": {"Python"}, |
||||
"Berksfile": {"Ruby"}, |
||||
"Brewfile": {"Ruby"}, |
||||
"Buildfile": {"Ruby"}, |
||||
"CMakeLists.txt": {"CMake"}, |
||||
"COPYING": {"Text"}, |
||||
"COPYING.regex": {"Text"}, |
||||
"COPYRIGHT.regex": {"Text"}, |
||||
"Cakefile": {"CoffeeScript"}, |
||||
"Capfile": {"Ruby"}, |
||||
"Cargo.lock": {"TOML"}, |
||||
"Cask": {"Emacs Lisp"}, |
||||
"Dangerfile": {"Ruby"}, |
||||
"Deliverfile": {"Ruby"}, |
||||
"Dockerfile": {"Dockerfile"}, |
||||
"Emakefile": {"Erlang"}, |
||||
"FONTLOG": {"Text"}, |
||||
"Fakefile": {"Fancy"}, |
||||
"Fastfile": {"Ruby"}, |
||||
"GNUmakefile": {"Makefile"}, |
||||
"Gemfile": {"Ruby"}, |
||||
"Gemfile.lock": {"Ruby"}, |
||||
"Gopkg.lock": {"TOML"}, |
||||
"Guardfile": {"Ruby"}, |
||||
"INSTALL": {"Text"}, |
||||
"INSTALL.mysql": {"Text"}, |
||||
"Jakefile": {"JavaScript"}, |
||||
"Jarfile": {"Ruby"}, |
||||
"Jenkinsfile": {"Groovy"}, |
||||
"Kbuild": {"Makefile"}, |
||||
"LICENSE": {"Text"}, |
||||
"LICENSE.mysql": {"Text"}, |
||||
"Makefile": {"Makefile"}, |
||||
"Makefile.PL": {"Perl"}, |
||||
"Makefile.am": {"Makefile"}, |
||||
"Makefile.boot": {"Makefile"}, |
||||
"Makefile.frag": {"Makefile"}, |
||||
"Makefile.in": {"Makefile"}, |
||||
"Makefile.inc": {"Makefile"}, |
||||
"Makefile.wat": {"Makefile"}, |
||||
"Mavenfile": {"Ruby"}, |
||||
"Modulefile": {"Puppet"}, |
||||
"NEWS": {"Text"}, |
||||
"Notebook": {"Jupyter Notebook"}, |
||||
"NuGet.config": {"XML"}, |
||||
"Nukefile": {"Nu"}, |
||||
"PKGBUILD": {"Shell"}, |
||||
"Phakefile": {"PHP"}, |
||||
"Podfile": {"Ruby"}, |
||||
"Project.ede": {"Emacs Lisp"}, |
||||
"Puppetfile": {"Ruby"}, |
||||
"README.1ST": {"Text"}, |
||||
"README.me": {"Text"}, |
||||
"README.mysql": {"Text"}, |
||||
"ROOT": {"Isabelle ROOT"}, |
||||
"Rakefile": {"Ruby"}, |
||||
"Rexfile": {"Perl"}, |
||||
"SConscript": {"Python"}, |
||||
"SConstruct": {"Python"}, |
||||
"Settings.StyleCop": {"XML"}, |
||||
"Slakefile": {"LiveScript"}, |
||||
"Snakefile": {"Python"}, |
||||
"Snapfile": {"Ruby"}, |
||||
"Thorfile": {"Ruby"}, |
||||
"Vagrantfile": {"Ruby"}, |
||||
"WORKSPACE": {"Python"}, |
||||
"Web.Debug.config": {"XML"}, |
||||
"Web.Release.config": {"XML"}, |
||||
"Web.config": {"XML"}, |
||||
"XCompose": {"XCompose"}, |
||||
"_emacs": {"Emacs Lisp"}, |
||||
"_vimrc": {"Vim script"}, |
||||
"abbrev_defs": {"Emacs Lisp"}, |
||||
"ack": {"Perl"}, |
||||
"ant.xml": {"Ant Build System"}, |
||||
"apache2.conf": {"ApacheConf"}, |
||||
"bash_aliases": {"Shell"}, |
||||
"bash_logout": {"Shell"}, |
||||
"bash_profile": {"Shell"}, |
||||
"bashrc": {"Shell"}, |
||||
"build.xml": {"Ant Build System"}, |
||||
"buildfile": {"Ruby"}, |
||||
"buildozer.spec": {"INI"}, |
||||
"cabal.config": {"Cabal Config"}, |
||||
"cabal.project": {"Cabal Config"}, |
||||
"click.me": {"Text"}, |
||||
"composer.lock": {"JSON"}, |
||||
"configure.ac": {"M4Sugar"}, |
||||
"contents.lr": {"Markdown"}, |
||||
"cpanfile": {"Perl"}, |
||||
"cshrc": {"Shell"}, |
||||
"delete.me": {"Text"}, |
||||
"descrip.mmk": {"Module Management System"}, |
||||
"descrip.mms": {"Module Management System"}, |
||||
"encodings.dir": {"X Font Directory Index"}, |
||||
"expr-dist": {"R"}, |
||||
"firestore.rules": {"Cloud Firestore Security Rules"}, |
||||
"fonts.alias": {"X Font Directory Index"}, |
||||
"fonts.dir": {"X Font Directory Index"}, |
||||
"fonts.scale": {"X Font Directory Index"}, |
||||
"fp-lib-table": {"KiCad Layout"}, |
||||
"gitignore-global": {"Ignore List"}, |
||||
"gitignore_global": {"Ignore List"}, |
||||
"glide.lock": {"YAML"}, |
||||
"go.mod": {"Text"}, |
||||
"go.sum": {"Text"}, |
||||
"gradlew": {"Shell"}, |
||||
"gvimrc": {"Vim script"}, |
||||
"haproxy.cfg": {"HAProxy"}, |
||||
"httpd.conf": {"ApacheConf"}, |
||||
"jsconfig.json": {"JSON with Comments"}, |
||||
"keep.me": {"Text"}, |
||||
"language-configuration.json": {"JSON with Comments"}, |
||||
"ld.script": {"Linker Script"}, |
||||
"login": {"Shell"}, |
||||
"m3makefile": {"Quake"}, |
||||
"m3overrides": {"Quake"}, |
||||
"makefile": {"Makefile"}, |
||||
"makefile.sco": {"Makefile"}, |
||||
"man": {"Shell"}, |
||||
"mcmod.info": {"JSON"}, |
||||
"meson.build": {"Meson"}, |
||||
"meson_options.txt": {"Meson"}, |
||||
"mix.lock": {"Elixir"}, |
||||
"mkfile": {"Makefile"}, |
||||
"mmn": {"Roff"}, |
||||
"mmt": {"Roff"}, |
||||
"nanorc": {"nanorc"}, |
||||
"nextflow.config": {"Nextflow"}, |
||||
"nginx.conf": {"Nginx"}, |
||||
"nim.cfg": {"Nim"}, |
||||
"nvimrc": {"Vim script"}, |
||||
"owh": {"Tcl"}, |
||||
"packages.config": {"XML"}, |
||||
"pom.xml": {"Maven POM"}, |
||||
"profile": {"Shell"}, |
||||
"read.me": {"Text"}, |
||||
"readme.1st": {"Text"}, |
||||
"rebar.config": {"Erlang"}, |
||||
"rebar.config.lock": {"Erlang"}, |
||||
"rebar.lock": {"Erlang"}, |
||||
"riemann.config": {"Clojure"}, |
||||
"ssh-config": {"SSH Config"}, |
||||
"ssh_config": {"SSH Config"}, |
||||
"sshconfig": {"SSH Config"}, |
||||
"sshconfig.snip": {"SSH Config"}, |
||||
"sshd-config": {"SSH Config"}, |
||||
"sshd_config": {"SSH Config"}, |
||||
"starfield": {"Tcl"}, |
||||
"test.me": {"Text"}, |
||||
"tsconfig.json": {"JSON with Comments"}, |
||||
"vimrc": {"Vim script"}, |
||||
"wscript": {"Python"}, |
||||
"xcompose": {"XCompose"}, |
||||
"zlogin": {"Shell"}, |
||||
"zlogout": {"Shell"}, |
||||
"zprofile": {"Shell"}, |
||||
"zshenv": {"Shell"}, |
||||
"zshrc": {"Shell"}, |
||||
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,35 @@ |
||||
package data |
||||
|
||||
import "github.com/src-d/enry/v2/data/rule" |
||||
|
||||
// Heuristics implements a rule-based content matching engine.
|
||||
|
||||
// Heuristics is a number of sequntially applied rule.Heuristic where a
|
||||
// matching one disambiguages language(s) for a single file extension.
|
||||
type Heuristics []rule.Heuristic |
||||
|
||||
// Match returns languages identified by the matching rule of the heuristic.
|
||||
func (hs Heuristics) Match(data []byte) []string { |
||||
var matchedLangs []string |
||||
for _, heuristic := range hs { |
||||
if heuristic.Match(data) { |
||||
for _, langOrAlias := range heuristic.Languages() { |
||||
lang, ok := LanguageByAlias(langOrAlias) |
||||
if !ok { // should never happen
|
||||
// reaching here means language name/alias in heuristics.yml
|
||||
// is not consistent with languages.yml
|
||||
// but we do not surface any such error at the API
|
||||
continue |
||||
} |
||||
matchedLangs = append(matchedLangs, lang) |
||||
} |
||||
break |
||||
} |
||||
} |
||||
return matchedLangs |
||||
} |
||||
|
||||
// matchString is a convenience used only in tests.
|
||||
func (hs *Heuristics) matchString(data string) []string { |
||||
return hs.Match([]byte(data)) |
||||
} |
@ -0,0 +1,124 @@ |
||||
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
|
||||
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
|
||||
|
||||
package data |
||||
|
||||
var LanguagesByInterpreter = map[string][]string{ |
||||
"Rscript": {"R"}, |
||||
"apl": {"APL"}, |
||||
"aplx": {"APL"}, |
||||
"ash": {"Shell"}, |
||||
"asy": {"Asymptote"}, |
||||
"awk": {"Awk"}, |
||||
"bash": {"Shell"}, |
||||
"bigloo": {"Scheme"}, |
||||
"boolector": {"SMT"}, |
||||
"ccl": {"Common Lisp"}, |
||||
"chakra": {"JavaScript"}, |
||||
"chicken": {"Scheme"}, |
||||
"clisp": {"Common Lisp"}, |
||||
"coffee": {"CoffeeScript"}, |
||||
"cperl": {"Perl"}, |
||||
"crystal": {"Crystal"}, |
||||
"csi": {"Scheme"}, |
||||
"cvc4": {"SMT"}, |
||||
"cwl-runner": {"Common Workflow Language"}, |
||||
"d8": {"JavaScript"}, |
||||
"dart": {"Dart"}, |
||||
"dash": {"Shell"}, |
||||
"deno": {"TypeScript"}, |
||||
"dtrace": {"DTrace"}, |
||||
"dyalog": {"APL"}, |
||||
"ecl": {"Common Lisp"}, |
||||
"elixir": {"Elixir"}, |
||||
"escript": {"Erlang"}, |
||||
"fish": {"fish"}, |
||||
"gawk": {"Awk"}, |
||||
"gerbv": {"Gerber Image"}, |
||||
"gerbview": {"Gerber Image"}, |
||||
"gn": {"GN"}, |
||||
"gnuplot": {"Gnuplot"}, |
||||
"gosh": {"Scheme"}, |
||||
"groovy": {"Groovy"}, |
||||
"gsed": {"sed"}, |
||||
"guile": {"Scheme"}, |
||||
"hy": {"Hy"}, |
||||
"instantfpc": {"Pascal"}, |
||||
"io": {"Io"}, |
||||
"ioke": {"Ioke"}, |
||||
"jconsole": {"J"}, |
||||
"jolie": {"Jolie"}, |
||||
"jruby": {"Ruby"}, |
||||
"js": {"JavaScript"}, |
||||
"julia": {"Julia"}, |
||||
"ksh": {"Shell"}, |
||||
"lisp": {"Common Lisp"}, |
||||
"lsl": {"LSL"}, |
||||
"lua": {"Lua", "Terra"}, |
||||
"macruby": {"Ruby"}, |
||||
"make": {"Makefile"}, |
||||
"mathsat5": {"SMT"}, |
||||
"mawk": {"Awk"}, |
||||
"minised": {"sed"}, |
||||
"mksh": {"Shell"}, |
||||
"mmi": {"Mercury"}, |
||||
"moon": {"MoonScript"}, |
||||
"nawk": {"Awk"}, |
||||
"newlisp": {"NewLisp"}, |
||||
"nextflow": {"Nextflow"}, |
||||
"node": {"JavaScript"}, |
||||
"nush": {"Nu"}, |
||||
"ocaml": {"OCaml", "Reason"}, |
||||
"ocamlrun": {"OCaml"}, |
||||
"ocamlscript": {"OCaml"}, |
||||
"openrc-run": {"OpenRC runscript"}, |
||||
"opensmt": {"SMT"}, |
||||
"osascript": {"AppleScript"}, |
||||
"parrot": {"Parrot Assembly", "Parrot Internal Representation"}, |
||||
"pdksh": {"Shell"}, |
||||
"perl": {"Perl", "Pod"}, |
||||
"perl6": {"Perl 6", "Pod 6"}, |
||||
"php": {"PHP"}, |
||||
"picolisp": {"PicoLisp"}, |
||||
"pike": {"Pike"}, |
||||
"pil": {"PicoLisp"}, |
||||
"pwsh": {"PowerShell"}, |
||||
"python": {"Python"}, |
||||
"python2": {"Python"}, |
||||
"python3": {"Python"}, |
||||
"qmake": {"QMake"}, |
||||
"r6rs": {"Scheme"}, |
||||
"racket": {"Racket"}, |
||||
"rake": {"Ruby"}, |
||||
"rbx": {"Ruby"}, |
||||
"rc": {"Shell"}, |
||||
"regina": {"REXX"}, |
||||
"rexx": {"REXX"}, |
||||
"rhino": {"JavaScript"}, |
||||
"ruby": {"Ruby"}, |
||||
"rune": {"E"}, |
||||
"runhaskell": {"Haskell"}, |
||||
"sbcl": {"Common Lisp"}, |
||||
"scala": {"Scala"}, |
||||
"scheme": {"Scheme"}, |
||||
"sclang": {"SuperCollider"}, |
||||
"scsynth": {"SuperCollider"}, |
||||
"sed": {"sed"}, |
||||
"sh": {"Shell"}, |
||||
"smt-rat": {"SMT"}, |
||||
"smtinterpol": {"SMT"}, |
||||
"ssed": {"sed"}, |
||||
"stp": {"SMT"}, |
||||
"swipl": {"Prolog"}, |
||||
"tcc": {"C"}, |
||||
"tclsh": {"Tcl"}, |
||||
"ts-node": {"TypeScript"}, |
||||
"v8": {"JavaScript"}, |
||||
"v8-shell": {"JavaScript"}, |
||||
"verit": {"SMT"}, |
||||
"wish": {"Tcl"}, |
||||
"yap": {"Prolog"}, |
||||
"yices2": {"SMT"}, |
||||
"z3": {"SMT"}, |
||||
"zsh": {"Shell"}, |
||||
} |
@ -0,0 +1,226 @@ |
||||
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
|
||||
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
|
||||
|
||||
package data |
||||
|
||||
var LanguagesMime = map[string]string{ |
||||
"AGS Script": "text/x-c++src", |
||||
"APL": "text/apl", |
||||
"ASN.1": "text/x-ttcn-asn", |
||||
"ASP": "application/x-aspx", |
||||
"Alpine Abuild": "text/x-sh", |
||||
"AngelScript": "text/x-c++src", |
||||
"Ant Build System": "application/xml", |
||||
"Apex": "text/x-java", |
||||
"Asymptote": "text/x-kotlin", |
||||
"Brainfuck": "text/x-brainfuck", |
||||
"C": "text/x-csrc", |
||||
"C#": "text/x-csharp", |
||||
"C++": "text/x-c++src", |
||||
"C2hs Haskell": "text/x-haskell", |
||||
"CMake": "text/x-cmake", |
||||
"COBOL": "text/x-cobol", |
||||
"COLLADA": "text/xml", |
||||
"CSON": "text/x-coffeescript", |
||||
"CSS": "text/css", |
||||
"Cabal Config": "text/x-haskell", |
||||
"ChucK": "text/x-java", |
||||
"Clojure": "text/x-clojure", |
||||
"Closure Templates": "text/x-soy", |
||||
"Cloud Firestore Security Rules": "text/css", |
||||
"CoffeeScript": "text/x-coffeescript", |
||||
"Common Lisp": "text/x-common-lisp", |
||||
"Common Workflow Language": "text/x-yaml", |
||||
"Component Pascal": "text/x-pascal", |
||||
"Crystal": "text/x-crystal", |
||||
"Cuda": "text/x-c++src", |
||||
"Cycript": "text/javascript", |
||||
"Cython": "text/x-cython", |
||||
"D": "text/x-d", |
||||
"DTrace": "text/x-csrc", |
||||
"Dart": "application/dart", |
||||
"Dhall": "text/x-haskell", |
||||
"Diff": "text/x-diff", |
||||
"Dockerfile": "text/x-dockerfile", |
||||
"Dylan": "text/x-dylan", |
||||
"EBNF": "text/x-ebnf", |
||||
"ECL": "text/x-ecl", |
||||
"EQ": "text/x-csharp", |
||||
"Eagle": "text/xml", |
||||
"Easybuild": "text/x-python", |
||||
"Ecere Projects": "application/json", |
||||
"EditorConfig": "text/x-properties", |
||||
"Edje Data Collection": "application/json", |
||||
"Eiffel": "text/x-eiffel", |
||||
"Elm": "text/x-elm", |
||||
"Emacs Lisp": "text/x-common-lisp", |
||||
"EmberScript": "text/x-coffeescript", |
||||
"Erlang": "text/x-erlang", |
||||
"F#": "text/x-fsharp", |
||||
"Factor": "text/x-factor", |
||||
"Forth": "text/x-forth", |
||||
"Fortran": "text/x-fortran", |
||||
"GCC Machine Description": "text/x-common-lisp", |
||||
"GN": "text/x-python", |
||||
"Game Maker Language": "text/x-c++src", |
||||
"Genshi": "text/xml", |
||||
"Gentoo Ebuild": "text/x-sh", |
||||
"Gentoo Eclass": "text/x-sh", |
||||
"Git Attributes": "text/x-sh", |
||||
"Git Config": "text/x-properties", |
||||
"Glyph": "text/x-tcl", |
||||
"Go": "text/x-go", |
||||
"Grammatical Framework": "text/x-haskell", |
||||
"Groovy": "text/x-groovy", |
||||
"Groovy Server Pages": "application/x-jsp", |
||||
"HCL": "text/x-ruby", |
||||
"HTML": "text/html", |
||||
"HTML+Django": "text/x-django", |
||||
"HTML+ECR": "text/html", |
||||
"HTML+EEX": "text/html", |
||||
"HTML+ERB": "application/x-erb", |
||||
"HTML+PHP": "application/x-httpd-php", |
||||
"HTML+Razor": "text/html", |
||||
"HTTP": "message/http", |
||||
"Hack": "application/x-httpd-php", |
||||
"Haml": "text/x-haml", |
||||
"Haskell": "text/x-haskell", |
||||
"Haxe": "text/x-haxe", |
||||
"HolyC": "text/x-csrc", |
||||
"IDL": "text/x-idl", |
||||
"INI": "text/x-properties", |
||||
"IRC log": "text/mirc", |
||||
"Ignore List": "text/x-sh", |
||||
"JSON": "application/json", |
||||
"JSON with Comments": "text/javascript", |
||||
"JSON5": "application/json", |
||||
"JSONLD": "application/json", |
||||
"JSONiq": "application/json", |
||||
"JSX": "text/jsx", |
||||
"Java": "text/x-java", |
||||
"Java Properties": "text/x-properties", |
||||
"Java Server Pages": "application/x-jsp", |
||||
"JavaScript": "text/javascript", |
||||
"JavaScript+ERB": "application/javascript", |
||||
"Julia": "text/x-julia", |
||||
"Jupyter Notebook": "application/json", |
||||
"KiCad Layout": "text/x-common-lisp", |
||||
"Kit": "text/html", |
||||
"Kotlin": "text/x-kotlin", |
||||
"LFE": "text/x-common-lisp", |
||||
"LTspice Symbol": "text/x-spreadsheet", |
||||
"LabVIEW": "text/xml", |
||||
"Latte": "text/x-smarty", |
||||
"Less": "text/css", |
||||
"Literate Haskell": "text/x-literate-haskell", |
||||
"LiveScript": "text/x-livescript", |
||||
"LookML": "text/x-yaml", |
||||
"Lua": "text/x-lua", |
||||
"M": "text/x-mumps", |
||||
"MATLAB": "text/x-octave", |
||||
"MTML": "text/html", |
||||
"MUF": "text/x-forth", |
||||
"Makefile": "text/x-cmake", |
||||
"Markdown": "text/x-gfm", |
||||
"Marko": "text/html", |
||||
"Mathematica": "text/x-mathematica", |
||||
"Maven POM": "text/xml", |
||||
"Max": "application/json", |
||||
"Metal": "text/x-c++src", |
||||
"Mirah": "text/x-ruby", |
||||
"Modelica": "text/x-modelica", |
||||
"NSIS": "text/x-nsis", |
||||
"NetLogo": "text/x-common-lisp", |
||||
"NewLisp": "text/x-common-lisp", |
||||
"Nginx": "text/x-nginx-conf", |
||||
"Nu": "text/x-scheme", |
||||
"NumPy": "text/x-python", |
||||
"OCaml": "text/x-ocaml", |
||||
"Objective-C": "text/x-objectivec", |
||||
"Objective-C++": "text/x-objectivec", |
||||
"OpenCL": "text/x-csrc", |
||||
"OpenRC runscript": "text/x-sh", |
||||
"Oz": "text/x-oz", |
||||
"PHP": "application/x-httpd-php", |
||||
"PLSQL": "text/x-plsql", |
||||
"PLpgSQL": "text/x-sql", |
||||
"Pascal": "text/x-pascal", |
||||
"Perl": "text/x-perl", |
||||
"Perl 6": "text/x-perl", |
||||
"Pic": "text/troff", |
||||
"Pod": "text/x-perl", |
||||
"PowerShell": "application/x-powershell", |
||||
"Protocol Buffer": "text/x-protobuf", |
||||
"Public Key": "application/pgp", |
||||
"Pug": "text/x-pug", |
||||
"Puppet": "text/x-puppet", |
||||
"PureScript": "text/x-haskell", |
||||
"Python": "text/x-python", |
||||
"R": "text/x-rsrc", |
||||
"RAML": "text/x-yaml", |
||||
"RHTML": "application/x-erb", |
||||
"RMarkdown": "text/x-gfm", |
||||
"RPM Spec": "text/x-rpm-spec", |
||||
"Reason": "text/x-rustsrc", |
||||
"Roff": "text/troff", |
||||
"Roff Manpage": "text/troff", |
||||
"Rouge": "text/x-clojure", |
||||
"Ruby": "text/x-ruby", |
||||
"Rust": "text/x-rustsrc", |
||||
"SAS": "text/x-sas", |
||||
"SCSS": "text/x-scss", |
||||
"SPARQL": "application/sparql-query", |
||||
"SQL": "text/x-sql", |
||||
"SQLPL": "text/x-sql", |
||||
"SRecode Template": "text/x-common-lisp", |
||||
"SVG": "text/xml", |
||||
"Sage": "text/x-python", |
||||
"SaltStack": "text/x-yaml", |
||||
"Sass": "text/x-sass", |
||||
"Scala": "text/x-scala", |
||||
"Scheme": "text/x-scheme", |
||||
"Shell": "text/x-sh", |
||||
"ShellSession": "text/x-sh", |
||||
"Slim": "text/x-slim", |
||||
"Smalltalk": "text/x-stsrc", |
||||
"Smarty": "text/x-smarty", |
||||
"Squirrel": "text/x-c++src", |
||||
"Standard ML": "text/x-ocaml", |
||||
"Svelte": "text/html", |
||||
"Swift": "text/x-swift", |
||||
"SystemVerilog": "text/x-systemverilog", |
||||
"TOML": "text/x-toml", |
||||
"TSX": "text/jsx", |
||||
"Tcl": "text/x-tcl", |
||||
"Tcsh": "text/x-sh", |
||||
"TeX": "text/x-stex", |
||||
"Terra": "text/x-lua", |
||||
"Textile": "text/x-textile", |
||||
"Turtle": "text/turtle", |
||||
"Twig": "text/x-twig", |
||||
"TypeScript": "application/typescript", |
||||
"Unified Parallel C": "text/x-csrc", |
||||
"Unity3D Asset": "text/x-yaml", |
||||
"Uno": "text/x-csharp", |
||||
"UnrealScript": "text/x-java", |
||||
"VHDL": "text/x-vhdl", |
||||
"Verilog": "text/x-verilog", |
||||
"Visual Basic": "text/x-vb", |
||||
"Volt": "text/x-d", |
||||
"WebAssembly": "text/x-common-lisp", |
||||
"WebIDL": "text/x-webidl", |
||||
"Windows Registry Entries": "text/x-properties", |
||||
"X BitMap": "text/x-csrc", |
||||
"X PixMap": "text/x-csrc", |
||||
"XC": "text/x-csrc", |
||||
"XML": "text/xml", |
||||
"XPages": "text/xml", |
||||
"XProc": "text/xml", |
||||
"XQuery": "application/xquery", |
||||
"XS": "text/x-csrc", |
||||
"XSLT": "text/xml", |
||||
"YAML": "text/x-yaml", |
||||
"edn": "text/x-clojure", |
||||
"reStructuredText": "text/x-rst", |
||||
"wisp": "text/x-clojure", |
||||
} |
@ -0,0 +1,109 @@ |
||||
// Package rule contains rule-based heuristic implementations.
|
||||
// It is used in the generated code in content.go for disambiguation of languages
|
||||
// with colliding extensions, based on regexps from Linguist data.
|
||||
package rule |
||||
|
||||
// Heuristic consist of (a number of) rules where each, if matches,
|
||||
// identifes content as belonging to a programming language(s).
|
||||
type Heuristic interface { |
||||
Matcher |
||||
Languages() []string |
||||
} |
||||
|
||||
// Matcher checks if the data matches (number of) pattern.
|
||||
// Every heuristic rule below implements this interface.
|
||||
// A regexp.Regexp satisfies this interface and can be used instead.
|
||||
type Matcher interface { |
||||
Match(data []byte) bool |
||||
} |
||||
|
||||
// languages struct incapsulate data common to every Matcher: all languages
|
||||
// that it identifies.
|
||||
type languages struct { |
||||
langs []string |
||||
} |
||||
|
||||
// Languages returns all languages, identified by this Matcher.
|
||||
func (l languages) Languages() []string { |
||||
return l.langs |
||||
} |
||||
|
||||
// MatchingLanguages is a helper to create new languages.
|
||||
func MatchingLanguages(langs ...string) languages { |
||||
return languages{langs} |
||||
} |
||||
|
||||
// Implements a Heuristic.
|
||||
type or struct { |
||||
languages |
||||
pattern Matcher |
||||
} |
||||
|
||||
// Or rule matches, if a single matching pattern exists.
|
||||
// It recives only one pattern as it relies on compile-time optimization that
|
||||
// represtes union with | inside a single regexp.
|
||||
func Or(l languages, r Matcher) Heuristic { |
||||
return or{l, r} |
||||
} |
||||
|
||||
// Match implements rule.Matcher.
|
||||
func (r or) Match(data []byte) bool { |
||||
return r.pattern.Match(data) |
||||
} |
||||
|
||||
// Implements a Heuristic.
|
||||
type and struct { |
||||
languages |
||||
patterns []Matcher |
||||
} |
||||
|
||||
// And rule matches, if each of the patterns does match.
|
||||
func And(l languages, m ...Matcher) Heuristic { |
||||
return and{l, m} |
||||
} |
||||
|
||||
// Match implements data.Matcher.
|
||||
func (r and) Match(data []byte) bool { |
||||
for _, p := range r.patterns { |
||||
if !p.Match(data) { |
||||
return false |
||||
} |
||||
} |
||||
return true |
||||
} |
||||
|
||||
// Implements a Heuristic.
|
||||
type not struct { |
||||
languages |
||||
Patterns []Matcher |
||||
} |
||||
|
||||
// Not rule matches if none of the patterns match.
|
||||
func Not(l languages, r ...Matcher) Heuristic { |
||||
return not{l, r} |
||||
} |
||||
|
||||
// Match implements data.Matcher.
|
||||
func (r not) Match(data []byte) bool { |
||||
for _, p := range r.Patterns { |
||||
if p.Match(data) { |
||||
return false |
||||
} |
||||
} |
||||
return true |
||||
} |
||||
|
||||
// Implements a Heuristic.
|
||||
type always struct { |
||||
languages |
||||
} |
||||
|
||||
// Always rule always matches. Often is used as a default fallback.
|
||||
func Always(l languages) Heuristic { |
||||
return always{l} |
||||
} |
||||
|
||||
// Match implements Matcher.
|
||||
func (r always) Match(data []byte) bool { |
||||
return true |
||||
} |
@ -0,0 +1,526 @@ |
||||
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
|
||||
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
|
||||
|
||||
package data |
||||
|
||||
var LanguagesType = map[string]int{ |
||||
"1C Enterprise": 2, |
||||
"ABAP": 2, |
||||
"ABNF": 1, |
||||
"AGS Script": 2, |
||||
"AMPL": 2, |
||||
"ANTLR": 2, |
||||
"API Blueprint": 3, |
||||
"APL": 2, |
||||
"ASN.1": 1, |
||||
"ASP": 2, |
||||
"ATS": 2, |
||||
"ActionScript": 2, |
||||
"Ada": 2, |
||||
"Adobe Font Metrics": 1, |
||||
"Agda": 2, |
||||
"Alloy": 2, |
||||
"Alpine Abuild": 2, |
||||
"Altium Designer": 1, |
||||
"AngelScript": 2, |
||||
"Ant Build System": 1, |
||||
"ApacheConf": 1, |
||||
"Apex": 2, |
||||
"Apollo Guidance Computer": 2, |
||||
"AppleScript": 2, |
||||
"Arc": 2, |
||||
"AsciiDoc": 4, |
||||
"AspectJ": 2, |
||||
"Assembly": 2, |
||||
"Asymptote": 2, |
||||
"Augeas": 2, |
||||
"AutoHotkey": 2, |
||||
"AutoIt": 2, |
||||
"Awk": 2, |
||||
"Ballerina": 2, |
||||
"Batchfile": 2, |
||||
"Befunge": 2, |
||||
"Bison": 2, |
||||
"BitBake": 2, |
||||
"Blade": 3, |
||||
"BlitzBasic": 2, |
||||
"BlitzMax": 2, |
||||
"Bluespec": 2, |
||||
"Boo": 2, |
||||
"Brainfuck": 2, |
||||
"Brightscript": 2, |
||||
"C": 2, |
||||
"C#": 2, |
||||
"C++": 2, |
||||
"C-ObjDump": 1, |
||||
"C2hs Haskell": 2, |
||||
"CLIPS": 2, |
||||
"CMake": 2, |
||||
"COBOL": 2, |
||||
"COLLADA": 1, |
||||
"CSON": 1, |
||||
"CSS": 3, |
||||
"CSV": 1, |
||||
"CWeb": 2, |
||||
"Cabal Config": 1, |
||||
"Cap'n Proto": 2, |
||||
"CartoCSS": 2, |
||||
"Ceylon": 2, |
||||
"Chapel": 2, |
||||
"Charity": 2, |
||||
"ChucK": 2, |
||||
"Cirru": 2, |
||||
"Clarion": 2, |
||||
"Clean": 2, |
||||
"Click": 2, |
||||
"Clojure": 2, |
||||
"Closure Templates": 3, |
||||
"Cloud Firestore Security Rules": 1, |
||||
"CoNLL-U": 1, |
||||
"CoffeeScript": 2, |
||||
"ColdFusion": 2, |
||||
"ColdFusion CFC": 2, |
||||
"Common Lisp": 2, |
||||
"Common Workflow Language": 2, |
||||
"Component Pascal": 2, |
||||
"Cool": 2, |
||||
"Coq": 2, |
||||
"Cpp-ObjDump": 1, |
||||
"Creole": 4, |
||||
"Crystal": 2, |
||||
"Csound": 2, |
||||
"Csound Document": 2, |
||||
"Csound Score": 2, |
||||
"Cuda": 2, |
||||
"Cycript": 2, |
||||
"Cython": 2, |
||||
"D": 2, |
||||
"D-ObjDump": 1, |
||||
"DIGITAL Command Language": 2, |
||||
"DM": 2, |
||||
"DNS Zone": 1, |
||||
"DTrace": 2, |
||||
"Darcs Patch": 1, |
||||
"Dart": 2, |
||||
"DataWeave": 2, |
||||
"Dhall": 2, |
||||
"Diff": 1, |
||||
"Dockerfile": 2, |
||||
"Dogescript": 2, |
||||
"Dylan": 2, |
||||
"E": 2, |
||||
"EBNF": 1, |
||||
"ECL": 2, |
||||
"ECLiPSe": 2, |
||||
"EJS": 3, |
||||
"EML": 1, |
||||
"EQ": 2, |
||||
"Eagle": 1, |
||||
"Easybuild": 1, |
||||
"Ecere Projects": 1, |
||||
"EditorConfig": 1, |
||||
"Edje Data Collection": 1, |
||||
"Eiffel": 2, |
||||
"Elixir": 2, |
||||
"Elm": 2, |
||||
"Emacs Lisp": 2, |
||||
"EmberScript": 2, |
||||
"Erlang": 2, |
||||
"F#": 2, |
||||
"F*": 2, |
||||
"FIGlet Font": 1, |
||||
"FLUX": 2, |
||||
"Factor": 2, |
||||
"Fancy": 2, |
||||
"Fantom": 2, |
||||
"Filebench WML": 2, |
||||
"Filterscript": 2, |
||||
"Formatted": 1, |
||||
"Forth": 2, |
||||
"Fortran": 2, |
||||
"FreeMarker": 2, |
||||
"Frege": 2, |
||||
"G-code": 2, |
||||
"GAMS": 2, |
||||
"GAP": 2, |
||||
"GCC Machine Description": 2, |
||||
"GDB": 2, |
||||
"GDScript": 2, |
||||
"GLSL": 2, |
||||
"GN": 1, |
||||
"Game Maker Language": 2, |
||||
"Genie": 2, |
||||
"Genshi": 2, |
||||
"Gentoo Ebuild": 2, |
||||
"Gentoo Eclass": 2, |
||||
"Gerber Image": 1, |
||||
"Gettext Catalog": 4, |
||||
"Gherkin": 2, |
||||
"Git Attributes": 1, |
||||
"Git Config": 1, |
||||
"Glyph": 2, |
||||
"Glyph Bitmap Distribution Format": 1, |
||||
"Gnuplot": 2, |
||||
"Go": 2, |
||||
"Golo": 2, |
||||
"Gosu": 2, |
||||
"Grace": 2, |
||||
"Gradle": 1, |
||||
"Grammatical Framework": 2, |
||||
"Graph Modeling Language": 1, |
||||
"GraphQL": 1, |
||||
"Graphviz (DOT)": 1, |
||||
"Groovy": 2, |
||||
"Groovy Server Pages": 2, |
||||
"HAProxy": 1, |
||||
"HCL": 2, |
||||
"HLSL": 2, |
||||
"HTML": 3, |
||||
"HTML+Django": 3, |
||||
"HTML+ECR": 3, |
||||
"HTML+EEX": 3, |
||||
"HTML+ERB": 3, |
||||
"HTML+PHP": 3, |
||||
"HTML+Razor": 3, |
||||
"HTTP": 1, |
||||
"HXML": 1, |
||||
"Hack": 2, |
||||
"Haml": 3, |
||||
"Handlebars": 3, |
||||
"Harbour": 2, |
||||
"Haskell": 2, |
||||
"Haxe": 2, |
||||
"HiveQL": 2, |
||||
"HolyC": 2, |
||||
"Hy": 2, |
||||
"HyPhy": 2, |
||||
"IDL": 2, |
||||
"IGOR Pro": 2, |
||||
"INI": 1, |
||||
"IRC log": 1, |
||||
"Idris": 2, |
||||
"Ignore List": 1, |
||||
"Inform 7": 2, |
||||
"Inno Setup": 2, |
||||
"Io": 2, |
||||
"Ioke": 2, |
||||
"Isabelle": 2, |
||||
"Isabelle ROOT": 2, |
||||
"J": 2, |
||||
"JFlex": 2, |
||||
"JSON": 1, |
||||
"JSON with Comments": 1, |
||||
"JSON5": 1, |
||||
"JSONLD": 1, |
||||
"JSONiq": 2, |
||||
"JSX": 2, |
||||
"Jasmin": 2, |
||||
"Java": 2, |
||||
"Java Properties": 1, |
||||
"Java Server Pages": 2, |
||||
"JavaScript": 2, |
||||
"JavaScript+ERB": 2, |
||||
"Jison": 2, |
||||
"Jison Lex": 2, |
||||
"Jolie": 2, |
||||
"Jsonnet": 2, |
||||
"Julia": 2, |
||||
"Jupyter Notebook": 3, |
||||
"KRL": 2, |
||||
"KiCad Layout": 1, |
||||
"KiCad Legacy Layout": 1, |
||||
"KiCad Schematic": 1, |
||||
"Kit": 3, |
||||
"Kotlin": 2, |
||||
"LFE": 2, |
||||
"LLVM": 2, |
||||
"LOLCODE": 2, |
||||
"LSL": 2, |
||||
"LTspice Symbol": 1, |
||||
"LabVIEW": 2, |
||||
"Lasso": 2, |
||||
"Latte": 3, |
||||
"Lean": 2, |
||||
"Less": 3, |
||||
"Lex": 2, |
||||
"LilyPond": 2, |
||||
"Limbo": 2, |
||||
"Linker Script": 1, |
||||
"Linux Kernel Module": 1, |
||||
"Liquid": 3, |
||||
"Literate Agda": 2, |
||||
"Literate CoffeeScript": 2, |
||||
"Literate Haskell": 2, |
||||
"LiveScript": 2, |
||||
"Logos": 2, |
||||
"Logtalk": 2, |
||||
"LookML": 2, |
||||
"LoomScript": 2, |
||||
"Lua": 2, |
||||
"M": 2, |
||||
"M4": 2, |
||||
"M4Sugar": 2, |
||||
"MATLAB": 2, |
||||
"MAXScript": 2, |
||||
"MQL4": 2, |
||||
"MQL5": 2, |
||||
"MTML": 3, |
||||
"MUF": 2, |
||||
"Makefile": 2, |
||||
"Mako": 2, |
||||
"Markdown": 4, |
||||
"Marko": 3, |
||||
"Mask": 3, |
||||
"Mathematica": 2, |
||||
"Maven POM": 1, |
||||
"Max": 2, |
||||
"MediaWiki": 4, |
||||
"Mercury": 2, |
||||
"Meson": 2, |
||||
"Metal": 2, |
||||
"MiniD": 2, |
||||
"Mirah": 2, |
||||
"Modelica": 2, |
||||
"Modula-2": 2, |
||||
"Modula-3": 2, |
||||
"Module Management System": 2, |
||||
"Monkey": 2, |
||||
"Moocode": 2, |
||||
"MoonScript": 2, |
||||
"Motorola 68K Assembly": 2, |
||||
"Myghty": 2, |
||||
"NCL": 2, |
||||
"NL": 1, |
||||
"NSIS": 2, |
||||
"Nearley": 2, |
||||
"Nemerle": 2, |
||||
"NetLinx": 2, |
||||
"NetLinx+ERB": 2, |
||||
"NetLogo": 2, |
||||
"NewLisp": 2, |
||||
"Nextflow": 2, |
||||
"Nginx": 1, |
||||
"Nim": 2, |
||||
"Ninja": 1, |
||||
"Nit": 2, |
||||
"Nix": 2, |
||||
"Nu": 2, |
||||
"NumPy": 2, |
||||
"OCaml": 2, |
||||
"ObjDump": 1, |
||||
"ObjectScript": 2, |
||||
"Objective-C": 2, |
||||
"Objective-C++": 2, |
||||
"Objective-J": 2, |
||||
"Omgrofl": 2, |
||||
"Opa": 2, |
||||
"Opal": 2, |
||||
"OpenCL": 2, |
||||
"OpenEdge ABL": 2, |
||||
"OpenRC runscript": 2, |
||||
"OpenSCAD": 2, |
||||
"OpenType Feature File": 1, |
||||
"Org": 4, |
||||
"Ox": 2, |
||||
"Oxygene": 2, |
||||
"Oz": 2, |
||||
"P4": 2, |
||||
"PHP": 2, |
||||
"PLSQL": 2, |
||||
"PLpgSQL": 2, |
||||
"POV-Ray SDL": 2, |
||||
"Pan": 2, |
||||
"Papyrus": 2, |
||||
"Parrot": 2, |
||||
"Parrot Assembly": 2, |
||||
"Parrot Internal Representation": 2, |
||||
"Pascal": 2, |
||||
"Pawn": 2, |
||||
"Pep8": 2, |
||||
"Perl": 2, |
||||
"Perl 6": 2, |
||||
"Pic": 3, |
||||
"Pickle": 1, |
||||
"PicoLisp": 2, |
||||
"PigLatin": 2, |
||||
"Pike": 2, |
||||
"Pod": 4, |
||||
"Pod 6": 4, |
||||
"PogoScript": 2, |
||||
"Pony": 2, |
||||
"PostCSS": 3, |
||||
"PostScript": 3, |
||||
"PowerBuilder": 2, |
||||
"PowerShell": 2, |
||||
"Processing": 2, |
||||
"Prolog": 2, |
||||
"Propeller Spin": 2, |
||||
"Protocol Buffer": 1, |
||||
"Public Key": 1, |
||||
"Pug": 3, |
||||
"Puppet": 2, |
||||
"Pure Data": 1, |
||||
"PureBasic": 2, |
||||
"PureScript": 2, |
||||
"Python": 2, |
||||
"Python console": 2, |
||||
"Python traceback": 1, |
||||
"QML": 2, |
||||
"QMake": 2, |
||||
"Quake": 2, |
||||
"R": 2, |
||||
"RAML": 3, |
||||
"RDoc": 4, |
||||
"REALbasic": 2, |
||||
"REXX": 2, |
||||
"RHTML": 3, |
||||
"RMarkdown": 4, |
||||
"RPC": 2, |
||||
"RPM Spec": 1, |
||||
"RUNOFF": 3, |
||||
"Racket": 2, |
||||
"Ragel": 2, |
||||
"Rascal": 2, |
||||
"Raw token data": 1, |
||||
"Reason": 2, |
||||
"Rebol": 2, |
||||
"Red": 2, |
||||
"Redcode": 2, |
||||
"Regular Expression": 1, |
||||
"Ren'Py": 2, |
||||
"RenderScript": 2, |
||||
"Rich Text Format": 3, |
||||
"Ring": 2, |
||||
"RobotFramework": 2, |
||||
"Roff": 3, |
||||
"Roff Manpage": 3, |
||||
"Rouge": 2, |
||||
"Ruby": 2, |
||||
"Rust": 2, |
||||
"SAS": 2, |
||||
"SCSS": 3, |
||||
"SMT": 2, |
||||
"SPARQL": 1, |
||||
"SQF": 2, |
||||
"SQL": 1, |
||||
"SQLPL": 2, |
||||
"SRecode Template": 3, |
||||
"SSH Config": 1, |
||||
"STON": 1, |
||||
"SVG": 1, |
||||
"Sage": 2, |
||||
"SaltStack": 2, |
||||
"Sass": 3, |
||||
"Scala": 2, |
||||
"Scaml": 3, |
||||
"Scheme": 2, |
||||
"Scilab": 2, |
||||
"Self": 2, |
||||
"ShaderLab": 2, |
||||
"Shell": 2, |
||||
"ShellSession": 2, |
||||
"Shen": 2, |
||||
"Slash": 2, |
||||
"Slice": 2, |
||||
"Slim": 3, |
||||
"Smali": 2, |
||||
"Smalltalk": 2, |
||||
"Smarty": 2, |
||||
"Solidity": 2, |
||||
"SourcePawn": 2, |
||||
"Spline Font Database": 1, |
||||
"Squirrel": 2, |
||||
"Stan": 2, |
||||
"Standard ML": 2, |
||||
"Stata": 2, |
||||
"Stylus": 3, |
||||
"SubRip Text": 1, |
||||
"SugarSS": 3, |
||||
"SuperCollider": 2, |
||||
"Svelte": 3, |
||||
"Swift": 2, |
||||
"SystemVerilog": 2, |
||||
"TI Program": 2, |
||||
"TLA": 2, |
||||
"TOML": 1, |
||||
"TSQL": 2, |
||||
"TSX": 2, |
||||
"TXL": 2, |
||||
"Tcl": 2, |
||||
"Tcsh": 2, |
||||
"TeX": 3, |
||||
"Tea": 3, |
||||
"Terra": 2, |
||||
"Text": 4, |
||||
"Textile": 4, |
||||
"Thrift": 2, |
||||
"Turing": 2, |
||||
"Turtle": 1, |
||||
"Twig": 3, |
||||
"Type Language": 1, |
||||
"TypeScript": 2, |
||||
"Unified Parallel C": 2, |
||||
"Unity3D Asset": 1, |
||||
"Unix Assembly": 2, |
||||
"Uno": 2, |
||||
"UnrealScript": 2, |
||||
"UrWeb": 2, |
||||
"VCL": 2, |
||||
"VHDL": 2, |
||||
"Vala": 2, |
||||
"Verilog": 2, |
||||
"Vim script": 2, |
||||
"Visual Basic": 2, |
||||
"Volt": 2, |
||||
"Vue": 3, |
||||
"Wavefront Material": 1, |
||||
"Wavefront Object": 1, |
||||
"Web Ontology Language": 1, |
||||
"WebAssembly": 2, |
||||
"WebIDL": 2, |
||||
"WebVTT": 1, |
||||
"Windows Registry Entries": 1, |
||||
"Wollok": 2, |
||||
"World of Warcraft Addon Data": 1, |
||||
"X BitMap": 1, |
||||
"X Font Directory Index": 1, |
||||
"X PixMap": 1, |
||||
"X10": 2, |
||||
"XC": 2, |
||||
"XCompose": 1, |
||||
"XML": 1, |
||||
"XPages": 1, |
||||
"XProc": 2, |
||||
"XQuery": 2, |
||||
"XS": 2, |
||||
"XSLT": 2, |
||||
"Xojo": 2, |
||||
"Xtend": 2, |
||||
"YAML": 1, |
||||
"YANG": 1, |
||||
"YARA": 2, |
||||
"YASnippet": 3, |
||||
"Yacc": 2, |
||||
"ZAP": 2, |
||||
"ZIL": 2, |
||||
"Zeek": 2, |
||||
"ZenScript": 2, |
||||
"Zephir": 2, |
||||
"Zig": 2, |
||||
"Zimpl": 2, |
||||
"desktop": 1, |
||||
"eC": 2, |
||||
"edn": 1, |
||||
"fish": 2, |
||||
"mcfunction": 2, |
||||
"mupad": 2, |
||||
"nanorc": 1, |
||||
"nesC": 2, |
||||
"ooc": 2, |
||||
"q": 2, |
||||
"reStructuredText": 4, |
||||
"sed": 2, |
||||
"wdl": 2, |
||||
"wisp": 2, |
||||
"xBase": 2, |
||||
} |
@ -0,0 +1,166 @@ |
||||
// Code generated by github.com/src-d/enry/v2/internal/code-generator DO NOT EDIT.
|
||||
// Extracted from github/linguist commit: 3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d
|
||||
|
||||
package data |
||||
|
||||
import "gopkg.in/toqueteos/substring.v1" |
||||
|
||||
var VendorMatchers = substring.Or( |
||||
substring.Regexp(`(^|/)cache/`), |
||||
substring.Regexp(`^[Dd]ependencies/`), |
||||
substring.Regexp(`(^|/)dist/`), |
||||
substring.Regexp(`^deps/`), |
||||
substring.Regexp(`(^|/)configure$`), |
||||
substring.Regexp(`(^|/)config.guess$`), |
||||
substring.Regexp(`(^|/)config.sub$`), |
||||
substring.Regexp(`(^|/)aclocal.m4`), |
||||
substring.Regexp(`(^|/)libtool.m4`), |
||||
substring.Regexp(`(^|/)ltoptions.m4`), |
||||
substring.Regexp(`(^|/)ltsugar.m4`), |
||||
substring.Regexp(`(^|/)ltversion.m4`), |
||||
substring.Regexp(`(^|/)lt~obsolete.m4`), |
||||
substring.Regexp(`cpplint.py`), |
||||
substring.Regexp(`node_modules/`), |
||||
substring.Regexp(`bower_components/`), |
||||
substring.Regexp(`^rebar$`), |
||||
substring.Regexp(`erlang.mk`), |
||||
substring.Regexp(`Godeps/_workspace/`), |
||||
substring.Regexp(`(^|/)testdata/`), |
||||
substring.Regexp(`.indent.pro`), |
||||
substring.Regexp(`(\.|-)min\.(js|css)$`), |
||||
substring.Regexp(`([^\s]*)import\.(css|less|scss|styl)$`), |
||||
substring.Regexp(`(^|/)bootstrap([^.]*)\.(js|css|less|scss|styl)$`), |
||||
substring.Regexp(`(^|/)custom\.bootstrap([^\s]*)(js|css|less|scss|styl)$`), |
||||
substring.Regexp(`(^|/)font-?awesome\.(css|less|scss|styl)$`), |
||||
substring.Regexp(`(^|/)font-?awesome/.*\.(css|less|scss|styl)$`), |
||||
substring.Regexp(`(^|/)foundation\.(css|less|scss|styl)$`), |
||||
substring.Regexp(`(^|/)normalize\.(css|less|scss|styl)$`), |
||||
substring.Regexp(`(^|/)skeleton\.(css|less|scss|styl)$`), |
||||
substring.Regexp(`(^|/)[Bb]ourbon/.*\.(css|less|scss|styl)$`), |
||||
substring.Regexp(`(^|/)animate\.(css|less|scss|styl)$`), |
||||
substring.Regexp(`(^|/)materialize\.(css|less|scss|styl|js)$`), |
||||
substring.Regexp(`(^|/)select2/.*\.(css|scss|js)$`), |
||||
substring.Regexp(`(^|/)bulma\.(css|sass|scss)$`), |
||||
substring.Regexp(`(3rd|[Tt]hird)[-_]?[Pp]arty/`), |
||||
substring.Regexp(`vendors?/`), |
||||
substring.Regexp(`extern(al)?/`), |
||||
substring.Regexp(`(^|/)[Vv]+endor/`), |
||||
substring.Regexp(`^debian/`), |
||||
substring.Regexp(`run.n$`), |
||||
substring.Regexp(`bootstrap-datepicker/`), |
||||
substring.Regexp(`(^|/)jquery([^.]*)\.js$`), |
||||
substring.Regexp(`(^|/)jquery\-\d\.\d+(\.\d+)?\.js$`), |
||||
substring.Regexp(`(^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?\.(js|css)$`), |
||||
substring.Regexp(`(^|/)jquery\.(ui|effects)\.([^.]*)\.(js|css)$`), |
||||
substring.Regexp(`jquery.fn.gantt.js`), |
||||
substring.Regexp(`jquery.fancybox.(js|css)`), |
||||
substring.Regexp(`fuelux.js`), |
||||
substring.Regexp(`(^|/)jquery\.fileupload(-\w+)?\.js$`), |
||||
substring.Regexp(`jquery.dataTables.js`), |
||||
substring.Regexp(`bootbox.js`), |
||||
substring.Regexp(`pdf.worker.js`), |
||||
substring.Regexp(`(^|/)slick\.\w+.js$`), |
||||
substring.Regexp(`(^|/)Leaflet\.Coordinates-\d+\.\d+\.\d+\.src\.js$`), |
||||
substring.Regexp(`leaflet.draw-src.js`), |
||||
substring.Regexp(`leaflet.draw.css`), |
||||
substring.Regexp(`Control.FullScreen.css`), |
||||
substring.Regexp(`Control.FullScreen.js`), |
||||
substring.Regexp(`leaflet.spin.js`), |
||||
substring.Regexp(`wicket-leaflet.js`), |
||||
substring.Regexp(`.sublime-project`), |
||||
substring.Regexp(`.sublime-workspace`), |
||||
substring.Regexp(`.vscode`), |
||||
substring.Regexp(`(^|/)prototype(.*)\.js$`), |
||||
substring.Regexp(`(^|/)effects\.js$`), |
||||
substring.Regexp(`(^|/)controls\.js$`), |
||||
substring.Regexp(`(^|/)dragdrop\.js$`), |
||||
substring.Regexp(`(.*?)\.d\.ts$`), |
||||
substring.Regexp(`(^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$`), |
||||
substring.Regexp(`(^|/)dojo\.js$`), |
||||
substring.Regexp(`(^|/)MochiKit\.js$`), |
||||
substring.Regexp(`(^|/)yahoo-([^.]*)\.js$`), |
||||
substring.Regexp(`(^|/)yui([^.]*)\.js$`), |
||||
substring.Regexp(`(^|/)ckeditor\.js$`), |
||||
substring.Regexp(`(^|/)tiny_mce([^.]*)\.js$`), |
||||
substring.Regexp(`(^|/)tiny_mce/(langs|plugins|themes|utils)`), |
||||
substring.Regexp(`(^|/)ace-builds/`), |
||||
substring.Regexp(`(^|/)fontello(.*?)\.css$`), |
||||
substring.Regexp(`(^|/)MathJax/`), |
||||
substring.Regexp(`(^|/)Chart\.js$`), |
||||
substring.Regexp(`(^|/)[Cc]ode[Mm]irror/(\d+\.\d+/)?(lib|mode|theme|addon|keymap|demo)`), |
||||
substring.Regexp(`(^|/)shBrush([^.]*)\.js$`), |
||||
substring.Regexp(`(^|/)shCore\.js$`), |
||||
substring.Regexp(`(^|/)shLegacy\.js$`), |
||||
substring.Regexp(`(^|/)angular([^.]*)\.js$`), |
||||
substring.Regexp(`(^|\/)d3(\.v\d+)?([^.]*)\.js$`), |
||||
substring.Regexp(`(^|/)react(-[^.]*)?\.js$`), |
||||
substring.Regexp(`(^|/)flow-typed/.*\.js$`), |
||||
substring.Regexp(`(^|/)modernizr\-\d\.\d+(\.\d+)?\.js$`), |
||||
substring.Regexp(`(^|/)modernizr\.custom\.\d+\.js$`), |
||||
substring.Regexp(`(^|/)knockout-(\d+\.){3}(debug\.)?js$`), |
||||
substring.Regexp(`(^|/)docs?/_?(build|themes?|templates?|static)/`), |
||||
substring.Regexp(`(^|/)admin_media/`), |
||||
substring.Regexp(`(^|/)env/`), |
||||
substring.Regexp(`^fabfile\.py$`), |
||||
substring.Regexp(`^waf$`), |
||||
substring.Regexp(`^.osx$`), |
||||
substring.Regexp(`\.xctemplate/`), |
||||
substring.Regexp(`\.imageset/`), |
||||
substring.Regexp(`(^|/)Carthage/`), |
||||
substring.Regexp(`(^|/)Sparkle/`), |
||||
substring.Regexp(`Crashlytics.framework/`), |
||||
substring.Regexp(`Fabric.framework/`), |
||||
substring.Regexp(`BuddyBuildSDK.framework/`), |
||||
substring.Regexp(`Realm.framework`), |
||||
substring.Regexp(`RealmSwift.framework`), |
||||
substring.Regexp(`gitattributes$`), |
||||
substring.Regexp(`gitignore$`), |
||||
substring.Regexp(`gitmodules$`), |
||||
substring.Regexp(`(^|/)gradlew$`), |
||||
substring.Regexp(`(^|/)gradlew\.bat$`), |
||||
substring.Regexp(`(^|/)gradle/wrapper/`), |
||||
substring.Regexp(`(^|/)mvnw$`), |
||||
substring.Regexp(`(^|/)mvnw\.cmd$`), |
||||
substring.Regexp(`(^|/)\.mvn/wrapper/`), |
||||
substring.Regexp(`-vsdoc\.js$`), |
||||
substring.Regexp(`\.intellisense\.js$`), |
||||
substring.Regexp(`(^|/)jquery([^.]*)\.validate(\.unobtrusive)?\.js$`), |
||||
substring.Regexp(`(^|/)jquery([^.]*)\.unobtrusive\-ajax\.js$`), |
||||
substring.Regexp(`(^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$`), |
||||
substring.Regexp(`^[Pp]ackages\/.+\.\d+\/`), |
||||
substring.Regexp(`(^|/)extjs/.*?\.js$`), |
||||
substring.Regexp(`(^|/)extjs/.*?\.xml$`), |
||||
substring.Regexp(`(^|/)extjs/.*?\.txt$`), |
||||
substring.Regexp(`(^|/)extjs/.*?\.html$`), |
||||
substring.Regexp(`(^|/)extjs/.*?\.properties$`), |
||||
substring.Regexp(`(^|/)extjs/.sencha/`), |
||||
substring.Regexp(`(^|/)extjs/docs/`), |
||||
substring.Regexp(`(^|/)extjs/builds/`), |
||||
substring.Regexp(`(^|/)extjs/cmd/`), |
||||
substring.Regexp(`(^|/)extjs/examples/`), |
||||
substring.Regexp(`(^|/)extjs/locale/`), |
||||
substring.Regexp(`(^|/)extjs/packages/`), |
||||
substring.Regexp(`(^|/)extjs/plugins/`), |
||||
substring.Regexp(`(^|/)extjs/resources/`), |
||||
substring.Regexp(`(^|/)extjs/src/`), |
||||
substring.Regexp(`(^|/)extjs/welcome/`), |
||||
substring.Regexp(`(^|/)html5shiv\.js$`), |
||||
substring.Regexp(`^[Tt]ests?/fixtures/`), |
||||
substring.Regexp(`^[Ss]pecs?/fixtures/`), |
||||
substring.Regexp(`(^|/)cordova([^.]*)\.js$`), |
||||
substring.Regexp(`(^|/)cordova\-\d\.\d(\.\d)?\.js$`), |
||||
substring.Regexp(`foundation(\..*)?\.js$`), |
||||
substring.Regexp(`^Vagrantfile$`), |
||||
substring.Regexp(`.[Dd][Ss]_[Ss]tore$`), |
||||
substring.Regexp(`^vignettes/`), |
||||
substring.Regexp(`^inst/extdata/`), |
||||
substring.Regexp(`octicons.css`), |
||||
substring.Regexp(`sprockets-octicons.scss`), |
||||
substring.Regexp(`(^|/)activator$`), |
||||
substring.Regexp(`(^|/)activator\.bat$`), |
||||
substring.Regexp(`proguard.pro`), |
||||
substring.Regexp(`proguard-rules.pro`), |
||||
substring.Regexp(`^puphpet/`), |
||||
substring.Regexp(`(^|/)\.google_apis/`), |
||||
substring.Regexp(`^Jenkinsfile$`), |
||||
) |
@ -0,0 +1,16 @@ |
||||
/* |
||||
Package enry implements multiple strategies for programming language identification. |
||||
|
||||
Identification is made based on file name and file content using a seriece |
||||
of strategies to narrow down possible option. |
||||
Each strategy is available as a separate API call, as well as a main enty point |
||||
|
||||
GetLanguage(filename string, content []byte) (language string) |
||||
|
||||
It is a port of the https://github.com/github/linguist from Ruby.
|
||||
Upstream Linguist YAML files are used to generate datastructures for data |
||||
package. |
||||
*/ |
||||
package enry // import "github.com/src-d/enry/v2"
|
||||
|
||||
//go:generate make code-generate
|
@ -0,0 +1,11 @@ |
||||
module github.com/src-d/enry/v2 |
||||
|
||||
go 1.12 |
||||
|
||||
require ( |
||||
github.com/src-d/go-oniguruma v1.1.0 |
||||
github.com/stretchr/testify v1.3.0 |
||||
github.com/toqueteos/trie v1.0.0 // indirect |
||||
gopkg.in/toqueteos/substring.v1 v1.0.2 |
||||
gopkg.in/yaml.v2 v2.2.2 |
||||
) |
@ -0,0 +1,17 @@ |
||||
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= |
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= |
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= |
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= |
||||
github.com/src-d/go-oniguruma v1.1.0 h1:EG+Nm5n2JqWUaCjtM0NtutPxU7ZN5Tp50GWrrV8bTww= |
||||
github.com/src-d/go-oniguruma v1.1.0/go.mod h1:chVbff8kcVtmrhxtZ3yBVLLquXbzCS6DrxQaAK/CeqM= |
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= |
||||
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= |
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= |
||||
github.com/toqueteos/trie v1.0.0 h1:8i6pXxNUXNRAqP246iibb7w/pSFquNTQ+uNfriG7vlk= |
||||
github.com/toqueteos/trie v1.0.0/go.mod h1:Ywk48QhEqhU1+DwhMkJ2x7eeGxDHiGkAdc9+0DYcbsM= |
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= |
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= |
||||
gopkg.in/toqueteos/substring.v1 v1.0.2 h1:urLqCeMm6x/eTuQa1oZerNw8N1KNOIp5hD5kGL7lFsE= |
||||
gopkg.in/toqueteos/substring.v1 v1.0.2/go.mod h1:Eb2Z1UYehlVK8LYW2WBVR2rwbujsz3aX8XDrM1vbNew= |
||||
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= |
||||
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= |
@ -0,0 +1,7 @@ |
||||
// Package tokenizer implements file tokenization used by the enry content
|
||||
// classifier. This package is an implementation detail of enry and should not
|
||||
// be imported by other packages.
|
||||
package tokenizer |
||||
|
||||
// ByteLimit defines the maximum prefix of an input text that will be tokenized.
|
||||
const ByteLimit = 100000 |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,336 @@ |
||||
#ifndef linguist_yyHEADER_H |
||||
#define linguist_yyHEADER_H 1 |
||||
#define linguist_yyIN_HEADER 1 |
||||
|
||||
#line 6 "lex.linguist_yy.h" |
||||
|
||||
#define YY_INT_ALIGNED short int |
||||
|
||||
/* A lexical scanner generated by flex */ |
||||
|
||||
#define FLEX_SCANNER |
||||
#define YY_FLEX_MAJOR_VERSION 2 |
||||
#define YY_FLEX_MINOR_VERSION 5 |
||||
#define YY_FLEX_SUBMINOR_VERSION 35 |
||||
#if YY_FLEX_SUBMINOR_VERSION > 0 |
||||
#define FLEX_BETA |
||||
#endif |
||||
|
||||
/* First, we deal with platform-specific or compiler-specific issues. */ |
||||
|
||||
/* begin standard C headers. */ |
||||
#include <stdio.h> |
||||
#include <string.h> |
||||
#include <errno.h> |
||||
#include <stdlib.h> |
||||
|
||||
/* end standard C headers. */ |
||||
|
||||
/* flex integer type definitions */ |
||||
|
||||
#ifndef FLEXINT_H |
||||
#define FLEXINT_H |
||||
|
||||
/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */ |
||||
|
||||
#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L |
||||
|
||||
/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h,
|
||||
* if you want the limit (max/min) macros for int types.
|
||||
*/ |
||||
#ifndef __STDC_LIMIT_MACROS |
||||
#define __STDC_LIMIT_MACROS 1 |
||||
#endif |
||||
|
||||
#include <inttypes.h> |
||||
typedef int8_t flex_int8_t; |
||||
typedef uint8_t flex_uint8_t; |
||||
typedef int16_t flex_int16_t; |
||||
typedef uint16_t flex_uint16_t; |
||||
typedef int32_t flex_int32_t; |
||||
typedef uint32_t flex_uint32_t; |
||||
typedef uint64_t flex_uint64_t; |
||||
#else |
||||
typedef signed char flex_int8_t; |
||||
typedef short int flex_int16_t; |
||||
typedef int flex_int32_t; |
||||
typedef unsigned char flex_uint8_t;
|
||||
typedef unsigned short int flex_uint16_t; |
||||
typedef unsigned int flex_uint32_t; |
||||
#endif /* ! C99 */ |
||||
|
||||
/* Limits of integral types. */ |
||||
#ifndef INT8_MIN |
||||
#define INT8_MIN (-128) |
||||
#endif |
||||
#ifndef INT16_MIN |
||||
#define INT16_MIN (-32767-1) |
||||
#endif |
||||
#ifndef INT32_MIN |
||||
#define INT32_MIN (-2147483647-1) |
||||
#endif |
||||
#ifndef INT8_MAX |
||||
#define INT8_MAX (127) |
||||
#endif |
||||
#ifndef INT16_MAX |
||||
#define INT16_MAX (32767) |
||||
#endif |
||||
#ifndef INT32_MAX |
||||
#define INT32_MAX (2147483647) |
||||
#endif |
||||
#ifndef UINT8_MAX |
||||
#define UINT8_MAX (255U) |
||||
#endif |
||||
#ifndef UINT16_MAX |
||||
#define UINT16_MAX (65535U) |
||||
#endif |
||||
#ifndef UINT32_MAX |
||||
#define UINT32_MAX (4294967295U) |
||||
#endif |
||||
|
||||
#endif /* ! FLEXINT_H */ |
||||
|
||||
#ifdef __cplusplus |
||||
|
||||
/* The "const" storage-class-modifier is valid. */ |
||||
#define YY_USE_CONST |
||||
|
||||
#else /* ! __cplusplus */ |
||||
|
||||
/* C99 requires __STDC__ to be defined as 1. */ |
||||
#if defined (__STDC__) |
||||
|
||||
#define YY_USE_CONST |
||||
|
||||
#endif /* defined (__STDC__) */ |
||||
#endif /* ! __cplusplus */ |
||||
|
||||
#ifdef YY_USE_CONST |
||||
#define yyconst const |
||||
#else |
||||
#define yyconst |
||||
#endif |
||||
|
||||
/* An opaque pointer. */ |
||||
#ifndef YY_TYPEDEF_YY_SCANNER_T |
||||
#define YY_TYPEDEF_YY_SCANNER_T |
||||
typedef void* yyscan_t; |
||||
#endif |
||||
|
||||
/* For convenience, these vars (plus the bison vars far below)
|
||||
are macros in the reentrant scanner. */ |
||||
#define yyin yyg->yyin_r |
||||
#define yyout yyg->yyout_r |
||||
#define yyextra yyg->yyextra_r |
||||
#define yyleng yyg->yyleng_r |
||||
#define yytext yyg->yytext_r |
||||
#define yylineno (YY_CURRENT_BUFFER_LVALUE->yy_bs_lineno) |
||||
#define yycolumn (YY_CURRENT_BUFFER_LVALUE->yy_bs_column) |
||||
#define yy_flex_debug yyg->yy_flex_debug_r |
||||
|
||||
/* Size of default input buffer. */ |
||||
#ifndef YY_BUF_SIZE |
||||
#define YY_BUF_SIZE 16384 |
||||
#endif |
||||
|
||||
#ifndef YY_TYPEDEF_YY_BUFFER_STATE |
||||
#define YY_TYPEDEF_YY_BUFFER_STATE |
||||
typedef struct yy_buffer_state *YY_BUFFER_STATE; |
||||
#endif |
||||
|
||||
#ifndef YY_TYPEDEF_YY_SIZE_T |
||||
#define YY_TYPEDEF_YY_SIZE_T |
||||
typedef size_t yy_size_t; |
||||
#endif |
||||
|
||||
#ifndef YY_STRUCT_YY_BUFFER_STATE |
||||
#define YY_STRUCT_YY_BUFFER_STATE |
||||
struct yy_buffer_state |
||||
{ |
||||
FILE *yy_input_file; |
||||
|
||||
char *yy_ch_buf; /* input buffer */ |
||||
char *yy_buf_pos; /* current position in input buffer */ |
||||
|
||||
/* Size of input buffer in bytes, not including room for EOB
|
||||
* characters. |
||||
*/ |
||||
yy_size_t yy_buf_size; |
||||
|
||||
/* Number of characters read into yy_ch_buf, not including EOB
|
||||
* characters. |
||||
*/ |
||||
yy_size_t yy_n_chars; |
||||
|
||||
/* Whether we "own" the buffer - i.e., we know we created it,
|
||||
* and can realloc() it to grow it, and should free() it to |
||||
* delete it. |
||||
*/ |
||||
int yy_is_our_buffer; |
||||
|
||||
/* Whether this is an "interactive" input source; if so, and
|
||||
* if we're using stdio for input, then we want to use getc() |
||||
* instead of fread(), to make sure we stop fetching input after |
||||
* each newline. |
||||
*/ |
||||
int yy_is_interactive; |
||||
|
||||
/* Whether we're considered to be at the beginning of a line.
|
||||
* If so, '^' rules will be active on the next match, otherwise |
||||
* not. |
||||
*/ |
||||
int yy_at_bol; |
||||
|
||||
int yy_bs_lineno; /**< The line count. */ |
||||
int yy_bs_column; /**< The column count. */ |
||||
|
||||
/* Whether to try to fill the input buffer when we reach the
|
||||
* end of it. |
||||
*/ |
||||
int yy_fill_buffer; |
||||
|
||||
int yy_buffer_status; |
||||
|
||||
}; |
||||
#endif /* !YY_STRUCT_YY_BUFFER_STATE */ |
||||
|
||||
void linguist_yyrestart (FILE *input_file ,yyscan_t yyscanner ); |
||||
void linguist_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); |
||||
YY_BUFFER_STATE linguist_yy_create_buffer (FILE *file,int size ,yyscan_t yyscanner ); |
||||
void linguist_yy_delete_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); |
||||
void linguist_yy_flush_buffer (YY_BUFFER_STATE b ,yyscan_t yyscanner ); |
||||
void linguist_yypush_buffer_state (YY_BUFFER_STATE new_buffer ,yyscan_t yyscanner ); |
||||
void linguist_yypop_buffer_state (yyscan_t yyscanner ); |
||||
|
||||
YY_BUFFER_STATE linguist_yy_scan_buffer (char *base,yy_size_t size ,yyscan_t yyscanner ); |
||||
YY_BUFFER_STATE linguist_yy_scan_string (yyconst char *yy_str ,yyscan_t yyscanner ); |
||||
YY_BUFFER_STATE linguist_yy_scan_bytes (yyconst char *bytes,yy_size_t len ,yyscan_t yyscanner ); |
||||
|
||||
void *linguist_yyalloc (yy_size_t ,yyscan_t yyscanner ); |
||||
void *linguist_yyrealloc (void *,yy_size_t ,yyscan_t yyscanner ); |
||||
void linguist_yyfree (void * ,yyscan_t yyscanner ); |
||||
|
||||
/* Begin user sect3 */ |
||||
|
||||
#define yytext_ptr yytext_r |
||||
|
||||
#ifdef YY_HEADER_EXPORT_START_CONDITIONS |
||||
#define INITIAL 0 |
||||
#define sgml 1 |
||||
#define c_comment 2 |
||||
#define xml_comment 3 |
||||
#define haskell_comment 4 |
||||
#define ocaml_comment 5 |
||||
#define python_dcomment 6 |
||||
#define python_scomment 7 |
||||
|
||||
#endif |
||||
|
||||
#ifndef YY_NO_UNISTD_H |
||||
/* Special case for "unistd.h", since it is non-ANSI. We include it way
|
||||
* down here because we want the user's section 1 to have been scanned first. |
||||
* The user has a chance to override it with an option. |
||||
*/ |
||||
#include <unistd.h> |
||||
#endif |
||||
|
||||
#define YY_EXTRA_TYPE struct tokenizer_extra * |
||||
|
||||
int linguist_yylex_init (yyscan_t* scanner); |
||||
|
||||
int linguist_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner); |
||||
|
||||
/* Accessor methods to globals.
|
||||
These are made visible to non-reentrant scanners for convenience. */ |
||||
|
||||
int linguist_yylex_destroy (yyscan_t yyscanner ); |
||||
|
||||
int linguist_yyget_debug (yyscan_t yyscanner ); |
||||
|
||||
void linguist_yyset_debug (int debug_flag ,yyscan_t yyscanner ); |
||||
|
||||
YY_EXTRA_TYPE linguist_yyget_extra (yyscan_t yyscanner ); |
||||
|
||||
void linguist_yyset_extra (YY_EXTRA_TYPE user_defined ,yyscan_t yyscanner ); |
||||
|
||||
FILE *linguist_yyget_in (yyscan_t yyscanner ); |
||||
|
||||
void linguist_yyset_in (FILE * in_str ,yyscan_t yyscanner ); |
||||
|
||||
FILE *linguist_yyget_out (yyscan_t yyscanner ); |
||||
|
||||
void linguist_yyset_out (FILE * out_str ,yyscan_t yyscanner ); |
||||
|
||||
yy_size_t linguist_yyget_leng (yyscan_t yyscanner ); |
||||
|
||||
char *linguist_yyget_text (yyscan_t yyscanner ); |
||||
|
||||
int linguist_yyget_lineno (yyscan_t yyscanner ); |
||||
|
||||
void linguist_yyset_lineno (int line_number ,yyscan_t yyscanner ); |
||||
|
||||
/* Macros after this point can all be overridden by user definitions in
|
||||
* section 1. |
||||
*/ |
||||
|
||||
#ifndef YY_SKIP_YYWRAP |
||||
#ifdef __cplusplus |
||||
extern "C" int linguist_yywrap (yyscan_t yyscanner ); |
||||
#else |
||||
extern int linguist_yywrap (yyscan_t yyscanner ); |
||||
#endif |
||||
#endif |
||||
|
||||
#ifndef yytext_ptr |
||||
static void yy_flex_strncpy (char *,yyconst char *,int ,yyscan_t yyscanner); |
||||
#endif |
||||
|
||||
#ifdef YY_NEED_STRLEN |
||||
static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner); |
||||
#endif |
||||
|
||||
#ifndef YY_NO_INPUT |
||||
|
||||
#endif |
||||
|
||||
/* Amount of stuff to slurp up with each read. */ |
||||
#ifndef YY_READ_BUF_SIZE |
||||
#define YY_READ_BUF_SIZE 8192 |
||||
#endif |
||||
|
||||
/* Number of entries by which start-condition stack grows. */ |
||||
#ifndef YY_START_STACK_INCR |
||||
#define YY_START_STACK_INCR 25 |
||||
#endif |
||||
|
||||
/* Default declaration of generated scanner - a define so the user can
|
||||
* easily add parameters. |
||||
*/ |
||||
#ifndef YY_DECL |
||||
#define YY_DECL_IS_OURS 1 |
||||
|
||||
extern int linguist_yylex (yyscan_t yyscanner); |
||||
|
||||
#define YY_DECL int linguist_yylex (yyscan_t yyscanner) |
||||
#endif /* !YY_DECL */ |
||||
|
||||
/* yy_get_previous_state - get the state just before the EOB char was reached */ |
||||
|
||||
#undef YY_NEW_FILE |
||||
#undef YY_FLUSH_BUFFER |
||||
#undef yy_set_bol |
||||
#undef yy_new_buffer |
||||
#undef yy_set_interactive |
||||
#undef YY_DO_BEFORE_ACTION |
||||
|
||||
#ifdef YY_DECL_IS_OURS |
||||
#undef YY_DECL_IS_OURS |
||||
#undef YY_DECL |
||||
#endif |
||||
|
||||
#line 118 "tokenizer.l" |
||||
|
||||
|
||||
#line 335 "lex.linguist_yy.h" |
||||
#undef linguist_yyIN_HEADER |
||||
#endif /* linguist_yyHEADER_H */ |
@ -0,0 +1,15 @@ |
||||
// https://github.com/github/linguist/blob/f72f2a21dfe80ebd16af3bc6216da75cd983a4f6/ext/linguist/linguist.h#L1
|
||||
enum tokenizer_type { |
||||
NO_ACTION, |
||||
REGULAR_TOKEN, |
||||
SHEBANG_TOKEN, |
||||
SGML_TOKEN, |
||||
}; |
||||
|
||||
struct tokenizer_extra { |
||||
char *token; |
||||
enum tokenizer_type type; |
||||
}; |
||||
|
||||
// TODO(bzz) port Win support from
|
||||
// https://github.com/github/linguist/commit/8e912b4d8bf2aef7948de59eba48b75cfcbc97e0
|
@ -0,0 +1,71 @@ |
||||
package flex |
||||
|
||||
// #include <stdlib.h>
|
||||
// #include "linguist.h"
|
||||
// #include "lex.linguist_yy.h"
|
||||
// int linguist_yywrap(yyscan_t yyscanner) {
|
||||
// return 1;
|
||||
// }
|
||||
import "C" |
||||
import "unsafe" |
||||
|
||||
const maxTokenLen = 32 // bytes
|
||||
|
||||
// TokenizeFlex implements tokenizer by calling Flex generated code from linguist in C
|
||||
// This is a transliteration from C https://github.com/github/linguist/blob/master/ext/linguist/linguist.c#L12
|
||||
func TokenizeFlex(content []byte) []string { |
||||
var buf C.YY_BUFFER_STATE |
||||
var scanner C.yyscan_t |
||||
var extra C.struct_tokenizer_extra |
||||
var _len C.ulong |
||||
var r C.int |
||||
|
||||
_len = C.ulong(len(content)) |
||||
cs := C.CBytes(content) |
||||
defer C.free(unsafe.Pointer(cs)) |
||||
|
||||
C.linguist_yylex_init_extra(&extra, &scanner) |
||||
buf = C.linguist_yy_scan_bytes((*C.char)(cs), _len, scanner) |
||||
|
||||
ary := []string{} |
||||
for { |
||||
extra._type = C.NO_ACTION |
||||
extra.token = nil |
||||
r = C.linguist_yylex(scanner) |
||||
switch extra._type { |
||||
case C.NO_ACTION: |
||||
break |
||||
case C.REGULAR_TOKEN: |
||||
_len = C.strlen(extra.token) |
||||
if _len <= maxTokenLen { |
||||
ary = append(ary, C.GoStringN(extra.token, (C.int)(_len))) |
||||
} |
||||
C.free(unsafe.Pointer(extra.token)) |
||||
break |
||||
case C.SHEBANG_TOKEN: |
||||
_len = C.strlen(extra.token) |
||||
if _len <= maxTokenLen { |
||||
s := "SHEBANG#!" + C.GoStringN(extra.token, (C.int)(_len)) |
||||
ary = append(ary, s) |
||||
} |
||||
C.free(unsafe.Pointer(extra.token)) |
||||
break |
||||
case C.SGML_TOKEN: |
||||
_len = C.strlen(extra.token) |
||||
if _len <= maxTokenLen { |
||||
s := C.GoStringN(extra.token, (C.int)(_len)) + ">" |
||||
ary = append(ary, s) |
||||
} |
||||
C.free(unsafe.Pointer(extra.token)) |
||||
break |
||||
} |
||||
if r == 0 { |
||||
break |
||||
} |
||||
} |
||||
|
||||
C.linguist_yy_delete_buffer(buf, scanner) |
||||
C.linguist_yylex_destroy(scanner) |
||||
|
||||
return ary |
||||
} |
@ -0,0 +1,214 @@ |
||||
// +build !flex
|
||||
|
||||
package tokenizer |
||||
|
||||
import ( |
||||
"bytes" |
||||
|
||||
"github.com/src-d/enry/v2/regex" |
||||
) |
||||
|
||||
// Tokenize returns lexical tokens from content. The tokens returned match what
|
||||
// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
|
||||
//
|
||||
// BUG: Until https://github.com/src-d/enry/issues/193 is resolved, there are some
|
||||
// differences between this function and the Linguist output.
|
||||
func Tokenize(content []byte) []string { |
||||
if len(content) > ByteLimit { |
||||
content = content[:ByteLimit] |
||||
} |
||||
|
||||
// Copy the input so that changes wrought by the tokenization steps do not
|
||||
// modify the caller's copy of the input. See #196.
|
||||
content = append([]byte(nil), content...) |
||||
|
||||
tokens := make([][]byte, 0, 50) |
||||
for _, extract := range extractTokens { |
||||
var extractedTokens [][]byte |
||||
content, extractedTokens = extract(content) |
||||
tokens = append(tokens, extractedTokens...) |
||||
} |
||||
|
||||
return toString(tokens) |
||||
} |
||||
|
||||
func toString(tokens [][]byte) []string { |
||||
stokens := make([]string, 0, len(tokens)) |
||||
for _, token := range tokens { |
||||
stokens = append(stokens, string(token)) |
||||
} |
||||
|
||||
return stokens |
||||
} |
||||
|
||||
var ( |
||||
extractTokens = []func(content []byte) (replacedContent []byte, tokens [][]byte){ |
||||
// The order to must be this
|
||||
extractAndReplaceShebang, |
||||
extractAndReplaceSGML, |
||||
skipCommentsAndLiterals, |
||||
extractAndReplacePunctuation, |
||||
extractAndReplaceRegular, |
||||
extractAndReplaceOperator, |
||||
extractRemainders, |
||||
} |
||||
|
||||
// Differences between golang regexp and oniguruma:
|
||||
// 1. no (?s) in oniguruma - makes dot match \n
|
||||
// 2. no (?U) in oniguruma - ungreedy *
|
||||
// 3. (?m) implies dot matches \n in oniguruma
|
||||
// 4. oniguruma handles \w differently - impossible, but true
|
||||
//
|
||||
// Workarounds:
|
||||
// 1. (.|\n)
|
||||
// 2. replace * with *?
|
||||
// 3. replace . with [^\n]
|
||||
// 4. replace \w with [0-9A-Za-z_]
|
||||
//
|
||||
// Original golang regexps:
|
||||
//
|
||||
// reLiteralStringQuotes = regexp.MustCompile(`(?sU)(".*"|'.*')`)
|
||||
// reSingleLineComment = regexp.MustCompile(`(?m)(//|--|#|%|")\s(.*$)`)
|
||||
// reMultilineComment = regexp.MustCompile(`(?sU)(/\*.*\*/|<!--.*-->|\{-.*-\}|\(\*.*\*\)|""".*"""|'''.*''')`)
|
||||
// reLiteralNumber = regexp.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`)
|
||||
// reShebang = regexp.MustCompile(`(?m)^#!(?:/\w+)*/(?:(\w+)|\w+(?:\s*\w+=\w+\s*)*\s*(\w+))(?:\s*-\w+\s*)*$`)
|
||||
// rePunctuation = regexp.MustCompile(`;|\{|\}|\(|\)|\[|\]`)
|
||||
// reSGML = regexp.MustCompile(`(?sU)(<\/?[^\s<>=\d"']+)(?:\s.*\/?>|>)`)
|
||||
// reSGMLComment = regexp.MustCompile(`(?sU)(<!--.*-->)`)
|
||||
// reSGMLAttributes = regexp.MustCompile(`\s+(\w+=)|\s+([^\s>]+)`)
|
||||
// reSGMLLoneAttribute = regexp.MustCompile(`(\w+)`)
|
||||
// reRegularToken = regexp.MustCompile(`[\w\.@#\/\*]+`)
|
||||
// reOperators = regexp.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`)
|
||||
//
|
||||
// These regexps were converted to work in the same way for both engines:
|
||||
//
|
||||
reLiteralStringQuotes = regex.MustCompile(`("(.|\n)*?"|'(.|\n)*?')`) |
||||
reSingleLineComment = regex.MustCompile(`(?m)(//|--|#|%|")\s([^\n]*$)`) |
||||
reMultilineComment = regex.MustCompile(`(/\*(.|\n)*?\*/|<!--(.|\n)*?-->|\{-(.|\n)*?-\}|\(\*(.|\n)*?\*\)|"""(.|\n)*?"""|'''(.|\n)*?''')`) |
||||
reLiteralNumber = regex.MustCompile(`(0x[0-9A-Fa-f]([0-9A-Fa-f]|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)`) |
||||
reShebang = regex.MustCompile(`(?m)^#!(?:/[0-9A-Za-z_]+)*/(?:([0-9A-Za-z_]+)|[0-9A-Za-z_]+(?:\s*[0-9A-Za-z_]+=[0-9A-Za-z_]+\s*)*\s*([0-9A-Za-z_]+))(?:\s*-[0-9A-Za-z_]+\s*)*$`) |
||||
rePunctuation = regex.MustCompile(`;|\{|\}|\(|\)|\[|\]`) |
||||
reSGML = regex.MustCompile(`(<\/?[^\s<>=\d"']+)(?:\s(.|\n)*?\/?>|>)`) |
||||
reSGMLComment = regex.MustCompile(`(<!--(.|\n)*?-->)`) |
||||
reSGMLAttributes = regex.MustCompile(`\s+([0-9A-Za-z_]+=)|\s+([^\s>]+)`) |
||||
reSGMLLoneAttribute = regex.MustCompile(`([0-9A-Za-z_]+)`) |
||||
reRegularToken = regex.MustCompile(`[0-9A-Za-z_\.@#\/\*]+`) |
||||
reOperators = regex.MustCompile(`<<?|\+|\-|\*|\/|%|&&?|\|\|?`) |
||||
|
||||
regexToSkip = []regex.EnryRegexp{ |
||||
// The order must be this
|
||||
reLiteralStringQuotes, |
||||
reMultilineComment, |
||||
reSingleLineComment, |
||||
reLiteralNumber, |
||||
} |
||||
) |
||||
|
||||
func extractAndReplaceShebang(content []byte) ([]byte, [][]byte) { |
||||
var shebangTokens [][]byte |
||||
matches := reShebang.FindAllSubmatch(content, -1) |
||||
if matches != nil { |
||||
shebangTokens = make([][]byte, 0, 2) |
||||
for _, match := range matches { |
||||
shebangToken := getShebangToken(match) |
||||
shebangTokens = append(shebangTokens, shebangToken) |
||||
} |
||||
|
||||
reShebang.ReplaceAll(content, []byte(` `)) |
||||
} |
||||
|
||||
return content, shebangTokens |
||||
} |
||||
|
||||
func getShebangToken(matchedShebang [][]byte) []byte { |
||||
const prefix = `SHEBANG#!` |
||||
var token []byte |
||||
for i := 1; i < len(matchedShebang); i++ { |
||||
if len(matchedShebang[i]) > 0 { |
||||
token = matchedShebang[i] |
||||
break |
||||
} |
||||
} |
||||
|
||||
tokenShebang := append([]byte(prefix), token...) |
||||
return tokenShebang |
||||
} |
||||
|
||||
func commonExtractAndReplace(content []byte, re regex.EnryRegexp) ([]byte, [][]byte) { |
||||
tokens := re.FindAll(content, -1) |
||||
content = re.ReplaceAll(content, []byte(` `)) |
||||
return content, tokens |
||||
} |
||||
|
||||
func extractAndReplacePunctuation(content []byte) ([]byte, [][]byte) { |
||||
return commonExtractAndReplace(content, rePunctuation) |
||||
} |
||||
|
||||
func extractAndReplaceRegular(content []byte) ([]byte, [][]byte) { |
||||
return commonExtractAndReplace(content, reRegularToken) |
||||
} |
||||
|
||||
func extractAndReplaceOperator(content []byte) ([]byte, [][]byte) { |
||||
return commonExtractAndReplace(content, reOperators) |
||||
} |
||||
|
||||
func extractAndReplaceSGML(content []byte) ([]byte, [][]byte) { |
||||
var SGMLTokens [][]byte |
||||
matches := reSGML.FindAllSubmatch(content, -1) |
||||
if matches != nil { |
||||
SGMLTokens = make([][]byte, 0, 2) |
||||
for _, match := range matches { |
||||
if reSGMLComment.Match(match[0]) { |
||||
continue |
||||
} |
||||
|
||||
token := append(match[1], '>') |
||||
SGMLTokens = append(SGMLTokens, token) |
||||
attributes := getSGMLAttributes(match[0]) |
||||
SGMLTokens = append(SGMLTokens, attributes...) |
||||
} |
||||
|
||||
content = reSGML.ReplaceAll(content, []byte(` `)) |
||||
} |
||||
|
||||
return content, SGMLTokens |
||||
} |
||||
|
||||
func getSGMLAttributes(SGMLTag []byte) [][]byte { |
||||
var attributes [][]byte |
||||
matches := reSGMLAttributes.FindAllSubmatch(SGMLTag, -1) |
||||
if matches != nil { |
||||
attributes = make([][]byte, 0, 5) |
||||
for _, match := range matches { |
||||
if len(match[1]) != 0 { |
||||
attributes = append(attributes, match[1]) |
||||
} |
||||
|
||||
if len(match[2]) != 0 { |
||||
loneAttributes := reSGMLLoneAttribute.FindAll(match[2], -1) |
||||
attributes = append(attributes, loneAttributes...) |
||||
} |
||||
} |
||||
} |
||||
|
||||
return attributes |
||||
} |
||||
|
||||
func skipCommentsAndLiterals(content []byte) ([]byte, [][]byte) { |
||||
for _, skip := range regexToSkip { |
||||
content = skip.ReplaceAll(content, []byte(` `)) |
||||
} |
||||
|
||||
return content, nil |
||||
} |
||||
|
||||
func extractRemainders(content []byte) ([]byte, [][]byte) { |
||||
splitted := bytes.Fields(content) |
||||
remainderTokens := make([][]byte, 0, len(splitted)*3) |
||||
for _, remainder := range splitted { |
||||
remainders := bytes.Split(remainder, nil) |
||||
remainderTokens = append(remainderTokens, remainders...) |
||||
} |
||||
|
||||
return content, remainderTokens |
||||
} |
@ -0,0 +1,15 @@ |
||||
// +build flex
|
||||
|
||||
package tokenizer |
||||
|
||||
import "github.com/src-d/enry/v2/internal/tokenizer/flex" |
||||
|
||||
// Tokenize returns lexical tokens from content. The tokens returned match what
|
||||
// the Linguist library returns. At most the first ByteLimit bytes of content are tokenized.
|
||||
func Tokenize(content []byte) []string { |
||||
if len(content) > ByteLimit { |
||||
content = content[:ByteLimit] |
||||
} |
||||
|
||||
return flex.TokenizeFlex(content) |
||||
} |
@ -0,0 +1,17 @@ |
||||
// +build oniguruma
|
||||
|
||||
package regex |
||||
|
||||
import ( |
||||
rubex "github.com/src-d/go-oniguruma" |
||||
) |
||||
|
||||
type EnryRegexp = *rubex.Regexp |
||||
|
||||
func MustCompile(str string) EnryRegexp { |
||||
return rubex.MustCompileASCII(str) |
||||
} |
||||
|
||||
func QuoteMeta(s string) string { |
||||
return rubex.QuoteMeta(s) |
||||
} |
@ -0,0 +1,17 @@ |
||||
// +build !oniguruma
|
||||
|
||||
package regex |
||||
|
||||
import ( |
||||
"regexp" |
||||
) |
||||
|
||||
type EnryRegexp = *regexp.Regexp |
||||
|
||||
func MustCompile(str string) EnryRegexp { |
||||
return regexp.MustCompile(str) |
||||
} |
||||
|
||||
func QuoteMeta(s string) string { |
||||
return regexp.QuoteMeta(s) |
||||
} |
@ -0,0 +1,84 @@ |
||||
package enry |
||||
|
||||
import ( |
||||
"bytes" |
||||
"path/filepath" |
||||
"strings" |
||||
|
||||
"github.com/src-d/enry/v2/data" |
||||
) |
||||
|
||||
const binSniffLen = 8000 |
||||
|
||||
var configurationLanguages = map[string]bool{ |
||||
"XML": true, "JSON": true, "TOML": true, "YAML": true, "INI": true, "SQL": true, |
||||
} |
||||
|
||||
// IsConfiguration tells if filename is in one of the configuration languages.
|
||||
func IsConfiguration(path string) bool { |
||||
language, _ := GetLanguageByExtension(path) |
||||
_, is := configurationLanguages[language] |
||||
return is |
||||
} |
||||
|
||||
// IsImage tells if a given file is an image (PNG, JPEG or GIF format).
|
||||
func IsImage(path string) bool { |
||||
extension := filepath.Ext(path) |
||||
if extension == ".png" || extension == ".jpg" || extension == ".jpeg" || extension == ".gif" { |
||||
return true |
||||
} |
||||
|
||||
return false |
||||
} |
||||
|
||||
// GetMIMEType returns a MIME type of a given file based on its languages.
|
||||
func GetMIMEType(path string, language string) string { |
||||
if mime, ok := data.LanguagesMime[language]; ok { |
||||
return mime |
||||
} |
||||
|
||||
if IsImage(path) { |
||||
return "image/" + filepath.Ext(path)[1:] |
||||
} |
||||
|
||||
return "text/plain" |
||||
} |
||||
|
||||
// IsDocumentation returns whether or not path is a documentation path.
|
||||
func IsDocumentation(path string) bool { |
||||
return data.DocumentationMatchers.Match(path) |
||||
} |
||||
|
||||
// IsDotFile returns whether or not path has dot as a prefix.
|
||||
func IsDotFile(path string) bool { |
||||
base := filepath.Base(filepath.Clean(path)) |
||||
return strings.HasPrefix(base, ".") && base != "." |
||||
} |
||||
|
||||
// IsVendor returns whether or not path is a vendor path.
|
||||
func IsVendor(path string) bool { |
||||
return data.VendorMatchers.Match(path) |
||||
} |
||||
|
||||
// IsBinary detects if data is a binary value based on:
|
||||
// http://git.kernel.org/cgit/git/git.git/tree/xdiff-interface.c?id=HEAD#n198
|
||||
func IsBinary(data []byte) bool { |
||||
if len(data) > binSniffLen { |
||||
data = data[:binSniffLen] |
||||
} |
||||
|
||||
if bytes.IndexByte(data, byte(0)) == -1 { |
||||
return false |
||||
} |
||||
|
||||
return true |
||||
} |
||||
|
||||
// GetColor returns a HTML color code of a given language.
|
||||
func GetColor(language string) string { |
||||
if color, ok := data.LanguagesColor[language]; ok { |
||||
return color |
||||
} |
||||
|
||||
return "#cccccc" |
||||
} |
@ -0,0 +1,20 @@ |
||||
dist: trusty |
||||
language: go |
||||
go: |
||||
- '1.11.x' |
||||
- '1.12.x' |
||||
|
||||
env: |
||||
global: |
||||
- LD_LIBRARY_PATH="/usr/local/lib":${LD_LIBRARY_PATH} |
||||
- GO111MODULE=on |
||||
- ONIGURUMA_VERSION='6.9.1' |
||||
|
||||
before_install: # install oniguruma manually as trusty has only ancient 5.x |
||||
- sudo apt-get install -y dpkg # dpkg >= 1.17.5ubuntu5.8 fixes https://bugs.launchpad.net/ubuntu/+source/dpkg/+bug/1730627 |
||||
- wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig5_${ONIGURUMA_VERSION}-1_amd64.deb" |
||||
- sudo dpkg -i "libonig5_${ONIGURUMA_VERSION}-1_amd64.deb" |
||||
- wget "http://archive.ubuntu.com/ubuntu/pool/universe/libo/libonig/libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb" |
||||
- sudo dpkg -i "libonig-dev_${ONIGURUMA_VERSION}-1_amd64.deb" |
||||
script: |
||||
- go test -v --cover -race |
@ -0,0 +1,19 @@ |
||||
Copyright (C) 2011 by Zhigang Chen |
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
of this software and associated documentation files (the "Software"), to deal |
||||
in the Software without restriction, including without limitation the rights |
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
copies of the Software, and to permit persons to whom the Software is |
||||
furnished to do so, subject to the following conditions: |
||||
|
||||
The above copyright notice and this permission notice shall be included in |
||||
all copies or substantial portions of the Software. |
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
||||
THE SOFTWARE. |
@ -0,0 +1,20 @@ |
||||
## go-oniguruma |
||||
<a href="https://travis-ci.org/src-d/go-oniguruma"><img alt="Build Status" src="https://travis-ci.org/src-d/go-oniguruma.svg?branch=master" /></a> |
||||
|
||||
This repository is a fork of [moovweb/rubex](https://github.com/moovweb/rubex/tree/go1) - a simple regular expression library (based on [oniguruma](https://github.com/kkos/oniguruma)) that supports Ruby's regex syntax. |
||||
|
||||
The _rubex_ was originally created by Zhigang Chen (zhigang.chen@moovweb.com or zhigangc@gmail.com). It implements all the public functions of Go's Regexp package, except LiteralPrefix. |
||||
|
||||
By the benchmark tests in regexp, the library is 40% to 10X faster than Regexp on all but one test. Unlike Go's regexp, this library supports named capture groups and also allow `"\\1"` and `"\\k<name>"` in replacement strings. |
||||
The library calls the _oniguruma_ regex library for regex pattern searching. All replacement code is done in Go. |
||||
|
||||
### Install all (_oniguruma_ and _rubex_): |
||||
```sh |
||||
# linux (debian/ubuntu/...) |
||||
sudo apt-get install libonig-dev |
||||
|
||||
# osx (homebrew) |
||||
brew install oniguruma |
||||
|
||||
go install -i . |
||||
``` |
@ -0,0 +1,184 @@ |
||||
#include <stdlib.h> |
||||
#include <stdio.h> |
||||
#include <string.h> |
||||
#ifdef BENCHMARK_CHELP |
||||
#include <sys/time.h> |
||||
#endif |
||||
#include "chelper.h" |
||||
|
||||
int NewOnigRegex( char *pattern, int pattern_length, int option, |
||||
OnigRegex *regex, OnigRegion **region, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer) { |
||||
int ret = ONIG_NORMAL; |
||||
int error_msg_len = 0; |
||||
|
||||
OnigUChar *pattern_start = (OnigUChar *) pattern; |
||||
OnigUChar *pattern_end = (OnigUChar *) (pattern + pattern_length); |
||||
|
||||
*error_info = (OnigErrorInfo *) malloc(sizeof(OnigErrorInfo)); |
||||
memset(*error_info, 0, sizeof(OnigErrorInfo)); |
||||
|
||||
onig_initialize_encoding(*encoding); |
||||
|
||||
*error_buffer = (char*) malloc(ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char)); |
||||
|
||||
memset(*error_buffer, 0, ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char)); |
||||
|
||||
*region = onig_region_new(); |
||||
|
||||
ret = onig_new(regex, pattern_start, pattern_end, (OnigOptionType)(option), *encoding, OnigDefaultSyntax, *error_info); |
||||
|
||||
if (ret != ONIG_NORMAL) { |
||||
error_msg_len = onig_error_code_to_str((unsigned char*)(*error_buffer), ret, *error_info); |
||||
if (error_msg_len >= ONIG_MAX_ERROR_MESSAGE_LEN) { |
||||
error_msg_len = ONIG_MAX_ERROR_MESSAGE_LEN - 1; |
||||
} |
||||
(*error_buffer)[error_msg_len] = '\0'; |
||||
} |
||||
return ret; |
||||
} |
||||
|
||||
int SearchOnigRegex( void *str, int str_length, int offset, int option, |
||||
OnigRegex regex, OnigRegion *region, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures) { |
||||
int ret = ONIG_MISMATCH; |
||||
int error_msg_len = 0; |
||||
#ifdef BENCHMARK_CHELP |
||||
struct timeval tim1, tim2; |
||||
long t; |
||||
#endif |
||||
|
||||
OnigUChar *str_start = (OnigUChar *) str; |
||||
OnigUChar *str_end = (OnigUChar *) (str_start + str_length); |
||||
OnigUChar *search_start = (OnigUChar *)(str_start + offset); |
||||
OnigUChar *search_end = str_end; |
||||
|
||||
#ifdef BENCHMARK_CHELP |
||||
gettimeofday(&tim1, NULL); |
||||
#endif |
||||
|
||||
ret = onig_search(regex, str_start, str_end, search_start, search_end, region, option); |
||||
if (ret < 0 && error_buffer != NULL) { |
||||
error_msg_len = onig_error_code_to_str((unsigned char*)(error_buffer), ret, error_info); |
||||
if (error_msg_len >= ONIG_MAX_ERROR_MESSAGE_LEN) { |
||||
error_msg_len = ONIG_MAX_ERROR_MESSAGE_LEN - 1; |
||||
} |
||||
error_buffer[error_msg_len] = '\0'; |
||||
} |
||||
else if (captures != NULL) { |
||||
int i; |
||||
int count = 0; |
||||
for (i = 0; i < region->num_regs; i++) { |
||||
captures[2*count] = region->beg[i]; |
||||
captures[2*count+1] = region->end[i]; |
||||
count ++; |
||||
} |
||||
*numCaptures = count; |
||||
} |
||||
|
||||
#ifdef BENCHMARK_CHELP |
||||
gettimeofday(&tim2, NULL); |
||||
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec; |
||||
printf("%ld microseconds elapsed\n", t); |
||||
#endif |
||||
return ret; |
||||
} |
||||
|
||||
int MatchOnigRegex(void *str, int str_length, int offset, int option, |
||||
OnigRegex regex, OnigRegion *region) { |
||||
int ret = ONIG_MISMATCH; |
||||
int error_msg_len = 0; |
||||
#ifdef BENCHMARK_CHELP |
||||
struct timeval tim1, tim2; |
||||
long t; |
||||
#endif |
||||
|
||||
OnigUChar *str_start = (OnigUChar *) str; |
||||
OnigUChar *str_end = (OnigUChar *) (str_start + str_length); |
||||
OnigUChar *search_start = (OnigUChar *)(str_start + offset); |
||||
|
||||
#ifdef BENCHMARK_CHELP |
||||
gettimeofday(&tim1, NULL); |
||||
#endif |
||||
ret = onig_match(regex, str_start, str_end, search_start, region, option); |
||||
#ifdef BENCHMARK_CHELP |
||||
gettimeofday(&tim2, NULL); |
||||
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec; |
||||
printf("%ld microseconds elapsed\n", t); |
||||
#endif |
||||
return ret; |
||||
} |
||||
|
||||
int LookupOnigCaptureByName(char *name, int name_length, |
||||
OnigRegex regex, OnigRegion *region) { |
||||
int ret = ONIGERR_UNDEFINED_NAME_REFERENCE; |
||||
#ifdef BENCHMARK_CHELP |
||||
struct timeval tim1, tim2; |
||||
long t; |
||||
#endif |
||||
OnigUChar *name_start = (OnigUChar *) name; |
||||
OnigUChar *name_end = (OnigUChar *) (name_start + name_length); |
||||
#ifdef BENCHMARK_CHELP |
||||
gettimeofday(&tim1, NULL); |
||||
#endif |
||||
ret = onig_name_to_backref_number(regex, name_start, name_end, region); |
||||
#ifdef BENCHMARK_CHELP |
||||
gettimeofday(&tim2, NULL); |
||||
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec; |
||||
printf("%ld microseconds elapsed\n", t); |
||||
#endif |
||||
return ret; |
||||
} |
||||
|
||||
typedef struct { |
||||
char *nameBuffer; |
||||
int bufferOffset; |
||||
int bufferSize; |
||||
int *numbers; |
||||
int numIndex; |
||||
} group_info_t; |
||||
|
||||
int name_callback(const UChar* name, const UChar* name_end, |
||||
int ngroup_num, int* group_nums, |
||||
regex_t* reg, void* arg) |
||||
{ |
||||
int nameLen, offset, newOffset; |
||||
group_info_t *groupInfo; |
||||
|
||||
groupInfo = (group_info_t*) arg; |
||||
offset = groupInfo->bufferOffset; |
||||
nameLen = name_end - name; |
||||
newOffset = offset + nameLen; |
||||
|
||||
//if there are already names, add a ";"
|
||||
if (offset > 0) { |
||||
newOffset += 1; |
||||
} |
||||
|
||||
if (newOffset <= groupInfo->bufferSize) { |
||||
if (offset > 0) { |
||||
groupInfo->nameBuffer[offset] = ';'; |
||||
offset += 1; |
||||
} |
||||
memcpy(&groupInfo->nameBuffer[offset], name, nameLen); |
||||
} |
||||
groupInfo->bufferOffset = newOffset; |
||||
if (ngroup_num > 0) { |
||||
groupInfo->numbers[groupInfo->numIndex] = group_nums[ngroup_num-1]; |
||||
} else { |
||||
groupInfo->numbers[groupInfo->numIndex] = -1; |
||||
} |
||||
groupInfo->numIndex += 1; |
||||
return 0; /* 0: continue */ |
||||
} |
||||
|
||||
int GetCaptureNames(OnigRegex reg, void *buffer, int bufferSize, int* groupNumbers) { |
||||
int ret; |
||||
group_info_t groupInfo; |
||||
groupInfo.nameBuffer = (char*)buffer; |
||||
groupInfo.bufferOffset = 0; |
||||
groupInfo.bufferSize = bufferSize; |
||||
groupInfo.numbers = groupNumbers; |
||||
groupInfo.numIndex = 0; |
||||
onig_foreach_name(reg, name_callback, (void* )&groupInfo); |
||||
return groupInfo.bufferOffset; |
||||
} |
||||
|
@ -0,0 +1,14 @@ |
||||
#include <oniguruma.h> |
||||
|
||||
extern int NewOnigRegex( char *pattern, int pattern_length, int option, |
||||
OnigRegex *regex, OnigRegion **region, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer); |
||||
|
||||
extern int SearchOnigRegex( void *str, int str_length, int offset, int option, |
||||
OnigRegex regex, OnigRegion *region, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures); |
||||
|
||||
extern int MatchOnigRegex( void *str, int str_length, int offset, int option, |
||||
OnigRegex regex, OnigRegion *region); |
||||
|
||||
extern int LookupOnigCaptureByName(char *name, int name_length, OnigRegex regex, OnigRegion *region); |
||||
|
||||
extern int GetCaptureNames(OnigRegex regex, void *buffer, int bufferSize, int* groupNumbers); |
@ -0,0 +1,27 @@ |
||||
package rubex |
||||
|
||||
const ( |
||||
ONIG_OPTION_DEFAULT = ONIG_OPTION_NONE |
||||
/* options */ |
||||
ONIG_OPTION_NONE = 0 |
||||
ONIG_OPTION_IGNORECASE = 1 |
||||
ONIG_OPTION_EXTEND = (ONIG_OPTION_IGNORECASE << 1) |
||||
ONIG_OPTION_MULTILINE = (ONIG_OPTION_EXTEND << 1) |
||||
ONIG_OPTION_SINGLELINE = (ONIG_OPTION_MULTILINE << 1) |
||||
ONIG_OPTION_FIND_LONGEST = (ONIG_OPTION_SINGLELINE << 1) |
||||
ONIG_OPTION_FIND_NOT_EMPTY = (ONIG_OPTION_FIND_LONGEST << 1) |
||||
ONIG_OPTION_NEGATE_SINGLELINE = (ONIG_OPTION_FIND_NOT_EMPTY << 1) |
||||
ONIG_OPTION_DONT_CAPTURE_GROUP = (ONIG_OPTION_NEGATE_SINGLELINE << 1) |
||||
ONIG_OPTION_CAPTURE_GROUP = (ONIG_OPTION_DONT_CAPTURE_GROUP << 1) |
||||
/* options (search time) */ |
||||
ONIG_OPTION_NOTBOL = (ONIG_OPTION_CAPTURE_GROUP << 1) |
||||
ONIG_OPTION_NOTEOL = (ONIG_OPTION_NOTBOL << 1) |
||||
ONIG_OPTION_POSIX_REGION = (ONIG_OPTION_NOTEOL << 1) |
||||
ONIG_OPTION_MAXBIT = ONIG_OPTION_POSIX_REGION /* limit */ |
||||
|
||||
ONIG_NORMAL = 0 |
||||
ONIG_MISMATCH = -1 |
||||
|
||||
ONIG_MISMATCH_STR = "mismatch" |
||||
ONIGERR_UNDEFINED_NAME_REFERENCE = -217 |
||||
) |
@ -0,0 +1 @@ |
||||
module github.com/src-d/go-oniguruma |
@ -0,0 +1,36 @@ |
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package regexp implements a simple regular expression library.
|
||||
|
||||
// QuoteMeta func is copied here to avoid linking the entire Regexp library.
|
||||
|
||||
package rubex |
||||
|
||||
func special(c int) bool { |
||||
for _, r := range `\.+*?()|[]^$` { |
||||
if c == int(r) { |
||||
return true |
||||
} |
||||
} |
||||
return false |
||||
} |
||||
|
||||
// QuoteMeta returns a string that quotes all regular expression metacharacters
|
||||
// inside the argument text; the returned string is a regular expression matching
|
||||
// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
|
||||
func QuoteMeta(s string) string { |
||||
b := make([]byte, 2*len(s)) |
||||
|
||||
// A byte loop is correct because all metacharacters are ASCII.
|
||||
j := 0 |
||||
for i := 0; i < len(s); i++ { |
||||
if special(int(s[i])) { |
||||
b[j] = '\\' |
||||
j++ |
||||
} |
||||
b[j] = s[i] |
||||
j++ |
||||
} |
||||
return string(b[0:j]) |
||||
} |
@ -0,0 +1,668 @@ |
||||
package rubex |
||||
|
||||
/* |
||||
#cgo CFLAGS: -I/usr/local/include |
||||
#cgo LDFLAGS: -L/usr/local/lib -lonig |
||||
#include <stdlib.h> |
||||
#include <oniguruma.h> |
||||
#include "chelper.h" |
||||
*/ |
||||
import "C" |
||||
|
||||
import ( |
||||
"bytes" |
||||
"errors" |
||||
"fmt" |
||||
"io" |
||||
"log" |
||||
"runtime" |
||||
"strconv" |
||||
"sync" |
||||
"unicode/utf8" |
||||
"unsafe" |
||||
) |
||||
|
||||
type strRange []int |
||||
|
||||
const numMatchStartSize = 4 |
||||
const numReadBufferStartSize = 256 |
||||
|
||||
var mutex sync.Mutex |
||||
|
||||
type MatchData struct { |
||||
count int |
||||
indexes [][]int32 |
||||
} |
||||
|
||||
type NamedGroupInfo map[string]int |
||||
|
||||
type Regexp struct { |
||||
pattern string |
||||
regex C.OnigRegex |
||||
region *C.OnigRegion |
||||
encoding C.OnigEncoding |
||||
errorInfo *C.OnigErrorInfo |
||||
errorBuf *C.char |
||||
matchData *MatchData |
||||
namedGroupInfo NamedGroupInfo |
||||
} |
||||
|
||||
// NewRegexp creates and initializes a new Regexp with the given pattern and option.
|
||||
func NewRegexp(pattern string, option int) (re *Regexp, err error) { |
||||
return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_UTF8}, option) |
||||
} |
||||
|
||||
// NewRegexpASCII is equivalent to NewRegexp, but with the encoding restricted to ASCII.
|
||||
func NewRegexpASCII(pattern string, option int) (re *Regexp, err error) { |
||||
return initRegexp(&Regexp{pattern: pattern, encoding: C.ONIG_ENCODING_ASCII}, option) |
||||
} |
||||
|
||||
func initRegexp(re *Regexp, option int) (*Regexp, error) { |
||||
var err error |
||||
patternCharPtr := C.CString(re.pattern) |
||||
defer C.free(unsafe.Pointer(patternCharPtr)) |
||||
mutex.Lock() |
||||
defer mutex.Unlock() |
||||
errorCode := C.NewOnigRegex(patternCharPtr, C.int(len(re.pattern)), C.int(option), &re.regex, &re.region, &re.encoding, &re.errorInfo, &re.errorBuf) |
||||
if errorCode != C.ONIG_NORMAL { |
||||
err = errors.New(C.GoString(re.errorBuf)) |
||||
} else { |
||||
err = nil |
||||
numCapturesInPattern := int(C.onig_number_of_captures(re.regex)) + 1 |
||||
re.matchData = &MatchData{} |
||||
re.matchData.indexes = make([][]int32, numMatchStartSize) |
||||
for i := 0; i < numMatchStartSize; i++ { |
||||
re.matchData.indexes[i] = make([]int32, numCapturesInPattern*2) |
||||
} |
||||
re.namedGroupInfo = re.getNamedGroupInfo() |
||||
runtime.SetFinalizer(re, (*Regexp).Free) |
||||
} |
||||
return re, err |
||||
} |
||||
|
||||
func Compile(str string) (*Regexp, error) { |
||||
return NewRegexp(str, ONIG_OPTION_DEFAULT) |
||||
} |
||||
|
||||
func MustCompile(str string) *Regexp { |
||||
regexp, error := NewRegexp(str, ONIG_OPTION_DEFAULT) |
||||
if error != nil { |
||||
panic("regexp: compiling " + str + ": " + error.Error()) |
||||
} |
||||
return regexp |
||||
} |
||||
|
||||
func CompileWithOption(str string, option int) (*Regexp, error) { |
||||
return NewRegexp(str, option) |
||||
} |
||||
|
||||
func MustCompileWithOption(str string, option int) *Regexp { |
||||
regexp, error := NewRegexp(str, option) |
||||
if error != nil { |
||||
panic("regexp: compiling " + str + ": " + error.Error()) |
||||
} |
||||
return regexp |
||||
} |
||||
|
||||
// MustCompileASCII is equivalent to MustCompile, but with the encoding restricted to ASCII.
|
||||
func MustCompileASCII(str string) *Regexp { |
||||
regexp, error := NewRegexpASCII(str, ONIG_OPTION_DEFAULT) |
||||
if error != nil { |
||||
panic("regexp: compiling " + str + ": " + error.Error()) |
||||
} |
||||
return regexp |
||||
} |
||||
|
||||
func (re *Regexp) Free() { |
||||
mutex.Lock() |
||||
if re.regex != nil { |
||||
C.onig_free(re.regex) |
||||
re.regex = nil |
||||
} |
||||
if re.region != nil { |
||||
C.onig_region_free(re.region, 1) |
||||
re.region = nil |
||||
} |
||||
mutex.Unlock() |
||||
if re.errorInfo != nil { |
||||
C.free(unsafe.Pointer(re.errorInfo)) |
||||
re.errorInfo = nil |
||||
} |
||||
if re.errorBuf != nil { |
||||
C.free(unsafe.Pointer(re.errorBuf)) |
||||
re.errorBuf = nil |
||||
} |
||||
} |
||||
|
||||
func (re *Regexp) getNamedGroupInfo() (namedGroupInfo NamedGroupInfo) { |
||||
numNamedGroups := int(C.onig_number_of_names(re.regex)) |
||||
//when any named capture exisits, there is no numbered capture even if there are unnamed captures
|
||||
if numNamedGroups > 0 { |
||||
namedGroupInfo = make(map[string]int) |
||||
//try to get the names
|
||||
bufferSize := len(re.pattern) * 2 |
||||
nameBuffer := make([]byte, bufferSize) |
||||
groupNumbers := make([]int32, numNamedGroups) |
||||
bufferPtr := unsafe.Pointer(&nameBuffer[0]) |
||||
numbersPtr := unsafe.Pointer(&groupNumbers[0]) |
||||
length := int(C.GetCaptureNames(re.regex, bufferPtr, (C.int)(bufferSize), (*C.int)(numbersPtr))) |
||||
if length > 0 { |
||||
namesAsBytes := bytes.Split(nameBuffer[:length], ([]byte)(";")) |
||||
if len(namesAsBytes) != numNamedGroups { |
||||
log.Fatalf("the number of named groups (%d) does not match the number names found (%d)\n", numNamedGroups, len(namesAsBytes)) |
||||
} |
||||
for i, nameAsBytes := range namesAsBytes { |
||||
name := string(nameAsBytes) |
||||
namedGroupInfo[name] = int(groupNumbers[i]) |
||||
} |
||||
} else { |
||||
log.Fatalf("could not get the capture group names from %q", re.String()) |
||||
} |
||||
} |
||||
return |
||||
} |
||||
|
||||
func (re *Regexp) groupNameToId(name string) (id int) { |
||||
if re.namedGroupInfo == nil { |
||||
id = ONIGERR_UNDEFINED_NAME_REFERENCE |
||||
} else { |
||||
id = re.namedGroupInfo[name] |
||||
} |
||||
return |
||||
} |
||||
|
||||
func (re *Regexp) processMatch(numCaptures int) (match []int32) { |
||||
if numCaptures <= 0 { |
||||
panic("cannot have 0 captures when processing a match") |
||||
} |
||||
matchData := re.matchData |
||||
return matchData.indexes[matchData.count][:numCaptures*2] |
||||
} |
||||
|
||||
func (re *Regexp) ClearMatchData() { |
||||
matchData := re.matchData |
||||
matchData.count = 0 |
||||
} |
||||
|
||||
func (re *Regexp) find(b []byte, n int, offset int) (match []int) { |
||||
if n == 0 { |
||||
b = []byte{0} |
||||
} |
||||
ptr := unsafe.Pointer(&b[0]) |
||||
matchData := re.matchData |
||||
capturesPtr := unsafe.Pointer(&(matchData.indexes[matchData.count][0])) |
||||
numCaptures := int32(0) |
||||
numCapturesPtr := unsafe.Pointer(&numCaptures) |
||||
pos := int(C.SearchOnigRegex((ptr), C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), re.regex, re.region, re.errorInfo, (*C.char)(nil), (*C.int)(capturesPtr), (*C.int)(numCapturesPtr))) |
||||
if pos >= 0 { |
||||
if numCaptures <= 0 { |
||||
panic("cannot have 0 captures when processing a match") |
||||
} |
||||
match2 := matchData.indexes[matchData.count][:numCaptures*2] |
||||
match = make([]int, len(match2)) |
||||
for i := range match2 { |
||||
match[i] = int(match2[i]) |
||||
} |
||||
numCapturesInPattern := int32(C.onig_number_of_captures(re.regex)) + 1 |
||||
if numCapturesInPattern != numCaptures { |
||||
log.Fatalf("expected %d captures but got %d\n", numCapturesInPattern, numCaptures) |
||||
} |
||||
} |
||||
return |
||||
} |
||||
|
||||
func getCapture(b []byte, beg int, end int) []byte { |
||||
if beg < 0 || end < 0 { |
||||
return nil |
||||
} |
||||
return b[beg:end] |
||||
} |
||||
|
||||
func (re *Regexp) match(b []byte, n int, offset int) bool { |
||||
re.ClearMatchData() |
||||
if n == 0 { |
||||
b = []byte{0} |
||||
} |
||||
ptr := unsafe.Pointer(&b[0]) |
||||
pos := int(C.SearchOnigRegex((ptr), C.int(n), C.int(offset), C.int(ONIG_OPTION_DEFAULT), re.regex, re.region, re.errorInfo, (*C.char)(nil), (*C.int)(nil), (*C.int)(nil))) |
||||
return pos >= 0 |
||||
} |
||||
|
||||
func (re *Regexp) findAll(b []byte, n int) (matches [][]int) { |
||||
re.ClearMatchData() |
||||
|
||||
if n < 0 { |
||||
n = len(b) |
||||
} |
||||
matchData := re.matchData |
||||
offset := 0 |
||||
for offset <= n { |
||||
if matchData.count >= len(matchData.indexes) { |
||||
length := len(matchData.indexes[0]) |
||||
matchData.indexes = append(matchData.indexes, make([]int32, length)) |
||||
} |
||||
if match := re.find(b, n, offset); len(match) > 0 { |
||||
matchData.count += 1 |
||||
//move offset to the ending index of the current match and prepare to find the next non-overlapping match
|
||||
offset = match[1] |
||||
//if match[0] == match[1], it means the current match does not advance the search. we need to exit the loop to avoid getting stuck here.
|
||||
if match[0] == match[1] { |
||||
if offset < n && offset >= 0 { |
||||
//there are more bytes, so move offset by a word
|
||||
_, width := utf8.DecodeRune(b[offset:]) |
||||
offset += width |
||||
} else { |
||||
//search is over, exit loop
|
||||
break |
||||
} |
||||
} |
||||
} else { |
||||
break |
||||
} |
||||
} |
||||
matches2 := matchData.indexes[:matchData.count] |
||||
matches = make([][]int, len(matches2)) |
||||
for i, v := range matches2 { |
||||
matches[i] = make([]int, len(v)) |
||||
for j, v2 := range v { |
||||
matches[i][j] = int(v2) |
||||
} |
||||
} |
||||
return |
||||
} |
||||
|
||||
func (re *Regexp) FindIndex(b []byte) []int { |
||||
re.ClearMatchData() |
||||
match := re.find(b, len(b), 0) |
||||
if len(match) == 0 { |
||||
return nil |
||||
} |
||||
return match[:2] |
||||
} |
||||
|
||||
func (re *Regexp) Find(b []byte) []byte { |
||||
loc := re.FindIndex(b) |
||||
if loc == nil { |
||||
return nil |
||||
} |
||||
return getCapture(b, loc[0], loc[1]) |
||||
} |
||||
|
||||
func (re *Regexp) FindString(s string) string { |
||||
b := []byte(s) |
||||
mb := re.Find(b) |
||||
if mb == nil { |
||||
return "" |
||||
} |
||||
return string(mb) |
||||
} |
||||
|
||||
func (re *Regexp) FindStringIndex(s string) []int { |
||||
b := []byte(s) |
||||
return re.FindIndex(b) |
||||
} |
||||
|
||||
func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { |
||||
matches := re.findAll(b, n) |
||||
if len(matches) == 0 { |
||||
return nil |
||||
} |
||||
return matches |
||||
} |
||||
|
||||
func (re *Regexp) FindAll(b []byte, n int) [][]byte { |
||||
matches := re.FindAllIndex(b, n) |
||||
if matches == nil { |
||||
return nil |
||||
} |
||||
matchBytes := make([][]byte, 0, len(matches)) |
||||
for _, match := range matches { |
||||
matchBytes = append(matchBytes, getCapture(b, match[0], match[1])) |
||||
} |
||||
return matchBytes |
||||
} |
||||
|
||||
func (re *Regexp) FindAllString(s string, n int) []string { |
||||
b := []byte(s) |
||||
matches := re.FindAllIndex(b, n) |
||||
if matches == nil { |
||||
return nil |
||||
} |
||||
matchStrings := make([]string, 0, len(matches)) |
||||
for _, match := range matches { |
||||
m := getCapture(b, match[0], match[1]) |
||||
if m == nil { |
||||
matchStrings = append(matchStrings, "") |
||||
} else { |
||||
matchStrings = append(matchStrings, string(m)) |
||||
} |
||||
} |
||||
return matchStrings |
||||
|
||||
} |
||||
|
||||
func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { |
||||
b := []byte(s) |
||||
return re.FindAllIndex(b, n) |
||||
} |
||||
|
||||
func (re *Regexp) findSubmatchIndex(b []byte) (match []int) { |
||||
re.ClearMatchData() |
||||
match = re.find(b, len(b), 0) |
||||
return |
||||
} |
||||
|
||||
func (re *Regexp) FindSubmatchIndex(b []byte) []int { |
||||
match := re.findSubmatchIndex(b) |
||||
if len(match) == 0 { |
||||
return nil |
||||
} |
||||
return match |
||||
} |
||||
|
||||
func (re *Regexp) FindSubmatch(b []byte) [][]byte { |
||||
match := re.findSubmatchIndex(b) |
||||
if match == nil { |
||||
return nil |
||||
} |
||||
length := len(match) / 2 |
||||
if length == 0 { |
||||
return nil |
||||
} |
||||
results := make([][]byte, 0, length) |
||||
for i := 0; i < length; i++ { |
||||
results = append(results, getCapture(b, match[2*i], match[2*i+1])) |
||||
} |
||||
return results |
||||
} |
||||
|
||||
func (re *Regexp) FindStringSubmatch(s string) []string { |
||||
b := []byte(s) |
||||
match := re.findSubmatchIndex(b) |
||||
if match == nil { |
||||
return nil |
||||
} |
||||
length := len(match) / 2 |
||||
if length == 0 { |
||||
return nil |
||||
} |
||||
|
||||
results := make([]string, 0, length) |
||||
for i := 0; i < length; i++ { |
||||
cap := getCapture(b, match[2*i], match[2*i+1]) |
||||
if cap == nil { |
||||
results = append(results, "") |
||||
} else { |
||||
results = append(results, string(cap)) |
||||
} |
||||
} |
||||
return results |
||||
} |
||||
|
||||
func (re *Regexp) FindStringSubmatchIndex(s string) []int { |
||||
b := []byte(s) |
||||
return re.FindSubmatchIndex(b) |
||||
} |
||||
|
||||
func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { |
||||
matches := re.findAll(b, n) |
||||
if len(matches) == 0 { |
||||
return nil |
||||
} |
||||
return matches |
||||
} |
||||
|
||||
func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { |
||||
matches := re.findAll(b, n) |
||||
if len(matches) == 0 { |
||||
return nil |
||||
} |
||||
allCapturedBytes := make([][][]byte, 0, len(matches)) |
||||
for _, match := range matches { |
||||
length := len(match) / 2 |
||||
capturedBytes := make([][]byte, 0, length) |
||||
for i := 0; i < length; i++ { |
||||
capturedBytes = append(capturedBytes, getCapture(b, match[2*i], match[2*i+1])) |
||||
} |
||||
allCapturedBytes = append(allCapturedBytes, capturedBytes) |
||||
} |
||||
|
||||
return allCapturedBytes |
||||
} |
||||
|
||||
func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { |
||||
b := []byte(s) |
||||
matches := re.findAll(b, n) |
||||
if len(matches) == 0 { |
||||
return nil |
||||
} |
||||
allCapturedStrings := make([][]string, 0, len(matches)) |
||||
for _, match := range matches { |
||||
length := len(match) / 2 |
||||
capturedStrings := make([]string, 0, length) |
||||
for i := 0; i < length; i++ { |
||||
cap := getCapture(b, match[2*i], match[2*i+1]) |
||||
if cap == nil { |
||||
capturedStrings = append(capturedStrings, "") |
||||
} else { |
||||
capturedStrings = append(capturedStrings, string(cap)) |
||||
} |
||||
} |
||||
allCapturedStrings = append(allCapturedStrings, capturedStrings) |
||||
} |
||||
return allCapturedStrings |
||||
} |
||||
|
||||
func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { |
||||
b := []byte(s) |
||||
return re.FindAllSubmatchIndex(b, n) |
||||
} |
||||
|
||||
func (re *Regexp) Match(b []byte) bool { |
||||
return re.match(b, len(b), 0) |
||||
} |
||||
|
||||
func (re *Regexp) MatchString(s string) bool { |
||||
b := []byte(s) |
||||
return re.Match(b) |
||||
} |
||||
|
||||
func (re *Regexp) NumSubexp() int { |
||||
return (int)(C.onig_number_of_captures(re.regex)) |
||||
} |
||||
|
||||
func (re *Regexp) getNamedCapture(name []byte, capturedBytes [][]byte) []byte { |
||||
nameStr := string(name) |
||||
capNum := re.groupNameToId(nameStr) |
||||
if capNum < 0 || capNum >= len(capturedBytes) { |
||||
panic(fmt.Sprintf("capture group name (%q) has error\n", nameStr)) |
||||
} |
||||
return capturedBytes[capNum] |
||||
} |
||||
|
||||
func (re *Regexp) getNumberedCapture(num int, capturedBytes [][]byte) []byte { |
||||
//when named capture groups exist, numbered capture groups returns ""
|
||||
if re.namedGroupInfo == nil && num <= (len(capturedBytes)-1) && num >= 0 { |
||||
return capturedBytes[num] |
||||
} |
||||
return ([]byte)("") |
||||
} |
||||
|
||||
func fillCapturedValues(repl []byte, _ []byte, capturedBytes map[string][]byte) []byte { |
||||
replLen := len(repl) |
||||
newRepl := make([]byte, 0, replLen*3) |
||||
inEscapeMode := false |
||||
inGroupNameMode := false |
||||
groupName := make([]byte, 0, replLen) |
||||
for index := 0; index < replLen; index += 1 { |
||||
ch := repl[index] |
||||
if inGroupNameMode && ch == byte('<') { |
||||
} else if inGroupNameMode && ch == byte('>') { |
||||
inGroupNameMode = false |
||||
groupNameStr := string(groupName) |
||||
capBytes := capturedBytes[groupNameStr] |
||||
newRepl = append(newRepl, capBytes...) |
||||
groupName = groupName[:0] //reset the name
|
||||
} else if inGroupNameMode { |
||||
groupName = append(groupName, ch) |
||||
} else if inEscapeMode && ch <= byte('9') && byte('1') <= ch { |
||||
capNumStr := string(ch) |
||||
capBytes := capturedBytes[capNumStr] |
||||
newRepl = append(newRepl, capBytes...) |
||||
} else if inEscapeMode && ch == byte('k') && (index+1) < replLen && repl[index+1] == byte('<') { |
||||
inGroupNameMode = true |
||||
inEscapeMode = false |
||||
index += 1 //bypass the next char '<'
|
||||
} else if inEscapeMode { |
||||
newRepl = append(newRepl, '\\') |
||||
newRepl = append(newRepl, ch) |
||||
} else if ch != '\\' { |
||||
newRepl = append(newRepl, ch) |
||||
} |
||||
if ch == byte('\\') || inEscapeMode { |
||||
inEscapeMode = !inEscapeMode |
||||
} |
||||
} |
||||
return newRepl |
||||
} |
||||
|
||||
func (re *Regexp) replaceAll(src, repl []byte, replFunc func([]byte, []byte, map[string][]byte) []byte) []byte { |
||||
srcLen := len(src) |
||||
matches := re.findAll(src, srcLen) |
||||
if len(matches) == 0 { |
||||
return src |
||||
} |
||||
dest := make([]byte, 0, srcLen) |
||||
for i, match := range matches { |
||||
length := len(match) / 2 |
||||
capturedBytes := make(map[string][]byte) |
||||
if re.namedGroupInfo == nil { |
||||
for j := 0; j < length; j++ { |
||||
capturedBytes[strconv.Itoa(j)] = getCapture(src, match[2*j], match[2*j+1]) |
||||
} |
||||
} else { |
||||
for name, j := range re.namedGroupInfo { |
||||
capturedBytes[name] = getCapture(src, match[2*j], match[2*j+1]) |
||||
} |
||||
} |
||||
matchBytes := getCapture(src, match[0], match[1]) |
||||
newRepl := replFunc(repl, matchBytes, capturedBytes) |
||||
prevEnd := 0 |
||||
if i > 0 { |
||||
prevMatch := matches[i-1][:2] |
||||
prevEnd = prevMatch[1] |
||||
} |
||||
if match[0] > prevEnd && prevEnd >= 0 && match[0] <= srcLen { |
||||
dest = append(dest, src[prevEnd:match[0]]...) |
||||
} |
||||
dest = append(dest, newRepl...) |
||||
} |
||||
lastEnd := matches[len(matches)-1][1] |
||||
if lastEnd < srcLen && lastEnd >= 0 { |
||||
dest = append(dest, src[lastEnd:]...) |
||||
} |
||||
return dest |
||||
} |
||||
|
||||
func (re *Regexp) ReplaceAll(src, repl []byte) []byte { |
||||
return re.replaceAll(src, repl, fillCapturedValues) |
||||
} |
||||
|
||||
func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { |
||||
return re.replaceAll(src, []byte(""), func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte { |
||||
return repl(matchBytes) |
||||
}) |
||||
} |
||||
|
||||
func (re *Regexp) ReplaceAllString(src, repl string) string { |
||||
return string(re.ReplaceAll([]byte(src), []byte(repl))) |
||||
} |
||||
|
||||
func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { |
||||
srcB := []byte(src) |
||||
destB := re.replaceAll(srcB, []byte(""), func(_ []byte, matchBytes []byte, _ map[string][]byte) []byte { |
||||
return []byte(repl(string(matchBytes))) |
||||
}) |
||||
return string(destB) |
||||
} |
||||
|
||||
func (re *Regexp) String() string { |
||||
return re.pattern |
||||
} |
||||
|
||||
func grow_buffer(b []byte, offset int, n int) []byte { |
||||
if offset+n > cap(b) { |
||||
buf := make([]byte, 2*cap(b)+n) |
||||
copy(buf, b[:offset]) |
||||
return buf |
||||
} |
||||
return b |
||||
} |
||||
|
||||
func fromReader(r io.RuneReader) []byte { |
||||
b := make([]byte, numReadBufferStartSize) |
||||
offset := 0 |
||||
var err error = nil |
||||
for err == nil { |
||||
rune, runeWidth, err := r.ReadRune() |
||||
if err == nil { |
||||
b = grow_buffer(b, offset, runeWidth) |
||||
writeWidth := utf8.EncodeRune(b[offset:], rune) |
||||
if runeWidth != writeWidth { |
||||
panic("reading rune width not equal to the written rune width") |
||||
} |
||||
offset += writeWidth |
||||
} else { |
||||
break |
||||
} |
||||
} |
||||
return b[:offset] |
||||
} |
||||
|
||||
func (re *Regexp) FindReaderIndex(r io.RuneReader) []int { |
||||
b := fromReader(r) |
||||
return re.FindIndex(b) |
||||
} |
||||
|
||||
func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { |
||||
b := fromReader(r) |
||||
return re.FindSubmatchIndex(b) |
||||
} |
||||
|
||||
func (re *Regexp) MatchReader(r io.RuneReader) bool { |
||||
b := fromReader(r) |
||||
return re.Match(b) |
||||
} |
||||
|
||||
func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { |
||||
//no easy way to implement this
|
||||
return "", false |
||||
} |
||||
|
||||
func MatchString(pattern string, s string) (matched bool, error error) { |
||||
re, err := Compile(pattern) |
||||
if err != nil { |
||||
return false, err |
||||
} |
||||
return re.MatchString(s), nil |
||||
} |
||||
|
||||
func (re *Regexp) Gsub(src, repl string) string { |
||||
srcBytes := ([]byte)(src) |
||||
replBytes := ([]byte)(repl) |
||||
replaced := re.replaceAll(srcBytes, replBytes, fillCapturedValues) |
||||
return string(replaced) |
||||
} |
||||
|
||||
func (re *Regexp) GsubFunc(src string, replFunc func(string, map[string]string) string) string { |
||||
srcBytes := ([]byte)(src) |
||||
replaced := re.replaceAll(srcBytes, nil, func(_ []byte, matchBytes []byte, capturedBytes map[string][]byte) []byte { |
||||
capturedStrings := make(map[string]string) |
||||
for name, capBytes := range capturedBytes { |
||||
capturedStrings[name] = string(capBytes) |
||||
} |
||||
matchString := string(matchBytes) |
||||
return ([]byte)(replFunc(matchString, capturedStrings)) |
||||
}) |
||||
return string(replaced) |
||||
} |
@ -0,0 +1,22 @@ |
||||
Copyright (c) 2013 Caleb Spare |
||||
|
||||
MIT License |
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining |
||||
a copy of this software and associated documentation files (the |
||||
"Software"), to deal in the Software without restriction, including |
||||
without limitation the rights to use, copy, modify, merge, publish, |
||||
distribute, sublicense, and/or sell copies of the Software, and to |
||||
permit persons to whom the Software is furnished to do so, subject to |
||||
the following conditions: |
||||
|
||||
The above copyright notice and this permission notice shall be |
||||
included in all copies or substantial portions of the Software. |
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE |
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION |
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION |
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
@ -0,0 +1,7 @@ |
||||
# Trie |
||||
|
||||
[![GoDoc](http://godoc.org/github.com/toqueteos/trie?status.png)](http://godoc.org/github.com/toqueteos/trie) |
||||
|
||||
This is a fork of https://github.com/cespare/go-trie that adds the `PrefixIndex` method. |
||||
|
||||
It's required for https://github.com/toqueteos/substring. |
@ -0,0 +1 @@ |
||||
module github.com/toqueteos/trie |
@ -0,0 +1,102 @@ |
||||
// Package trie is an implementation of a trie (prefix tree) data structure over byte slices. It provides a
|
||||
// small and simple API for usage as a set as well as a 'Node' API for walking the trie.
|
||||
package trie |
||||
|
||||
// A Trie is a a prefix tree.
|
||||
type Trie struct { |
||||
root *Node |
||||
} |
||||
|
||||
// New construct a new, empty Trie ready for use.
|
||||
func New() *Trie { |
||||
return &Trie{ |
||||
root: &Node{}, |
||||
} |
||||
} |
||||
|
||||
// Insert puts b into the Trie. It returns true if the element was not previously in t.
|
||||
func (t *Trie) Insert(b []byte) bool { |
||||
n := t.root |
||||
for _, c := range b { |
||||
next, ok := n.Walk(c) |
||||
if !ok { |
||||
next = &Node{} |
||||
n.branches[c] = next |
||||
n.hasChildren = true |
||||
} |
||||
n = next |
||||
} |
||||
if n.terminal { |
||||
return false |
||||
} |
||||
n.terminal = true |
||||
return true |
||||
} |
||||
|
||||
// Contains checks t for membership of b.
|
||||
func (t *Trie) Contains(b []byte) bool { |
||||
n := t.root |
||||
for _, c := range b { |
||||
next, ok := n.Walk(c) |
||||
if !ok { |
||||
return false |
||||
} |
||||
n = next |
||||
} |
||||
return n.terminal |
||||
} |
||||
|
||||
// PrefixIndex walks through `b` until a prefix is found (terminal node) or it is exhausted.
|
||||
func (t *Trie) PrefixIndex(b []byte) int { |
||||
var idx int |
||||
n := t.root |
||||
for _, c := range b { |
||||
next, ok := n.Walk(c) |
||||
if !ok { |
||||
return -1 |
||||
} |
||||
if next.terminal { |
||||
return idx |
||||
} |
||||
n = next |
||||
idx++ |
||||
} |
||||
if !n.terminal { |
||||
idx = -1 |
||||
} |
||||
return idx |
||||
} |
||||
|
||||
// Root returns the root node of a Trie. A valid Trie (i.e., constructed with New), always has a non-nil root
|
||||
// node.
|
||||
func (t *Trie) Root() *Node { |
||||
return t.root |
||||
} |
||||
|
||||
// A Node represents a logical vertex in the trie structure.
|
||||
type Node struct { |
||||
branches [256]*Node |
||||
terminal bool |
||||
hasChildren bool |
||||
} |
||||
|
||||
// Walk returns the node reached along edge c, if one exists. The ok value indicates whether such a node
|
||||
// exist.
|
||||
func (n *Node) Walk(c byte) (next *Node, ok bool) { |
||||
next = n.branches[int(c)] |
||||
return next, (next != nil) |
||||
} |
||||
|
||||
// Terminal indicates whether n is terminal in the trie (that is, whether the path from the root to n
|
||||
// represents an element in the set). For instance, if the root node is terminal, then []byte{} is in the
|
||||
// trie.
|
||||
func (n *Node) Terminal() bool { |
||||
return n.terminal |
||||
} |
||||
|
||||
// Leaf indicates whether n is a leaf node in the trie (that is, whether it has children). A leaf node must be
|
||||
// terminal (else it would not exist). Logically, if n is a leaf node then the []byte represented by the path
|
||||
// from the root to n is not a proper prefix of any element of the trie.
|
||||
func (n *Node) Leaf() bool { |
||||
return !n.hasChildren |
||||
} |
@ -0,0 +1,24 @@ |
||||
# Compiled Object files, Static and Dynamic libs (Shared Objects) |
||||
*.o |
||||
*.a |
||||
*.so |
||||
|
||||
# Folders |
||||
_obj |
||||
_test |
||||
|
||||
# Architecture specific extensions/prefixes |
||||
*.[568vq] |
||||
[568vq].out |
||||
|
||||
*.cgo1.go |
||||
*.cgo2.c |
||||
_cgo_defun.c |
||||
_cgo_gotypes.go |
||||
_cgo_export.* |
||||
|
||||
_testmain.go |
||||
|
||||
*.exe |
||||
*.test |
||||
*.prof |
@ -0,0 +1,11 @@ |
||||
language: go |
||||
|
||||
go: |
||||
- 1.2 |
||||
- 1.3 |
||||
- 1.4 |
||||
- tip |
||||
|
||||
script: |
||||
- go get launchpad.net/gocheck |
||||
- go test |
@ -0,0 +1,22 @@ |
||||
The MIT License (MIT) |
||||
|
||||
Copyright (c) 2015 Carlos Cobo |
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy |
||||
of this software and associated documentation files (the "Software"), to deal |
||||
in the Software without restriction, including without limitation the rights |
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
||||
copies of the Software, and to permit persons to whom the Software is |
||||
furnished to do so, subject to the following conditions: |
||||
|
||||
The above copyright notice and this permission notice shall be included in all |
||||
copies or substantial portions of the Software. |
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||||
SOFTWARE. |
||||
|
@ -0,0 +1,80 @@ |
||||
# substring [![Build Status](https://travis-ci.org/toqueteos/substring.png?branch=master)](https://travis-ci.org/toqueteos/substring) [![GoDoc](http://godoc.org/github.com/toqueteos/substring?status.png)](http://godoc.org/github.com/toqueteos/substring) [![GitHub release](https://img.shields.io/github/release/toqueteos/substring.svg)](https://github.com/toqueteos/substring/releases) |
||||
|
||||
Simple and composable alternative to [regexp](http://golang.org/pkg/regexp/) package for fast substring searches. |
||||
|
||||
## Installation |
||||
|
||||
The recommended way to install substring |
||||
|
||||
``` |
||||
go get -t gopkg.in/toqueteos/substring.v1 |
||||
``` |
||||
|
||||
The `-t` flag is for fetching [gocheck](https://gopkg.in/check.v1), required for tests and benchmarks. |
||||
|
||||
## Examples |
||||
|
||||
A basic example with two matchers: |
||||
|
||||
```go |
||||
package main |
||||
|
||||
import ( |
||||
"fmt" |
||||
"regexp" |
||||
|
||||
"gopkg.in/toqueteos/substring.v1" |
||||
) |
||||
|
||||
func main() { |
||||
m1 := substring.After("assets/", substring.Or( |
||||
substring.Has("jquery"), |
||||
substring.Has("angular"), |
||||
substring.Suffixes(".js", ".css", ".html"), |
||||
)) |
||||
fmt.Println(m1.Match("assets/angular/foo/bar")) //Prints: true |
||||
fmt.Println(m1.Match("assets/js/file.js")) //Prints: true |
||||
fmt.Println(m1.Match("assets/style/bar.css")) //Prints: true |
||||
fmt.Println(m1.Match("assets/foo/bar.html")) //Prints: false |
||||
fmt.Println(m1.Match("assets/js/qux.json")) //Prints: false |
||||
fmt.Println(m1.Match("core/file.html")) //Prints: false |
||||
fmt.Println(m1.Match("foobar/that.jsx")) //Prints: false |
||||
|
||||
m2 := substring.After("vendor/", substring.Suffixes(".css", ".js", ".less")) |
||||
|
||||
fmt.Println(m2.Match("foo/vendor/bar/qux.css")) //Prints: true |
||||
fmt.Println(m2.Match("foo/var/qux.less")) //Prints: false |
||||
|
||||
re := regexp.MustCompile(`vendor\/.*\.(css|js|less)$`) |
||||
fmt.Println(re.MatchString("foo/vendor/bar/qux.css")) //Prints: true |
||||
fmt.Println(re.MatchString("foo/var/qux.less")) //Prints: false |
||||
} |
||||
``` |
||||
|
||||
## How fast? |
||||
|
||||
It may vary depending on your use case but 1~2 orders of magnitude faster than `regexp` is pretty common. |
||||
|
||||
Test it out for yourself by running `go test -check.b`! |
||||
|
||||
``` |
||||
$ go test -check.b |
||||
PASS: lib_test.go:18: LibSuite.BenchmarkExample1 10000000 221 ns/op |
||||
PASS: lib_test.go:23: LibSuite.BenchmarkExample2 10000000 229 ns/op |
||||
PASS: lib_test.go:28: LibSuite.BenchmarkExample3 10000000 216 ns/op |
||||
PASS: lib_test.go:33: LibSuite.BenchmarkExample4 10000000 208 ns/op |
||||
PASS: lib_test.go:38: LibSuite.BenchmarkExample5 20000000 82.1 ns/op |
||||
PASS: lib_test.go:48: LibSuite.BenchmarkExampleRe1 500000 4136 ns/op |
||||
PASS: lib_test.go:53: LibSuite.BenchmarkExampleRe2 500000 5222 ns/op |
||||
PASS: lib_test.go:58: LibSuite.BenchmarkExampleRe3 500000 5116 ns/op |
||||
PASS: lib_test.go:63: LibSuite.BenchmarkExampleRe4 500000 4020 ns/op |
||||
PASS: lib_test.go:68: LibSuite.BenchmarkExampleRe5 10000000 226 ns/op |
||||
OK: 10 passed |
||||
PASS |
||||
ok gopkg.in/toqueteos/substring.v1 23.471s |
||||
``` |
||||
|
||||
License |
||||
------- |
||||
|
||||
MIT, see [LICENSE](LICENSE) |
@ -0,0 +1,229 @@ |
||||
package substring |
||||
|
||||
import ( |
||||
"bytes" |
||||
"regexp" |
||||
|
||||
"github.com/toqueteos/trie" |
||||
) |
||||
|
||||
type BytesMatcher interface { |
||||
Match(b []byte) bool |
||||
MatchIndex(b []byte) int |
||||
} |
||||
|
||||
// regexp
|
||||
type regexpBytes struct{ re *regexp.Regexp } |
||||
|
||||
func BytesRegexp(pat string) *regexpBytes { return ®expBytes{regexp.MustCompile(pat)} } |
||||
func (m *regexpBytes) Match(b []byte) bool { return m.re.Match(b) } |
||||
func (m *regexpBytes) MatchIndex(b []byte) int { |
||||
found := m.re.FindIndex(b) |
||||
if found != nil { |
||||
return found[1] |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// exact
|
||||
type exactBytes struct{ pat []byte } |
||||
|
||||
func BytesExact(pat string) *exactBytes { return &exactBytes{[]byte(pat)} } |
||||
func (m *exactBytes) Match(b []byte) bool { |
||||
l, r := len(m.pat), len(b) |
||||
if l != r { |
||||
return false |
||||
} |
||||
for i := 0; i < l; i++ { |
||||
if b[i] != m.pat[i] { |
||||
return false |
||||
} |
||||
} |
||||
return true |
||||
} |
||||
func (m *exactBytes) MatchIndex(b []byte) int { |
||||
if m.Match(b) { |
||||
return len(b) |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// any, search `s` in `.Match(pat)`
|
||||
type anyBytes struct { |
||||
pat []byte |
||||
} |
||||
|
||||
func BytesAny(pat string) *anyBytes { return &anyBytes{[]byte(pat)} } |
||||
func (m *anyBytes) Match(b []byte) bool { return bytes.Index(m.pat, b) >= 0 } |
||||
func (m *anyBytes) MatchIndex(b []byte) int { |
||||
if idx := bytes.Index(m.pat, b); idx >= 0 { |
||||
return idx + len(b) |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// has, search `pat` in `.Match(s)`
|
||||
type hasBytes struct { |
||||
pat []byte |
||||
} |
||||
|
||||
func BytesHas(pat string) *hasBytes { return &hasBytes{[]byte(pat)} } |
||||
func (m *hasBytes) Match(b []byte) bool { return bytes.Index(b, m.pat) >= 0 } |
||||
func (m *hasBytes) MatchIndex(b []byte) int { |
||||
if idx := bytes.Index(b, m.pat); idx >= 0 { |
||||
return idx + len(m.pat) |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// prefix
|
||||
type prefixBytes struct{ pat []byte } |
||||
|
||||
func BytesPrefix(pat string) *prefixBytes { return &prefixBytes{[]byte(pat)} } |
||||
func (m *prefixBytes) Match(b []byte) bool { return bytes.HasPrefix(b, m.pat) } |
||||
func (m *prefixBytes) MatchIndex(b []byte) int { |
||||
if bytes.HasPrefix(b, m.pat) { |
||||
return len(m.pat) |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// prefixes
|
||||
type prefixesBytes struct { |
||||
t *trie.Trie |
||||
} |
||||
|
||||
func BytesPrefixes(pats ...string) *prefixesBytes { |
||||
t := trie.New() |
||||
for _, pat := range pats { |
||||
t.Insert([]byte(pat)) |
||||
} |
||||
return &prefixesBytes{t} |
||||
} |
||||
func (m *prefixesBytes) Match(b []byte) bool { return m.t.PrefixIndex(b) >= 0 } |
||||
func (m *prefixesBytes) MatchIndex(b []byte) int { |
||||
if idx := m.t.PrefixIndex(b); idx >= 0 { |
||||
return idx |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// suffix
|
||||
type suffixBytes struct{ pat []byte } |
||||
|
||||
func BytesSuffix(pat string) *suffixBytes { return &suffixBytes{[]byte(pat)} } |
||||
func (m *suffixBytes) Match(b []byte) bool { return bytes.HasSuffix(b, m.pat) } |
||||
func (m *suffixBytes) MatchIndex(b []byte) int { |
||||
if bytes.HasSuffix(b, m.pat) { |
||||
return len(m.pat) |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// suffixes
|
||||
type suffixesBytes struct { |
||||
t *trie.Trie |
||||
} |
||||
|
||||
func BytesSuffixes(pats ...string) *suffixesBytes { |
||||
t := trie.New() |
||||
for _, pat := range pats { |
||||
t.Insert(reverse([]byte(pat))) |
||||
} |
||||
return &suffixesBytes{t} |
||||
} |
||||
func (m *suffixesBytes) Match(b []byte) bool { |
||||
return m.t.PrefixIndex(reverse(b)) >= 0 |
||||
} |
||||
func (m *suffixesBytes) MatchIndex(b []byte) int { |
||||
if idx := m.t.PrefixIndex(reverse(b)); idx >= 0 { |
||||
return idx |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// after
|
||||
type afterBytes struct { |
||||
first []byte |
||||
matcher BytesMatcher |
||||
} |
||||
|
||||
func BytesAfter(first string, m BytesMatcher) *afterBytes { return &afterBytes{[]byte(first), m} } |
||||
func (a *afterBytes) Match(b []byte) bool { |
||||
if idx := bytes.Index(b, a.first); idx >= 0 { |
||||
return a.matcher.Match(b[idx+len(a.first):]) |
||||
} |
||||
return false |
||||
} |
||||
func (a *afterBytes) MatchIndex(b []byte) int { |
||||
if idx := bytes.Index(b, a.first); idx >= 0 { |
||||
return idx + a.matcher.MatchIndex(b[idx:]) |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// and, returns true iff all matchers return true
|
||||
type andBytes struct{ matchers []BytesMatcher } |
||||
|
||||
func BytesAnd(m ...BytesMatcher) *andBytes { return &andBytes{m} } |
||||
func (a *andBytes) Match(b []byte) bool { |
||||
for _, m := range a.matchers { |
||||
if !m.Match(b) { |
||||
return false |
||||
} |
||||
} |
||||
return true |
||||
} |
||||
func (a *andBytes) MatchIndex(b []byte) int { |
||||
longest := 0 |
||||
for _, m := range a.matchers { |
||||
if idx := m.MatchIndex(b); idx < 0 { |
||||
return -1 |
||||
} else if idx > longest { |
||||
longest = idx |
||||
} |
||||
} |
||||
return longest |
||||
} |
||||
|
||||
// or, returns true iff any matcher returns true
|
||||
type orBytes struct{ matchers []BytesMatcher } |
||||
|
||||
func BytesOr(m ...BytesMatcher) *orBytes { return &orBytes{m} } |
||||
func (o *orBytes) Match(b []byte) bool { |
||||
for _, m := range o.matchers { |
||||
if m.Match(b) { |
||||
return true |
||||
} |
||||
} |
||||
return false |
||||
} |
||||
func (o *orBytes) MatchIndex(b []byte) int { |
||||
for _, m := range o.matchers { |
||||
if idx := m.MatchIndex(b); idx >= 0 { |
||||
return idx |
||||
} |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
type suffixGroupBytes struct { |
||||
suffix BytesMatcher |
||||
matchers []BytesMatcher |
||||
} |
||||
|
||||
func BytesSuffixGroup(s string, m ...BytesMatcher) *suffixGroupBytes { |
||||
return &suffixGroupBytes{BytesSuffix(s), m} |
||||
} |
||||
func (sg *suffixGroupBytes) Match(b []byte) bool { |
||||
if sg.suffix.Match(b) { |
||||
return BytesOr(sg.matchers...).Match(b) |
||||
} |
||||
return false |
||||
} |
||||
func (sg *suffixGroupBytes) MatchIndex(b []byte) int { |
||||
if sg.suffix.MatchIndex(b) >= 0 { |
||||
return BytesOr(sg.matchers...).MatchIndex(b) |
||||
} |
||||
return -1 |
||||
} |
@ -0,0 +1,10 @@ |
||||
package substring |
||||
|
||||
// reverse is a helper fn for Suffixes
|
||||
func reverse(b []byte) []byte { |
||||
n := len(b) |
||||
for i := 0; i < n/2; i++ { |
||||
b[i], b[n-1-i] = b[n-1-i], b[i] |
||||
} |
||||
return b |
||||
} |
@ -0,0 +1,216 @@ |
||||
package substring |
||||
|
||||
import ( |
||||
"regexp" |
||||
"strings" |
||||
|
||||
"github.com/toqueteos/trie" |
||||
) |
||||
|
||||
type StringsMatcher interface { |
||||
Match(s string) bool |
||||
MatchIndex(s string) int |
||||
} |
||||
|
||||
// regexp
|
||||
type regexpString struct{ re *regexp.Regexp } |
||||
|
||||
func Regexp(pat string) *regexpString { return ®expString{regexp.MustCompile(pat)} } |
||||
func (m *regexpString) Match(s string) bool { return m.re.MatchString(s) } |
||||
func (m *regexpString) MatchIndex(s string) int { |
||||
found := m.re.FindStringIndex(s) |
||||
if found != nil { |
||||
return found[1] |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// exact
|
||||
type exactString struct{ pat string } |
||||
|
||||
func Exact(pat string) *exactString { return &exactString{pat} } |
||||
func (m *exactString) Match(s string) bool { return m.pat == s } |
||||
func (m *exactString) MatchIndex(s string) int { |
||||
if m.pat == s { |
||||
return len(s) |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// any, search `s` in `.Match(pat)`
|
||||
type anyString struct{ pat string } |
||||
|
||||
func Any(pat string) *anyString { return &anyString{pat} } |
||||
func (m *anyString) Match(s string) bool { |
||||
return strings.Index(m.pat, s) >= 0 |
||||
} |
||||
func (m *anyString) MatchIndex(s string) int { |
||||
if idx := strings.Index(m.pat, s); idx >= 0 { |
||||
return idx + len(s) |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// has, search `pat` in `.Match(s)`
|
||||
type hasString struct{ pat string } |
||||
|
||||
func Has(pat string) *hasString { return &hasString{pat} } |
||||
func (m *hasString) Match(s string) bool { |
||||
return strings.Index(s, m.pat) >= 0 |
||||
} |
||||
func (m *hasString) MatchIndex(s string) int { |
||||
if idx := strings.Index(s, m.pat); idx >= 0 { |
||||
return idx + len(m.pat) |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// prefix
|
||||
type prefixString struct{ pat string } |
||||
|
||||
func Prefix(pat string) *prefixString { return &prefixString{pat} } |
||||
func (m *prefixString) Match(s string) bool { return strings.HasPrefix(s, m.pat) } |
||||
func (m *prefixString) MatchIndex(s string) int { |
||||
if strings.HasPrefix(s, m.pat) { |
||||
return len(m.pat) |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// prefixes
|
||||
type prefixesString struct{ t *trie.Trie } |
||||
|
||||
func Prefixes(pats ...string) *prefixesString { |
||||
t := trie.New() |
||||
for _, pat := range pats { |
||||
t.Insert([]byte(pat)) |
||||
} |
||||
return &prefixesString{t} |
||||
} |
||||
func (m *prefixesString) Match(s string) bool { return m.t.PrefixIndex([]byte(s)) >= 0 } |
||||
func (m *prefixesString) MatchIndex(s string) int { |
||||
if idx := m.t.PrefixIndex([]byte(s)); idx >= 0 { |
||||
return idx |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// suffix
|
||||
type suffixString struct{ pat string } |
||||
|
||||
func Suffix(pat string) *suffixString { return &suffixString{pat} } |
||||
func (m *suffixString) Match(s string) bool { return strings.HasSuffix(s, m.pat) } |
||||
func (m *suffixString) MatchIndex(s string) int { |
||||
if strings.HasSuffix(s, m.pat) { |
||||
return len(m.pat) |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// suffixes
|
||||
type suffixesString struct{ t *trie.Trie } |
||||
|
||||
func Suffixes(pats ...string) *suffixesString { |
||||
t := trie.New() |
||||
for _, pat := range pats { |
||||
t.Insert(reverse([]byte(pat))) |
||||
} |
||||
return &suffixesString{t} |
||||
} |
||||
func (m *suffixesString) Match(s string) bool { |
||||
return m.t.PrefixIndex(reverse([]byte(s))) >= 0 |
||||
} |
||||
func (m *suffixesString) MatchIndex(s string) int { |
||||
if idx := m.t.PrefixIndex(reverse([]byte(s))); idx >= 0 { |
||||
return idx |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// after
|
||||
type afterString struct { |
||||
first string |
||||
matcher StringsMatcher |
||||
} |
||||
|
||||
func After(first string, m StringsMatcher) *afterString { |
||||
return &afterString{first, m} |
||||
} |
||||
func (a *afterString) Match(s string) bool { |
||||
if idx := strings.Index(s, a.first); idx >= 0 { |
||||
return a.matcher.Match(s[idx+len(a.first):]) |
||||
} |
||||
return false |
||||
} |
||||
func (a *afterString) MatchIndex(s string) int { |
||||
if idx := strings.Index(s, a.first); idx >= 0 { |
||||
return idx + a.matcher.MatchIndex(s[idx+len(a.first):]) |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
// and, returns true iff all matchers return true
|
||||
type andString struct{ matchers []StringsMatcher } |
||||
|
||||
func And(m ...StringsMatcher) *andString { return &andString{m} } |
||||
func (a *andString) Match(s string) bool { |
||||
for _, m := range a.matchers { |
||||
if !m.Match(s) { |
||||
return false |
||||
} |
||||
} |
||||
return true |
||||
} |
||||
func (a *andString) MatchIndex(s string) int { |
||||
longest := 0 |
||||
for _, m := range a.matchers { |
||||
if idx := m.MatchIndex(s); idx < 0 { |
||||
return -1 |
||||
} else if idx > longest { |
||||
longest = idx |
||||
} |
||||
} |
||||
return longest |
||||
} |
||||
|
||||
// or, returns true iff any matcher returns true
|
||||
type orString struct{ matchers []StringsMatcher } |
||||
|
||||
func Or(m ...StringsMatcher) *orString { return &orString{m} } |
||||
func (o *orString) Match(s string) bool { |
||||
for _, m := range o.matchers { |
||||
if m.Match(s) { |
||||
return true |
||||
} |
||||
} |
||||
return false |
||||
} |
||||
func (o *orString) MatchIndex(s string) int { |
||||
for _, m := range o.matchers { |
||||
if idx := m.MatchIndex(s); idx >= 0 { |
||||
return idx |
||||
} |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
type suffixGroupString struct { |
||||
suffix StringsMatcher |
||||
matchers []StringsMatcher |
||||
} |
||||
|
||||
func SuffixGroup(s string, m ...StringsMatcher) *suffixGroupString { |
||||
return &suffixGroupString{Suffix(s), m} |
||||
} |
||||
func (sg *suffixGroupString) Match(s string) bool { |
||||
if sg.suffix.Match(s) { |
||||
return Or(sg.matchers...).Match(s) |
||||
} |
||||
return false |
||||
} |
||||
func (sg *suffixGroupString) MatchIndex(s string) int { |
||||
if sg.suffix.MatchIndex(s) >= 0 { |
||||
return Or(sg.matchers...).MatchIndex(s) |
||||
} |
||||
return -1 |
||||
} |
Loading…
Reference in new issue