Updated tokenizer to better matching when search for code snippets (#32261)

This PR improves the accuracy of Gitea's code search. 

Currently, Gitea does not consider statements such as
`onsole.log("hello")` as hits when the user searches for `log`. The
culprit is how both ES and Bleve are tokenizing the file contents (in
both cases, `console.log` is a whole token).

In ES' case, we changed the tokenizer to
[simple_pattern_split](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-simplepatternsplit-tokenizer.html#:~:text=The%20simple_pattern_split%20tokenizer%20uses%20a,the%20tokenization%20is%20generally%20faster.).
In such a case, tokens are words formed by digits and letters. In
Bleve's case, it employs a
[letter](https://blevesearch.com/docs/Tokenizers/) tokenizer.

Resolves #32220

---------

Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
pull/32151/head^2
Bruno Sofiato 2 weeks ago committed by GitHub
parent b573512312
commit f64fbd9b74
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 5
      modules/indexer/code/bleve/bleve.go
  2. 13
      modules/indexer/code/elasticsearch/elasticsearch.go
  3. 49
      modules/indexer/code/indexer_test.go
  4. 9
      modules/indexer/internal/bleve/util.go
  5. 8
      modules/indexer/internal/bleve/util_test.go
  6. 5
      tests/gitea-repositories-meta/org42/search-by-path.git/description
  7. 2
      tests/gitea-repositories-meta/org42/search-by-path.git/info/refs
  8. BIN
      tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/commit-graph
  9. 2
      tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/packs
  10. BIN
      tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.bitmap
  11. BIN
      tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.idx
  12. BIN
      tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.rev
  13. BIN
      tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.bitmap
  14. BIN
      tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.idx
  15. BIN
      tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.pack
  16. BIN
      tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.rev
  17. 2
      tests/gitea-repositories-meta/org42/search-by-path.git/packed-refs

@ -31,6 +31,7 @@ import (
"github.com/blevesearch/bleve/v2/analysis/token/camelcase" "github.com/blevesearch/bleve/v2/analysis/token/camelcase"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase" "github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm" "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search/query" "github.com/blevesearch/bleve/v2/search/query"
@ -69,7 +70,7 @@ const (
filenameIndexerAnalyzer = "filenameIndexerAnalyzer" filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
filenameIndexerTokenizer = "filenameIndexerTokenizer" filenameIndexerTokenizer = "filenameIndexerTokenizer"
repoIndexerDocType = "repoIndexerDocType" repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 7 repoIndexerLatestVersion = 8
) )
// generateBleveIndexMapping generates a bleve index mapping for the repo indexer // generateBleveIndexMapping generates a bleve index mapping for the repo indexer
@ -105,7 +106,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{ } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
"type": analyzer_custom.Name, "type": analyzer_custom.Name,
"char_filters": []string{}, "char_filters": []string{},
"tokenizer": unicode.Name, "tokenizer": letter.Name,
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name}, "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
}); err != nil { }); err != nil {
return nil, err return nil, err

@ -30,7 +30,7 @@ import (
) )
const ( const (
esRepoIndexerLatestVersion = 2 esRepoIndexerLatestVersion = 3
// multi-match-types, currently only 2 types are used // multi-match-types, currently only 2 types are used
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
esMultiMatchTypeBestFields = "best_fields" esMultiMatchTypeBestFields = "best_fields"
@ -60,6 +60,10 @@ const (
"settings": { "settings": {
"analysis": { "analysis": {
"analyzer": { "analyzer": {
"content_analyzer": {
"tokenizer": "content_tokenizer",
"filter" : ["lowercase"]
},
"filename_path_analyzer": { "filename_path_analyzer": {
"tokenizer": "path_tokenizer" "tokenizer": "path_tokenizer"
}, },
@ -68,6 +72,10 @@ const (
} }
}, },
"tokenizer": { "tokenizer": {
"content_tokenizer": {
"type": "simple_pattern_split",
"pattern": "[^a-zA-Z0-9]"
},
"path_tokenizer": { "path_tokenizer": {
"type": "path_hierarchy", "type": "path_hierarchy",
"delimiter": "/" "delimiter": "/"
@ -104,7 +112,8 @@ const (
"content": { "content": {
"type": "text", "type": "text",
"term_vector": "with_positions_offsets", "term_vector": "with_positions_offsets",
"index": true "index": true,
"analyzer": "content_analyzer"
}, },
"commit_id": { "commit_id": {
"type": "keyword", "type": "keyword",

@ -181,6 +181,55 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
}, },
}, },
}, },
// Search for matches on the contents of files regardless of case.
{
RepoIDs: nil,
Keyword: "dESCRIPTION",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "README.md",
Content: "# repo1\n\nDescription for repo1",
},
},
},
// Search for an exact match on the filename within the repo '62' (case insenstive).
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "AVOCADO.MD",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for matches on the contents of files when the criteria is a expression.
{
RepoIDs: []int64{62},
Keyword: "console.log",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "example-file.js",
Content: "console.log(\"Hello, World!\")",
},
},
},
// Search for matches on the contents of files when the criteria is part of a expression.
{
RepoIDs: []int64{62},
Keyword: "log",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "example-file.js",
Content: "console.log(\"Hello, World!\")",
},
},
},
} }
for _, kw := range keywords { for _, kw := range keywords {

@ -6,12 +6,13 @@ package bleve
import ( import (
"errors" "errors"
"os" "os"
"unicode"
"code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/util" "code.gitea.io/gitea/modules/util"
"github.com/blevesearch/bleve/v2" "github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" unicode_tokenizer "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/index/upsidedown" "github.com/blevesearch/bleve/v2/index/upsidedown"
"github.com/ethantkoenig/rupture" "github.com/ethantkoenig/rupture"
) )
@ -57,7 +58,7 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
// may be different on two string and they still be considered equivalent. // may be different on two string and they still be considered equivalent.
// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero. // Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
func GuessFuzzinessByKeyword(s string) int { func GuessFuzzinessByKeyword(s string) int {
tokenizer := unicode.NewUnicodeTokenizer() tokenizer := unicode_tokenizer.NewUnicodeTokenizer()
tokens := tokenizer.Tokenize([]byte(s)) tokens := tokenizer.Tokenize([]byte(s))
if len(tokens) > 0 { if len(tokens) > 0 {
@ -77,8 +78,10 @@ func guessFuzzinessByKeyword(s string) int {
// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2 // according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword // magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot. // BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
// Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness
for _, r := range s { for _, r := range s {
if r >= 128 { if r >= 128 || !unicode.IsLetter(r) {
return 0 return 0
} }
} }

@ -35,6 +35,14 @@ func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
Input: "갃갃갃", Input: "갃갃갃",
Fuzziness: 0, Fuzziness: 0,
}, },
{
Input: "repo1",
Fuzziness: 0,
},
{
Input: "avocado.md",
Fuzziness: 0,
},
} }
for _, scenario := range scenarios { for _, scenario := range scenarios {

@ -4,5 +4,6 @@ This repository will be used to test code search. The snippet below shows its di
├── avocado.md ├── avocado.md
├── cucumber.md ├── cucumber.md
├── ham.md ├── ham.md
└── potato ├── potato
└── ham.md | └── ham.md
└── example-file.js

@ -3,7 +3,7 @@
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop 65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1 65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1
78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check 78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check
3731fe53b763859aaf83e703ee731f6b9447ff1e refs/heads/master 9f894b61946fd2f7b8b9d8e370e4d62f915522f5 refs/heads/master
62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update 62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update
4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check 4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check
3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits 3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits

@ -1,2 +1,2 @@
P pack-393dc29256bc27cb2ec73898507df710be7a3cf5.pack P pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.pack

@ -4,7 +4,7 @@
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop 65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1 65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1
78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check 78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check
3731fe53b763859aaf83e703ee731f6b9447ff1e refs/heads/master 9f894b61946fd2f7b8b9d8e370e4d62f915522f5 refs/heads/master
62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update 62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update
4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check 4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check
3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits 3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits

Loading…
Cancel
Save