diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index 90e5e62bcb4..772317fa594 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -31,6 +31,7 @@ import ( "github.com/blevesearch/bleve/v2/analysis/token/camelcase" "github.com/blevesearch/bleve/v2/analysis/token/lowercase" "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm" + "github.com/blevesearch/bleve/v2/analysis/tokenizer/letter" "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search/query" @@ -69,7 +70,7 @@ const ( filenameIndexerAnalyzer = "filenameIndexerAnalyzer" filenameIndexerTokenizer = "filenameIndexerTokenizer" repoIndexerDocType = "repoIndexerDocType" - repoIndexerLatestVersion = 7 + repoIndexerLatestVersion = 8 ) // generateBleveIndexMapping generates a bleve index mapping for the repo indexer @@ -105,7 +106,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) { } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{ "type": analyzer_custom.Name, "char_filters": []string{}, - "tokenizer": unicode.Name, + "tokenizer": letter.Name, "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name}, }); err != nil { return nil, err diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go index 669a1bafcc9..1c4dd39eff0 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch.go +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -30,7 +30,7 @@ import ( ) const ( - esRepoIndexerLatestVersion = 2 + esRepoIndexerLatestVersion = 3 // multi-match-types, currently only 2 types are used // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types esMultiMatchTypeBestFields = "best_fields" @@ -60,6 +60,10 @@ const ( "settings": { "analysis": { "analyzer": { + "content_analyzer": { + "tokenizer": "content_tokenizer", + "filter" : ["lowercase"] + }, "filename_path_analyzer": { "tokenizer": "path_tokenizer" }, @@ -68,6 +72,10 @@ const ( } }, "tokenizer": { + "content_tokenizer": { + "type": "simple_pattern_split", + "pattern": "[^a-zA-Z0-9]" + }, "path_tokenizer": { "type": "path_hierarchy", "delimiter": "/" @@ -104,7 +112,8 @@ const ( "content": { "type": "text", "term_vector": "with_positions_offsets", - "index": true + "index": true, + "analyzer": "content_analyzer" }, "commit_id": { "type": "keyword", diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go index 5b33528dcde..020ccc72f81 100644 --- a/modules/indexer/code/indexer_test.go +++ b/modules/indexer/code/indexer_test.go @@ -181,6 +181,55 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }, }, }, + // Search for matches on the contents of files regardless of case. + { + RepoIDs: nil, + Keyword: "dESCRIPTION", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "README.md", + Content: "# repo1\n\nDescription for repo1", + }, + }, + }, + // Search for an exact match on the filename within the repo '62' (case insenstive). + // This scenario yields a single result (the file avocado.md on the repo '62') + { + RepoIDs: []int64{62}, + Keyword: "AVOCADO.MD", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "avocado.md", + Content: "# repo1\n\npineaple pie of cucumber juice", + }, + }, + }, + // Search for matches on the contents of files when the criteria is a expression. + { + RepoIDs: []int64{62}, + Keyword: "console.log", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "example-file.js", + Content: "console.log(\"Hello, World!\")", + }, + }, + }, + // Search for matches on the contents of files when the criteria is part of a expression. + { + RepoIDs: []int64{62}, + Keyword: "log", + Langs: 1, + Results: []codeSearchResult{ + { + Filename: "example-file.js", + Content: "console.log(\"Hello, World!\")", + }, + }, + }, } for _, kw := range keywords { diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go index b426b39bc20..a0c3dc4ad45 100644 --- a/modules/indexer/internal/bleve/util.go +++ b/modules/indexer/internal/bleve/util.go @@ -6,12 +6,13 @@ package bleve import ( "errors" "os" + "unicode" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/util" "github.com/blevesearch/bleve/v2" - "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" + unicode_tokenizer "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" "github.com/blevesearch/bleve/v2/index/upsidedown" "github.com/ethantkoenig/rupture" ) @@ -57,7 +58,7 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) { // may be different on two string and they still be considered equivalent. // Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero. func GuessFuzzinessByKeyword(s string) int { - tokenizer := unicode.NewUnicodeTokenizer() + tokenizer := unicode_tokenizer.NewUnicodeTokenizer() tokens := tokenizer.Tokenize([]byte(s)) if len(tokens) > 0 { @@ -77,8 +78,10 @@ func guessFuzzinessByKeyword(s string) int { // according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2 // magic number 4 was chosen to determine the levenshtein distance per each character of a keyword // BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot. + // Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness + for _, r := range s { - if r >= 128 { + if r >= 128 || !unicode.IsLetter(r) { return 0 } } diff --git a/modules/indexer/internal/bleve/util_test.go b/modules/indexer/internal/bleve/util_test.go index ae0b12c08d4..8f7844464e7 100644 --- a/modules/indexer/internal/bleve/util_test.go +++ b/modules/indexer/internal/bleve/util_test.go @@ -35,6 +35,14 @@ func TestBleveGuessFuzzinessByKeyword(t *testing.T) { Input: "갃갃갃", Fuzziness: 0, }, + { + Input: "repo1", + Fuzziness: 0, + }, + { + Input: "avocado.md", + Fuzziness: 0, + }, } for _, scenario := range scenarios { diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/description b/tests/gitea-repositories-meta/org42/search-by-path.git/description index 382e2d7f101..ffc40a9c48a 100644 --- a/tests/gitea-repositories-meta/org42/search-by-path.git/description +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/description @@ -4,5 +4,6 @@ This repository will be used to test code search. The snippet below shows its di ├── avocado.md ├── cucumber.md ├── ham.md -└── potato - └── ham.md +├── potato +| └── ham.md +└── example-file.js \ No newline at end of file diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/info/refs b/tests/gitea-repositories-meta/org42/search-by-path.git/info/refs index 6b948c96a83..4adf83dda31 100644 --- a/tests/gitea-repositories-meta/org42/search-by-path.git/info/refs +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/info/refs @@ -3,7 +3,7 @@ 65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop 65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1 78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check -3731fe53b763859aaf83e703ee731f6b9447ff1e refs/heads/master +9f894b61946fd2f7b8b9d8e370e4d62f915522f5 refs/heads/master 62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update 4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check 3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/commit-graph b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/commit-graph deleted file mode 100644 index b38715bb92b..00000000000 Binary files a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/commit-graph and /dev/null differ diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/packs b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/packs index b2af8c8378a..9774923d2ea 100644 --- a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/packs +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/info/packs @@ -1,2 +1,2 @@ -P pack-393dc29256bc27cb2ec73898507df710be7a3cf5.pack +P pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.pack diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.bitmap b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.bitmap deleted file mode 100644 index 1fdef225e83..00000000000 Binary files a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.bitmap and /dev/null differ diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.idx b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.idx deleted file mode 100644 index 0d930e7499f..00000000000 Binary files a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.idx and /dev/null differ diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.rev b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.rev deleted file mode 100644 index 869860ba611..00000000000 Binary files a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.rev and /dev/null differ diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.bitmap b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.bitmap new file mode 100644 index 00000000000..39c02c29877 Binary files /dev/null and b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.bitmap differ diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.idx b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.idx new file mode 100644 index 00000000000..38d0e6b7226 Binary files /dev/null and b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.idx differ diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.pack b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.pack similarity index 74% rename from tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.pack rename to tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.pack index f1aac1e7404..06c0a899f36 100644 Binary files a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-393dc29256bc27cb2ec73898507df710be7a3cf5.pack and b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.pack differ diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.rev b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.rev new file mode 100644 index 00000000000..b06ecca825a Binary files /dev/null and b/tests/gitea-repositories-meta/org42/search-by-path.git/objects/pack/pack-a7bef76cf6e2b46bc816936ab69306fb10aea571.rev differ diff --git a/tests/gitea-repositories-meta/org42/search-by-path.git/packed-refs b/tests/gitea-repositories-meta/org42/search-by-path.git/packed-refs index 70e69af1e10..2334e3da489 100644 --- a/tests/gitea-repositories-meta/org42/search-by-path.git/packed-refs +++ b/tests/gitea-repositories-meta/org42/search-by-path.git/packed-refs @@ -4,7 +4,7 @@ 65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop 65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1 78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check -3731fe53b763859aaf83e703ee731f6b9447ff1e refs/heads/master +9f894b61946fd2f7b8b9d8e370e4d62f915522f5 refs/heads/master 62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update 4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check 3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits