mirror of https://github.com/go-gitea/gitea
Allow code search by filename (#32210)
This is a large and complex PR, so let me explain in detail its changes. First, I had to create new index mappings for Bleve and ElasticSerach as the current ones do not support search by filename. This requires Gitea to recreate the code search indexes (I do not know if this is a breaking change, but I feel it deserves a heads-up). I've used [this approach](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/analysis-pathhierarchy-tokenizer.html) to model the filename index. It allows us to efficiently search for both the full path and the name of a file. Bleve, however, does not support this out-of-box, so I had to code a brand new [token filter](https://blevesearch.com/docs/Token-Filters/) to generate the search terms. I also did an overhaul in the `indexer_test.go` file. It now asserts the order of the expected results (this is important since matches based on the name of a file are more relevant than those based on its content). I've added new test scenarios that deal with searching by filename. They use a new repo included in the Gitea fixture. The screenshot below depicts how Gitea shows the search results. It shows results based on content in the same way as the current version does. In matches based on the filename, the first seven lines of the file contents are shown (BTW, this is how GitHub does it). ![image](https://github.com/user-attachments/assets/9d938d86-1a8d-4f89-8644-1921a473e858) Resolves #32096 --------- Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>pull/32239/head^2
parent
0fe5e2b08c
commit
900ac62251
@ -0,0 +1,101 @@ |
||||
// Copyright 2024 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package path |
||||
|
||||
import ( |
||||
"slices" |
||||
"strings" |
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis" |
||||
"github.com/blevesearch/bleve/v2/registry" |
||||
) |
||||
|
||||
const ( |
||||
Name = "gitea/path" |
||||
) |
||||
|
||||
type TokenFilter struct{} |
||||
|
||||
func NewTokenFilter() *TokenFilter { |
||||
return &TokenFilter{} |
||||
} |
||||
|
||||
func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) { |
||||
return NewTokenFilter(), nil |
||||
} |
||||
|
||||
func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream { |
||||
if len(input) == 1 { |
||||
// if there is only one token, we dont need to generate the reversed chain
|
||||
return generatePathTokens(input, false) |
||||
} |
||||
|
||||
normal := generatePathTokens(input, false) |
||||
reversed := generatePathTokens(input, true) |
||||
|
||||
return append(normal, reversed...) |
||||
} |
||||
|
||||
// Generates path tokens from the input tokens.
|
||||
// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
|
||||
// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
|
||||
//
|
||||
// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
|
||||
// to efficiently search for filenames without supplying the fullpath.
|
||||
func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream { |
||||
terms := make([]string, 0, len(input)) |
||||
longestTerm := 0 |
||||
|
||||
if reversed { |
||||
slices.Reverse(input) |
||||
} |
||||
|
||||
for i := 0; i < len(input); i++ { |
||||
var sb strings.Builder |
||||
sb.WriteString(string(input[0].Term)) |
||||
|
||||
for j := 1; j < i; j++ { |
||||
sb.WriteString("/") |
||||
sb.WriteString(string(input[j].Term)) |
||||
} |
||||
|
||||
term := sb.String() |
||||
|
||||
if longestTerm < len(term) { |
||||
longestTerm = len(term) |
||||
} |
||||
|
||||
terms = append(terms, term) |
||||
} |
||||
|
||||
output := make(analysis.TokenStream, 0, len(terms)) |
||||
|
||||
for _, term := range terms { |
||||
var start, end int |
||||
|
||||
if reversed { |
||||
start = 0 |
||||
end = len(term) |
||||
} else { |
||||
start = longestTerm - len(term) |
||||
end = longestTerm |
||||
} |
||||
|
||||
token := analysis.Token{ |
||||
Position: 1, |
||||
Start: start, |
||||
End: end, |
||||
Type: analysis.AlphaNumeric, |
||||
Term: []byte(term), |
||||
} |
||||
|
||||
output = append(output, &token) |
||||
} |
||||
|
||||
return output |
||||
} |
||||
|
||||
func init() { |
||||
registry.RegisterTokenFilter(Name, TokenFilterConstructor) |
||||
} |
@ -0,0 +1,76 @@ |
||||
// Copyright 2024 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package path |
||||
|
||||
import ( |
||||
"fmt" |
||||
"testing" |
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis" |
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode" |
||||
"github.com/stretchr/testify/assert" |
||||
) |
||||
|
||||
type Scenario struct { |
||||
Input string |
||||
Tokens []string |
||||
} |
||||
|
||||
func TestTokenFilter(t *testing.T) { |
||||
scenarios := []struct { |
||||
Input string |
||||
Terms []string |
||||
}{ |
||||
{ |
||||
Input: "Dockerfile", |
||||
Terms: []string{"Dockerfile"}, |
||||
}, |
||||
{ |
||||
Input: "Dockerfile.rootless", |
||||
Terms: []string{"Dockerfile.rootless"}, |
||||
}, |
||||
{ |
||||
Input: "a/b/c/Dockerfile.rootless", |
||||
Terms: []string{"a", "a/b", "a/b/c", "a/b/c/Dockerfile.rootless", "Dockerfile.rootless", "Dockerfile.rootless/c", "Dockerfile.rootless/c/b", "Dockerfile.rootless/c/b/a"}, |
||||
}, |
||||
{ |
||||
Input: "", |
||||
Terms: []string{}, |
||||
}, |
||||
} |
||||
|
||||
for _, scenario := range scenarios { |
||||
t.Run(fmt.Sprintf("ensure terms of '%s'", scenario.Input), func(t *testing.T) { |
||||
terms := extractTerms(scenario.Input) |
||||
|
||||
assert.Len(t, terms, len(scenario.Terms)) |
||||
|
||||
for _, term := range terms { |
||||
assert.Contains(t, scenario.Terms, term) |
||||
} |
||||
}) |
||||
} |
||||
} |
||||
|
||||
func extractTerms(input string) []string { |
||||
tokens := tokenize(input) |
||||
filteredTokens := filter(tokens) |
||||
terms := make([]string, 0, len(filteredTokens)) |
||||
|
||||
for _, token := range filteredTokens { |
||||
terms = append(terms, string(token.Term)) |
||||
} |
||||
|
||||
return terms |
||||
} |
||||
|
||||
func filter(input analysis.TokenStream) analysis.TokenStream { |
||||
filter := NewTokenFilter() |
||||
return filter.Filter(input) |
||||
} |
||||
|
||||
func tokenize(input string) analysis.TokenStream { |
||||
tokenizer := unicode.NewUnicodeTokenizer() |
||||
return tokenizer.Tokenize([]byte(input)) |
||||
} |
@ -0,0 +1,45 @@ |
||||
// Copyright 2024 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package bleve |
||||
|
||||
import ( |
||||
"fmt" |
||||
"testing" |
||||
|
||||
"github.com/stretchr/testify/assert" |
||||
) |
||||
|
||||
func TestBleveGuessFuzzinessByKeyword(t *testing.T) { |
||||
scenarios := []struct { |
||||
Input string |
||||
Fuzziness int // See util.go for the definition of fuzziness in this particular context
|
||||
}{ |
||||
{ |
||||
Input: "", |
||||
Fuzziness: 0, |
||||
}, |
||||
{ |
||||
Input: "Avocado", |
||||
Fuzziness: 1, |
||||
}, |
||||
{ |
||||
Input: "Geschwindigkeit", |
||||
Fuzziness: 2, |
||||
}, |
||||
{ |
||||
Input: "non-exist", |
||||
Fuzziness: 0, |
||||
}, |
||||
{ |
||||
Input: "갃갃갃", |
||||
Fuzziness: 0, |
||||
}, |
||||
} |
||||
|
||||
for _, scenario := range scenarios { |
||||
t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) { |
||||
assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input)) |
||||
}) |
||||
} |
||||
} |
@ -0,0 +1 @@ |
||||
|
@ -0,0 +1 @@ |
||||
ref: refs/heads/master |
@ -0,0 +1,4 @@ |
||||
[core] |
||||
repositoryformatversion = 0 |
||||
filemode = true |
||||
bare = true |
@ -0,0 +1,8 @@ |
||||
This repository will be used to test code search. The snippet below shows its directory structure |
||||
|
||||
. |
||||
├── avocado.md |
||||
├── cucumber.md |
||||
├── ham.md |
||||
└── potato |
||||
└── ham.md |
@ -0,0 +1,7 @@ |
||||
#!/usr/bin/env bash |
||||
ORI_DIR=`pwd` |
||||
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd) |
||||
cd "$ORI_DIR" |
||||
for i in `ls "$SHELL_FOLDER/post-receive.d"`; do |
||||
sh "$SHELL_FOLDER/post-receive.d/$i" |
||||
done |
@ -0,0 +1,2 @@ |
||||
#!/usr/bin/env bash |
||||
"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" post-receive |
@ -0,0 +1,7 @@ |
||||
#!/usr/bin/env bash |
||||
ORI_DIR=`pwd` |
||||
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd) |
||||
cd "$ORI_DIR" |
||||
for i in `ls "$SHELL_FOLDER/pre-receive.d"`; do |
||||
sh "$SHELL_FOLDER/pre-receive.d/$i" |
||||
done |
@ -0,0 +1,2 @@ |
||||
#!/usr/bin/env bash |
||||
"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" pre-receive |
@ -0,0 +1,7 @@ |
||||
#!/usr/bin/env bash |
||||
ORI_DIR=`pwd` |
||||
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd) |
||||
cd "$ORI_DIR" |
||||
for i in `ls "$SHELL_FOLDER/proc-receive.d"`; do |
||||
sh "$SHELL_FOLDER/proc-receive.d/$i" |
||||
done |
@ -0,0 +1,2 @@ |
||||
#!/usr/bin/env bash |
||||
"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" proc-receive |
@ -0,0 +1,7 @@ |
||||
#!/usr/bin/env bash |
||||
ORI_DIR=`pwd` |
||||
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd) |
||||
cd "$ORI_DIR" |
||||
for i in `ls "$SHELL_FOLDER/update.d"`; do |
||||
sh "$SHELL_FOLDER/update.d/$i" $1 $2 $3 |
||||
done |
@ -0,0 +1,2 @@ |
||||
#!/usr/bin/env bash |
||||
"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" update $1 $2 $3 |
@ -0,0 +1,6 @@ |
||||
# git ls-files --others --exclude-from=.git/info/exclude |
||||
# Lines that start with '#' are comments. |
||||
# For a project mostly in C, the following would be a good set of |
||||
# exclude patterns (uncomment them if you want to use them): |
||||
# *.[oa] |
||||
# *~ |
@ -0,0 +1,13 @@ |
||||
90c1019714259b24fb81711d4416ac0f18667dfa refs/heads/DefaultBranch |
||||
985f0301dba5e7b34be866819cd15ad3d8f508ee refs/heads/branch2 |
||||
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop |
||||
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1 |
||||
78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check |
||||
3731fe53b763859aaf83e703ee731f6b9447ff1e refs/heads/master |
||||
62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update |
||||
4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check |
||||
3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits |
||||
4a357436d925b5c974181ff12a994538ddc5a269 refs/pull/2/head |
||||
5f22f7d0d95d614d25a5b68592adb345a4b5c7fd refs/pull/3/head |
||||
62fb502a7172d4453f0322a2cc85bddffa57f07a refs/pull/5/head |
||||
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/tags/v1.1 |
Binary file not shown.
@ -0,0 +1,2 @@ |
||||
P pack-393dc29256bc27cb2ec73898507df710be7a3cf5.pack |
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,14 @@ |
||||
# pack-refs with: peeled fully-peeled sorted |
||||
90c1019714259b24fb81711d4416ac0f18667dfa refs/heads/DefaultBranch |
||||
985f0301dba5e7b34be866819cd15ad3d8f508ee refs/heads/branch2 |
||||
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop |
||||
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1 |
||||
78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check |
||||
3731fe53b763859aaf83e703ee731f6b9447ff1e refs/heads/master |
||||
62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update |
||||
4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check |
||||
3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits |
||||
4a357436d925b5c974181ff12a994538ddc5a269 refs/pull/2/head |
||||
5f22f7d0d95d614d25a5b68592adb345a4b5c7fd refs/pull/3/head |
||||
62fb502a7172d4453f0322a2cc85bddffa57f07a refs/pull/5/head |
||||
65f1bf27bc3bf70f64657658635e66094edbcb4d refs/tags/v1.1 |
Loading…
Reference in new issue