diff --git a/modules/git/languagestats/language_stats_gogit.go b/modules/git/languagestats/language_stats_gogit.go index 418c05b15789f..504148b628004 100644 --- a/modules/git/languagestats/language_stats_gogit.go +++ b/modules/git/languagestats/language_stats_gogit.go @@ -7,6 +7,7 @@ package languagestats import ( "bytes" + "context" "io" "code.gitea.io/gitea/modules/analyze" @@ -20,8 +21,8 @@ import ( "github.com/go-git/go-git/v5/plumbing/object" ) -// GetLanguageStats calculates language stats for git repository at specified commit -func GetLanguageStats(repo *git_module.Repository, commitID string) (map[string]int64, error) { +// CalcLanguageStats calculates language stats for git repository at specified commit +func CalcLanguageStats(ctx context.Context, repo *git_module.Repository, commitID string) (map[string]int64, error) { r, err := git.PlainOpen(repo.Path) if err != nil { return nil, err @@ -58,6 +59,13 @@ func GetLanguageStats(repo *git_module.Repository, commitID string) (map[string] firstExcludedLanguageSize := int64(0) err = tree.Files().ForEach(func(f *object.File) error { + select { + case <-ctx.Done(): + return ctx.Err() + default: + return nil + } + if f.Size == 0 { return nil } diff --git a/modules/git/languagestats/language_stats_nogogit.go b/modules/git/languagestats/language_stats_nogogit.go index 94cf9fff8c129..f8c446277ce97 100644 --- a/modules/git/languagestats/language_stats_nogogit.go +++ b/modules/git/languagestats/language_stats_nogogit.go @@ -7,6 +7,7 @@ package languagestats import ( "bytes" + "context" "io" "code.gitea.io/gitea/modules/analyze" @@ -18,8 +19,8 @@ import ( "github.com/go-enry/go-enry/v2" ) -// GetLanguageStats calculates language stats for git repository at specified commit -func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, error) { +// CalcLanguageStats calculates language stats for git repository at specified commit +func CalcLanguageStats(ctx context.Context, repo *git.Repository, commitID string) (map[string]int64, error) { // We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary. // so let's create a batch stdin and stdout batchStdinWriter, batchReader, cancel, err := repo.CatFileBatch(repo.Ctx) @@ -59,11 +60,6 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, tree := commit.Tree - entries, err := tree.ListEntriesRecursiveWithSize() - if err != nil { - return nil, err - } - checker, err := attribute.NewBatchChecker(repo, commitID, attribute.LinguistAttributes) if err != nil { return nil, err @@ -82,18 +78,12 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, firstExcludedLanguage := "" firstExcludedLanguageSize := int64(0) - for _, f := range entries { - select { - case <-repo.Ctx.Done(): - return sizes, repo.Ctx.Err() - default: - } - + if err := tree.IterateEntriesRecursive(ctx, func(ctx context.Context, f *git.TreeEntry) error { contentBuf.Reset() content = contentBuf.Bytes() if f.Size() == 0 { - continue + return nil } isVendored := optional.None[bool]() @@ -104,19 +94,19 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, attrLinguistGenerated := optional.None[bool]() if err == nil { if isVendored = attrs.GetVendored(); isVendored.ValueOrDefault(false) { - continue + return nil } if attrLinguistGenerated = attrs.GetGenerated(); attrLinguistGenerated.ValueOrDefault(false) { - continue + return nil } if isDocumentation = attrs.GetDocumentation(); isDocumentation.ValueOrDefault(false) { - continue + return nil } if isDetectable = attrs.GetDetectable(); !isDetectable.ValueOrDefault(true) { - continue + return nil } if hasLanguage := attrs.GetLanguage(); hasLanguage.Value() != "" { @@ -130,7 +120,7 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, // this language will always be added to the size sizes[language] += f.Size() - continue + return nil } } @@ -138,19 +128,19 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, enry.IsDotFile(f.Name()) || (!isDocumentation.Has() && enry.IsDocumentation(f.Name())) || enry.IsConfiguration(f.Name()) { - continue + return nil } // If content can not be read or file is too big just do detection by filename if f.Size() <= bigFileSize { if err := writeID(f.ID.String()); err != nil { - return nil, err + return err } _, _, size, err := git.ReadBatchLine(batchReader) if err != nil { log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err) - return nil, err + return err } sizeToRead := size @@ -162,11 +152,11 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, _, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead)) if err != nil { - return nil, err + return err } content = contentBuf.Bytes() if err := git.DiscardFull(batchReader, discard); err != nil { - return nil, err + return err } } @@ -178,14 +168,14 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, isGenerated = enry.IsGenerated(f.Name(), content) } if isGenerated { - continue + return nil } // FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary? // - eg. do the all the detection tests using filename first before reading content. language := analyze.GetCodeLanguage(f.Name(), content) if language == "" { - continue + return nil } // group languages, such as Pug -> HTML; SCSS -> CSS @@ -206,6 +196,9 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, firstExcludedLanguage = language firstExcludedLanguageSize += f.Size() } + return nil + }, git.TrustedCmdArgs{"--long"}); err != nil { + return sizes, err } // If there are no included languages add the first excluded language diff --git a/modules/git/languagestats/language_stats_test.go b/modules/git/languagestats/language_stats_test.go index b908ae6413d72..197be59c98b82 100644 --- a/modules/git/languagestats/language_stats_test.go +++ b/modules/git/languagestats/language_stats_test.go @@ -22,7 +22,7 @@ func TestRepository_GetLanguageStats(t *testing.T) { require.NoError(t, err) defer gitRepo.Close() - stats, err := GetLanguageStats(gitRepo, "8fee858da5796dfb37704761701bb8e800ad9ef3") + stats, err := CalcLanguageStats(t.Context(), gitRepo, "8fee858da5796dfb37704761701bb8e800ad9ef3") require.NoError(t, err) assert.Equal(t, map[string]int64{ diff --git a/modules/git/parse_nogogit.go b/modules/git/parse_nogogit.go index 78a016288986a..d4c699dad7feb 100644 --- a/modules/git/parse_nogogit.go +++ b/modules/git/parse_nogogit.go @@ -22,6 +22,13 @@ func ParseTreeEntries(data []byte) ([]*TreeEntry, error) { // parseTreeEntries FIXME this function's design is not right, it should not make the caller read all data into memory func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) { entries := make([]*TreeEntry, 0, bytes.Count(data, []byte{'\n'})+1) + return entries, iterateTreeEntries(data, ptree, func(entry *TreeEntry) error { + entries = append(entries, entry) + return nil + }) +} + +func iterateTreeEntries(data []byte, ptree *Tree, f func(entry *TreeEntry) error) error { for pos := 0; pos < len(data); { posEnd := bytes.IndexByte(data[pos:], '\n') if posEnd == -1 { @@ -33,7 +40,7 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) { line := data[pos:posEnd] lsTreeLine, err := parseLsTreeLine(line) if err != nil { - return nil, err + return err } entry := &TreeEntry{ ptree: ptree, @@ -44,9 +51,11 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) { sized: lsTreeLine.Size.Has(), } pos = posEnd + 1 - entries = append(entries, entry) + if err := f(entry); err != nil { + return err + } } - return entries, nil + return nil } func catBatchParseTreeEntries(objectFormat ObjectFormat, ptree *Tree, rd *bufio.Reader, sz int64) ([]*TreeEntry, error) { diff --git a/modules/git/tree_nogogit.go b/modules/git/tree_nogogit.go index f88788418e27d..92357b9171e26 100644 --- a/modules/git/tree_nogogit.go +++ b/modules/git/tree_nogogit.go @@ -6,6 +6,8 @@ package git import ( + "bufio" + "context" "io" "strings" ) @@ -122,3 +124,50 @@ func (t *Tree) ListEntriesRecursiveFast() (Entries, error) { func (t *Tree) ListEntriesRecursiveWithSize() (Entries, error) { return t.listEntriesRecursive(TrustedCmdArgs{"--long"}) } + +// IterateEntriesRecursive returns iterate entries of current tree recursively including all subtrees +// extraArgs could be "-l" to get the size, which is slower +func (t *Tree) IterateEntriesRecursive(ctx context.Context, f func(ctx context.Context, entry *TreeEntry) error, extraArgs TrustedCmdArgs) error { + reader, writer := io.Pipe() + done := make(chan error) + + go func(t *Tree, done chan error, writer *io.PipeWriter) { + runErr := NewCommand("ls-tree", "-t", "-r"). + AddArguments(extraArgs...). + AddDynamicArguments(t.ID.String()). + Run(ctx, &RunOpts{ + Dir: t.repo.Path, + Stdout: writer, + }) + + _ = writer.Close() + + done <- runErr + }(t, done, writer) + + scanner := bufio.NewScanner(reader) + for scanner.Scan() { + if err := scanner.Err(); err != nil { + return err + } + + data := scanner.Bytes() + if err := iterateTreeEntries(data, t, func(entry *TreeEntry) error { + if err := f(ctx, entry); err != nil { + return err + } + + select { + case <-ctx.Done(): + return ctx.Err() + case runErr := <-done: + return runErr + default: + return nil + } + }); err != nil { + return err + } + } + return nil +} diff --git a/modules/indexer/stats/db.go b/modules/indexer/stats/db.go index 199d493e97d21..f72c6c1150b86 100644 --- a/modules/indexer/stats/db.go +++ b/modules/indexer/stats/db.go @@ -63,7 +63,7 @@ func (db *DBIndexer) Index(id int64) error { } // Calculate and save language statistics to database - stats, err := languagestats.GetLanguageStats(gitRepo, commitID) + stats, err := languagestats.CalcLanguageStats(ctx, gitRepo, commitID) if err != nil { if !setting.IsInTesting { log.Error("Unable to get language stats for ID %s for default branch %s in %s. Error: %v", commitID, repo.DefaultBranch, repo.FullName(), err)