Skip to content

Reduce memory usage when cacluting repository languages #34605

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions modules/git/languagestats/language_stats_gogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package languagestats

import (
"bytes"
"context"
"io"

"code.gitea.io/gitea/modules/analyze"
Expand All @@ -20,8 +21,8 @@ import (
"github.com/go-git/go-git/v5/plumbing/object"
)

// GetLanguageStats calculates language stats for git repository at specified commit
func GetLanguageStats(repo *git_module.Repository, commitID string) (map[string]int64, error) {
// CalcLanguageStats calculates language stats for git repository at specified commit
func CalcLanguageStats(ctx context.Context, repo *git_module.Repository, commitID string) (map[string]int64, error) {
r, err := git.PlainOpen(repo.Path)
if err != nil {
return nil, err
Expand Down Expand Up @@ -58,6 +59,13 @@ func GetLanguageStats(repo *git_module.Repository, commitID string) (map[string]
firstExcludedLanguageSize := int64(0)

err = tree.Files().ForEach(func(f *object.File) error {
select {
case <-ctx.Done():
return ctx.Err()
default:
return nil
}

if f.Size == 0 {
return nil
}
Expand Down
47 changes: 20 additions & 27 deletions modules/git/languagestats/language_stats_nogogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package languagestats

import (
"bytes"
"context"
"io"

"code.gitea.io/gitea/modules/analyze"
Expand All @@ -18,8 +19,8 @@ import (
"github.com/go-enry/go-enry/v2"
)

// GetLanguageStats calculates language stats for git repository at specified commit
func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, error) {
// CalcLanguageStats calculates language stats for git repository at specified commit
func CalcLanguageStats(ctx context.Context, repo *git.Repository, commitID string) (map[string]int64, error) {
// We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.
// so let's create a batch stdin and stdout
batchStdinWriter, batchReader, cancel, err := repo.CatFileBatch(repo.Ctx)
Expand Down Expand Up @@ -59,11 +60,6 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,

tree := commit.Tree

entries, err := tree.ListEntriesRecursiveWithSize()
if err != nil {
return nil, err
}

checker, err := attribute.NewBatchChecker(repo, commitID, attribute.LinguistAttributes)
if err != nil {
return nil, err
Expand All @@ -82,18 +78,12 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
firstExcludedLanguage := ""
firstExcludedLanguageSize := int64(0)

for _, f := range entries {
select {
case <-repo.Ctx.Done():
return sizes, repo.Ctx.Err()
default:
}

if err := tree.IterateEntriesRecursive(ctx, func(ctx context.Context, f *git.TreeEntry) error {
contentBuf.Reset()
content = contentBuf.Bytes()

if f.Size() == 0 {
continue
return nil
}

isVendored := optional.None[bool]()
Expand All @@ -104,19 +94,19 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
attrLinguistGenerated := optional.None[bool]()
if err == nil {
if isVendored = attrs.GetVendored(); isVendored.ValueOrDefault(false) {
continue
return nil
}

if attrLinguistGenerated = attrs.GetGenerated(); attrLinguistGenerated.ValueOrDefault(false) {
continue
return nil
}

if isDocumentation = attrs.GetDocumentation(); isDocumentation.ValueOrDefault(false) {
continue
return nil
}

if isDetectable = attrs.GetDetectable(); !isDetectable.ValueOrDefault(true) {
continue
return nil
}

if hasLanguage := attrs.GetLanguage(); hasLanguage.Value() != "" {
Expand All @@ -130,27 +120,27 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,

// this language will always be added to the size
sizes[language] += f.Size()
continue
return nil
}
}

if (!isVendored.Has() && analyze.IsVendor(f.Name())) ||
enry.IsDotFile(f.Name()) ||
(!isDocumentation.Has() && enry.IsDocumentation(f.Name())) ||
enry.IsConfiguration(f.Name()) {
continue
return nil
}

// If content can not be read or file is too big just do detection by filename

if f.Size() <= bigFileSize {
if err := writeID(f.ID.String()); err != nil {
return nil, err
return err
}
_, _, size, err := git.ReadBatchLine(batchReader)
if err != nil {
log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err)
return nil, err
return err
}

sizeToRead := size
Expand All @@ -162,11 +152,11 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,

_, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))
if err != nil {
return nil, err
return err
}
content = contentBuf.Bytes()
if err := git.DiscardFull(batchReader, discard); err != nil {
return nil, err
return err
}
}

Expand All @@ -178,14 +168,14 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
isGenerated = enry.IsGenerated(f.Name(), content)
}
if isGenerated {
continue
return nil
}

// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
// - eg. do the all the detection tests using filename first before reading content.
language := analyze.GetCodeLanguage(f.Name(), content)
if language == "" {
continue
return nil
}

// group languages, such as Pug -> HTML; SCSS -> CSS
Expand All @@ -206,6 +196,9 @@ func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64,
firstExcludedLanguage = language
firstExcludedLanguageSize += f.Size()
}
return nil
}, git.TrustedCmdArgs{"--long"}); err != nil {
return sizes, err
}

// If there are no included languages add the first excluded language
Expand Down
2 changes: 1 addition & 1 deletion modules/git/languagestats/language_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func TestRepository_GetLanguageStats(t *testing.T) {
require.NoError(t, err)
defer gitRepo.Close()

stats, err := GetLanguageStats(gitRepo, "8fee858da5796dfb37704761701bb8e800ad9ef3")
stats, err := CalcLanguageStats(t.Context(), gitRepo, "8fee858da5796dfb37704761701bb8e800ad9ef3")
require.NoError(t, err)

assert.Equal(t, map[string]int64{
Expand Down
15 changes: 12 additions & 3 deletions modules/git/parse_nogogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ func ParseTreeEntries(data []byte) ([]*TreeEntry, error) {
// parseTreeEntries FIXME this function's design is not right, it should not make the caller read all data into memory
func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
entries := make([]*TreeEntry, 0, bytes.Count(data, []byte{'\n'})+1)
return entries, iterateTreeEntries(data, ptree, func(entry *TreeEntry) error {
entries = append(entries, entry)
return nil
})
}

func iterateTreeEntries(data []byte, ptree *Tree, f func(entry *TreeEntry) error) error {
for pos := 0; pos < len(data); {
posEnd := bytes.IndexByte(data[pos:], '\n')
if posEnd == -1 {
Expand All @@ -33,7 +40,7 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
line := data[pos:posEnd]
lsTreeLine, err := parseLsTreeLine(line)
if err != nil {
return nil, err
return err
}
entry := &TreeEntry{
ptree: ptree,
Expand All @@ -44,9 +51,11 @@ func parseTreeEntries(data []byte, ptree *Tree) ([]*TreeEntry, error) {
sized: lsTreeLine.Size.Has(),
}
pos = posEnd + 1
entries = append(entries, entry)
if err := f(entry); err != nil {
return err
}
}
return entries, nil
return nil
}

func catBatchParseTreeEntries(objectFormat ObjectFormat, ptree *Tree, rd *bufio.Reader, sz int64) ([]*TreeEntry, error) {
Expand Down
49 changes: 49 additions & 0 deletions modules/git/tree_nogogit.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
package git

import (
"bufio"
"context"
"io"
"strings"
)
Expand Down Expand Up @@ -122,3 +124,50 @@ func (t *Tree) ListEntriesRecursiveFast() (Entries, error) {
func (t *Tree) ListEntriesRecursiveWithSize() (Entries, error) {
return t.listEntriesRecursive(TrustedCmdArgs{"--long"})
}

// IterateEntriesRecursive returns iterate entries of current tree recursively including all subtrees
// extraArgs could be "-l" to get the size, which is slower
func (t *Tree) IterateEntriesRecursive(ctx context.Context, f func(ctx context.Context, entry *TreeEntry) error, extraArgs TrustedCmdArgs) error {
reader, writer := io.Pipe()
done := make(chan error)

go func(t *Tree, done chan error, writer *io.PipeWriter) {
runErr := NewCommand("ls-tree", "-t", "-r").
AddArguments(extraArgs...).
AddDynamicArguments(t.ID.String()).
Run(ctx, &RunOpts{
Dir: t.repo.Path,
Stdout: writer,
})

_ = writer.Close()

done <- runErr
}(t, done, writer)

scanner := bufio.NewScanner(reader)
for scanner.Scan() {
if err := scanner.Err(); err != nil {
return err
}

data := scanner.Bytes()
if err := iterateTreeEntries(data, t, func(entry *TreeEntry) error {
if err := f(ctx, entry); err != nil {
return err
}

select {
case <-ctx.Done():
return ctx.Err()
case runErr := <-done:
return runErr
default:
return nil
}
}); err != nil {
return err
}
}
return nil
}
2 changes: 1 addition & 1 deletion modules/indexer/stats/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func (db *DBIndexer) Index(id int64) error {
}

// Calculate and save language statistics to database
stats, err := languagestats.GetLanguageStats(gitRepo, commitID)
stats, err := languagestats.CalcLanguageStats(ctx, gitRepo, commitID)
if err != nil {
if !setting.IsInTesting {
log.Error("Unable to get language stats for ID %s for default branch %s in %s. Error: %v", commitID, repo.DefaultBranch, repo.FullName(), err)
Expand Down
Loading