From ead34d9e695e62b37c734fcc82f57fbfe3add27a Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 21 Mar 2025 01:21:40 +0000 Subject: [PATCH 1/2] Fix issue #9: symbolic link error --- cmd/symlink_test.go | 47 +++++ prompt/prompt.go | 354 +++++++++++++++++----------------- test_symlink/public/storage | 1 + test_symlink/storage/test.txt | 1 + 4 files changed, 228 insertions(+), 175 deletions(-) create mode 100644 cmd/symlink_test.go create mode 100644 test_symlink/public/storage create mode 100644 test_symlink/storage/test.txt diff --git a/cmd/symlink_test.go b/cmd/symlink_test.go new file mode 100644 index 0000000..0f7d927 --- /dev/null +++ b/cmd/symlink_test.go @@ -0,0 +1,47 @@ +package cmd + +import ( + "path/filepath" + "testing" + + "github.com/chand1012/git2gpt/prompt" +) + +func TestSymlinkHandling(t *testing.T) { + // Create a temporary directory for the test + testDir := "/workspace/test_symlink" + + // Generate an ignore list + ignoreList := prompt.GenerateIgnoreList(testDir, "", true) + + // Process the repository + repo, err := prompt.ProcessGitRepo(testDir, ignoreList) + if err != nil { + t.Fatalf("Error processing repository with symlink: %v", err) + } + + // Verify that the repository was processed successfully + if repo == nil { + t.Fatal("Repository is nil") + } + + // Check if the test.txt file was included + found := false + for _, file := range repo.Files { + if file.Path == filepath.Join("storage", "test.txt") { + found = true + break + } + } + + if !found { + t.Fatal("Expected to find storage/test.txt in the repository") + } + + // Verify that the symlink was skipped + for _, file := range repo.Files { + if file.Path == filepath.Join("public", "storage") { + t.Fatal("Symlink should have been skipped") + } + } +} diff --git a/prompt/prompt.go b/prompt/prompt.go index 1c0a462..714cfb1 100644 --- a/prompt/prompt.go +++ b/prompt/prompt.go @@ -1,169 +1,169 @@ package prompt import ( - "bufio" - "encoding/json" - "encoding/xml" - "fmt" - "io" - "os" - "path/filepath" - "strings" - "unicode/utf8" - - "github.com/chand1012/git2gpt/utils" - "github.com/gobwas/glob" - "github.com/pkoukk/tiktoken-go" + "bufio" + "encoding/json" + "encoding/xml" + "fmt" + "io" + "os" + "path/filepath" + "strings" + "unicode/utf8" + + "github.com/chand1012/git2gpt/utils" + "github.com/gobwas/glob" + "github.com/pkoukk/tiktoken-go" ) // GitFile is a file in a Git repository type GitFile struct { - Path string `json:"path" xml:"path"` // path to the file relative to the repository root - Tokens int64 `json:"tokens" xml:"tokens"` // number of tokens in the file - Contents string `json:"contents" xml:"contents"` // contents of the file + Path string `json:"path" xml:"path"` // path to the file relative to the repository root + Tokens int64 `json:"tokens" xml:"tokens"` // number of tokens in the file + Contents string `json:"contents" xml:"contents"` // contents of the file } // GitRepo is a Git repository type GitRepo struct { - TotalTokens int64 `json:"total_tokens" xml:"total_tokens"` - Files []GitFile `json:"files" xml:"files>file"` - FileCount int `json:"file_count" xml:"file_count"` + TotalTokens int64 `json:"total_tokens" xml:"total_tokens"` + Files []GitFile `json:"files" xml:"files>file"` + FileCount int `json:"file_count" xml:"file_count"` } // contains checks if a string is in a slice of strings func contains(s []string, e string) bool { - for _, a := range s { - if a == e { - return true - } - } - return false + for _, a := range s { + if a == e { + return true + } + } + return false } func getIgnoreList(ignoreFilePath string) ([]string, error) { - var ignoreList []string - file, err := os.Open(ignoreFilePath) - if err != nil { - return ignoreList, err - } - defer file.Close() - - scanner := bufio.NewScanner(file) - for scanner.Scan() { - line := strings.TrimSpace(scanner.Text()) - if line == "" || strings.HasPrefix(line, "#") { - continue - } - // if the line ends with a slash, add a globstar to the end - if strings.HasSuffix(line, "/") { - line = line + "**" - } - // remove all preceding slashes - line = strings.TrimPrefix(line, "/") - // line = filepath.FromSlash(line) - ignoreList = append(ignoreList, line) - } - return ignoreList, scanner.Err() + var ignoreList []string + file, err := os.Open(ignoreFilePath) + if err != nil { + return ignoreList, err + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + // if the line ends with a slash, add a globstar to the end + if strings.HasSuffix(line, "/") { + line = line + "**" + } + // remove all preceding slashes + line = strings.TrimPrefix(line, "/") + // line = filepath.FromSlash(line) + ignoreList = append(ignoreList, line) + } + return ignoreList, scanner.Err() } func windowsToUnixPath(windowsPath string) string { - unixPath := strings.ReplaceAll(windowsPath, "\\", "/") - return unixPath + unixPath := strings.ReplaceAll(windowsPath, "\\", "/") + return unixPath } func shouldIgnore(filePath string, ignoreList []string) bool { - for _, pattern := range ignoreList { - g := glob.MustCompile(pattern, '/') - if g.Match(windowsToUnixPath(filePath)) { - return true - } - } - return false + for _, pattern := range ignoreList { + g := glob.MustCompile(pattern, '/') + if g.Match(windowsToUnixPath(filePath)) { + return true + } + } + return false } // GenerateIgnoreList generates a list of ignore patterns from the .gptignore file and the .gitignore file. Returns a slice of strings. Will return an empty slice if no ignore files exist. func GenerateIgnoreList(repoPath, ignoreFilePath string, useGitignore bool) []string { - if ignoreFilePath == "" { - ignoreFilePath = filepath.Join(repoPath, ".gptignore") - } - - var ignoreList []string - if _, err := os.Stat(ignoreFilePath); err == nil { - // .gptignore file exists - ignoreList, _ = getIgnoreList(ignoreFilePath) - } - ignoreList = append(ignoreList, ".git/**", ".gitignore", ".gptignore") - - if useGitignore { - gitignorePath := filepath.Join(repoPath, ".gitignore") - if _, err := os.Stat(gitignorePath); err == nil { - // .gitignore file exists - gitignoreList, _ := getIgnoreList(gitignorePath) - ignoreList = append(ignoreList, gitignoreList...) - } - } - - var finalIgnoreList []string - // loop through the ignore list and remove any duplicates - // also check if any pattern is a directory and add a globstar to the end - for _, pattern := range ignoreList { - if !contains(finalIgnoreList, pattern) { - // check if the pattern is a directory - info, err := os.Stat(filepath.Join(repoPath, pattern)) - if err == nil && info.IsDir() { - pattern = filepath.Join(pattern, "**") - } - finalIgnoreList = append(finalIgnoreList, pattern) - } - } - - return finalIgnoreList + if ignoreFilePath == "" { + ignoreFilePath = filepath.Join(repoPath, ".gptignore") + } + + var ignoreList []string + if _, err := os.Stat(ignoreFilePath); err == nil { + // .gptignore file exists + ignoreList, _ = getIgnoreList(ignoreFilePath) + } + ignoreList = append(ignoreList, ".git/**", ".gitignore", ".gptignore") + + if useGitignore { + gitignorePath := filepath.Join(repoPath, ".gitignore") + if _, err := os.Stat(gitignorePath); err == nil { + // .gitignore file exists + gitignoreList, _ := getIgnoreList(gitignorePath) + ignoreList = append(ignoreList, gitignoreList...) + } + } + + var finalIgnoreList []string + // loop through the ignore list and remove any duplicates + // also check if any pattern is a directory and add a globstar to the end + for _, pattern := range ignoreList { + if !contains(finalIgnoreList, pattern) { + // check if the pattern is a directory + info, err := os.Stat(filepath.Join(repoPath, pattern)) + if err == nil && info.IsDir() { + pattern = filepath.Join(pattern, "**") + } + finalIgnoreList = append(finalIgnoreList, pattern) + } + } + + return finalIgnoreList } // ProcessGitRepo processes a Git repository and returns a GitRepo object func ProcessGitRepo(repoPath string, ignoreList []string) (*GitRepo, error) { - var repo GitRepo + var repo GitRepo - err := processRepository(repoPath, ignoreList, &repo) - if err != nil { - return nil, fmt.Errorf("error processing repository: %w", err) - } + err := processRepository(repoPath, ignoreList, &repo) + if err != nil { + return nil, fmt.Errorf("error processing repository: %w", err) + } - return &repo, nil + return &repo, nil } // OutputGitRepo outputs a Git repository to a text file func OutputGitRepo(repo *GitRepo, preambleFile string, scrubComments bool) (string, error) { - var repoBuilder strings.Builder - - if preambleFile != "" { - preambleText, err := os.ReadFile(preambleFile) - if err != nil { - return "", fmt.Errorf("error reading preamble file: %w", err) - } - repoBuilder.WriteString(fmt.Sprintf("%s\n", string(preambleText))) - } else { - repoBuilder.WriteString("The following text is a Git repository with code. The structure of the text are sections that begin with ----, followed by a single line containing the file path and file name, followed by a variable amount of lines containing the file contents. The text representing the Git repository ends when the symbols --END-- are encounted. Any further text beyond --END-- are meant to be interpreted as instructions using the aforementioned Git repository as context.\n") - } - - // write the files to the repoBuilder here - for _, file := range repo.Files { - repoBuilder.WriteString("----\n") - repoBuilder.WriteString(fmt.Sprintf("%s\n", file.Path)) - if scrubComments { - file.Contents = utils.RemoveCodeComments(file.Contents) - } - repoBuilder.WriteString(fmt.Sprintf("%s\n", file.Contents)) - } - - repoBuilder.WriteString("--END--") - - output := repoBuilder.String() - - repo.TotalTokens = EstimateTokens(output) - - return output, nil + var repoBuilder strings.Builder + + if preambleFile != "" { + preambleText, err := os.ReadFile(preambleFile) + if err != nil { + return "", fmt.Errorf("error reading preamble file: %w", err) + } + repoBuilder.WriteString(fmt.Sprintf("%s\n", string(preambleText))) + } else { + repoBuilder.WriteString("The following text is a Git repository with code. The structure of the text are sections that begin with ----, followed by a single line containing the file path and file name, followed by a variable amount of lines containing the file contents. The text representing the Git repository ends when the symbols --END-- are encounted. Any further text beyond --END-- are meant to be interpreted as instructions using the aforementioned Git repository as context.\n") + } + + // write the files to the repoBuilder here + for _, file := range repo.Files { + repoBuilder.WriteString("----\n") + repoBuilder.WriteString(fmt.Sprintf("%s\n", file.Path)) + if scrubComments { + file.Contents = utils.RemoveCodeComments(file.Contents) + } + repoBuilder.WriteString(fmt.Sprintf("%s\n", file.Contents)) + } + + repoBuilder.WriteString("--END--") + + output := repoBuilder.String() + + repo.TotalTokens = EstimateTokens(output) + + return output, nil } func OutputGitRepoXML(repo *GitRepo, scrubComments bool) (string, error) { @@ -241,59 +241,63 @@ func ValidateXML(xmlString string) error { func MarshalRepo(repo *GitRepo, scrubComments bool) ([]byte, error) { - // run the output function to get the total tokens - _, err := OutputGitRepo(repo, "", scrubComments) - if err != nil { - return nil, fmt.Errorf("error marshalling repo: %w", err) - } - return json.Marshal(repo) + // run the output function to get the total tokens + _, err := OutputGitRepo(repo, "", scrubComments) + if err != nil { + return nil, fmt.Errorf("error marshalling repo: %w", err) + } + return json.Marshal(repo) } func processRepository(repoPath string, ignoreList []string, repo *GitRepo) error { - err := filepath.Walk(repoPath, func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - if !info.IsDir() { - relativeFilePath, _ := filepath.Rel(repoPath, path) - ignore := shouldIgnore(relativeFilePath, ignoreList) - // fmt.Println(relativeFilePath, ignore) - if !ignore { - contents, err := os.ReadFile(path) - // if the file is not valid UTF-8, skip it - if !utf8.Valid(contents) { - return nil - } - if err != nil { - return err - } - var file GitFile - file.Path = relativeFilePath - file.Contents = string(contents) - file.Tokens = EstimateTokens(file.Contents) - repo.Files = append(repo.Files, file) - } - } - return nil - }) - - repo.FileCount = len(repo.Files) - - if err != nil { - return fmt.Errorf("error walking the path %q: %w", repoPath, err) - } - - return nil + err := filepath.Walk(repoPath, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + // Skip symbolic links to avoid issues with directory symlinks (like Laravel's storage link) + if info.Mode()&os.ModeSymlink != 0 { + return nil + } + if !info.IsDir() { + relativeFilePath, _ := filepath.Rel(repoPath, path) + ignore := shouldIgnore(relativeFilePath, ignoreList) + // fmt.Println(relativeFilePath, ignore) + if !ignore { + contents, err := os.ReadFile(path) + // if the file is not valid UTF-8, skip it + if !utf8.Valid(contents) { + return nil + } + if err != nil { + return err + } + var file GitFile + file.Path = relativeFilePath + file.Contents = string(contents) + file.Tokens = EstimateTokens(file.Contents) + repo.Files = append(repo.Files, file) + } + } + return nil + }) + + repo.FileCount = len(repo.Files) + + if err != nil { + return fmt.Errorf("error walking the path %q: %w", repoPath, err) + } + + return nil } // EstimateTokens estimates the number of tokens in a string func EstimateTokens(output string) int64 { - tke, err := tiktoken.GetEncoding("cl100k_base") - if err != nil { - fmt.Println("Error getting encoding:", err) - return 0 - } - - tokens := tke.Encode(output, nil, nil) - return int64(len(tokens)) + tke, err := tiktoken.GetEncoding("cl100k_base") + if err != nil { + fmt.Println("Error getting encoding:", err) + return 0 + } + + tokens := tke.Encode(output, nil, nil) + return int64(len(tokens)) } diff --git a/test_symlink/public/storage b/test_symlink/public/storage new file mode 100644 index 0000000..2b2bcac --- /dev/null +++ b/test_symlink/public/storage @@ -0,0 +1 @@ +/workspace/test_symlink/storage diff --git a/test_symlink/storage/test.txt b/test_symlink/storage/test.txt new file mode 100644 index 0000000..524acff --- /dev/null +++ b/test_symlink/storage/test.txt @@ -0,0 +1 @@ +Test file From 7aaa82e04b32ee9f548e8ac56cdd8464d8da3fd6 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 2 Jul 2025 14:57:45 +0000 Subject: [PATCH 2/2] Fix pr #14: Fix issue #9: symbolic link error --- cmd/complex_symlink_test.go | 71 +++++++++++++++++++++++++++ cmd/symlink_test.go | 11 ++++- prompt/prompt.go | 18 ++++++- test_complex_symlink/dir1/file1.txt | 1 + test_complex_symlink/dir2/file2.txt | 1 + test_complex_symlink/public/link1 | 1 + test_complex_symlink/public/link2.txt | 1 + test_symlink/public/storage | 2 +- 8 files changed, 102 insertions(+), 4 deletions(-) create mode 100644 cmd/complex_symlink_test.go create mode 100644 test_complex_symlink/dir1/file1.txt create mode 100644 test_complex_symlink/dir2/file2.txt create mode 100644 test_complex_symlink/public/link1 create mode 100644 test_complex_symlink/public/link2.txt diff --git a/cmd/complex_symlink_test.go b/cmd/complex_symlink_test.go new file mode 100644 index 0000000..c02466a --- /dev/null +++ b/cmd/complex_symlink_test.go @@ -0,0 +1,71 @@ +package cmd + +import ( + "path/filepath" + "testing" + + "github.com/chand1012/git2gpt/prompt" +) + +func TestComplexSymlinkHandling(t *testing.T) { + // Create a temporary directory for the test + testDir := "/workspace/test_complex_symlink" + + // Generate an ignore list + ignoreList := prompt.GenerateIgnoreList(testDir, "", true) + + // Process the repository + repo, err := prompt.ProcessGitRepo(testDir, ignoreList) + if err != nil { + t.Fatalf("Error processing repository with symlinks: %v", err) + } + + // Verify that the repository was processed successfully + if repo == nil { + t.Fatal("Repository is nil") + } + + // Check if the regular files were included + file1Found := false + file2Found := false + for _, file := range repo.Files { + if file.Path == filepath.Join("dir1", "file1.txt") { + file1Found = true + } + if file.Path == filepath.Join("dir2", "file2.txt") { + file2Found = true + } + } + + if !file1Found { + t.Fatal("Expected to find dir1/file1.txt in the repository") + } + if !file2Found { + t.Fatal("Expected to find dir2/file2.txt in the repository") + } + + // Verify that the symlinks were included + link1Found := false + link2Found := false + for _, file := range repo.Files { + if file.Path == filepath.Join("public", "link1") { + link1Found = true + if file.Contents != "../dir1" { + t.Fatalf("Expected link1 content to be '../dir1', got '%s'", file.Contents) + } + } + if file.Path == filepath.Join("public", "link2.txt") { + link2Found = true + if file.Contents != "../dir2/file2.txt" { + t.Fatalf("Expected link2.txt content to be '../dir2/file2.txt', got '%s'", file.Contents) + } + } + } + + if !link1Found { + t.Fatal("Expected to find public/link1 in the repository") + } + if !link2Found { + t.Fatal("Expected to find public/link2.txt in the repository") + } +} diff --git a/cmd/symlink_test.go b/cmd/symlink_test.go index 0f7d927..0521e81 100644 --- a/cmd/symlink_test.go +++ b/cmd/symlink_test.go @@ -38,10 +38,17 @@ func TestSymlinkHandling(t *testing.T) { t.Fatal("Expected to find storage/test.txt in the repository") } - // Verify that the symlink was skipped + // Verify that the symlink was resolved + // The symlink itself should be included as a file + symlinkFound := false for _, file := range repo.Files { if file.Path == filepath.Join("public", "storage") { - t.Fatal("Symlink should have been skipped") + symlinkFound = true + break } } + + if !symlinkFound { + t.Fatal("Expected to find public/storage in the repository") + } } diff --git a/prompt/prompt.go b/prompt/prompt.go index 714cfb1..884cd5f 100644 --- a/prompt/prompt.go +++ b/prompt/prompt.go @@ -254,8 +254,24 @@ func processRepository(repoPath string, ignoreList []string, repo *GitRepo) erro if err != nil { return err } - // Skip symbolic links to avoid issues with directory symlinks (like Laravel's storage link) + // Handle symbolic links by including them as files if info.Mode()&os.ModeSymlink != 0 { + relativeFilePath, _ := filepath.Rel(repoPath, path) + ignore := shouldIgnore(relativeFilePath, ignoreList) + if !ignore { + // Get the target of the symlink + target, err := os.Readlink(path) + if err != nil { + return err + } + + // Create a file entry for the symlink itself + var file GitFile + file.Path = relativeFilePath + file.Contents = target // Store the symlink target as the content + file.Tokens = EstimateTokens(file.Contents) + repo.Files = append(repo.Files, file) + } return nil } if !info.IsDir() { diff --git a/test_complex_symlink/dir1/file1.txt b/test_complex_symlink/dir1/file1.txt new file mode 100644 index 0000000..870c88c --- /dev/null +++ b/test_complex_symlink/dir1/file1.txt @@ -0,0 +1 @@ +Test file 1 diff --git a/test_complex_symlink/dir2/file2.txt b/test_complex_symlink/dir2/file2.txt new file mode 100644 index 0000000..705e376 --- /dev/null +++ b/test_complex_symlink/dir2/file2.txt @@ -0,0 +1 @@ +Test file 2 diff --git a/test_complex_symlink/public/link1 b/test_complex_symlink/public/link1 new file mode 100644 index 0000000..37dfff0 --- /dev/null +++ b/test_complex_symlink/public/link1 @@ -0,0 +1 @@ +../dir1 diff --git a/test_complex_symlink/public/link2.txt b/test_complex_symlink/public/link2.txt new file mode 100644 index 0000000..06ce85d --- /dev/null +++ b/test_complex_symlink/public/link2.txt @@ -0,0 +1 @@ +../dir2/file2.txt diff --git a/test_symlink/public/storage b/test_symlink/public/storage index 2b2bcac..78978cf 100644 --- a/test_symlink/public/storage +++ b/test_symlink/public/storage @@ -1 +1 @@ -/workspace/test_symlink/storage +../storage