Skip to content

Commit 022a3b0

Browse files
authored
feat: add --compress for tree-sitter (#57)
1 parent 029cdc8 commit 022a3b0

25 files changed

+2898
-85
lines changed

.github/workflows/build-release-publish.yml

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ on:
1111
- 'README.md'
1212

1313
env:
14-
GO_VERSION: '1.22'
14+
GO_VERSION: '1.24.3' # Updated to match toolchain
1515
BINARY_NAME: 'ingest'
1616

1717
permissions:
@@ -22,16 +22,22 @@ jobs:
2222
build:
2323
if: ${{ ! contains(github.event.head_commit.message, '[skip ci]') && ! contains(github.event.pull_request.title, '[skip ci]')}}
2424
name: Build
25-
runs-on: ubuntu-latest
2625
strategy:
2726
matrix:
28-
include:
27+
target:
2928
- os: darwin
3029
arch: arm64
30+
runner: macos-14
31+
c_compiler_package: ""
3132
- os: linux
3233
arch: amd64
33-
- os: linux
34-
arch: arm64
34+
runner: ubuntu-latest
35+
c_compiler_package: "build-essential"
36+
# - os: linux
37+
# arch: arm64
38+
# runner: ubuntu-latest-arm64 # Use native ARM64 runner
39+
# c_compiler_package: "build-essential" # Native compiler
40+
runs-on: ${{ matrix.target.runner }}
3541

3642
outputs:
3743
version: ${{ steps.set_version.outputs.new_tag }}
@@ -58,25 +64,32 @@ jobs:
5864
- name: Get dependencies
5965
run: go mod download
6066

67+
- name: Set up C compiler
68+
if: startsWith(matrix.target.runner, 'ubuntu') && matrix.target.c_compiler_package != ''
69+
run: |
70+
sudo apt-get update
71+
sudo apt-get install -y ${{ matrix.target.c_compiler_package }}
72+
6173
- name: Run tests
6274
run: go test -v ./...
6375

6476
- name: Build
6577
env:
66-
GOOS: ${{ matrix.os }}
67-
GOARCH: ${{ matrix.arch }}
78+
CGO_ENABLED: "1" # Explicitly enable CGo
79+
GOOS: ${{ matrix.target.os }}
80+
GOARCH: ${{ matrix.target.arch }}
6881
VERSION: ${{ steps.set_version.outputs.new_tag }}
6982

7083
run: |
71-
go build -v -ldflags "-X main.Version=$version" -o build/${{ env.BINARY_NAME }}-${{ matrix.os }}-${{ matrix.arch }} .
84+
go build -v -ldflags "-X main.Version=$VERSION" -o build/${{ env.BINARY_NAME }}-${{ matrix.target.os }}-${{ matrix.target.arch }} .
7285
ls -ltarh build/
7386
7487
- name: Upload artifact
7588
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4
7689
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
7790
with:
78-
name: ${{ env.BINARY_NAME }}-${{ matrix.os }}-${{ matrix.arch }}
79-
path: build/${{ env.BINARY_NAME }}-${{ matrix.os }}-${{ matrix.arch }}
91+
name: ${{ env.BINARY_NAME }}-${{ matrix.target.os }}-${{ matrix.target.arch }}
92+
path: build/${{ env.BINARY_NAME }}-${{ matrix.target.os }}-${{ matrix.target.arch }}
8093
retention-days: 90
8194

8295
release:

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ clean:
2929
$(GOCLEAN)
3030
rm -f $(BINARY_NAME)
3131

32+
lint:
33+
gofmt -s -w .
34+
3235
test:
3336
$(GOTEST) -v ./...
3437

README.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ And ingest web URLs.
2020

2121
- Traverse directory structures and generate a tree view
2222
- Include/exclude files based on glob patterns
23+
- Compress code using Tree-sitter to extract key structural information while omitting implementation details
2324
- Estimate vRAM requirements and check model compatibility using another package I've created called [quantest](https://github.com/sammcj/quantest)
2425
- Parse output directly to LLMs such as Ollama or any OpenAI compatible API for processing
2526
- Generate and include git diffs and logs
@@ -209,6 +210,44 @@ You can provide a prompt suffix to append to the generated prompt:
209210
ingest --llm -p "explain this code" /path/to/project
210211
```
211212

213+
## Code Compression with Tree-sitter
214+
215+
**Experimental**
216+
217+
Ingest can compress source code files by extracting key structural information while omitting implementation details. This is useful for reducing token usage while preserving the important parts of the code structure.
218+
219+
```shell
220+
ingest --compress /path/to/project
221+
```
222+
223+
The compression extracts:
224+
- Package/module declarations
225+
- Import statements
226+
- Function/method signatures (without bodies)
227+
- Class definitions (without method bodies)
228+
- Type definitions
229+
- Comments
230+
231+
Currently supported languages:
232+
- Go
233+
- Python
234+
- JavaScript (including arrow functions and ES6 module syntax)
235+
- Bash
236+
- C
237+
- CSS
238+
239+
Example of compressed JavaScript:
240+
241+
```
242+
// This is a JavaScript comment
243+
import { something } from 'module';
244+
export class MyJSClass { ... } // Body removed
245+
constructor(name) { ... } // Body removed
246+
greet(message) { ... } // Body removed
247+
export function myJSFunction(x, y) { ... } // Body removed
248+
const myArrowFunc = (a, b) => { ... } // Body removed
249+
```
250+
212251
## Web Crawling & Ingestion
213252

214253
Crawl with explicit web mode
@@ -272,6 +311,7 @@ These directories will be created automatically on first run, along with README
272311

273312
### Flags
274313

314+
- `--compress`: **New** Enable code compression using Tree-sitter to extract key structural information while omitting implementation details
275315
- `--config`: Opens the config file in the default editor
276316
- `--context`: Specify the context length for VRAM estimation
277317
- `--exclude-from-tree`: Exclude files/folders from the source tree based on exclude patterns

config/config.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ type OllamaConfig struct {
1818
}
1919

2020
type Config struct {
21-
Ollama []OllamaConfig `json:"ollama"`
22-
LLM LLMConfig `json:"llm"`
23-
AutoSave bool `json:"auto_save"`
21+
Ollama []OllamaConfig `json:"ollama"`
22+
LLM LLMConfig `json:"llm"`
23+
AutoSave bool `json:"auto_save"`
2424
}
2525

2626
type LLMConfig struct {

filesystem/filesystem.go

Lines changed: 60 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/fatih/color"
1717
"github.com/mitchellh/go-homedir"
1818
ignore "github.com/sabhiram/go-gitignore"
19+
"github.com/sammcj/ingest/internal/compressor"
1920
"github.com/sammcj/ingest/pdf"
2021
"github.com/sammcj/ingest/utils"
2122
)
@@ -28,10 +29,10 @@ type FileInfo struct {
2829

2930
// New type to track excluded files and directories
3031
type ExcludedInfo struct {
31-
Directories map[string]int // Directory path -> count of excluded files
32-
Extensions map[string]int // File extension -> count of excluded files
33-
TotalFiles int // Total number of excluded files
34-
Files []string // List of excluded files (if total ≤ 20)
32+
Directories map[string]int // Directory path -> count of excluded files
33+
Extensions map[string]int // File extension -> count of excluded files
34+
TotalFiles int // Total number of excluded files
35+
Files []string // List of excluded files (if total ≤ 20)
3536
}
3637

3738
type treeNode struct {
@@ -67,9 +68,9 @@ func ReadExcludePatterns(patternExclude string, noDefaultExcludes bool) ([]strin
6768
// If user has a default.glob, it overrides the default patterns
6869
if _, err := os.Stat(userDefaultGlob); err == nil {
6970
return readGlobFile(userDefaultGlob)
70-
}
71+
}
7172

72-
// Read other user-defined patterns
73+
// Read other user-defined patterns
7374
userPatterns, _ := readGlobFilesFromDir(userPatternsDir)
7475

7576
// Combine user patterns with default patterns (if not disabled)
@@ -102,7 +103,6 @@ func trackExcludedFile(excluded *ExcludedInfo, path string, mu *sync.Mutex) {
102103
}
103104
}
104105

105-
106106
func readGlobFile(filename string) ([]string, error) {
107107
file, err := os.Open(filename)
108108
if err != nil {
@@ -150,15 +150,15 @@ func trackExcludedDirectory(excluded *ExcludedInfo, path string, mu *sync.Mutex)
150150
excluded.Directories[path] = 0 // Initialize directory count
151151
}
152152

153-
func WalkDirectory(rootPath string, includePatterns, excludePatterns []string, patternExclude string, includePriority, lineNumber, relativePaths, excludeFromTree, noCodeblock, noDefaultExcludes bool) (string, []FileInfo, *ExcludedInfo, error) {
153+
func WalkDirectory(rootPath string, includePatterns, excludePatterns []string, patternExclude string, includePriority, lineNumber, relativePaths, excludeFromTree, noCodeblock, noDefaultExcludes bool, comp *compressor.GenericCompressor) (string, []FileInfo, *ExcludedInfo, error) {
154154
var files []FileInfo
155155
var mu sync.Mutex
156156
var wg sync.WaitGroup
157157

158158
excluded := &ExcludedInfo{
159159
Directories: make(map[string]int),
160160
Extensions: make(map[string]int),
161-
Files: make([]string, 0),
161+
Files: make([]string, 0),
162162
}
163163

164164
// Read exclude patterns
@@ -216,7 +216,7 @@ func WalkDirectory(rootPath string, includePatterns, excludePatterns []string, p
216216
wg.Add(1)
217217
go func() {
218218
defer wg.Done()
219-
processFile(rootPath, relPath, filepath.Dir(rootPath), lineNumber, relativePaths, noCodeblock, &mu, &files)
219+
processFile(rootPath, relPath, filepath.Dir(rootPath), lineNumber, relativePaths, noCodeblock, &mu, &files, comp)
220220
}()
221221
} else {
222222
trackExcludedFile(excluded, rootPath, &mu)
@@ -259,7 +259,7 @@ func WalkDirectory(rootPath string, includePatterns, excludePatterns []string, p
259259
wg.Add(1)
260260
go func(path, relPath string, info os.FileInfo) {
261261
defer wg.Done()
262-
processFile(path, relPath, rootPath, lineNumber, relativePaths, noCodeblock, &mu, &files)
262+
processFile(path, relPath, rootPath, lineNumber, relativePaths, noCodeblock, &mu, &files, comp)
263263
}(path, relPath, info)
264264
}
265265

@@ -281,7 +281,7 @@ func shouldExcludePath(path string, excludePatterns []string, gitignore *ignore.
281281
for _, pattern := range excludePatterns {
282282
if match, _ := doublestar.Match(pattern, path); match {
283283
return true
284-
}
284+
}
285285
}
286286
return gitignore != nil && gitignore.MatchesPath(path)
287287
}
@@ -353,7 +353,7 @@ func isBinaryFile(filePath string) (bool, error) {
353353
n, err := file.Read(buffer)
354354
if err != nil && err != io.EOF {
355355
return false, err
356-
}
356+
}
357357

358358
// Use http.DetectContentType to determine the content type
359359
contentType := http.DetectContentType(buffer[:n])
@@ -371,7 +371,7 @@ func PrintDefaultExcludes() {
371371
fmt.Println(strings.Join(excludes, "\n"))
372372
}
373373

374-
func processFile(path, relPath string, rootPath string, lineNumber, relativePaths, noCodeblock bool, mu *sync.Mutex, files *[]FileInfo) {
374+
func processFile(path, relPath string, rootPath string, lineNumber, relativePaths, noCodeblock bool, mu *sync.Mutex, files *[]FileInfo, comp *compressor.GenericCompressor) {
375375
// Check if it's the root path being processed (explicitly provided file)
376376
isExplicitFile := path == rootPath
377377

@@ -428,13 +428,36 @@ func processFile(path, relPath string, rootPath string, lineNumber, relativePath
428428
}
429429

430430
code := string(content)
431+
432+
// Attempt compression if compressor is provided and it's not a PDF
433+
if comp != nil && !isPDF {
434+
langID, err := compressor.IdentifyLanguage(path)
435+
if err == nil { // Language identified
436+
compressedCode, err := comp.Compress(content, langID)
437+
if err == nil {
438+
code = compressedCode
439+
// If compressed, we might not want to add line numbers or wrap in a generic code block
440+
// as the compressor might handle formatting. For now, let's assume compressed output
441+
// is final for this file's content.
442+
// We'll skip line numbering and code block wrapping for compressed content.
443+
goto skipFormatting
444+
} else {
445+
utils.PrintColouredMessage("⚠️", fmt.Sprintf("Compression failed for %s: %v. Using original content.", path, err), color.FgYellow)
446+
}
447+
} else {
448+
// Language not identified for compression, use original content
449+
utils.PrintColouredMessage("ℹ️", fmt.Sprintf("Language not identified for compression for %s. Using original content.", path), color.FgBlue)
450+
}
451+
}
452+
431453
if lineNumber {
432454
code = addLineNumbers(code)
433455
}
434456
if !noCodeblock {
435457
code = wrapCodeBlock(code, filepath.Ext(path))
436458
}
437459

460+
skipFormatting:
438461
filePath := path
439462
if relativePaths {
440463
filePath = filepath.Join(filepath.Base(rootPath), relPath)
@@ -487,9 +510,9 @@ func generateTreeString(rootPath string, excludePatterns []string) (string, erro
487510
}
488511
if !found {
489512
newNode := &treeNode{
490-
name: part,
491-
isDir: true,
492-
excluded: true,
513+
name: part,
514+
isDir: true,
515+
excluded: true,
493516
}
494517
current.children = append(current.children, newNode)
495518
current = newNode
@@ -608,7 +631,7 @@ func isExcluded(path string, patterns []string) bool {
608631
return false
609632
}
610633

611-
func ProcessSingleFile(path string, lineNumber, relativePaths, noCodeblock bool) (FileInfo, error) {
634+
func ProcessSingleFile(path string, lineNumber, relativePaths, noCodeblock bool, comp *compressor.GenericCompressor) (FileInfo, error) {
612635
// Check if it's a PDF first
613636
isPDF, err := pdf.IsPDF(path)
614637
if err != nil {
@@ -635,13 +658,32 @@ func ProcessSingleFile(path string, lineNumber, relativePaths, noCodeblock bool)
635658
}
636659

637660
code := string(content)
661+
662+
// Attempt compression if compressor is provided and it's not a PDF
663+
if comp != nil && !isPDF {
664+
langID, err := compressor.IdentifyLanguage(path)
665+
if err == nil { // Language identified
666+
compressedCode, err := comp.Compress(content, langID)
667+
if err == nil {
668+
code = compressedCode
669+
// Skip standard formatting for compressed content
670+
goto skipSingleFileFormatting
671+
} else {
672+
utils.PrintColouredMessage("⚠️", fmt.Sprintf("Compression failed for %s: %v. Using original content.", path, err), color.FgYellow)
673+
}
674+
} else {
675+
utils.PrintColouredMessage("ℹ️", fmt.Sprintf("Language not identified for compression for %s. Using original content.", path), color.FgBlue)
676+
}
677+
}
678+
638679
if lineNumber {
639680
code = addLineNumbers(code)
640681
}
641682
if !noCodeblock {
642683
code = wrapCodeBlock(code, filepath.Ext(path))
643684
}
644685

686+
skipSingleFileFormatting:
645687
filePath := path
646688
if relativePaths {
647689
filePath = filepath.Base(path)

0 commit comments

Comments
 (0)