Skip to content

Commit 8706ff8

Browse files
committed
chore: more idiomatic copy/reader usage
Signed-off-by: Christopher Phillips <[email protected]>
1 parent e58e631 commit 8706ff8

File tree

3 files changed

+22
-39
lines changed

3 files changed

+22
-39
lines changed

syft/pkg/cataloger/ai/cataloger_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ func TestGGUFCataloger(t *testing.T) {
7676
GGUFVersion: 3,
7777
TensorCount: 0,
7878
MetadataKeyValuesHash: "6e3d368066455ce4",
79-
Header: map[string]interface{}{
79+
RemainingKeyValues: map[string]interface{}{
8080
"general.some_random_kv": "foobar",
8181
},
8282
},
@@ -113,7 +113,7 @@ func TestGGUFCataloger(t *testing.T) {
113113
GGUFVersion: 3,
114114
TensorCount: 0,
115115
MetadataKeyValuesHash: "9dc6f23591062a27",
116-
Header: map[string]interface{}{
116+
RemainingKeyValues: map[string]interface{}{
117117
"gpt2.context_length": "1024",
118118
"gpt2.embedding_length": uint32(768),
119119
},

syft/pkg/cataloger/ai/parse_gguf.go

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,46 +14,35 @@ const (
1414
maxHeaderSize = 50 * 1024 * 1024 // 50MB for large tokenizer vocabularies
1515
)
1616

17-
// readHeader reads only the GGUF header (metadata) without reading tensor data
18-
// This is much more efficient than reading the entire file
19-
// The reader should be wrapped with io.LimitedReader to prevent OOM issues
20-
func readHeader(r io.Reader) ([]byte, error) {
21-
// Read initial chunk to determine header size
17+
// copyHeader copies the GGUF header from the reader to the writer.
18+
// It validates the magic number first, then copies the rest of the data.
19+
// The reader should be wrapped with io.LimitedReader to prevent OOM issues.
20+
func copyHeader(w io.Writer, r io.Reader) error {
21+
// Read initial chunk to validate magic number
2222
// GGUF format: magic(4) + version(4) + tensor_count(8) + metadata_kv_count(8) + metadata_kvs + tensors_info
2323
initialBuf := make([]byte, 24) // Enough for magic, version, tensor count, and kv count
2424
if _, err := io.ReadFull(r, initialBuf); err != nil {
25-
return nil, fmt.Errorf("failed to read GGUF header prefix: %w", err)
25+
return fmt.Errorf("failed to read GGUF header prefix: %w", err)
2626
}
2727

2828
// Verify magic number
2929
magic := binary.LittleEndian.Uint32(initialBuf[0:4])
3030
if magic != ggufMagicNumber {
31-
return nil, fmt.Errorf("invalid GGUF magic number: 0x%08X", magic)
31+
return fmt.Errorf("invalid GGUF magic number: 0x%08X", magic)
3232
}
3333

34-
// We need to read the metadata KV pairs to know the full header size
35-
// The io.LimitedReader wrapping this reader ensures we don't read more than maxHeaderSize
36-
headerData := make([]byte, 0, 1024*1024) // Start with 1MB capacity
37-
headerData = append(headerData, initialBuf...)
34+
// Write the initial buffer to the writer
35+
if _, err := w.Write(initialBuf); err != nil {
36+
return fmt.Errorf("failed to write GGUF header prefix: %w", err)
37+
}
3838

39-
// Read the rest of the header in larger chunks for efficiency
39+
// Copy the rest of the header from reader to writer
4040
// The LimitedReader will return EOF once maxHeaderSize is reached
41-
buf := make([]byte, 64*1024) // 64KB chunks
42-
for {
43-
n, err := r.Read(buf)
44-
if n > 0 {
45-
headerData = append(headerData, buf[:n]...)
46-
}
47-
if err == io.EOF {
48-
// Reached end of file or limit, we have all available data
49-
break
50-
}
51-
if err != nil {
52-
return nil, fmt.Errorf("failed to read GGUF header: %w", err)
53-
}
41+
if _, err := io.Copy(w, r); err != nil {
42+
return fmt.Errorf("failed to copy GGUF header: %w", err)
5443
}
5544

56-
return headerData, nil
45+
return nil
5746
}
5847

5948
// Helper to convert gguf_parser metadata to simpler types

syft/pkg/cataloger/ai/parse_gguf_model.go

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,6 @@ import (
2727
func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment, reader file.LocationReadCloser) ([]pkg.Package, []artifact.Relationship, error) {
2828
defer internal.CloseAndLogError(reader, reader.Path())
2929

30-
// Read and validate the GGUF file header using LimitedReader to prevent OOM
31-
// We use LimitedReader to cap reads at maxHeaderSize (50MB)
32-
limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize}
33-
headerData, err := readHeader(limitedReader)
34-
if err != nil {
35-
return nil, nil, fmt.Errorf("failed to read GGUF header: %w", err)
36-
}
37-
3830
// Create a temporary file for the library to parse
3931
// The library requires a file path, so we create a temp file
4032
tempFile, err := os.CreateTemp("", "syft-gguf-*.gguf")
@@ -44,10 +36,12 @@ func parseGGUFModel(_ context.Context, _ file.Resolver, _ *generic.Environment,
4436
tempPath := tempFile.Name()
4537
defer os.Remove(tempPath)
4638

47-
// Write the validated header data to temp file
48-
if _, err := tempFile.Write(headerData); err != nil {
39+
// Copy and validate the GGUF file header using LimitedReader to prevent OOM
40+
// We use LimitedReader to cap reads at maxHeaderSize (50MB)
41+
limitedReader := &io.LimitedReader{R: reader, N: maxHeaderSize}
42+
if err := copyHeader(tempFile, limitedReader); err != nil {
4943
tempFile.Close()
50-
return nil, nil, fmt.Errorf("failed to write to temp file: %w", err)
44+
return nil, nil, fmt.Errorf("failed to copy GGUF header: %w", err)
5145
}
5246
tempFile.Close()
5347

0 commit comments

Comments
 (0)