Skip to content

Commit c809ec5

Browse files
committed
feat(llama.cpp): estimate vram usage
Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent bace651 commit c809ec5

File tree

7 files changed

+126
-21
lines changed

7 files changed

+126
-21
lines changed

core/cli/util.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@ import (
77

88
"github.com/rs/zerolog/log"
99

10+
gguf "github.com/gpustack/gguf-parser-go"
1011
cliContext "github.com/mudler/LocalAI/core/cli/context"
1112
"github.com/mudler/LocalAI/core/config"
1213
"github.com/mudler/LocalAI/core/gallery"
1314
"github.com/mudler/LocalAI/pkg/downloader"
14-
gguf "github.com/thxcode/gguf-parser-go"
1515
)
1616

1717
type UtilCMD struct {
@@ -51,7 +51,7 @@ func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
5151
log.Info().
5252
Any("eosTokenID", f.Tokenizer().EOSTokenID).
5353
Any("bosTokenID", f.Tokenizer().BOSTokenID).
54-
Any("modelName", f.Model().Name).
54+
Any("modelName", f.Metadata().Name).
5555
Any("architecture", f.Architecture().Architecture).Msgf("GGUF file loaded: %s", u.Args[0])
5656

5757
log.Info().Any("tokenizer", fmt.Sprintf("%+v", f.Tokenizer())).Msg("Tokenizer")

core/config/gguf.go

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@ package config
33
import (
44
"strings"
55

6+
"github.com/mudler/LocalAI/pkg/xsysinfo"
67
"github.com/rs/zerolog/log"
78

8-
gguf "github.com/thxcode/gguf-parser-go"
9+
gguf "github.com/gpustack/gguf-parser-go"
910
)
1011

1112
type familyType uint8
@@ -23,6 +24,7 @@ const (
2324

2425
const (
2526
defaultContextSize = 1024
27+
defaultNGPULayers = 99999999
2628
)
2729

2830
type settingsConfig struct {
@@ -147,7 +149,7 @@ var knownTemplates = map[string]familyType{
147149
func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
148150

149151
if defaultCtx == 0 && cfg.ContextSize == nil {
150-
ctxSize := f.EstimateLLaMACppUsage().ContextSize
152+
ctxSize := f.EstimateLLaMACppRun().ContextSize
151153
if ctxSize > 0 {
152154
cSize := int(ctxSize)
153155
cfg.ContextSize = &cSize
@@ -157,6 +159,41 @@ func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
157159
}
158160
}
159161

162+
// GPU options
163+
if cfg.Options == nil {
164+
if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") {
165+
cfg.Options = []string{"gpu"}
166+
}
167+
}
168+
169+
// vram estimation
170+
vram, err := xsysinfo.TotalAvailableVRAM()
171+
if err != nil {
172+
log.Error().Msgf("guessDefaultsFromFile: %s", err)
173+
}
174+
175+
estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram)
176+
if err != nil {
177+
log.Error().Msgf("guessDefaultsFromFile: %s", err)
178+
}
179+
180+
if estimate.IsFullOffload {
181+
log.Warn().Msgf("guessDefaultsFromFile: %s", "full offload is recommended")
182+
}
183+
184+
if estimate.EstimatedVRAM > vram {
185+
log.Warn().Msgf("guessDefaultsFromFile: %s", "estimated VRAM usage is greater than available VRAM")
186+
}
187+
188+
if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 {
189+
cfg.NGPULayers = &estimate.EstimatedLayers
190+
} else {
191+
// we assume we want to offload all layers
192+
defaultHigh := defaultNGPULayers
193+
cfg.NGPULayers = &defaultHigh
194+
}
195+
196+
// template estimations
160197
if cfg.HasTemplate() {
161198
// nothing to guess here
162199
log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
@@ -166,12 +203,12 @@ func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
166203
log.Debug().
167204
Any("eosTokenID", f.Tokenizer().EOSTokenID).
168205
Any("bosTokenID", f.Tokenizer().BOSTokenID).
169-
Any("modelName", f.Model().Name).
206+
Any("modelName", f.Metadata().Name).
170207
Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
171208

172209
// guess the name
173210
if cfg.Name == "" {
174-
cfg.Name = f.Model().Name
211+
cfg.Name = f.Metadata().Name
175212
}
176213

177214
family := identifyFamily(f)
@@ -207,6 +244,7 @@ func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
207244
cfg.TemplateConfig.JinjaTemplate = true
208245
cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
209246
}
247+
210248
}
211249

212250
func identifyFamily(f *gguf.GGUFFile) familyType {
@@ -231,7 +269,7 @@ func identifyFamily(f *gguf.GGUFFile) familyType {
231269
commandR := arch == "command-r" && eosTokenID == 255001
232270
qwen2 := arch == "qwen2"
233271
phi3 := arch == "phi-3"
234-
gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
272+
gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Metadata().Name), "gemma")
235273
deepseek2 := arch == "deepseek2"
236274

237275
switch {

core/config/guesser.go

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@ import (
44
"os"
55
"path/filepath"
66

7-
"github.com/mudler/LocalAI/pkg/xsysinfo"
7+
gguf "github.com/gpustack/gguf-parser-go"
88
"github.com/rs/zerolog/log"
9-
gguf "github.com/thxcode/gguf-parser-go"
109
)
1110

1211
func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) {
@@ -36,10 +35,4 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int)
3635
}
3736
cfg.ContextSize = &defaultCtx
3837
}
39-
40-
if cfg.Options == nil {
41-
if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") {
42-
cfg.Options = []string{"gpu"}
43-
}
44-
}
4538
}

go.mod

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ require (
2727
github.com/golang/protobuf v1.5.4
2828
github.com/google/go-containerregistry v0.19.2
2929
github.com/google/uuid v1.6.0
30+
github.com/gpustack/gguf-parser-go v0.17.0
3031
github.com/grpc-ecosystem/grpc-gateway v1.5.0
3132
github.com/hpcloud/tail v1.0.0
3233
github.com/ipfs/go-log v1.0.5
@@ -110,6 +111,7 @@ require (
110111
github.com/pion/turn/v2 v2.1.6 // indirect
111112
github.com/pion/turn/v4 v4.0.0 // indirect
112113
github.com/pion/webrtc/v4 v4.0.9 // indirect
114+
github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 // indirect
113115
github.com/savsgio/gotils v0.0.0-20230208104028-c358bd845dee // indirect
114116
github.com/shirou/gopsutil/v4 v4.24.7 // indirect
115117
github.com/wlynxg/anet v0.0.5 // indirect
@@ -188,7 +190,7 @@ require (
188190
github.com/hashicorp/go-multierror v1.1.1 // indirect
189191
github.com/hashicorp/golang-lru v1.0.2 // indirect
190192
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
191-
github.com/henvic/httpretty v0.1.3 // indirect
193+
github.com/henvic/httpretty v0.1.4 // indirect
192194
github.com/huandu/xstrings v1.5.0 // indirect
193195
github.com/huin/goupnp v1.3.0 // indirect
194196
github.com/ipfs/boxo v0.27.4 // indirect
@@ -278,7 +280,7 @@ require (
278280
github.com/shoenig/go-m1cpu v0.1.6 // indirect
279281
github.com/shopspring/decimal v1.4.0 // indirect
280282
github.com/sirupsen/logrus v1.9.3 // indirect
281-
github.com/smallnest/ringbuffer v0.0.0-20240423223918-bab516b2000b // indirect
283+
github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect
282284
github.com/songgao/packets v0.0.0-20160404182456-549a10cd4091 // indirect
283285
github.com/spaolacci/murmur3 v1.1.0 // indirect
284286
github.com/spf13/cast v1.7.0 // indirect

go.sum

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,8 @@ github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8=
295295
github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0=
296296
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
297297
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
298+
github.com/gpustack/gguf-parser-go v0.17.0 h1:DkSziWLsiQM0pqqkr/zMcaBn94KY7iQTi4zmaHixDus=
299+
github.com/gpustack/gguf-parser-go v0.17.0/go.mod h1:GvHh1Kvvq5ojCOsJ5UpwiJJmIjFw3Qk5cW7R+CZ3IJo=
298300
github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
299301
github.com/grpc-ecosystem/grpc-gateway v1.5.0 h1:WcmKMm43DR7RdtlkEXQJyo5ws8iTp98CyhCCbOHMvNI=
300302
github.com/grpc-ecosystem/grpc-gateway v1.5.0/go.mod h1:RSKVYQBd5MCa4OVpNdGskqpgL2+G+NZTnrVHpWWfpdw=
@@ -307,8 +309,8 @@ github.com/hashicorp/golang-lru v1.0.2 h1:dV3g9Z/unq5DpblPpw+Oqcv4dU/1omnb4Ok8iP
307309
github.com/hashicorp/golang-lru v1.0.2/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4=
308310
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
309311
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
310-
github.com/henvic/httpretty v0.1.3 h1:4A6vigjz6Q/+yAfTD4wqipCv+Px69C7Th/NhT0ApuU8=
311-
github.com/henvic/httpretty v0.1.3/go.mod h1:UUEv7c2kHZ5SPQ51uS3wBpzPDibg2U3Y+IaXyHy5GBg=
312+
github.com/henvic/httpretty v0.1.4 h1:Jo7uwIRWVFxkqOnErcoYfH90o3ddQyVrSANeS4cxYmU=
313+
github.com/henvic/httpretty v0.1.4/go.mod h1:Dn60sQTZfbt2dYsdUSNsCljyF4AfdqnuJFDLJA1I4AM=
312314
github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
313315
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
314316
github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI=
@@ -660,6 +662,8 @@ github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUc
660662
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
661663
github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
662664
github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
665+
github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529 h1:18kd+8ZUlt/ARXhljq+14TwAoKa61q6dX8jtwOf6DH8=
666+
github.com/rs/dnscache v0.0.0-20230804202142-fc85eb664529/go.mod h1:qe5TWALJ8/a1Lqznoc5BDHpYX/8HU60Hm2AwRmqzxqA=
663667
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
664668
github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8=
665669
github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss=
@@ -712,8 +716,8 @@ github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic
712716
github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
713717
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
714718
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
715-
github.com/smallnest/ringbuffer v0.0.0-20240423223918-bab516b2000b h1:e9eeuSYSLmUKxy7ALzKcxo7ggTceQaVcBhjDIcewa9c=
716-
github.com/smallnest/ringbuffer v0.0.0-20240423223918-bab516b2000b/go.mod h1:tAG61zBM1DYRaGIPloumExGvScf08oHuo0kFoOqdbT0=
719+
github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d h1:3VwvTjiRPA7cqtgOWddEL+JrcijMlXUmj99c/6YyZoY=
720+
github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d/go.mod h1:tAG61zBM1DYRaGIPloumExGvScf08oHuo0kFoOqdbT0=
717721
github.com/smartystreets/assertions v1.2.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo=
718722
github.com/smartystreets/assertions v1.13.0 h1:Dx1kYM01xsSqKPno3aqLnrwac2LetPvN23diwyr69Qs=
719723
github.com/smartystreets/assertions v1.13.0/go.mod h1:wDmR7qL282YbGsPy6H/yAsesrxfxaaSlJazyFLYVFx8=

pkg/xsysinfo/gguf.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
package xsysinfo
2+
3+
import (
4+
"errors"
5+
6+
gguf "github.com/gpustack/gguf-parser-go"
7+
)
8+
9+
type VRAMEstimate struct {
10+
TotalVRAM uint64
11+
AvailableVRAM uint64
12+
ModelSize uint64
13+
EstimatedLayers int
14+
EstimatedVRAM uint64
15+
IsFullOffload bool
16+
}
17+
18+
func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) {
19+
// Get model metadata
20+
m := f.Metadata()
21+
a := f.Architecture()
22+
23+
// Calculate base model size
24+
modelSize := uint64(m.Size)
25+
26+
if a.BlockCount == 0 {
27+
return nil, errors.New("block count is 0")
28+
}
29+
30+
// Estimate number of layers that can fit in VRAM
31+
// Each layer typically requires about 1/32 of the model size
32+
layerSize := modelSize / uint64(a.BlockCount)
33+
estimatedLayers := int(availableVRAM / layerSize)
34+
35+
// If we can't fit even one layer, we need to do full offload
36+
isFullOffload := estimatedLayers <= 0
37+
if isFullOffload {
38+
estimatedLayers = 0
39+
}
40+
41+
// Calculate estimated VRAM usage
42+
estimatedVRAM := uint64(estimatedLayers) * layerSize
43+
44+
return &VRAMEstimate{
45+
TotalVRAM: availableVRAM,
46+
AvailableVRAM: availableVRAM,
47+
ModelSize: modelSize,
48+
EstimatedLayers: estimatedLayers,
49+
EstimatedVRAM: estimatedVRAM,
50+
IsFullOffload: isFullOffload,
51+
}, nil
52+
}

pkg/xsysinfo/gpu.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,22 @@ func GPUs() ([]*gpu.GraphicsCard, error) {
1616
return gpu.GraphicsCards, nil
1717
}
1818

19+
func TotalAvailableVRAM() (uint64, error) {
20+
gpus, err := GPUs()
21+
if err != nil {
22+
return 0, err
23+
}
24+
25+
var totalVRAM uint64
26+
for _, gpu := range gpus {
27+
if gpu.Node.Memory.TotalUsableBytes > 0 {
28+
totalVRAM += uint64(gpu.Node.Memory.TotalUsableBytes)
29+
}
30+
}
31+
32+
return totalVRAM, nil
33+
}
34+
1935
func HasGPU(vendor string) bool {
2036
gpus, err := GPUs()
2137
if err != nil {

0 commit comments

Comments
 (0)