mirror of
https://github.com/ollama/ollama.git
synced 2025-12-22 05:37:14 +00:00
llm: Avoid integer underflow on llama engine memory layout
On the llama engine, when we compute the memory layout, we reserve a buffer to allow for some flexibility for incorrect estimates. This is subtracted from GPU free memory and on GPUs with limited memory, it may underflow. Fixes #13494
This commit is contained in:
@@ -524,8 +524,13 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
|
||||
// Use the size of one layer as a buffer
|
||||
layers := s.ggml.Tensors().GroupLayers()
|
||||
if blk0, ok := layers["blk.0"]; ok {
|
||||
buffer := blk0.Size() + kv[0]
|
||||
for i := range gpus {
|
||||
gpus[i].FreeMemory -= blk0.Size() + kv[0]
|
||||
if gpus[i].FreeMemory > buffer {
|
||||
gpus[i].FreeMemory -= buffer
|
||||
} else {
|
||||
gpus[i].FreeMemory = 0
|
||||
}
|
||||
}
|
||||
} else {
|
||||
slog.Warn("model missing blk.0 layer size")
|
||||
@@ -575,7 +580,11 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
|
||||
projectorGPU = firstIntegrated
|
||||
}
|
||||
|
||||
gpus[projectorGPU].FreeMemory -= projectorWeights
|
||||
if gpus[projectorGPU].FreeMemory > projectorWeights {
|
||||
gpus[projectorGPU].FreeMemory -= projectorWeights
|
||||
} else {
|
||||
gpus[projectorGPU].FreeMemory = 0
|
||||
}
|
||||
}
|
||||
|
||||
var kvTotal uint64
|
||||
|
||||
Reference in New Issue
Block a user