Scaling Strategies
Note: Code examples in this guide are illustrative pseudocode showing recommended patterns. For working examples using the actual LLM4S API, see modules/samples .
Production patterns for handling high throughput, reducing latency, and distributing load. This guide covers caching, rate limiting, batch processing, and distributed execution.
Caching Strategies
Request-Level Caching
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
object RequestCaching {
import scala.collection.mutable
import scala.concurrent.duration._
case class CachedResponse(
result: String,
timestamp: Long,
ttl: Duration
) {
def isExpired: Boolean =
System.currentTimeMillis() - timestamp > ttl.toMillis
}
class RequestCache(maxSize: Int = 1000) {
private val cache = mutable.LinkedHashMap[String, CachedResponse]()
def get(key: String): Option[String] = {
cache.get(key).flatMap { cached =>
if (cached.isExpired) {
cache.remove(key)
None
} else {
Some(cached.result)
}
}
}
def put(key: String, value: String, ttl: Duration): Unit = {
if (cache.size >= maxSize) {
cache.remove(cache.head._1) // Remove oldest
}
cache(key) = CachedResponse(value, System.currentTimeMillis(), ttl)
}
def stats(): CacheStats = CacheStats(
size = cache.size,
hits = hitCount,
misses = missCount,
hitRate = hitCount.toDouble / (hitCount + missCount)
)
}
case class CacheStats(size: Int, hits: Long, misses: Long, hitRate: Double)
}
Distributed Cache (Redis)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
object DistributedCache {
import redis.clients.jedis.Jedis
class RedisCache(host: String = "localhost", port: Int = 6379) {
private val jedis = new Jedis(host, port)
def get(key: String): Option[String] = {
Option(jedis.get(key))
}
def put(key: String, value: String, ttlSeconds: Int): Unit = {
jedis.setex(key, ttlSeconds, value)
}
def invalidate(keyPattern: String): Long = {
val keys = jedis.keys(keyPattern)
if (keys.isEmpty) 0L
else jedis.del(keys.toArray: _*)
}
}
// Usage with LLM4S
def answerWithCaching(
query: String,
agent: Agent,
cache: RedisCache
): Result[String] = {
val cacheKey = s"query:${query.hashCode}"
cache.get(cacheKey) match {
case Some(cachedAnswer) =>
println(s"Cache hit for: $query")
Result.success(cachedAnswer)
case None =>
agent.run(query).map { response =>
cache.put(cacheKey, response.message, ttlSeconds = 3600)
response.message
}
}
}
}
Rate Limiting
Token Bucket Algorithm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
object RateLimiting {
import scala.concurrent.duration._
class TokenBucket(
capacity: Int,
refillRate: Int, // tokens per second
refillInterval: Duration = 1.second
) {
private var tokens = capacity.toDouble
private var lastRefillTime = System.currentTimeMillis()
def tryConsume(amount: Int = 1): Boolean = {
refill()
if (tokens >= amount) {
tokens -= amount
true
} else {
false
}
}
def availableTokens: Int = tokens.toInt
private def refill(): Unit = {
val now = System.currentTimeMillis()
val timeSinceLastRefill = now - lastRefillTime
val tokensToAdd = (timeSinceLastRefill / refillInterval.toMillis) * refillRate
tokens = Math.min(capacity, tokens + tokensToAdd)
lastRefillTime = now
}
}
// Usage
val rateLimiter = new TokenBucket(
capacity = 1000,
refillRate = 100 // 100 tokens per second = 100 requests/sec
)
def limitedLLMCall(
query: String,
agent: Agent
): Result[String] = {
if (rateLimiter.tryConsume()) {
agent.run(query)
} else {
Result.failure(s"Rate limit exceeded. Available tokens: ${rateLimiter.availableTokens}")
}
}
}
Per-User Rate Limiting
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
object PerUserRateLimiting {
import scala.collection.mutable
class UserRateLimiter {
private val buckets = mutable.Map[String, TokenBucket]()
def getLimiter(userId: String): TokenBucket = {
buckets.getOrElseUpdate(userId, new TokenBucket(100, 10))
}
def tryConsume(userId: String): Boolean = {
getLimiter(userId).tryConsume()
}
def getStatus(userId: String): String = {
val limiter = getLimiter(userId)
s"User $userId has ${limiter.availableTokens} tokens available"
}
}
}
Batch Processing
Embedding Batching
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
object EmbeddingBatching {
import scala.concurrent.Future
case class EmbeddingJob(
text: String,
callback: Vector[Double] => Unit
)
class BatchEmbeddingProcessor(
embeddingClient: EmbeddingClient,
batchSize: Int = 50,
batchTimeoutMs: Int = 1000
) {
private var batch = scala.collection.mutable.ListBuffer[EmbeddingJob]()
private var lastBatchTime = System.currentTimeMillis()
def embed(text: String): Future[Vector[Double]] = {
val promise = scala.concurrent.Promise[Vector[Double]]()
synchronized {
batch += EmbeddingJob(text, embedding => {
promise.success(embedding)
})
if (batch.size >= batchSize ||
System.currentTimeMillis() - lastBatchTime > batchTimeoutMs) {
processBatch()
}
}
promise.future
}
private def processBatch(): Unit = {
if (batch.nonEmpty) {
val texts = batch.map(_.text).toList
val callbacks = batch.map(_.callback).toList
// Embed all at once
val embeddings = embeddingClient.embedBatch(texts)
// Call callbacks
embeddings.zip(callbacks).foreach { case (emb, callback) =>
callback(emb)
}
batch.clear()
lastBatchTime = System.currentTimeMillis()
}
}
}
}
Distributed Execution
Worker Pool Pattern
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
object WorkerPool {
import scala.concurrent.{ExecutionContext, Future}
import java.util.concurrent.{Executors, ThreadPoolExecutor}
class AgentWorkerPool(numWorkers: Int) {
private implicit val ec = ExecutionContext.fromExecutor(
Executors.newFixedThreadPool(numWorkers)
)
def processInParallel[T](
tasks: List[String],
processor: String => Result[T]
): Future[List[Result[T]]] = {
val futures = tasks.map(task =>
Future {
processor(task)
}
)
Future.sequence(futures)
}
}
// Usage: Process multiple queries in parallel
def processMultipleQueries(
queries: List[String],
agent: Agent
): Future[List[Result[String]]] = {
val pool = new AgentWorkerPool(numWorkers = 4)
pool.processInParallel(queries) { query =>
agent.run(query).map(_.message)
}
}
}
Load Balancing Across Models
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
object LoadBalancing {
case class ModelLoad(
modelName: String,
currentLoad: Int,
capacity: Int
) {
def utilization: Double = currentLoad.toDouble / capacity
}
class LoadBalancer(models: List[LlmClient]) {
private val loads = scala.collection.mutable.Map[String, Int]()
models.foreach(m => loads(m.modelName) = 0)
def selectLeastLoaded(): LlmClient = {
models.minBy(m => loads(m.modelName))
}
def recordRequest(modelName: String): Unit = {
loads(modelName) = loads.getOrElse(modelName, 0) + 1
}
def recordCompletion(modelName: String): Unit = {
loads(modelName) = Math.max(0, loads.getOrElse(modelName, 1) - 1)
}
def execute(query: String): Result[String] = {
val model = selectLeastLoaded()
recordRequest(model.modelName)
try {
model.generate(query)
} finally {
recordCompletion(model.modelName)
}
}
}
}
Queue-Based Processing
Task Queue
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
object TaskQueue {
import scala.concurrent.{Future, Promise}
import scala.collection.mutable
sealed trait TaskStatus
case object Pending extends TaskStatus
case object Processing extends TaskStatus
case object Completed extends TaskStatus
case object Failed extends TaskStatus
case class Task(
id: String,
query: String,
status: TaskStatus = Pending,
result: Option[String] = None
)
class TaskProcessor(agent: Agent, numWorkers: Int = 3) {
private val queue = mutable.Queue[Task]()
private val tasks = mutable.Map[String, Task]()
private var running = true
def submitTask(query: String): String = {
val taskId = java.util.UUID.randomUUID().toString
val task = Task(taskId, query)
queue.enqueue(task)
tasks(taskId) = task
taskId
}
def getStatus(taskId: String): Option[TaskStatus] = {
tasks.get(taskId).map(_.status)
}
def getResult(taskId: String): Option[String] = {
tasks.get(taskId).flatMap(_.result)
}
def start(): Future[Unit] = {
Future {
while (running) {
if (queue.nonEmpty) {
val task = queue.dequeue()
processTask(task)
}
Thread.sleep(100)
}
}(scala.concurrent.ExecutionContext.global)
}
private def processTask(task: Task): Unit = {
tasks(task.id) = task.copy(status = Processing)
agent.run(task.query) match {
case Result.Success(response) =>
tasks(task.id) = task.copy(
status = Completed,
result = Some(response.message)
)
case Result.Failure(error) =>
tasks(task.id) = task.copy(
status = Failed,
result = Some(s"Error: $error")
)
}
}
}
}
Inference Optimization
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
object InferenceOptimization {
// Use smaller models for simple tasks
def selectModelByComplexity(query: String): LlmClient = {
if (query.length < 100) {
fastModel // faster, cheaper
} else {
powerfulModel // slower, more capable
}
}
// Parallel generation with early stopping
def generateWithTimeout(
agent: Agent,
query: String,
timeoutMs: Long = 5000
): Result[String] = {
val future = scala.concurrent.Future {
agent.run(query)
}(scala.concurrent.ExecutionContext.global)
scala.concurrent.Await.result(
future,
scala.concurrent.duration.Duration(timeoutMs, "ms")
)
}
// Speculative decoding (decode multiple possibilities in parallel)
def speculativeDecoding(
agent: Agent,
query: String
): Result[String] = {
// Generate with draft model
val draft = draftAgent.run(query)
// Verify with main model
val verified = agent.run(draft.message)
verified
}
}
Scaling Checklist
✅ Before scaling
✅ Scaling implementation
See Also
Last Updated: February 2026
Status: Production Ready