πŸ“ Tutorials
Β· 2 min read

Rate Limiting AI API Requests: Protect Your Budget and Stay Under Limits (2026)


Without rate limiting, one user can burn your entire AI API budget in an hour. One automated script, one curious developer, one bot β€” and your monthly bill goes from $50 to $5,000.

Token bucket rate limiter

import time
import asyncio

class TokenBucket:
    def __init__(self, rate, capacity):
        self.rate = rate          # tokens per second
        self.capacity = capacity  # max burst
        self.tokens = capacity
        self.last_refill = time.time()
    
    async def acquire(self):
        while True:
            now = time.time()
            elapsed = now - self.last_refill
            self.tokens = min(self.capacity, self.tokens + elapsed * self.rate)
            self.last_refill = now
            
            if self.tokens >= 1:
                self.tokens -= 1
                return True
            
            wait = (1 - self.tokens) / self.rate
            await asyncio.sleep(wait)

# 20 requests per minute, burst of 5
limiter = TokenBucket(rate=20/60, capacity=5)

async def call_ai(message):
    await limiter.acquire()
    return await openai_client.chat.completions.create(
        model="gpt-5.4-mini",
        messages=[{"role": "user", "content": message}],
    )

Per-user rate limiting

import redis

r = redis.Redis()

async def check_user_limit(user_id, max_requests=50, window=3600):
    key = f"ratelimit:{user_id}:{int(time.time()) // window}"
    count = r.incr(key)
    if count == 1:
        r.expire(key, window)
    
    if count > max_requests:
        return False, max_requests - count  # Over limit
    return True, max_requests - count       # Remaining

Per-user token budget

Rate limiting requests isn’t enough β€” one request can use 100K tokens:

async def check_token_budget(user_id, estimated_tokens):
    daily_key = f"tokens:{user_id}:{date.today()}"
    used = int(r.get(daily_key) or 0)
    
    DAILY_LIMIT = 100_000  # tokens per user per day
    
    if used + estimated_tokens > DAILY_LIMIT:
        return False
    
    r.incrby(daily_key, estimated_tokens)
    r.expire(daily_key, 86400)
    return True

See our AI agent cost management guide for more budget control strategies.

Express middleware

import rateLimit from 'express-rate-limit';

const aiLimiter = rateLimit({
  windowMs: 60 * 1000,  // 1 minute
  max: 20,              // 20 requests per minute
  message: { error: 'Too many AI requests. Please wait a minute.' },
  keyGenerator: (req) => req.user?.id || req.ip,
});

app.use('/api/ai', aiLimiter);

Graceful degradation

When a user hits their limit, don’t just return an error β€” degrade gracefully:

async def handle_request(user_id, message):
    allowed, remaining = await check_user_limit(user_id)
    
    if not allowed:
        return {"error": "Rate limit reached", "retry_after": 60}
    
    if remaining < 5:
        # Running low β€” use cheaper model
        model = "gpt-5.4-mini"  # Instead of gpt-5.4
    else:
        model = "gpt-5.4"
    
    return await call_ai(message, model=model)

Related: AI Agent Cost Management Β· AI Agent Error Handling Β· OpenRouter Rate Limit Fix Β· Deploy AI Agents to Production Β· What is Rate Limiting Β· Monitor AI API Spending