Rate Limiting AI API Requests: Protect Your Budget and Stay Under Limits (2026)
Without rate limiting, one user can burn your entire AI API budget in an hour. One automated script, one curious developer, one bot β and your monthly bill goes from $50 to $5,000.
Token bucket rate limiter
import time
import asyncio
class TokenBucket:
def __init__(self, rate, capacity):
self.rate = rate # tokens per second
self.capacity = capacity # max burst
self.tokens = capacity
self.last_refill = time.time()
async def acquire(self):
while True:
now = time.time()
elapsed = now - self.last_refill
self.tokens = min(self.capacity, self.tokens + elapsed * self.rate)
self.last_refill = now
if self.tokens >= 1:
self.tokens -= 1
return True
wait = (1 - self.tokens) / self.rate
await asyncio.sleep(wait)
# 20 requests per minute, burst of 5
limiter = TokenBucket(rate=20/60, capacity=5)
async def call_ai(message):
await limiter.acquire()
return await openai_client.chat.completions.create(
model="gpt-5.4-mini",
messages=[{"role": "user", "content": message}],
)
Per-user rate limiting
import redis
r = redis.Redis()
async def check_user_limit(user_id, max_requests=50, window=3600):
key = f"ratelimit:{user_id}:{int(time.time()) // window}"
count = r.incr(key)
if count == 1:
r.expire(key, window)
if count > max_requests:
return False, max_requests - count # Over limit
return True, max_requests - count # Remaining
Per-user token budget
Rate limiting requests isnβt enough β one request can use 100K tokens:
async def check_token_budget(user_id, estimated_tokens):
daily_key = f"tokens:{user_id}:{date.today()}"
used = int(r.get(daily_key) or 0)
DAILY_LIMIT = 100_000 # tokens per user per day
if used + estimated_tokens > DAILY_LIMIT:
return False
r.incrby(daily_key, estimated_tokens)
r.expire(daily_key, 86400)
return True
See our AI agent cost management guide for more budget control strategies.
Express middleware
import rateLimit from 'express-rate-limit';
const aiLimiter = rateLimit({
windowMs: 60 * 1000, // 1 minute
max: 20, // 20 requests per minute
message: { error: 'Too many AI requests. Please wait a minute.' },
keyGenerator: (req) => req.user?.id || req.ip,
});
app.use('/api/ai', aiLimiter);
Graceful degradation
When a user hits their limit, donβt just return an error β degrade gracefully:
async def handle_request(user_id, message):
allowed, remaining = await check_user_limit(user_id)
if not allowed:
return {"error": "Rate limit reached", "retry_after": 60}
if remaining < 5:
# Running low β use cheaper model
model = "gpt-5.4-mini" # Instead of gpt-5.4
else:
model = "gpt-5.4"
return await call_ai(message, model=model)
Related: AI Agent Cost Management Β· AI Agent Error Handling Β· OpenRouter Rate Limit Fix Β· Deploy AI Agents to Production Β· What is Rate Limiting Β· Monitor AI API Spending