Veni AI
Cost Management

LLM Cost Optimization: API Pricing and Strategy Guide

Comprehensive guide for optimizing LLM API costs, token management, model selection, caching strategies, and enterprise budget management.

Veni AI Technical TeamDecember 28, 20246 min read
LLM Cost Optimization: API Pricing and Strategy Guide

LLM Cost Optimization: API Pricing and Strategy Guide

LLM API costs can be a significant expense item in high-volume applications. In this guide, we examine cost optimization strategies.

API Pricing Comparison

OpenAI Pricing (2024)

ModelInput (/1M token)Output (/1M token)
GPT Turbo$10.00$30.00
GPTo$5.00$15.00
GPTo-mini$0.15$0.60
GPT-3.5 Turbo$0.50$1.50

Anthropic Pricing

ModelInput (/1M token)Output (/1M token)
Claude 3 Opus$15.00$75.00
Claude 3 Sonnet$3.00$15.00
Claude 3 Haiku$0.25$1.25

Google Pricing

ModelInput (/1M token)Output (/1M token)
Gemini Pro$0.50$1.50
Gemini Pro 1.5$3.50$10.50

Cost Calculation

Token Cost Calculator

1class CostCalculator: 2 PRICING = { 3 "gpt-4-turbo": {"input": 0.01, "output": 0.03}, 4 "gpt-4o": {"input": 0.005, "output": 0.015}, 5 "gpt-4o-mini": {"input": 0.00015, "output": 0.0006}, 6 "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015}, 7 "claude-3-opus": {"input": 0.015, "output": 0.075}, 8 "claude-3-sonnet": {"input": 0.003, "output": 0.015}, 9 "claude-3-haiku": {"input": 0.00025, "output": 0.00125}, 10 } 11 12 def calculate(self, model: str, input_tokens: int, output_tokens: int) -> float: 13 pricing = self.PRICING.get(model, {"input": 0, "output": 0}) 14 15 input_cost = (input_tokens / 1000) * pricing["input"] 16 output_cost = (output_tokens / 1000) * pricing["output"] 17 18 return input_cost + output_cost 19 20 def estimate_monthly(self, model: str, daily_requests: int, 21 avg_input_tokens: int, avg_output_tokens: int) -> dict: 22 daily_cost = daily_requests * self.calculate( 23 model, avg_input_tokens, avg_output_tokens 24 ) 25 26 return { 27 "daily": daily_cost, 28 "weekly": daily_cost * 7, 29 "monthly": daily_cost * 30, 30 "yearly": daily_cost * 365 31 } 32 33# Usage 34calc = CostCalculator() 35cost = calc.calculate("gpt-4-turbo", 1000, 500) 36print(f"Cost: ${cost:.4f}") 37 38monthly = calc.estimate_monthly("gpt-4-turbo", 10000, 500, 200) 39print(f"Monthly estimate: ${monthly['monthly']:.2f}")

Model Routing Strategy

Complexity-Based Routing

1class ModelRouter: 2 def __init__(self): 3 self.models = { 4 "simple": "gpt-4o-mini", 5 "medium": "gpt-4o", 6 "complex": "gpt-4-turbo" 7 } 8 9 def classify_complexity(self, prompt: str) -> str: 10 # Simple heuristics 11 word_count = len(prompt.split()) 12 13 complexity_indicators = [ 14 "analyze", "compare", "evaluate", 15 "strategy", "detailed", "comprehensive" 16 ] 17 18 has_complexity = any(ind in prompt.lower() for ind in complexity_indicators) 19 20 if word_count < 50 and not has_complexity: 21 return "simple" 22 elif word_count < 200 or not has_complexity: 23 return "medium" 24 else: 25 return "complex" 26 27 def route(self, prompt: str) -> str: 28 complexity = self.classify_complexity(prompt) 29 return self.models[complexity] 30 31# LLM-based complexity classification 32def classify_with_llm(prompt: str) -> str: 33 response = client.chat.completions.create( 34 model="gpt-4o-mini", # Classification with cheap model 35 messages=[ 36 { 37 "role": "system", 38 "content": "Determine the complexity of the given prompt: simple, medium, complex" 39 }, 40 {"role": "user", "content": prompt} 41 ], 42 max_tokens=10 43 ) 44 return response.choices[0].message.content.strip().lower()

Quality-Cost Tradeoff

1class AdaptiveRouter: 2 def __init__(self, quality_threshold: float = 0.8): 3 self.quality_threshold = quality_threshold 4 self.model_hierarchy = ["gpt-4o-mini", "gpt-4o", "gpt-4-turbo"] 5 6 def get_response_with_fallback(self, prompt: str) -> dict: 7 for model in self.model_hierarchy: 8 response = self.call_model(model, prompt) 9 quality = self.assess_quality(response) 10 11 if quality >= self.quality_threshold: 12 return { 13 "response": response, 14 "model_used": model, 15 "quality_score": quality 16 } 17 18 # Continue with strongest model 19 return { 20 "response": response, 21 "model_used": self.model_hierarchy[-1], 22 "quality_score": quality 23 }

Caching Strategies

Response Caching

1import hashlib 2import redis 3import json 4 5class LLMCache: 6 def __init__(self, redis_url: str, ttl: int = 3600): 7 self.redis = redis.from_url(redis_url) 8 self.ttl = ttl 9 self.stats = {"hits": 0, "misses": 0} 10 11 def _cache_key(self, model: str, messages: list, **kwargs) -> str: 12 content = json.dumps({ 13 "model": model, 14 "messages": messages, 15 **kwargs 16 }, sort_keys=True) 17 return f"llm:{hashlib.md5(content.encode()).hexdigest()}" 18 19 def get(self, model: str, messages: list, **kwargs) -> dict | None: 20 key = self._cache_key(model, messages, **kwargs) 21 cached = self.redis.get(key) 22 23 if cached: 24 self.stats["hits"] += 1 25 return json.loads(cached) 26 27 self.stats["misses"] += 1 28 return None 29 30 def set(self, model: str, messages: list, response: dict, **kwargs): 31 key = self._cache_key(model, messages, **kwargs) 32 self.redis.setex(key, self.ttl, json.dumps(response)) 33 34 def get_savings(self, cost_per_request: float) -> float: 35 return self.stats["hits"] * cost_per_request 36 37# Usage 38cache = LLMCache("redis://localhost:6379") 39 40def cached_completion(model: str, messages: list, **kwargs): 41 cached = cache.get(model, messages, **kwargs) 42 if cached: 43 return cached 44 45 response = client.chat.completions.create( 46 model=model, 47 messages=messages, 48 **kwargs 49 ) 50 51 result = response.choices[0].message.content 52 cache.set(model, messages, {"content": result}) 53 54 return {"content": result}

Semantic Caching

1class SemanticCache: 2 def __init__(self, vector_store, similarity_threshold: float = 0.95): 3 self.vector_store = vector_store 4 self.threshold = similarity_threshold 5 6 def get_similar(self, query: str) -> dict | None: 7 query_embedding = get_embedding(query) 8 9 results = self.vector_store.search( 10 vector=query_embedding, 11 top_k=1, 12 filter={"type": "cache"} 13 ) 14 15 if results and results[0].score >= self.threshold: 16 return { 17 "response": results[0].metadata["response"], 18 "similarity": results[0].score, 19 "original_query": results[0].metadata["query"] 20 } 21 22 return None 23 24 def store(self, query: str, response: str): 25 embedding = get_embedding(query) 26 27 self.vector_store.upsert([{ 28 "id": f"cache_{hash(query)}", 29 "values": embedding, 30 "metadata": { 31 "type": "cache", 32 "query": query, 33 "response": response, 34 "timestamp": datetime.now().isoformat() 35 } 36 }])

Token Optimization

Prompt Compression

1def compress_prompt(prompt: str, target_reduction: float = 0.3) -> str: 2 """Save tokens by shortening prompt""" 3 4 response = client.chat.completions.create( 5 model="gpt-4o-mini", 6 messages=[ 7 { 8 "role": "system", 9 "content": f"Shorten text by {int(target_reduction*100)}%. Keep important information." 10 }, 11 {"role": "user", "content": prompt} 12 ] 13 ) 14 15 return response.choices[0].message.content 16 17def remove_redundancy(text: str) -> str: 18 """Remove redundant content""" 19 sentences = text.split(". ") 20 unique_sentences = list(dict.fromkeys(sentences)) 21 return ". ".join(unique_sentences)

Output Length Control

1def optimize_output_tokens(prompt: str, max_tokens: int = None) -> dict: 2 """Optimize output token count""" 3 4 # Set max_tokens based on task type 5 task_limits = { 6 "classification": 10, 7 "extraction": 200, 8 "summarization": 300, 9 "generation": 500, 10 "analysis": 800 11 } 12 13 task_type = detect_task_type(prompt) 14 optimal_max = max_tokens or task_limits.get(task_type, 500) 15 16 response = client.chat.completions.create( 17 model="gpt-4o-mini", 18 messages=[{"role": "user", "content": prompt}], 19 max_tokens=optimal_max 20 ) 21 22 return { 23 "response": response.choices[0].message.content, 24 "tokens_used": response.usage.completion_tokens, 25 "tokens_saved": 4096 - optimal_max # Potential savings 26 }

Batch Processing

Batch API Usage

1async def batch_process(prompts: list, model: str = "gpt-4o-mini") -> list: 2 """Cost reduction with batch processing""" 3 4 # OpenAI Batch API 5 batch_input = [ 6 { 7 "custom_id": f"request-{i}", 8 "method": "POST", 9 "url": "/v1/chat/completions", 10 "body": { 11 "model": model, 12 "messages": [{"role": "user", "content": prompt}] 13 } 14 } 15 for i, prompt in enumerate(prompts) 16 ] 17 18 # Create batch job 19 batch = client.batches.create( 20 input_file_id=upload_batch_file(batch_input), 21 endpoint="/v1/chat/completions", 22 completion_window="24h" 23 ) 24 25 # Wait for and retrieve results 26 return await wait_for_batch(batch.id)

Budget Monitoring

1class BudgetMonitor: 2 def __init__(self, daily_limit: float, alert_threshold: float = 0.8): 3 self.daily_limit = daily_limit 4 self.alert_threshold = alert_threshold 5 self.daily_spend = 0 6 self.last_reset = datetime.now().date() 7 8 def track_usage(self, cost: float) -> dict: 9 # Check daily reset 10 if datetime.now().date() > self.last_reset: 11 self.daily_spend = 0 12 self.last_reset = datetime.now().date() 13 14 self.daily_spend += cost 15 16 status = { 17 "current_spend": self.daily_spend, 18 "remaining": self.daily_limit - self.daily_spend, 19 "percentage": self.daily_spend / self.daily_limit 20 } 21 22 if status["percentage"] >= self.alert_threshold: 23 self.send_alert(status) 24 25 if self.daily_spend >= self.daily_limit: 26 raise BudgetExceededError("Daily budget limit exceeded") 27 28 return status 29 30 def send_alert(self, status: dict): 31 # Email/Slack notification 32 pass 33 34# Usage as Middleware 35budget = BudgetMonitor(daily_limit=100.0) 36 37def monitored_completion(model: str, messages: list, **kwargs): 38 response = client.chat.completions.create( 39 model=model, messages=messages, **kwargs 40 ) 41 42 cost = calculate_cost(model, response.usage) 43 budget.track_usage(cost) 44 45 return response

ROI Calculation

1def calculate_ai_roi( 2 manual_cost_per_task: float, 3 tasks_per_month: int, 4 ai_cost_per_task: float, 5 accuracy_rate: float = 0.95 6) -> dict: 7 8 manual_monthly = manual_cost_per_task * tasks_per_month 9 ai_monthly = ai_cost_per_task * tasks_per_month 10 11 # Error correction cost 12 error_cost = (1 - accuracy_rate) * tasks_per_month * manual_cost_per_task * 0.5 13 14 total_ai_cost = ai_monthly + error_cost 15 savings = manual_monthly - total_ai_cost 16 17 return { 18 "manual_cost": manual_monthly, 19 "ai_cost": total_ai_cost, 20 "monthly_savings": savings, 21 "yearly_savings": savings * 12, 22 "roi_percentage": (savings / total_ai_cost) * 100 23 }

Conclusion

LLM cost optimization is achieved through model routing, caching, token management, and budget monitoring. With the right strategies, cost savings of 50-80% are possible.

At Veni AI, we provide cost-effective AI solutions.

İlgili Makaleler