LLM Maliyet Optimizasyonu: API Fiyatlandırma ve Strateji Rehberi
LLM API maliyetleri, yüksek hacimli uygulamalarda önemli bir gider kalemi oluşturabilir. Bu rehberde maliyet optimizasyon stratejilerini inceliyoruz.
API Fiyatlandırma Karşılaştırması
OpenAI Pricing (2024)
| Model | Input (/1M token) | Output (/1M token) |
|---|---|---|
| GPT Turbo | $10.00 | $30.00 |
| GPTo | $5.00 | $15.00 |
| GPTo-mini | $0.15 | $0.60 |
| GPT-3.5 Turbo | $0.50 | $1.50 |
Anthropic Pricing
| Model | Input (/1M token) | Output (/1M token) |
|---|---|---|
| Claude 3 Opus | $15.00 | $75.00 |
| Claude 3 Sonnet | $3.00 | $15.00 |
| Claude 3 Haiku | $0.25 | $1.25 |
Google Pricing
| Model | Input (/1M token) | Output (/1M token) |
|---|---|---|
| Gemini Pro | $0.50 | $1.50 |
| Gemini Pro 1.5 | $3.50 | $10.50 |
Maliyet Hesaplama
Token Maliyet Hesaplayıcı
1class CostCalculator: 2 PRICING = { 3 "gpt-4-turbo": {"input": 0.01, "output": 0.03}, 4 "gpt-4o": {"input": 0.005, "output": 0.015}, 5 "gpt-4o-mini": {"input": 0.00015, "output": 0.0006}, 6 "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015}, 7 "claude-3-opus": {"input": 0.015, "output": 0.075}, 8 "claude-3-sonnet": {"input": 0.003, "output": 0.015}, 9 "claude-3-haiku": {"input": 0.00025, "output": 0.00125}, 10 } 11 12 def calculate(self, model: str, input_tokens: int, output_tokens: int) -> float: 13 pricing = self.PRICING.get(model, {"input": 0, "output": 0}) 14 15 input_cost = (input_tokens / 1000) * pricing["input"] 16 output_cost = (output_tokens / 1000) * pricing["output"] 17 18 return input_cost + output_cost 19 20 def estimate_monthly(self, model: str, daily_requests: int, 21 avg_input_tokens: int, avg_output_tokens: int) -> dict: 22 daily_cost = daily_requests * self.calculate( 23 model, avg_input_tokens, avg_output_tokens 24 ) 25 26 return { 27 "daily": daily_cost, 28 "weekly": daily_cost * 7, 29 "monthly": daily_cost * 30, 30 "yearly": daily_cost * 365 31 } 32 33# Kullanım 34calc = CostCalculator() 35cost = calc.calculate("gpt-4-turbo", 1000, 500) 36print(f"Cost: ${cost:.4f}") 37 38monthly = calc.estimate_monthly("gpt-4-turbo", 10000, 500, 200) 39print(f"Monthly estimate: ${monthly['monthly']:.2f}")
Model Routing Stratejisi
Complexity-Based Routing
1class ModelRouter: 2 def __init__(self): 3 self.models = { 4 "simple": "gpt-4o-mini", 5 "medium": "gpt-4o", 6 "complex": "gpt-4-turbo" 7 } 8 9 def classify_complexity(self, prompt: str) -> str: 10 # Basit heuristic'ler 11 word_count = len(prompt.split()) 12 13 complexity_indicators = [ 14 "analiz", "karşılaştır", "değerlendir", 15 "strateji", "detaylı", "kapsamlı" 16 ] 17 18 has_complexity = any(ind in prompt.lower() for ind in complexity_indicators) 19 20 if word_count < 50 and not has_complexity: 21 return "simple" 22 elif word_count < 200 or not has_complexity: 23 return "medium" 24 else: 25 return "complex" 26 27 def route(self, prompt: str) -> str: 28 complexity = self.classify_complexity(prompt) 29 return self.models[complexity] 30 31# LLM-based complexity classification 32def classify_with_llm(prompt: str) -> str: 33 response = client.chat.completions.create( 34 model="gpt-4o-mini", # Ucuz model ile sınıflandırma 35 messages=[ 36 { 37 "role": "system", 38 "content": "Verilen prompt'un karmaşıklığını belirle: simple, medium, complex" 39 }, 40 {"role": "user", "content": prompt} 41 ], 42 max_tokens=10 43 ) 44 return response.choices[0].message.content.strip().lower()
Quality-Cost Tradeoff
1class AdaptiveRouter: 2 def __init__(self, quality_threshold: float = 0.8): 3 self.quality_threshold = quality_threshold 4 self.model_hierarchy = ["gpt-4o-mini", "gpt-4o", "gpt-4-turbo"] 5 6 def get_response_with_fallback(self, prompt: str) -> dict: 7 for model in self.model_hierarchy: 8 response = self.call_model(model, prompt) 9 quality = self.assess_quality(response) 10 11 if quality >= self.quality_threshold: 12 return { 13 "response": response, 14 "model_used": model, 15 "quality_score": quality 16 } 17 18 # En güçlü model ile devam et 19 return { 20 "response": response, 21 "model_used": self.model_hierarchy[-1], 22 "quality_score": quality 23 }
Caching Stratejileri
Response Caching
1import hashlib 2import redis 3import json 4 5class LLMCache: 6 def __init__(self, redis_url: str, ttl: int = 3600): 7 self.redis = redis.from_url(redis_url) 8 self.ttl = ttl 9 self.stats = {"hits": 0, "misses": 0} 10 11 def _cache_key(self, model: str, messages: list, **kwargs) -> str: 12 content = json.dumps({ 13 "model": model, 14 "messages": messages, 15 **kwargs 16 }, sort_keys=True) 17 return f"llm:{hashlib.md5(content.encode()).hexdigest()}" 18 19 def get(self, model: str, messages: list, **kwargs) -> dict | None: 20 key = self._cache_key(model, messages, **kwargs) 21 cached = self.redis.get(key) 22 23 if cached: 24 self.stats["hits"] += 1 25 return json.loads(cached) 26 27 self.stats["misses"] += 1 28 return None 29 30 def set(self, model: str, messages: list, response: dict, **kwargs): 31 key = self._cache_key(model, messages, **kwargs) 32 self.redis.setex(key, self.ttl, json.dumps(response)) 33 34 def get_savings(self, cost_per_request: float) -> float: 35 return self.stats["hits"] * cost_per_request 36 37# Kullanım 38cache = LLMCache("redis://localhost:6379") 39 40def cached_completion(model: str, messages: list, **kwargs): 41 cached = cache.get(model, messages, **kwargs) 42 if cached: 43 return cached 44 45 response = client.chat.completions.create( 46 model=model, 47 messages=messages, 48 **kwargs 49 ) 50 51 result = response.choices[0].message.content 52 cache.set(model, messages, {"content": result}) 53 54 return {"content": result}
Semantic Caching
1class SemanticCache: 2 def __init__(self, vector_store, similarity_threshold: float = 0.95): 3 self.vector_store = vector_store 4 self.threshold = similarity_threshold 5 6 def get_similar(self, query: str) -> dict | None: 7 query_embedding = get_embedding(query) 8 9 results = self.vector_store.search( 10 vector=query_embedding, 11 top_k=1, 12 filter={"type": "cache"} 13 ) 14 15 if results and results[0].score >= self.threshold: 16 return { 17 "response": results[0].metadata["response"], 18 "similarity": results[0].score, 19 "original_query": results[0].metadata["query"] 20 } 21 22 return None 23 24 def store(self, query: str, response: str): 25 embedding = get_embedding(query) 26 27 self.vector_store.upsert([{ 28 "id": f"cache_{hash(query)}", 29 "values": embedding, 30 "metadata": { 31 "type": "cache", 32 "query": query, 33 "response": response, 34 "timestamp": datetime.now().isoformat() 35 } 36 }])
Token Optimization
Prompt Compression
1def compress_prompt(prompt: str, target_reduction: float = 0.3) -> str: 2 """Prompt'u kısaltarak token tasarrufu sağla""" 3 4 response = client.chat.completions.create( 5 model="gpt-4o-mini", 6 messages=[ 7 { 8 "role": "system", 9 "content": f"Metni %{int(target_reduction*100)} kısalt. Önemli bilgileri koru." 10 }, 11 {"role": "user", "content": prompt} 12 ] 13 ) 14 15 return response.choices[0].message.content 16 17def remove_redundancy(text: str) -> str: 18 """Tekrarlayan içeriği kaldır""" 19 sentences = text.split(". ") 20 unique_sentences = list(dict.fromkeys(sentences)) 21 return ". ".join(unique_sentences)
Output Length Control
1def optimize_output_tokens(prompt: str, max_tokens: int = None) -> dict: 2 """Output token sayısını optimize et""" 3 4 # Görev tipine göre max_tokens belirle 5 task_limits = { 6 "classification": 10, 7 "extraction": 200, 8 "summarization": 300, 9 "generation": 500, 10 "analysis": 800 11 } 12 13 task_type = detect_task_type(prompt) 14 optimal_max = max_tokens or task_limits.get(task_type, 500) 15 16 response = client.chat.completions.create( 17 model="gpt-4o-mini", 18 messages=[{"role": "user", "content": prompt}], 19 max_tokens=optimal_max 20 ) 21 22 return { 23 "response": response.choices[0].message.content, 24 "tokens_used": response.usage.completion_tokens, 25 "tokens_saved": 4096 - optimal_max # Potansiyel tasarruf 26 }
Batch Processing
Batch API Kullanımı
1async def batch_process(prompts: list, model: str = "gpt-4o-mini") -> list: 2 """Batch işleme ile maliyet azaltma""" 3 4 # OpenAI Batch API 5 batch_input = [ 6 { 7 "custom_id": f"request-{i}", 8 "method": "POST", 9 "url": "/v1/chat/completions", 10 "body": { 11 "model": model, 12 "messages": [{"role": "user", "content": prompt}] 13 } 14 } 15 for i, prompt in enumerate(prompts) 16 ] 17 18 # Batch job oluştur 19 batch = client.batches.create( 20 input_file_id=upload_batch_file(batch_input), 21 endpoint="/v1/chat/completions", 22 completion_window="24h" 23 ) 24 25 # Sonuçları bekle ve al 26 return await wait_for_batch(batch.id)
Budget Monitoring
1class BudgetMonitor: 2 def __init__(self, daily_limit: float, alert_threshold: float = 0.8): 3 self.daily_limit = daily_limit 4 self.alert_threshold = alert_threshold 5 self.daily_spend = 0 6 self.last_reset = datetime.now().date() 7 8 def track_usage(self, cost: float) -> dict: 9 # Günlük reset kontrolü 10 if datetime.now().date() > self.last_reset: 11 self.daily_spend = 0 12 self.last_reset = datetime.now().date() 13 14 self.daily_spend += cost 15 16 status = { 17 "current_spend": self.daily_spend, 18 "remaining": self.daily_limit - self.daily_spend, 19 "percentage": self.daily_spend / self.daily_limit 20 } 21 22 if status["percentage"] >= self.alert_threshold: 23 self.send_alert(status) 24 25 if self.daily_spend >= self.daily_limit: 26 raise BudgetExceededError("Daily budget limit exceeded") 27 28 return status 29 30 def send_alert(self, status: dict): 31 # Email/Slack notification 32 pass 33 34# Middleware olarak kullanım 35budget = BudgetMonitor(daily_limit=100.0) 36 37def monitored_completion(model: str, messages: list, **kwargs): 38 response = client.chat.completions.create( 39 model=model, messages=messages, **kwargs 40 ) 41 42 cost = calculate_cost(model, response.usage) 43 budget.track_usage(cost) 44 45 return response
ROI Hesaplama
1def calculate_ai_roi( 2 manual_cost_per_task: float, 3 tasks_per_month: int, 4 ai_cost_per_task: float, 5 accuracy_rate: float = 0.95 6) -> dict: 7 8 manual_monthly = manual_cost_per_task * tasks_per_month 9 ai_monthly = ai_cost_per_task * tasks_per_month 10 11 # Hata düzeltme maliyeti 12 error_cost = (1 - accuracy_rate) * tasks_per_month * manual_cost_per_task * 0.5 13 14 total_ai_cost = ai_monthly + error_cost 15 savings = manual_monthly - total_ai_cost 16 17 return { 18 "manual_cost": manual_monthly, 19 "ai_cost": total_ai_cost, 20 "monthly_savings": savings, 21 "yearly_savings": savings * 12, 22 "roi_percentage": (savings / total_ai_cost) * 100 23 }
Sonuç
LLM maliyet optimizasyonu, model routing, caching, token yönetimi ve bütçe monitoring ile sağlanır. Doğru stratejilerle %50-80 maliyet tasarrufu mümkündür.
Veni AI olarak, maliyet-etkin AI çözümleri sunuyoruz.
