Veni AI
Maliyet Yönetimi

LLM Maliyet Optimizasyonu: API Fiyatlandırma ve Strateji Rehberi

LLM API maliyetlerini optimize etme, token yönetimi, model seçimi, caching stratejileri ve kurumsal bütçe yönetimi için kapsamlı rehber.

Veni AI Teknik Ekibi28 Aralık 20246 dk okuma
LLM Maliyet Optimizasyonu: API Fiyatlandırma ve Strateji Rehberi

LLM Maliyet Optimizasyonu: API Fiyatlandırma ve Strateji Rehberi

LLM API maliyetleri, yüksek hacimli uygulamalarda önemli bir gider kalemi oluşturabilir. Bu rehberde maliyet optimizasyon stratejilerini inceliyoruz.

API Fiyatlandırma Karşılaştırması

OpenAI Pricing (2024)

ModelInput (/1M token)Output (/1M token)
GPT Turbo$10.00$30.00
GPTo$5.00$15.00
GPTo-mini$0.15$0.60
GPT-3.5 Turbo$0.50$1.50

Anthropic Pricing

ModelInput (/1M token)Output (/1M token)
Claude 3 Opus$15.00$75.00
Claude 3 Sonnet$3.00$15.00
Claude 3 Haiku$0.25$1.25

Google Pricing

ModelInput (/1M token)Output (/1M token)
Gemini Pro$0.50$1.50
Gemini Pro 1.5$3.50$10.50

Maliyet Hesaplama

Token Maliyet Hesaplayıcı

1class CostCalculator: 2 PRICING = { 3 "gpt-4-turbo": {"input": 0.01, "output": 0.03}, 4 "gpt-4o": {"input": 0.005, "output": 0.015}, 5 "gpt-4o-mini": {"input": 0.00015, "output": 0.0006}, 6 "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015}, 7 "claude-3-opus": {"input": 0.015, "output": 0.075}, 8 "claude-3-sonnet": {"input": 0.003, "output": 0.015}, 9 "claude-3-haiku": {"input": 0.00025, "output": 0.00125}, 10 } 11 12 def calculate(self, model: str, input_tokens: int, output_tokens: int) -> float: 13 pricing = self.PRICING.get(model, {"input": 0, "output": 0}) 14 15 input_cost = (input_tokens / 1000) * pricing["input"] 16 output_cost = (output_tokens / 1000) * pricing["output"] 17 18 return input_cost + output_cost 19 20 def estimate_monthly(self, model: str, daily_requests: int, 21 avg_input_tokens: int, avg_output_tokens: int) -> dict: 22 daily_cost = daily_requests * self.calculate( 23 model, avg_input_tokens, avg_output_tokens 24 ) 25 26 return { 27 "daily": daily_cost, 28 "weekly": daily_cost * 7, 29 "monthly": daily_cost * 30, 30 "yearly": daily_cost * 365 31 } 32 33# Kullanım 34calc = CostCalculator() 35cost = calc.calculate("gpt-4-turbo", 1000, 500) 36print(f"Cost: ${cost:.4f}") 37 38monthly = calc.estimate_monthly("gpt-4-turbo", 10000, 500, 200) 39print(f"Monthly estimate: ${monthly['monthly']:.2f}")

Model Routing Stratejisi

Complexity-Based Routing

1class ModelRouter: 2 def __init__(self): 3 self.models = { 4 "simple": "gpt-4o-mini", 5 "medium": "gpt-4o", 6 "complex": "gpt-4-turbo" 7 } 8 9 def classify_complexity(self, prompt: str) -> str: 10 # Basit heuristic'ler 11 word_count = len(prompt.split()) 12 13 complexity_indicators = [ 14 "analiz", "karşılaştır", "değerlendir", 15 "strateji", "detaylı", "kapsamlı" 16 ] 17 18 has_complexity = any(ind in prompt.lower() for ind in complexity_indicators) 19 20 if word_count < 50 and not has_complexity: 21 return "simple" 22 elif word_count < 200 or not has_complexity: 23 return "medium" 24 else: 25 return "complex" 26 27 def route(self, prompt: str) -> str: 28 complexity = self.classify_complexity(prompt) 29 return self.models[complexity] 30 31# LLM-based complexity classification 32def classify_with_llm(prompt: str) -> str: 33 response = client.chat.completions.create( 34 model="gpt-4o-mini", # Ucuz model ile sınıflandırma 35 messages=[ 36 { 37 "role": "system", 38 "content": "Verilen prompt'un karmaşıklığını belirle: simple, medium, complex" 39 }, 40 {"role": "user", "content": prompt} 41 ], 42 max_tokens=10 43 ) 44 return response.choices[0].message.content.strip().lower()

Quality-Cost Tradeoff

1class AdaptiveRouter: 2 def __init__(self, quality_threshold: float = 0.8): 3 self.quality_threshold = quality_threshold 4 self.model_hierarchy = ["gpt-4o-mini", "gpt-4o", "gpt-4-turbo"] 5 6 def get_response_with_fallback(self, prompt: str) -> dict: 7 for model in self.model_hierarchy: 8 response = self.call_model(model, prompt) 9 quality = self.assess_quality(response) 10 11 if quality >= self.quality_threshold: 12 return { 13 "response": response, 14 "model_used": model, 15 "quality_score": quality 16 } 17 18 # En güçlü model ile devam et 19 return { 20 "response": response, 21 "model_used": self.model_hierarchy[-1], 22 "quality_score": quality 23 }

Caching Stratejileri

Response Caching

1import hashlib 2import redis 3import json 4 5class LLMCache: 6 def __init__(self, redis_url: str, ttl: int = 3600): 7 self.redis = redis.from_url(redis_url) 8 self.ttl = ttl 9 self.stats = {"hits": 0, "misses": 0} 10 11 def _cache_key(self, model: str, messages: list, **kwargs) -> str: 12 content = json.dumps({ 13 "model": model, 14 "messages": messages, 15 **kwargs 16 }, sort_keys=True) 17 return f"llm:{hashlib.md5(content.encode()).hexdigest()}" 18 19 def get(self, model: str, messages: list, **kwargs) -> dict | None: 20 key = self._cache_key(model, messages, **kwargs) 21 cached = self.redis.get(key) 22 23 if cached: 24 self.stats["hits"] += 1 25 return json.loads(cached) 26 27 self.stats["misses"] += 1 28 return None 29 30 def set(self, model: str, messages: list, response: dict, **kwargs): 31 key = self._cache_key(model, messages, **kwargs) 32 self.redis.setex(key, self.ttl, json.dumps(response)) 33 34 def get_savings(self, cost_per_request: float) -> float: 35 return self.stats["hits"] * cost_per_request 36 37# Kullanım 38cache = LLMCache("redis://localhost:6379") 39 40def cached_completion(model: str, messages: list, **kwargs): 41 cached = cache.get(model, messages, **kwargs) 42 if cached: 43 return cached 44 45 response = client.chat.completions.create( 46 model=model, 47 messages=messages, 48 **kwargs 49 ) 50 51 result = response.choices[0].message.content 52 cache.set(model, messages, {"content": result}) 53 54 return {"content": result}

Semantic Caching

1class SemanticCache: 2 def __init__(self, vector_store, similarity_threshold: float = 0.95): 3 self.vector_store = vector_store 4 self.threshold = similarity_threshold 5 6 def get_similar(self, query: str) -> dict | None: 7 query_embedding = get_embedding(query) 8 9 results = self.vector_store.search( 10 vector=query_embedding, 11 top_k=1, 12 filter={"type": "cache"} 13 ) 14 15 if results and results[0].score >= self.threshold: 16 return { 17 "response": results[0].metadata["response"], 18 "similarity": results[0].score, 19 "original_query": results[0].metadata["query"] 20 } 21 22 return None 23 24 def store(self, query: str, response: str): 25 embedding = get_embedding(query) 26 27 self.vector_store.upsert([{ 28 "id": f"cache_{hash(query)}", 29 "values": embedding, 30 "metadata": { 31 "type": "cache", 32 "query": query, 33 "response": response, 34 "timestamp": datetime.now().isoformat() 35 } 36 }])

Token Optimization

Prompt Compression

1def compress_prompt(prompt: str, target_reduction: float = 0.3) -> str: 2 """Prompt'u kısaltarak token tasarrufu sağla""" 3 4 response = client.chat.completions.create( 5 model="gpt-4o-mini", 6 messages=[ 7 { 8 "role": "system", 9 "content": f"Metni %{int(target_reduction*100)} kısalt. Önemli bilgileri koru." 10 }, 11 {"role": "user", "content": prompt} 12 ] 13 ) 14 15 return response.choices[0].message.content 16 17def remove_redundancy(text: str) -> str: 18 """Tekrarlayan içeriği kaldır""" 19 sentences = text.split(". ") 20 unique_sentences = list(dict.fromkeys(sentences)) 21 return ". ".join(unique_sentences)

Output Length Control

1def optimize_output_tokens(prompt: str, max_tokens: int = None) -> dict: 2 """Output token sayısını optimize et""" 3 4 # Görev tipine göre max_tokens belirle 5 task_limits = { 6 "classification": 10, 7 "extraction": 200, 8 "summarization": 300, 9 "generation": 500, 10 "analysis": 800 11 } 12 13 task_type = detect_task_type(prompt) 14 optimal_max = max_tokens or task_limits.get(task_type, 500) 15 16 response = client.chat.completions.create( 17 model="gpt-4o-mini", 18 messages=[{"role": "user", "content": prompt}], 19 max_tokens=optimal_max 20 ) 21 22 return { 23 "response": response.choices[0].message.content, 24 "tokens_used": response.usage.completion_tokens, 25 "tokens_saved": 4096 - optimal_max # Potansiyel tasarruf 26 }

Batch Processing

Batch API Kullanımı

1async def batch_process(prompts: list, model: str = "gpt-4o-mini") -> list: 2 """Batch işleme ile maliyet azaltma""" 3 4 # OpenAI Batch API 5 batch_input = [ 6 { 7 "custom_id": f"request-{i}", 8 "method": "POST", 9 "url": "/v1/chat/completions", 10 "body": { 11 "model": model, 12 "messages": [{"role": "user", "content": prompt}] 13 } 14 } 15 for i, prompt in enumerate(prompts) 16 ] 17 18 # Batch job oluştur 19 batch = client.batches.create( 20 input_file_id=upload_batch_file(batch_input), 21 endpoint="/v1/chat/completions", 22 completion_window="24h" 23 ) 24 25 # Sonuçları bekle ve al 26 return await wait_for_batch(batch.id)

Budget Monitoring

1class BudgetMonitor: 2 def __init__(self, daily_limit: float, alert_threshold: float = 0.8): 3 self.daily_limit = daily_limit 4 self.alert_threshold = alert_threshold 5 self.daily_spend = 0 6 self.last_reset = datetime.now().date() 7 8 def track_usage(self, cost: float) -> dict: 9 # Günlük reset kontrolü 10 if datetime.now().date() > self.last_reset: 11 self.daily_spend = 0 12 self.last_reset = datetime.now().date() 13 14 self.daily_spend += cost 15 16 status = { 17 "current_spend": self.daily_spend, 18 "remaining": self.daily_limit - self.daily_spend, 19 "percentage": self.daily_spend / self.daily_limit 20 } 21 22 if status["percentage"] >= self.alert_threshold: 23 self.send_alert(status) 24 25 if self.daily_spend >= self.daily_limit: 26 raise BudgetExceededError("Daily budget limit exceeded") 27 28 return status 29 30 def send_alert(self, status: dict): 31 # Email/Slack notification 32 pass 33 34# Middleware olarak kullanım 35budget = BudgetMonitor(daily_limit=100.0) 36 37def monitored_completion(model: str, messages: list, **kwargs): 38 response = client.chat.completions.create( 39 model=model, messages=messages, **kwargs 40 ) 41 42 cost = calculate_cost(model, response.usage) 43 budget.track_usage(cost) 44 45 return response

ROI Hesaplama

1def calculate_ai_roi( 2 manual_cost_per_task: float, 3 tasks_per_month: int, 4 ai_cost_per_task: float, 5 accuracy_rate: float = 0.95 6) -> dict: 7 8 manual_monthly = manual_cost_per_task * tasks_per_month 9 ai_monthly = ai_cost_per_task * tasks_per_month 10 11 # Hata düzeltme maliyeti 12 error_cost = (1 - accuracy_rate) * tasks_per_month * manual_cost_per_task * 0.5 13 14 total_ai_cost = ai_monthly + error_cost 15 savings = manual_monthly - total_ai_cost 16 17 return { 18 "manual_cost": manual_monthly, 19 "ai_cost": total_ai_cost, 20 "monthly_savings": savings, 21 "yearly_savings": savings * 12, 22 "roi_percentage": (savings / total_ai_cost) * 100 23 }

Sonuç

LLM maliyet optimizasyonu, model routing, caching, token yönetimi ve bütçe monitoring ile sağlanır. Doğru stratejilerle %50-80 maliyet tasarrufu mümkündür.

Veni AI olarak, maliyet-etkin AI çözümleri sunuyoruz.

İlgili Makaleler