Veni AI
Διαχείριση Κόστους

Βελτιστοποίηση Κόστους LLM: Οδηγός Τιμολόγησης API και Στρατηγικής

Πλήρης οδηγός για τη βελτιστοποίηση του κόστους API LLM, τη διαχείριση token, την επιλογή μοντέλων, τις στρατηγικές caching και τη διαχείριση προϋπολογισμού σε επιχειρησιακό επίπεδο.

Veni AI Technical Team28 Aralık 20246 dk okuma
Βελτιστοποίηση Κόστους LLM: Οδηγός Τιμολόγησης API και Στρατηγικής

Βελτιστοποίηση Κόστους LLM: Οδηγός Τιμολόγησης API και Στρατηγικής

Τα κόστη API για LLM μπορούν να αποτελέσουν σημαντικό έξοδο σε εφαρμογές μεγάλου όγκου. Σε αυτόν τον οδηγό, εξετάζουμε στρατηγικές βελτιστοποίησης κόστους.

Σύγκριση Τιμών API

Τιμολόγηση OpenAI (2024)

ΜοντέλοΕίσοδος (/1M token)Έξοδος (/1M token)
GPT Turbo$10.00$30.00
GPTo$5.00$15.00
GPTo-mini$0.15$0.60
GPT-3.5 Turbo$0.50$1.50

Τιμολόγηση Anthropic

ΜοντέλοΕίσοδος (/1M token)Έξοδος (/1M token)
Claude 3 Opus$15.00$75.00
Claude 3 Sonnet$3.00$15.00
Claude 3 Haiku$0.25$1.25

Τιμολόγηση Google

ΜοντέλοΕίσοδος (/1M token)Έξοδος (/1M token)
Gemini Pro$0.50$1.50
Gemini Pro 1.5$3.50$10.50

Υπολογισμός Κόστους

Υπολογιστής Κόστους Token

1class CostCalculator: 2 PRICING = { 3 "gpt-4-turbo": {"input": 0.01, "output": 0.03}, 4 "gpt-4o": {"input": 0.005, "output": 0.015}, 5 "gpt-4o-mini": {"input": 0.00015, "output": 0.0006}, 6 "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015}, 7 "claude-3-opus": {"input": 0.015, "output": 0.075}, 8 "claude-3-sonnet": {"input": 0.003, "output": 0.015}, 9 "claude-3-haiku": {"input": 0.00025, "output": 0.00125}, 10 } 11 12 def calculate(self, model: str, input_tokens: int, output_tokens: int) -> float: 13 pricing = self.PRICING.get(model, {"input": 0, "output": 0}) 14 15 input_cost = (input_tokens / 1000) * pricing["input"] 16 output_cost = (output_tokens / 1000) * pricing["output"] 17 18 return input_cost + output_cost 19 20 def estimate_monthly(self, model: str, daily_requests: int, 21 avg_input_tokens: int, avg_output_tokens: int) -> dict: 22 daily_cost = daily_requests * self.calculate( 23 model, avg_input_tokens, avg_output_tokens 24 ) 25 26 return { 27 "daily": daily_cost, 28 "weekly": daily_cost * 7, 29 "monthly": daily_cost * 30, 30 "yearly": daily_cost * 365 31 } 32 33# Usage 34calc = CostCalculator() 35cost = calc.calculate("gpt-4-turbo", 1000, 500) 36print(f"Cost: ${cost:.4f}") 37 38monthly = calc.estimate_monthly("gpt-4-turbo", 10000, 500, 200) 39print(f"Monthly estimate: ${monthly['monthly']:.2f}") 40## Στρατηγική Δρομολόγησης Μοντέλων 41 42### Δρομολόγηση με Βάση την Πολυπλοκότητα 43 44```python 45class ModelRouter: 46 def __init__(self): 47 self.models = { 48 "simple": "gpt-4o-mini", 49 "medium": "gpt-4o", 50 "complex": "gpt-4-turbo" 51 } 52 53 def classify_complexity(self, prompt: str) -> str: 54 # Simple heuristics 55 word_count = len(prompt.split()) 56 57 complexity_indicators = [ 58 "analyze", "compare", "evaluate", 59 "strategy", "detailed", "comprehensive" 60 ] 61 62 has_complexity = any(ind in prompt.lower() for ind in complexity_indicators) 63 64 if word_count < 50 and not has_complexity: 65 return "simple" 66 elif word_count < 200 or not has_complexity: 67 return "medium" 68 else: 69 return "complex" 70 71 def route(self, prompt: str) -> str: 72 complexity = self.classify_complexity(prompt) 73 return self.models[complexity] 74 75# LLM-based complexity classification 76def classify_with_llm(prompt: str) -> str: 77 response = client.chat.completions.create( 78 model="gpt-4o-mini", # Classification with cheap model 79 messages=[ 80 { 81 "role": "system", 82 "content": "Determine the complexity of the given prompt: simple, medium, complex" 83 }, 84 {"role": "user", "content": prompt} 85 ], 86 max_tokens=10 87 ) 88 return response.choices[0].message.content.strip().lower()

Αντιστάθμιση Ποιότητας–Κόστους

1class AdaptiveRouter: 2 def __init__(self, quality_threshold: float = 0.8): 3 self.quality_threshold = quality_threshold 4 self.model_hierarchy = ["gpt-4o-mini", "gpt-4o", "gpt-4-turbo"] 5 6 def get_response_with_fallback(self, prompt: str) -> dict: 7 for model in self.model_hierarchy: 8 response = self.call_model(model, prompt) 9 quality = self.assess_quality(response) 10 11 if quality >= self.quality_threshold: 12 return { 13 "response": response, 14 "model_used": model, 15 "quality_score": quality 16 } 17 18 # Continue with strongest model 19 return { 20 "response": response, 21 "model_used": self.model_hierarchy[-1], 22 "quality_score": quality 23 } 24## Στρατηγικές Caching 25 26### Response Caching 27 28```python 29import hashlib 30import redis 31import json 32 33class LLMCache: 34 def __init__(self, redis_url: str, ttl: int = 3600): 35 self.redis = redis.from_url(redis_url) 36 self.ttl = ttl 37 self.stats = {"hits": 0, "misses": 0} 38 39 def _cache_key(self, model: str, messages: list, **kwargs) -> str: 40 content = json.dumps({ 41 "model": model, 42 "messages": messages, 43 **kwargs 44 }, sort_keys=True) 45 return f"llm:{hashlib.md5(content.encode()).hexdigest()}" 46 47 def get(self, model: str, messages: list, **kwargs) -> dict | None: 48 key = self._cache_key(model, messages, **kwargs) 49 cached = self.redis.get(key) 50 51 if cached: 52 self.stats["hits"] += 1 53 return json.loads(cached) 54 55 self.stats["misses"] += 1 56 return None 57 58 def set(self, model: str, messages: list, response: dict, **kwargs): 59 key = self._cache_key(model, messages, **kwargs) 60 self.redis.setex(key, self.ttl, json.dumps(response)) 61 62 def get_savings(self, cost_per_request: float) -> float: 63 return self.stats["hits"] * cost_per_request 64 65# Usage 66cache = LLMCache("redis://localhost:6379") 67 68def cached_completion(model: str, messages: list, **kwargs): 69 cached = cache.get(model, messages, **kwargs) 70 if cached: 71 return cached 72 73 response = client.chat.completions.create( 74 model=model, 75 messages=messages, 76 **kwargs 77 ) 78 79 result = response.choices[0].message.content 80 cache.set(model, messages, {"content": result}) 81 82 return {"content": result}

Semantic Caching

1class SemanticCache: 2 def __init__(self, vector_store, similarity_threshold: float = 0.95): 3 self.vector_store = vector_store 4 self.threshold = similarity_threshold 5 6 def get_similar(self, query: str) -> dict | None: 7 query_embedding = get_embedding(query) 8 9 results = self.vector_store.search( 10 vector=query_embedding, 11 top_k=1, 12 filter={"type": "cache"} 13 ) 14 15 if results and results[0].score >= self.threshold: 16 return { 17 "response": results[0].metadata["response"], 18 "similarity": results[0].score, 19 "original_query": results[0].metadata["query"] 20 } 21 22 return None 23 24 def store(self, query: str, response: str): 25 embedding = get_embedding(query) 26 27 self.vector_store.upsert([{ 28 "id": f"cache_{hash(query)}", 29 "values": embedding, 30 "metadata": { 31 "type": "cache", 32 "query": query, 33 "response": response, 34 "timestamp": datetime.now().isoformat() 35 } 36 }]) 37## Βελτιστοποίηση Token 38 39### Συμπίεση Prompt 40 41```python 42def compress_prompt(prompt: str, target_reduction: float = 0.3) -> str: 43 """Save tokens by shortening prompt""" 44 45 response = client.chat.completions.create( 46 model="gpt-4o-mini", 47 messages=[ 48 { 49 "role": "system", 50 "content": f"Shorten text by {int(target_reduction*100)}%. Keep important information." 51 }, 52 {"role": "user", "content": prompt} 53 ] 54 ) 55 56 return response.choices[0].message.content 57 58def remove_redundancy(text: str) -> str: 59 """Remove redundant content""" 60 sentences = text.split(". ") 61 unique_sentences = list(dict.fromkeys(sentences)) 62 return ". ".join(unique_sentences)

Έλεγχος Μήκους Output

1def optimize_output_tokens(prompt: str, max_tokens: int = None) -> dict: 2 """Optimize output token count""" 3 4 # Set max_tokens based on task type 5 task_limits = { 6 "classification": 10, 7 "extraction": 200, 8 "summarization": 300, 9 "generation": 500, 10 "analysis": 800 11 } 12 13 task_type = detect_task_type(prompt) 14 optimal_max = max_tokens or task_limits.get(task_type, 500) 15 16 response = client.chat.completions.create( 17 model="gpt-4o-mini", 18 messages=[{"role": "user", "content": prompt}], 19 max_tokens=optimal_max 20 ) 21 22 return { 23 "response": response.choices[0].message.content, 24 "tokens_used": response.usage.completion_tokens, 25 "tokens_saved": 4096 - optimal_max # Potential savings 26 }

Επεξεργασία Batch

Χρήση Batch API

1async def batch_process(prompts: list, model: str = "gpt-4o-mini") -> list: 2 """Cost reduction with batch processing""" 3 4 # OpenAI Batch API 5 batch_input = [ 6 { 7 "custom_id": f"request-{i}", 8 "method": "POST", 9 "url": "/v1/chat/completions", 10 "body": { 11 "model": model, 12 "messages": [{"role": "user", "content": prompt}] 13 } 14 } 15 for i, prompt in enumerate(prompts) 16 ] 17 18 # Create batch job 19 batch = client.batches.create( 20 input_file_id=upload_batch_file(batch_input), 21 endpoint="/v1/chat/completions", 22 completion_window="24h" 23 ) 24 25 # Wait for and retrieve results 26 return await wait_for_batch(batch.id)

Παρακολούθηση Προϋπολογισμού

1class BudgetMonitor: 2 def __init__(self, daily_limit: float, alert_threshold: float = 0.8): 3 self.daily_limit = daily_limit 4 self.alert_threshold = alert_threshold 5 self.daily_spend = 0 6 self.last_reset = datetime.now().date() 7 8 def track_usage(self, cost: float) -> dict: 9 # Check daily reset 10 if datetime.now().date() > self.last_reset: 11 self.daily_spend = 0 12 self.last_reset = datetime.now().date() 13 14 self.daily_spend += cost 15 16 status = { 17 "current_spend": self.daily_spend, 18 "remaining": self.daily_limit - self.daily_spend, 19 "percentage": self.daily_spend / self.daily_limit 20 } 21 22 if status["percentage"] >= self.alert_threshold: 23 self.send_alert(status) 24 25 if self.daily_spend >= self.daily_limit: 26 raise BudgetExceededError("Daily budget limit exceeded") 27 28 return status 29 30 def send_alert(self, status: dict): 31 # Email/Slack notification 32 pass 33 34# Usage as Middleware 35budget = BudgetMonitor(daily_limit=100.0) 36 37def monitored_completion(model: str, messages: list, **kwargs): 38 response = client.chat.completions.create( 39 model=model, messages=messages, **kwargs 40 ) 41 42 cost = calculate_cost(model, response.usage) 43 budget.track_usage(cost) 44 45 return response 46## Υπολογισμός ROI 47 48```python 49def calculate_ai_roi( 50 manual_cost_per_task: float, 51 tasks_per_month: int, 52 ai_cost_per_task: float, 53 accuracy_rate: float = 0.95 54) -> dict: 55 56 manual_monthly = manual_cost_per_task * tasks_per_month 57 ai_monthly = ai_cost_per_task * tasks_per_month 58 59 # Error correction cost 60 error_cost = (1 - accuracy_rate) * tasks_per_month * manual_cost_per_task * 0.5 61 62 total_ai_cost = ai_monthly + error_cost 63 savings = manual_monthly - total_ai_cost 64 65 return { 66 "manual_cost": manual_monthly, 67 "ai_cost": total_ai_cost, 68 "monthly_savings": savings, 69 "yearly_savings": savings * 12, 70 "roi_percentage": (savings / total_ai_cost) * 100 71 }

Συμπέρασμα

Η βελτιστοποίηση κόστους LLM επιτυγχάνεται μέσω model routing, caching, token management και παρακολούθησης προϋπολογισμού. Με τις σωστές στρατηγικές, είναι δυνατή η εξοικονόμηση κόστους της τάξης του 50-80%.

Στη Veni AI, προσφέρουμε αποδοτικές και οικονομικές λύσεις AI.

İlgili Makaleler