Field	Value	Source
Canonical Path	/blog/llm-maliyet-optimizasyonu-api-fiyatlandirma-stratejileri	Veni AI Blog
Primary Category	إدارة التكاليف	Post Metadata
Author	Veni AI Technical Team	Post Metadata

تحسين تكلفة نماذج LLM: دليل التسعير والاستراتيجية لواجهات API

يمكن أن تشكّل تكاليف واجهات LLM API بندًا كبيرًا في التطبيقات ذات الحجم الكبير. في هذا الدليل، نستعرض استراتيجيات تحسين التكلفة.

مقارنة تسعير واجهات API

تسعير OpenAI (2024)

النموذج	الإدخال (/1M token)	الإخراج (/1M token)
GPT Turbo	$10.00	$30.00
GPTo	$5.00	$15.00
GPTo-mini	$0.15	$0.60
GPT-3.5 Turbo	$0.50	$1.50

تسعير Anthropic

النموذج	الإدخال (/1M token)	الإخراج (/1M token)
Claude 3 Opus	$15.00	$75.00
Claude 3 Sonnet	$3.00	$15.00
Claude 3 Haiku	$0.25	$1.25

تسعير Google

النموذج	الإدخال (/1M token)	الإخراج (/1M token)
Gemini Pro	$0.50	$1.50
Gemini Pro 1.5	$3.50	$10.50

حساب التكلفة

حاسبة تكلفة التوكن

1class CostCalculator:
2    PRICING = {
3        "gpt-4-turbo": {"input": 0.01, "output": 0.03},
4        "gpt-4o": {"input": 0.005, "output": 0.015},
5        "gpt-4o-mini": {"input": 0.00015, "output": 0.0006},
6        "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015},
7        "claude-3-opus": {"input": 0.015, "output": 0.075},
8        "claude-3-sonnet": {"input": 0.003, "output": 0.015},
9        "claude-3-haiku": {"input": 0.00025, "output": 0.00125},
10    }
11    
12    def calculate(self, model: str, input_tokens: int, output_tokens: int) -> float:
13        pricing = self.PRICING.get(model, {"input": 0, "output": 0})
14        
15        input_cost = (input_tokens / 1000) * pricing["input"]
16        output_cost = (output_tokens / 1000) * pricing["output"]
17        
18        return input_cost + output_cost
19    
20    def estimate_monthly(self, model: str, daily_requests: int, 
21                        avg_input_tokens: int, avg_output_tokens: int) -> dict:
22        daily_cost = daily_requests * self.calculate(
23            model, avg_input_tokens, avg_output_tokens
24        )
25        
26        return {
27            "daily": daily_cost,
28            "weekly": daily_cost * 7,
29            "monthly": daily_cost * 30,
30            "yearly": daily_cost * 365
31        }
32
33# Usage
34calc = CostCalculator()
35cost = calc.calculate("gpt-4-turbo", 1000, 500)
36print(f"Cost: ${cost:.4f}")
37
38monthly = calc.estimate_monthly("gpt-4-turbo", 10000, 500, 200)
39print(f"Monthly estimate: ${monthly['monthly']:.2f}")
40## استراتيجية توجيه النماذج
41
42### التوجيه بناءً على التعقيد
43
44```python
45class ModelRouter:
46    def __init__(self):
47        self.models = {
48            "simple": "gpt-4o-mini",
49            "medium": "gpt-4o",
50            "complex": "gpt-4-turbo"
51        }
52    
53    def classify_complexity(self, prompt: str) -> str:
54        # Simple heuristics
55        word_count = len(prompt.split())
56        
57        complexity_indicators = [
58            "analyze", "compare", "evaluate",
59            "strategy", "detailed", "comprehensive"
60        ]
61        
62        has_complexity = any(ind in prompt.lower() for ind in complexity_indicators)
63        
64        if word_count < 50 and not has_complexity:
65            return "simple"
66        elif word_count < 200 or not has_complexity:
67            return "medium"
68        else:
69            return "complex"
70    
71    def route(self, prompt: str) -> str:
72        complexity = self.classify_complexity(prompt)
73        return self.models[complexity]
74
75# LLM-based complexity classification
76def classify_with_llm(prompt: str) -> str:
77    response = client.chat.completions.create(
78        model="gpt-4o-mini",  # Classification with cheap model
79        messages=[
80            {
81                "role": "system",
82                "content": "Determine the complexity of the given prompt: simple, medium, complex"
83            },
84            {"role": "user", "content": prompt}
85        ],
86        max_tokens=10
87    )
88    return response.choices[0].message.content.strip().lower()

الموازنة بين الجودة والتكلفة

1class AdaptiveRouter:
2    def __init__(self, quality_threshold: float = 0.8):
3        self.quality_threshold = quality_threshold
4        self.model_hierarchy = ["gpt-4o-mini", "gpt-4o", "gpt-4-turbo"]
5    
6    def get_response_with_fallback(self, prompt: str) -> dict:
7        for model in self.model_hierarchy:
8            response = self.call_model(model, prompt)
9            quality = self.assess_quality(response)
10            
11            if quality >= self.quality_threshold:
12                return {
13                    "response": response,
14                    "model_used": model,
15                    "quality_score": quality
16                }
17        
18        # Continue with strongest model
19        return {
20            "response": response,
21            "model_used": self.model_hierarchy[-1],
22            "quality_score": quality
23        }
24## استراتيجيات التخزين المؤقت
25
26### التخزين المؤقت للاستجابة
27
28```python
29import hashlib
30import redis
31import json
32
33class LLMCache:
34    def __init__(self, redis_url: str, ttl: int = 3600):
35        self.redis = redis.from_url(redis_url)
36        self.ttl = ttl
37        self.stats = {"hits": 0, "misses": 0}
38    
39    def _cache_key(self, model: str, messages: list, **kwargs) -> str:
40        content = json.dumps({
41            "model": model,
42            "messages": messages,
43            **kwargs
44        }, sort_keys=True)
45        return f"llm:{hashlib.md5(content.encode()).hexdigest()}"
46    
47    def get(self, model: str, messages: list, **kwargs) -> dict | None:
48        key = self._cache_key(model, messages, **kwargs)
49        cached = self.redis.get(key)
50        
51        if cached:
52            self.stats["hits"] += 1
53            return json.loads(cached)
54        
55        self.stats["misses"] += 1
56        return None
57    
58    def set(self, model: str, messages: list, response: dict, **kwargs):
59        key = self._cache_key(model, messages, **kwargs)
60        self.redis.setex(key, self.ttl, json.dumps(response))
61    
62    def get_savings(self, cost_per_request: float) -> float:
63        return self.stats["hits"] * cost_per_request
64
65# Usage
66cache = LLMCache("redis://localhost:6379")
67
68def cached_completion(model: str, messages: list, **kwargs):
69    cached = cache.get(model, messages, **kwargs)
70    if cached:
71        return cached
72    
73    response = client.chat.completions.create(
74        model=model,
75        messages=messages,
76        **kwargs
77    )
78    
79    result = response.choices[0].message.content
80    cache.set(model, messages, {"content": result})
81    
82    return {"content": result}

التخزين المؤقت الدلالي

1class SemanticCache:
2    def __init__(self, vector_store, similarity_threshold: float = 0.95):
3        self.vector_store = vector_store
4        self.threshold = similarity_threshold
5    
6    def get_similar(self, query: str) -> dict | None:
7        query_embedding = get_embedding(query)
8        
9        results = self.vector_store.search(
10            vector=query_embedding,
11            top_k=1,
12            filter={"type": "cache"}
13        )
14        
15        if results and results[0].score >= self.threshold:
16            return {
17                "response": results[0].metadata["response"],
18                "similarity": results[0].score,
19                "original_query": results[0].metadata["query"]
20            }
21        
22        return None
23    
24    def store(self, query: str, response: str):
25        embedding = get_embedding(query)
26        
27        self.vector_store.upsert([{
28            "id": f"cache_{hash(query)}",
29            "values": embedding,
30            "metadata": {
31                "type": "cache",
32                "query": query,
33                "response": response,
34                "timestamp": datetime.now().isoformat()
35            }
36        }])
37## تحسين إدارة التوكن
38
39### ضغط الـ Prompt
40
41```python
42def compress_prompt(prompt: str, target_reduction: float = 0.3) -> str:
43    """Save tokens by shortening prompt"""
44    
45    response = client.chat.completions.create(
46        model="gpt-4o-mini",
47        messages=[
48            {
49                "role": "system",
50                "content": f"Shorten text by {int(target_reduction*100)}%. Keep important information."
51            },
52            {"role": "user", "content": prompt}
53        ]
54    )
55    
56    return response.choices[0].message.content
57
58def remove_redundancy(text: str) -> str:
59    """Remove redundant content"""
60    sentences = text.split(". ")
61    unique_sentences = list(dict.fromkeys(sentences))
62    return ". ".join(unique_sentences)

التحكم في طول المخرجات

1def optimize_output_tokens(prompt: str, max_tokens: int = None) -> dict:
2    """Optimize output token count"""
3    
4    # Set max_tokens based on task type
5    task_limits = {
6        "classification": 10,
7        "extraction": 200,
8        "summarization": 300,
9        "generation": 500,
10        "analysis": 800
11    }
12    
13    task_type = detect_task_type(prompt)
14    optimal_max = max_tokens or task_limits.get(task_type, 500)
15    
16    response = client.chat.completions.create(
17        model="gpt-4o-mini",
18        messages=[{"role": "user", "content": prompt}],
19        max_tokens=optimal_max
20    )
21    
22    return {
23        "response": response.choices[0].message.content,
24        "tokens_used": response.usage.completion_tokens,
25        "tokens_saved": 4096 - optimal_max  # Potential savings
26    }

معالجة الدُفعات

استخدام Batch API

1async def batch_process(prompts: list, model: str = "gpt-4o-mini") -> list:
2    """Cost reduction with batch processing"""
3    
4    # OpenAI Batch API
5    batch_input = [
6        {
7            "custom_id": f"request-{i}",
8            "method": "POST",
9            "url": "/v1/chat/completions",
10            "body": {
11                "model": model,
12                "messages": [{"role": "user", "content": prompt}]
13            }
14        }
15        for i, prompt in enumerate(prompts)
16    ]
17    
18    # Create batch job
19    batch = client.batches.create(
20        input_file_id=upload_batch_file(batch_input),
21        endpoint="/v1/chat/completions",
22        completion_window="24h"
23    )
24    
25    # Wait for and retrieve results
26    return await wait_for_batch(batch.id)

مراقبة الميزانية

1class BudgetMonitor:
2    def __init__(self, daily_limit: float, alert_threshold: float = 0.8):
3        self.daily_limit = daily_limit
4        self.alert_threshold = alert_threshold
5        self.daily_spend = 0
6        self.last_reset = datetime.now().date()
7    
8    def track_usage(self, cost: float) -> dict:
9        # Check daily reset
10        if datetime.now().date() > self.last_reset:
11            self.daily_spend = 0
12            self.last_reset = datetime.now().date()
13        
14        self.daily_spend += cost
15        
16        status = {
17            "current_spend": self.daily_spend,
18            "remaining": self.daily_limit - self.daily_spend,
19            "percentage": self.daily_spend / self.daily_limit
20        }
21        
22        if status["percentage"] >= self.alert_threshold:
23            self.send_alert(status)
24        
25        if self.daily_spend >= self.daily_limit:
26            raise BudgetExceededError("Daily budget limit exceeded")
27        
28        return status
29    
30    def send_alert(self, status: dict):
31        # Email/Slack notification
32        pass
33
34# Usage as Middleware
35budget = BudgetMonitor(daily_limit=100.0)
36
37def monitored_completion(model: str, messages: list, **kwargs):
38    response = client.chat.completions.create(
39        model=model, messages=messages, **kwargs
40    )
41    
42    cost = calculate_cost(model, response.usage)
43    budget.track_usage(cost)
44    
45    return response
46## حساب العائد على الاستثمار (ROI)
47
48```python
49def calculate_ai_roi(
50    manual_cost_per_task: float,
51    tasks_per_month: int,
52    ai_cost_per_task: float,
53    accuracy_rate: float = 0.95
54) -> dict:
55    
56    manual_monthly = manual_cost_per_task * tasks_per_month
57    ai_monthly = ai_cost_per_task * tasks_per_month
58    
59    # Error correction cost
60    error_cost = (1 - accuracy_rate) * tasks_per_month * manual_cost_per_task * 0.5
61    
62    total_ai_cost = ai_monthly + error_cost
63    savings = manual_monthly - total_ai_cost
64    
65    return {
66        "manual_cost": manual_monthly,
67        "ai_cost": total_ai_cost,
68        "monthly_savings": savings,
69        "yearly_savings": savings * 12,
70        "roi_percentage": (savings / total_ai_cost) * 100
71    }

الخلاصة

يتم تحقيق تحسين تكاليف الـ LLM من خلال توجيه النماذج، والتخزين المؤقت، وإدارة الرموز (tokens)، ومراقبة الميزانية. ومع اتباع الاستراتيجيات المناسبة، يمكن تحقيق وفورات في التكاليف تتراوح بين 50-80%.

في Veni AI، نقدم حلول ذكاء اصطناعي فعّالة من حيث التكلفة.

تحسين تكلفة نماذج اللغة الكبيرة: دليل التسعير واستراتيجيات الاستخدام

Reference Overview