Get started with Fiddler’s LLM-as-a-Judge evaluation using Prompt Specs in minutes. Learn to create custom evaluations, test them, and deploy to production monitoring.
Use this file to discover all available pages before exploring further.
Get your first custom LLM evaluation running in minutes using Prompt Specs with Fiddler’s LLM-as-a-Judge solution. This guide walks you through creating, testing, and deploying a custom evaluation using Prompt Specs.
def get_prediction(prompt_spec, input_data): response = requests.post( f"{PROMPT_SPEC_URL}/predict", headers=FIDDLER_HEADERS, json={"prompt_spec": prompt_spec, "input_data": input_data} ) if response.status_code == 200: return response.json()["prediction"] return {"topic": None, "reasoning": None}# Test with a single exampletest_result = get_prediction( basic_prompt_spec, {"news_summary": "Wimbledon 2025 is under way!"})print(json.dumps(test_result, indent=2))
6
Improve Accuracy With Descriptions
Add field descriptions to improve classification accuracy:
enhanced_prompt_spec = { "instruction": "Determine the topic of the given news summary.", "input_fields": { "news_summary": {"type": "string"} }, "output_fields": { "topic": { "type": "string", "choices": ["World", "Sports", "Business", "Sci/Tech"], "description": """Use 'Sci/Tech' for technology companies, scientific discoveries, or health/medical research.Use 'Sports' for sports events or athletes.Use 'Business' for companies outside of tech/sports.Use 'World' for global events or issues.""" }, "reasoning": { "type": "string", "description": "Explain why you chose this topic." } }}
7
Evaluate Performance
Test your enhanced Prompt Spec on multiple examples:
# Test on your datasetresults = []for _, row in df_news.iterrows(): prediction = get_prediction( enhanced_prompt_spec, {"news_summary": row["text"]} ) results.append({ "original": row["original_topic"], "predicted": prediction["topic"], "reasoning": prediction["reasoning"] })# Calculate accuracydf_results = pd.DataFrame(results)accuracy = (df_results["original"] == df_results["predicted"]).mean()print(f"Accuracy: {accuracy:.1%}")
8
Deploy to Production Monitoring
Once satisfied with your Prompt Spec, deploy it as a Fiddler enrichment:
import fiddler as fdl# Initialize Fiddler clientfdl.init(url=FIDDLER_BASE_URL, token=FIDDLER_TOKEN)# Create project and enrichmentproject = fdl.Project.get_or_create(name="llm_evaluation_demo")enrichment = fdl.Enrichment( name="news_topic_classifier", enrichment="llm_as_a_judge", columns=["news_summary"], config={"prompt_spec": enhanced_prompt_spec})# Create model with enrichmentmodel_spec = fdl.ModelSpec( inputs=["news_summary"], custom_features=[enrichment])model = fdl.Model.from_data( source=df_news.rename(columns={"text": "news_summary"}), name="news_classifier", project_id=project.id, spec=model_spec, task=fdl.ModelTask.LLM)model.create()print(f"Model created: {model.name}")
9
Publish Events and Monitor
Publish your data and start monitoring:
# Publish production eventsjob = model.publish(df_news.rename(columns={"text": "news_summary"}))job.wait()if job.status == "SUCCESS": print("✅ Data published successfully!") print("🎯 Your evaluation is now running in production monitoring")
Full Script Copy
import jsonfrom datetime import datetimeimport fiddler as fdlimport pandas as pdimport requests# Replace with your actual values# FIDDLER_TOKEN = "your_token_here"# FIDDLER_BASE_URL = "https://your_company.fiddler.ai"FIDDLER_TOKEN = "hqvUV7r8-WUkMkjvKHbvI_sVpxRd9DJLKX6PCloRwVk"FIDDLER_BASE_URL = "https://preprod.cloud.fiddler.ai"PROMPT_SPEC_URL = f"{FIDDLER_BASE_URL}/v3/llm-as-a-judge/prompt-spec"FIDDLER_HEADERS = { "Authorization": f"Bearer {FIDDLER_TOKEN}", "Content-Type": "application/json",}# Load sample news data (using AG News dataset)df_news = pd.read_parquet( "hf://datasets/fancyzhx/ag_news/data/test-00000-of-00001.parquet").sample(20, random_state=25)# Map labels to topic namesdf_news["original_topic"] = df_news["label"].map({ 0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"})print(df_news["original_topic"].value_counts())basic_prompt_spec = { "input_fields": { "news_summary": {"type": "string"} }, "output_fields": { "topic": { "type": "string", "choices": ["World", "Sports", "Business", "Sci/Tech"] }, "reasoning": {"type": "string"} }}validate_response = requests.post( f"{PROMPT_SPEC_URL}/validate", headers=FIDDLER_HEADERS, json={"prompt_spec": basic_prompt_spec})if validate_response.status_code == 200: print("✅ Schema validation successful!")else: print("❌ Validation failed:", validate_response.text)def get_prediction(prompt_spec, input_data): response = requests.post( f"{PROMPT_SPEC_URL}/predict", headers=FIDDLER_HEADERS, json={"prompt_spec": prompt_spec, "input_data": input_data} ) if response.status_code == 200: return response.json()["prediction"] return {"topic": None, "reasoning": None}# Test with a single exampletest_result = get_prediction( basic_prompt_spec, {"news_summary": "Wimbledon 2025 is under way!"})print(json.dumps(test_result, indent=2))enhanced_prompt_spec = { "instruction": "Determine the topic of the given news summary.", "input_fields": { "news_summary": {"type": "string"} }, "output_fields": { "topic": { "type": "string", "choices": ["World", "Sports", "Business", "Sci/Tech"], "description": """Use 'Sci/Tech' for technology companies, scientific discoveries, or health/medical research.Use 'Sports' for sports events or athletes.Use 'Business' for companies outside of tech/sports.Use 'World' for global events or issues.""" }, "reasoning": { "type": "string", "description": "Explain why you chose this topic." } }}# Test on your datasetresults = []for _, row in df_news.iterrows(): prediction = get_prediction( enhanced_prompt_spec, {"news_summary": row["text"]} ) results.append({ "original": row["original_topic"], "predicted": prediction["topic"], "reasoning": prediction["reasoning"] })# Calculate accuracydf_results = pd.DataFrame(results)accuracy = (df_results["original"] == df_results["predicted"]).mean()print(f"Accuracy: {accuracy:.1%}")