Lab 05 - Variance and Standard Deviation¶
Goal: compare run-to-run stability for two prompt profiles.
Info: Stability metric
Wildly swinging quality scores suggest instability (data drift, retraining, prompt changes). Track std dev as closely as mean.
Info: Profile comparison
Two profiles with equal mean but different variance should not be treated equally. Low-variance profile is safer for automation.
In [ ]:
Copied!
import math
profile_a = [4.1, 4.0, 4.2, 4.1, 4.3, 4.2, 4.1]
profile_b = [3.6, 4.8, 3.9, 4.7, 3.7, 4.6, 4.1]
def mean(xs):
return sum(xs) / len(xs)
def variance(xs):
mu = mean(xs)
return sum((x - mu) ** 2 for x in xs) / len(xs)
def stddev(xs):
return math.sqrt(variance(xs))
for name, data in [("A", profile_a), ("B", profile_b)]:
print(name, "mean=", round(mean(data), 3), "var=", round(variance(data), 4), "std=", round(stddev(data), 4))
import math
profile_a = [4.1, 4.0, 4.2, 4.1, 4.3, 4.2, 4.1]
profile_b = [3.6, 4.8, 3.9, 4.7, 3.7, 4.6, 4.1]
def mean(xs):
return sum(xs) / len(xs)
def variance(xs):
mu = mean(xs)
return sum((x - mu) ** 2 for x in xs) / len(xs)
def stddev(xs):
return math.sqrt(variance(xs))
for name, data in [("A", profile_a), ("B", profile_b)]:
print(name, "mean=", round(mean(data), 3), "var=", round(variance(data), 4), "std=", round(stddev(data), 4))
In [ ]:
Copied!
def stability_decision(std, good=0.20, watch=0.35):
if std <= good:
return "go"
if std <= watch:
return "monitor"
return "hold"
for name, data in [("A", profile_a), ("B", profile_b)]:
s = stddev(data)
print(name, "decision:", stability_decision(s))
def stability_decision(std, good=0.20, watch=0.35):
if std <= good:
return "go"
if std <= watch:
return "monitor"
return "hold"
for name, data in [("A", profile_a), ("B", profile_b)]:
s = stddev(data)
print(name, "decision:", stability_decision(s))
Visualization: stability comparison¶
The lines compare run-to-run variability between profiles. Lower spread means higher stability.
In [ ]:
Copied!
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
profile_a = [4.1, 4.0, 4.2, 4.1, 4.3, 4.2, 4.1]
profile_b = [3.6, 4.8, 3.9, 4.7, 3.7, 4.6, 4.1]
runs = list(range(1, len(profile_a)+1))
plt.figure(figsize=(9,4))
plt.plot(runs, profile_a, marker='o', label='Profile A')
plt.plot(runs, profile_b, marker='o', label='Profile B')
plt.axhline(sum(profile_a)/len(profile_a), linestyle='--', alpha=0.4, color='C0')
plt.axhline(sum(profile_b)/len(profile_b), linestyle='--', alpha=0.4, color='C1')
plt.title('Run-to-run stability comparison')
plt.xlabel('Run')
plt.ylabel('Quality score')
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
profile_a = [4.1, 4.0, 4.2, 4.1, 4.3, 4.2, 4.1]
profile_b = [3.6, 4.8, 3.9, 4.7, 3.7, 4.6, 4.1]
runs = list(range(1, len(profile_a)+1))
plt.figure(figsize=(9,4))
plt.plot(runs, profile_a, marker='o', label='Profile A')
plt.plot(runs, profile_b, marker='o', label='Profile B')
plt.axhline(sum(profile_a)/len(profile_a), linestyle='--', alpha=0.4, color='C0')
plt.axhline(sum(profile_b)/len(profile_b), linestyle='--', alpha=0.4, color='C1')
plt.title('Run-to-run stability comparison')
plt.xlabel('Run')
plt.ylabel('Quality score')
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()