Lab 05 - Variance and Standard Deviation¶

Goal: compare run-to-run stability for two prompt profiles.

Info: Stability metric

Wildly swinging quality scores suggest instability (data drift, retraining, prompt changes). Track std dev as closely as mean.

Info: Profile comparison

Two profiles with equal mean but different variance should not be treated equally. Low-variance profile is safer for automation.

In [ ]:

Copied!





import math

profile_a = [4.1, 4.0, 4.2, 4.1, 4.3, 4.2, 4.1]
profile_b = [3.6, 4.8, 3.9, 4.7, 3.7, 4.6, 4.1]

def mean(xs):
    return sum(xs) / len(xs)

def variance(xs):
    mu = mean(xs)
    return sum((x - mu) ** 2 for x in xs) / len(xs)

def stddev(xs):
    return math.sqrt(variance(xs))

for name, data in [("A", profile_a), ("B", profile_b)]:
    print(name, "mean=", round(mean(data), 3), "var=", round(variance(data), 4), "std=", round(stddev(data), 4))
import math

profile_a = [4.1, 4.0, 4.2, 4.1, 4.3, 4.2, 4.1]
profile_b = [3.6, 4.8, 3.9, 4.7, 3.7, 4.6, 4.1]

def mean(xs):
    return sum(xs) / len(xs)

def variance(xs):
    mu = mean(xs)
    return sum((x - mu) ** 2 for x in xs) / len(xs)

def stddev(xs):
    return math.sqrt(variance(xs))

for name, data in [("A", profile_a), ("B", profile_b)]:
    print(name, "mean=", round(mean(data), 3), "var=", round(variance(data), 4), "std=", round(stddev(data), 4))

In [ ]:

Copied!





def stability_decision(std, good=0.20, watch=0.35):
    if std <= good:
        return "go"
    if std <= watch:
        return "monitor"
    return "hold"

for name, data in [("A", profile_a), ("B", profile_b)]:
    s = stddev(data)
    print(name, "decision:", stability_decision(s))
def stability_decision(std, good=0.20, watch=0.35):
    if std <= good:
        return "go"
    if std <= watch:
        return "monitor"
    return "hold"

for name, data in [("A", profile_a), ("B", profile_b)]:
    s = stddev(data)
    print(name, "decision:", stability_decision(s))

Visualization: stability comparison¶

The lines compare run-to-run variability between profiles. Lower spread means higher stability.

In [ ]:

Copied!





import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

profile_a = [4.1, 4.0, 4.2, 4.1, 4.3, 4.2, 4.1]
profile_b = [3.6, 4.8, 3.9, 4.7, 3.7, 4.6, 4.1]
runs = list(range(1, len(profile_a)+1))
plt.figure(figsize=(9,4))
plt.plot(runs, profile_a, marker='o', label='Profile A')
plt.plot(runs, profile_b, marker='o', label='Profile B')
plt.axhline(sum(profile_a)/len(profile_a), linestyle='--', alpha=0.4, color='C0')
plt.axhline(sum(profile_b)/len(profile_b), linestyle='--', alpha=0.4, color='C1')
plt.title('Run-to-run stability comparison')
plt.xlabel('Run')
plt.ylabel('Quality score')
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

profile_a = [4.1, 4.0, 4.2, 4.1, 4.3, 4.2, 4.1]
profile_b = [3.6, 4.8, 3.9, 4.7, 3.7, 4.6, 4.1]
runs = list(range(1, len(profile_a)+1))
plt.figure(figsize=(9,4))
plt.plot(runs, profile_a, marker='o', label='Profile A')
plt.plot(runs, profile_b, marker='o', label='Profile B')
plt.axhline(sum(profile_a)/len(profile_a), linestyle='--', alpha=0.4, color='C0')
plt.axhline(sum(profile_b)/len(profile_b), linestyle='--', alpha=0.4, color='C1')
plt.title('Run-to-run stability comparison')
plt.xlabel('Run')
plt.ylabel('Quality score')
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()