Lab 08 - Classification and Calibration¶

Goal: tune thresholds and check reliability of predicted probabilities.

Info: Calibration is trust

If model says 80% confident but is correct 60% of the time, guardrails are miscalibrated. Recalibrate against live data.

Info: Precision-recall trade-off

High precision escalates uncertain cases (safe). High recall automates more (risky). Business context decides.

In [ ]:

Copied!





samples = [
    {"p": 0.92, "y": 1}, {"p": 0.81, "y": 1}, {"p": 0.76, "y": 1},
    {"p": 0.72, "y": 0}, {"p": 0.64, "y": 1}, {"p": 0.59, "y": 0},
    {"p": 0.47, "y": 0}, {"p": 0.42, "y": 1}, {"p": 0.33, "y": 0},
    {"p": 0.21, "y": 0}
]

def metrics(data, threshold):
    tp = fp = fn = tn = 0
    for row in data:
        pred = 1 if row["p"] >= threshold else 0
        y = row["y"]
        if pred == 1 and y == 1: tp += 1
        elif pred == 1 and y == 0: fp += 1
        elif pred == 0 and y == 1: fn += 1
        else: tn += 1
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall = tp / (tp + fn) if (tp + fn) else 0.0
    return precision, recall, (tp, fp, fn, tn)

for t in [0.5, 0.7, 0.8]:
    precision, recall, confusion = metrics(samples, t)
    print("threshold", t, "precision", round(precision, 3), "recall", round(recall, 3), "confusion", confusion)
samples = [
    {"p": 0.92, "y": 1}, {"p": 0.81, "y": 1}, {"p": 0.76, "y": 1},
    {"p": 0.72, "y": 0}, {"p": 0.64, "y": 1}, {"p": 0.59, "y": 0},
    {"p": 0.47, "y": 0}, {"p": 0.42, "y": 1}, {"p": 0.33, "y": 0},
    {"p": 0.21, "y": 0}
]

def metrics(data, threshold):
    tp = fp = fn = tn = 0
    for row in data:
        pred = 1 if row["p"] >= threshold else 0
        y = row["y"]
        if pred == 1 and y == 1: tp += 1
        elif pred == 1 and y == 0: fp += 1
        elif pred == 0 and y == 1: fn += 1
        else: tn += 1
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall = tp / (tp + fn) if (tp + fn) else 0.0
    return precision, recall, (tp, fp, fn, tn)

for t in [0.5, 0.7, 0.8]:
    precision, recall, confusion = metrics(samples, t)
    print("threshold", t, "precision", round(precision, 3), "recall", round(recall, 3), "confusion", confusion)

In [ ]:

Copied!





bins = [(0.0, 0.4), (0.4, 0.7), (0.7, 1.01)]
print("bin\tcount\tavg_pred\tobserved_rate")
for lo, hi in bins:
    chunk = [r for r in samples if lo <= r["p"] < hi]
    if not chunk:
        continue
    avg_pred = sum(r["p"] for r in chunk) / len(chunk)
    obs = sum(r["y"] for r in chunk) / len(chunk)
    print(f"[{lo:.1f},{hi:.1f})\t{len(chunk)}\t{avg_pred:.3f}\t{obs:.3f}")
bins = [(0.0, 0.4), (0.4, 0.7), (0.7, 1.01)]
print("bin\tcount\tavg_pred\tobserved_rate")
for lo, hi in bins:
    chunk = [r for r in samples if lo <= r["p"] < hi]
    if not chunk:
        continue
    avg_pred = sum(r["p"] for r in chunk) / len(chunk)
    obs = sum(r["y"] for r in chunk) / len(chunk)
    print(f"[{lo:.1f},{hi:.1f})\t{len(chunk)}\t{avg_pred:.3f}\t{obs:.3f}")

Visualization: threshold precision/recall curves¶

This compares precision and recall at different thresholds so you can choose policy trade-offs.

In [ ]:

Copied!





import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

samples = [
    {'p': 0.92, 'y': 1}, {'p': 0.81, 'y': 1}, {'p': 0.76, 'y': 1},
    {'p': 0.72, 'y': 0}, {'p': 0.64, 'y': 1}, {'p': 0.59, 'y': 0},
    {'p': 0.47, 'y': 0}, {'p': 0.42, 'y': 1}, {'p': 0.33, 'y': 0},
    {'p': 0.21, 'y': 0}
]
def metrics(data, t):
    tp=fp=fn=0
    for r in data:
        pred = 1 if r['p'] >= t else 0
        if pred==1 and r['y']==1: tp +=1
        elif pred==1 and r['y']==0: fp +=1
        elif pred==0 and r['y']==1: fn +=1
    precision = tp/(tp+fp) if (tp+fp) else 0
    recall = tp/(tp+fn) if (tp+fn) else 0
    return precision, recall

thresholds = [0.5, 0.6, 0.7, 0.8]
prec = [metrics(samples, t)[0] for t in thresholds]
rec = [metrics(samples, t)[1] for t in thresholds]
plt.figure(figsize=(8,4))
plt.plot(thresholds, prec, marker='o', label='Precision')
plt.plot(thresholds, rec, marker='o', label='Recall')
plt.ylim(0,1)
plt.title('Precision/Recall vs threshold')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

samples = [
    {'p': 0.92, 'y': 1}, {'p': 0.81, 'y': 1}, {'p': 0.76, 'y': 1},
    {'p': 0.72, 'y': 0}, {'p': 0.64, 'y': 1}, {'p': 0.59, 'y': 0},
    {'p': 0.47, 'y': 0}, {'p': 0.42, 'y': 1}, {'p': 0.33, 'y': 0},
    {'p': 0.21, 'y': 0}
]
def metrics(data, t):
    tp=fp=fn=0
    for r in data:
        pred = 1 if r['p'] >= t else 0
        if pred==1 and r['y']==1: tp +=1
        elif pred==1 and r['y']==0: fp +=1
        elif pred==0 and r['y']==1: fn +=1
    precision = tp/(tp+fp) if (tp+fp) else 0
    recall = tp/(tp+fn) if (tp+fn) else 0
    return precision, recall

thresholds = [0.5, 0.6, 0.7, 0.8]
prec = [metrics(samples, t)[0] for t in thresholds]
rec = [metrics(samples, t)[1] for t in thresholds]
plt.figure(figsize=(8,4))
plt.plot(thresholds, prec, marker='o', label='Precision')
plt.plot(thresholds, rec, marker='o', label='Recall')
plt.ylim(0,1)
plt.title('Precision/Recall vs threshold')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()