Lab 08 - Classification and Calibration¶
Goal: tune thresholds and check reliability of predicted probabilities.
Info: Calibration is trust
If model says 80% confident but is correct 60% of the time, guardrails are miscalibrated. Recalibrate against live data.
Info: Precision-recall trade-off
High precision escalates uncertain cases (safe). High recall automates more (risky). Business context decides.
In [ ]:
Copied!
samples = [
{"p": 0.92, "y": 1}, {"p": 0.81, "y": 1}, {"p": 0.76, "y": 1},
{"p": 0.72, "y": 0}, {"p": 0.64, "y": 1}, {"p": 0.59, "y": 0},
{"p": 0.47, "y": 0}, {"p": 0.42, "y": 1}, {"p": 0.33, "y": 0},
{"p": 0.21, "y": 0}
]
def metrics(data, threshold):
tp = fp = fn = tn = 0
for row in data:
pred = 1 if row["p"] >= threshold else 0
y = row["y"]
if pred == 1 and y == 1: tp += 1
elif pred == 1 and y == 0: fp += 1
elif pred == 0 and y == 1: fn += 1
else: tn += 1
precision = tp / (tp + fp) if (tp + fp) else 0.0
recall = tp / (tp + fn) if (tp + fn) else 0.0
return precision, recall, (tp, fp, fn, tn)
for t in [0.5, 0.7, 0.8]:
precision, recall, confusion = metrics(samples, t)
print("threshold", t, "precision", round(precision, 3), "recall", round(recall, 3), "confusion", confusion)
samples = [
{"p": 0.92, "y": 1}, {"p": 0.81, "y": 1}, {"p": 0.76, "y": 1},
{"p": 0.72, "y": 0}, {"p": 0.64, "y": 1}, {"p": 0.59, "y": 0},
{"p": 0.47, "y": 0}, {"p": 0.42, "y": 1}, {"p": 0.33, "y": 0},
{"p": 0.21, "y": 0}
]
def metrics(data, threshold):
tp = fp = fn = tn = 0
for row in data:
pred = 1 if row["p"] >= threshold else 0
y = row["y"]
if pred == 1 and y == 1: tp += 1
elif pred == 1 and y == 0: fp += 1
elif pred == 0 and y == 1: fn += 1
else: tn += 1
precision = tp / (tp + fp) if (tp + fp) else 0.0
recall = tp / (tp + fn) if (tp + fn) else 0.0
return precision, recall, (tp, fp, fn, tn)
for t in [0.5, 0.7, 0.8]:
precision, recall, confusion = metrics(samples, t)
print("threshold", t, "precision", round(precision, 3), "recall", round(recall, 3), "confusion", confusion)
In [ ]:
Copied!
bins = [(0.0, 0.4), (0.4, 0.7), (0.7, 1.01)]
print("bin\tcount\tavg_pred\tobserved_rate")
for lo, hi in bins:
chunk = [r for r in samples if lo <= r["p"] < hi]
if not chunk:
continue
avg_pred = sum(r["p"] for r in chunk) / len(chunk)
obs = sum(r["y"] for r in chunk) / len(chunk)
print(f"[{lo:.1f},{hi:.1f})\t{len(chunk)}\t{avg_pred:.3f}\t{obs:.3f}")
bins = [(0.0, 0.4), (0.4, 0.7), (0.7, 1.01)]
print("bin\tcount\tavg_pred\tobserved_rate")
for lo, hi in bins:
chunk = [r for r in samples if lo <= r["p"] < hi]
if not chunk:
continue
avg_pred = sum(r["p"] for r in chunk) / len(chunk)
obs = sum(r["y"] for r in chunk) / len(chunk)
print(f"[{lo:.1f},{hi:.1f})\t{len(chunk)}\t{avg_pred:.3f}\t{obs:.3f}")
Visualization: threshold precision/recall curves¶
This compares precision and recall at different thresholds so you can choose policy trade-offs.
In [ ]:
Copied!
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
samples = [
{'p': 0.92, 'y': 1}, {'p': 0.81, 'y': 1}, {'p': 0.76, 'y': 1},
{'p': 0.72, 'y': 0}, {'p': 0.64, 'y': 1}, {'p': 0.59, 'y': 0},
{'p': 0.47, 'y': 0}, {'p': 0.42, 'y': 1}, {'p': 0.33, 'y': 0},
{'p': 0.21, 'y': 0}
]
def metrics(data, t):
tp=fp=fn=0
for r in data:
pred = 1 if r['p'] >= t else 0
if pred==1 and r['y']==1: tp +=1
elif pred==1 and r['y']==0: fp +=1
elif pred==0 and r['y']==1: fn +=1
precision = tp/(tp+fp) if (tp+fp) else 0
recall = tp/(tp+fn) if (tp+fn) else 0
return precision, recall
thresholds = [0.5, 0.6, 0.7, 0.8]
prec = [metrics(samples, t)[0] for t in thresholds]
rec = [metrics(samples, t)[1] for t in thresholds]
plt.figure(figsize=(8,4))
plt.plot(thresholds, prec, marker='o', label='Precision')
plt.plot(thresholds, rec, marker='o', label='Recall')
plt.ylim(0,1)
plt.title('Precision/Recall vs threshold')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
samples = [
{'p': 0.92, 'y': 1}, {'p': 0.81, 'y': 1}, {'p': 0.76, 'y': 1},
{'p': 0.72, 'y': 0}, {'p': 0.64, 'y': 1}, {'p': 0.59, 'y': 0},
{'p': 0.47, 'y': 0}, {'p': 0.42, 'y': 1}, {'p': 0.33, 'y': 0},
{'p': 0.21, 'y': 0}
]
def metrics(data, t):
tp=fp=fn=0
for r in data:
pred = 1 if r['p'] >= t else 0
if pred==1 and r['y']==1: tp +=1
elif pred==1 and r['y']==0: fp +=1
elif pred==0 and r['y']==1: fn +=1
precision = tp/(tp+fp) if (tp+fp) else 0
recall = tp/(tp+fn) if (tp+fn) else 0
return precision, recall
thresholds = [0.5, 0.6, 0.7, 0.8]
prec = [metrics(samples, t)[0] for t in thresholds]
rec = [metrics(samples, t)[1] for t in thresholds]
plt.figure(figsize=(8,4))
plt.plot(thresholds, prec, marker='o', label='Precision')
plt.plot(thresholds, rec, marker='o', label='Recall')
plt.ylim(0,1)
plt.title('Precision/Recall vs threshold')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()