Lab 09 - Correlation vs Causation¶
Goal: detect association while avoiding incorrect causal conclusions.
Info: Segmentation breaks false correlations
Always segment by potential confounders (severity, queue, time). Correlation often disappears or reverses when stratified.
Info: Policy pilots first
Correlation suggests pilots, but only pilots justify production policy. Use controlled experiments, not correlation alone.
In [ ]:
Copied!
import math
def corr(xs, ys):
mx = sum(xs) / len(xs)
my = sum(ys) / len(ys)
num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
den_x = math.sqrt(sum((x - mx) ** 2 for x in xs))
den_y = math.sqrt(sum((y - my) ** 2 for y in ys))
return num / (den_x * den_y)
prompt_tokens = [60, 80, 90, 120, 140, 160, 180, 200]
resolution_mins = [35, 42, 45, 58, 70, 78, 90, 98]
print('Overall correlation:', round(corr(prompt_tokens, resolution_mins), 4))
import math
def corr(xs, ys):
mx = sum(xs) / len(xs)
my = sum(ys) / len(ys)
num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
den_x = math.sqrt(sum((x - mx) ** 2 for x in xs))
den_y = math.sqrt(sum((y - my) ** 2 for y in ys))
return num / (den_x * den_y)
prompt_tokens = [60, 80, 90, 120, 140, 160, 180, 200]
resolution_mins = [35, 42, 45, 58, 70, 78, 90, 98]
print('Overall correlation:', round(corr(prompt_tokens, resolution_mins), 4))
In [ ]:
Copied!
severity = ['low', 'low', 'med', 'med', 'high', 'high', 'high', 'high']
low_idx = [i for i, s in enumerate(severity) if s in ('low', 'med')]
high_idx = [i for i, s in enumerate(severity) if s == 'high']
pt_low = [prompt_tokens[i] for i in low_idx]
rm_low = [resolution_mins[i] for i in low_idx]
pt_high = [prompt_tokens[i] for i in high_idx]
rm_high = [resolution_mins[i] for i in high_idx]
print('Low/med segment correlation:', round(corr(pt_low, rm_low), 4))
print('High segment correlation:', round(corr(pt_high, rm_high), 4))
print('Interpretation: segment before deciding policy changes.')
severity = ['low', 'low', 'med', 'med', 'high', 'high', 'high', 'high']
low_idx = [i for i, s in enumerate(severity) if s in ('low', 'med')]
high_idx = [i for i, s in enumerate(severity) if s == 'high']
pt_low = [prompt_tokens[i] for i in low_idx]
rm_low = [resolution_mins[i] for i in low_idx]
pt_high = [prompt_tokens[i] for i in high_idx]
rm_high = [resolution_mins[i] for i in high_idx]
print('Low/med segment correlation:', round(corr(pt_low, rm_low), 4))
print('High segment correlation:', round(corr(pt_high, rm_high), 4))
print('Interpretation: segment before deciding policy changes.')