Lab 09 - Correlation vs Causation¶

Goal: detect association while avoiding incorrect causal conclusions.

Info: Segmentation breaks false correlations

Always segment by potential confounders (severity, queue, time). Correlation often disappears or reverses when stratified.

Info: Policy pilots first

Correlation suggests pilots, but only pilots justify production policy. Use controlled experiments, not correlation alone.

In [ ]:

Copied!





import math

def corr(xs, ys):
    mx = sum(xs) / len(xs)
    my = sum(ys) / len(ys)
    num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
    den_x = math.sqrt(sum((x - mx) ** 2 for x in xs))
    den_y = math.sqrt(sum((y - my) ** 2 for y in ys))
    return num / (den_x * den_y)

prompt_tokens = [60, 80, 90, 120, 140, 160, 180, 200]
resolution_mins = [35, 42, 45, 58, 70, 78, 90, 98]
print('Overall correlation:', round(corr(prompt_tokens, resolution_mins), 4))
import math

def corr(xs, ys):
    mx = sum(xs) / len(xs)
    my = sum(ys) / len(ys)
    num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
    den_x = math.sqrt(sum((x - mx) ** 2 for x in xs))
    den_y = math.sqrt(sum((y - my) ** 2 for y in ys))
    return num / (den_x * den_y)

prompt_tokens = [60, 80, 90, 120, 140, 160, 180, 200]
resolution_mins = [35, 42, 45, 58, 70, 78, 90, 98]
print('Overall correlation:', round(corr(prompt_tokens, resolution_mins), 4))

In [ ]:

Copied!





severity = ['low', 'low', 'med', 'med', 'high', 'high', 'high', 'high']
low_idx = [i for i, s in enumerate(severity) if s in ('low', 'med')]
high_idx = [i for i, s in enumerate(severity) if s == 'high']

pt_low = [prompt_tokens[i] for i in low_idx]
rm_low = [resolution_mins[i] for i in low_idx]
pt_high = [prompt_tokens[i] for i in high_idx]
rm_high = [resolution_mins[i] for i in high_idx]

print('Low/med segment correlation:', round(corr(pt_low, rm_low), 4))
print('High segment correlation:', round(corr(pt_high, rm_high), 4))
print('Interpretation: segment before deciding policy changes.')
severity = ['low', 'low', 'med', 'med', 'high', 'high', 'high', 'high']
low_idx = [i for i, s in enumerate(severity) if s in ('low', 'med')]
high_idx = [i for i, s in enumerate(severity) if s == 'high']

pt_low = [prompt_tokens[i] for i in low_idx]
rm_low = [resolution_mins[i] for i in low_idx]
pt_high = [prompt_tokens[i] for i in high_idx]
rm_high = [resolution_mins[i] for i in high_idx]

print('Low/med segment correlation:', round(corr(pt_low, rm_low), 4))
print('High segment correlation:', round(corr(pt_high, rm_high), 4))
print('Interpretation: segment before deciding policy changes.')