Retrieval & Augmentation: Building Context

After ingestion, the retrieval phase finds relevant documents and uses them to augment the LLM prompt.

Retrieval Strategy (Recap)

Use the complete hybrid approach you've learned:

def retrieve(query, vector_db, bm25, top_k=10):
    """Hybrid retrieval with filtering."""

    # 1. Extract constraints
    constraints = extract_constraints(query)

    # 2. Dense search
    dense_results = vector_db.search(
        query_embedding,
        limit=100
    )

    # 3. Sparse search
    sparse_results = bm25.search(query, top_k=100)

    # 4. Combine (hybrid)
    hybrid = combine_results(dense_results, sparse_results)

    # 5. Filter by constraints
    filtered = [r for r in hybrid if passes_filter(r, constraints)]

    # 6. Return top-K
    return filtered[:top_k]

Context Assembly

Convert retrieved documents into prompt context:

def assemble_context(retrieved_results, max_context_length=2000):
    """Build context string for LLM."""

    context_parts = []
    total_length = 0

    for i, result in enumerate(retrieved_results):
        # Format: "Source N: [text]"
        formatted = f"Source {i+1} (confidence: {result.score:.2f}):\n{result.text}"

        if total_length + len(formatted) > max_context_length:
            break

        context_parts.append(formatted)
        total_length += len(formatted)

    return "\n\n".join(context_parts)

# Usage
context = assemble_context(top_results)
# Output:
# Source 1 (confidence: 0.95):
# Order #1766 confirmed...
#
# Source 2 (confidence: 0.92):
# Order #1766 tracking...

Prompt Engineering

Build effective prompts that use retrieved context:

class RAGPromptBuilder:
    def build_prompt(self, query, context, system_role="customer service"):
        """Build prompt with context for LLM."""

        if system_role == "customer service":
            return f"""
You are a helpful customer service assistant.
Use the provided documentation to answer customer questions accurately and courteously.
If the information is not in the documentation, say you don't have that information.

Documentation:
{context}

Customer Question: {query}

Answer:
"""

        elif system_role == "order specialist":
            return f"""
You are an order tracking specialist.
Use the order information provided to give accurate status updates.
Always cite the source when providing information.

Order Information:
{context}

Customer Question: {query}

Answer (with source citation):
"""

    def build_chat_prompt(self, conversation_history, context):
        """Build prompt for multi-turn conversation."""

        messages = [
            {
                "role": "system",
                "content": f"""You are a helpful assistant.
Use this context to inform your answers:

{context}"""
            }
        ]

        for user_msg, assistant_msg in conversation_history[:-1]:
            messages.append({"role": "user", "content": user_msg})
            messages.append({"role": "assistant", "content": assistant_msg})

        messages.append({"role": "user", "content": conversation_history[-1]})

        return messages

Managing Context Window

LLMs have limited context (4K-8K tokens typical):

def optimize_context(query, retrieved_results, max_tokens=2000):
    """Use most relevant context within token limit."""

    # Estimate tokens (rough: 1 token ≈ 4 chars)
    query_tokens = len(query) / 4
    overhead_tokens = 200  # Prompt template, instructions
    available = max_tokens - query_tokens - overhead_tokens

    # Prioritize by score
    sorted_results = sorted(retrieved_results, key=lambda x: x.score, reverse=True)

    context_parts = []
    used_tokens = 0

    for result in sorted_results:
        result_tokens = len(result.text) / 4

        if used_tokens + result_tokens > available:
            break

        context_parts.append(result.text)
        used_tokens += result_tokens

    return "\n\n".join(context_parts)

Retrieval with Follow-up Questions

For complex queries, retrieve multiple times:

Click to expand code example

def iterative_retrieval(initial_query, vector_db, max_iterations=3):
    """Refine retrieval through multiple steps."""

    all_results = []
    current_query = initial_query
    seen_ids = set()

    for iteration in range(max_iterations):
        # Retrieve for current query
        results = retrieve(current_query, vector_db)

        # Deduplicate
        new_results = [r for r in results if r.id not in seen_ids]
        all_results.extend(new_results)
        seen_ids.update(r.id for r in new_results)

        # Generate follow-up question
        # (In real system, LLM would generate this)
        follow_up = generate_follow_up(current_query, results)

        if not follow_up:
            break

        current_query = follow_up

    return all_results

Quality Metrics

Track retrieval quality during augmentation:

def evaluate_retrieval(query, retrieved_results, ground_truth_ids):
    """Measure retrieval quality."""

    retrieved_ids = {r.id for r in retrieved_results}

    # Recall: how many relevant docs did we find?
    recall = len(retrieved_ids & ground_truth_ids) / len(ground_truth_ids)

    # Precision: how many retrieved docs were relevant?
    precision = len(retrieved_ids & ground_truth_ids) / len(retrieved_ids)

    # Mean Reciprocal Rank: how early was the first relevant?
    mrr = 0
    for i, result in enumerate(retrieved_results):
        if result.id in ground_truth_ids:
            mrr = 1 / (i + 1)
            break

    return {
        'recall': recall,
        'precision': precision,
        'mrr': mrr,
        'f1': 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    }

Next Step

→ Generation - LLM interaction and answer generation