Generation: LLM Integration

The final step: using the LLM with retrieved context to generate answers.

Basic Generation

from openai import OpenAI

def generate_answer(query, context, model="gpt-3.5-turbo"):
    """Generate answer using LLM."""

    client = OpenAI(api_key="your-key")

    prompt = f"""
Use the following context to answer the question.
If you don't know, say so.

Context:
{context}

Question: {query}

Answer:
"""

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=500
    )

    return response.choices[0].message.content

With Citation

Track which sources were used:

def generate_with_citations(query, retrieved_results, model="gpt-3.5-turbo"):
    """Generate answer with source citations."""

    # Build context with source markers
    context = "\n\n".join([
        f"[Source {i+1}]: {result.text}"
        for i, result in enumerate(retrieved_results)
    ])

    prompt = f"""
Answer the question using the provided sources.
Cite sources by referring to [Source N] when relevant.
If not in sources, say you don't know.

{context}

Question: {query}

Answer:
"""

    response = generate(prompt, model)

    # Extract source references
    citations = extract_citations(response, len(retrieved_results))

    return {
        'answer': response,
        'sources': [retrieved_results[i] for i in citations],
        'confidence': calculate_confidence(citations, len(retrieved_results))
    }

Streaming Response

For better UX, stream LLM output:

def generate_streaming(query, context):
    """Stream answer as it's generated."""

    client = OpenAI()

    stream = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": f"{context}\n\n{query}"}
        ],
        stream=True
    )

    for chunk in stream:
        if chunk.choices[0].delta.content:
            print(chunk.choices[0].delta.content, end="", flush=True)

Handling Long Contexts

When context is very long, use summarization:

def summarize_context(long_context, max_tokens=500):
    """Summarize long context to fit in model."""

    client = OpenAI()

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": f"Summarize this in {max_tokens} tokens:\n{long_context}"
            }
        ],
        max_tokens=max_tokens
    )

    return response.choices[0].message.content

Advanced: Chain-of-Thought

Improve reasoning by asking LLM to think step-by-step:

def generate_with_reasoning(query, context):
    """Use chain-of-thought prompting."""

    prompt = f"""
Context:
{context}

Question: {query}

Think step by step:
1. What information from the context is relevant?
2. How does it address the question?
3. What is the final answer?

Answer:
"""

    return generate(prompt)

Multi-Turn Conversation

Maintain conversation history:

class RAGConversation:
    def __init__(self, vector_db):
        self.vector_db = vector_db
        self.history = []

    def ask(self, user_message):
        """Process user message and maintain context."""

        # Retrieve based on latest message
        context = self.retrieve(user_message)

        # Build messages with history
        messages = [
            {"role": "system", "content": "You are a helpful assistant."}
        ]

        # Add conversation history (limit to recent 5 exchanges)
        for user_msg, assistant_msg in self.history[-5:]:
            messages.append({"role": "user", "content": user_msg})
            messages.append({"role": "assistant", "content": assistant_msg})

        # Add current message with context
        messages.append({
            "role": "user",
            "content": f"Context:\n{context}\n\nQuestion: {user_message}"
        })

        # Generate response
        response = generate(messages)

        # Store in history
        self.history.append((user_message, response))

        return response

    def retrieve(self, message):
        # Hybrid retrieval logic
        pass

Confidence Scoring

Estimate answer confidence:

def score_confidence(answer, context, retrieved_scores):
    """Estimate how confident the answer is."""

    factors = {
        'source_quality': sum(retrieved_scores) / len(retrieved_scores),  # Avg retrieval score
        'answer_length': min(len(answer.split()) / 100, 1.0),  # Longer = more detail
        'source_count': min(len(retrieved_scores) / 3, 1.0),   # More sources = better
    }

    # Weighted average
    weights = {'source_quality': 0.5, 'answer_length': 0.2, 'source_count': 0.3}
    confidence = sum(factors[k] * weights[k] for k in factors)

    return min(confidence, 1.0)  # Cap at 1.0

Error Handling

Gracefully handle failures:

def generate_safe(query, context, model="gpt-3.5-turbo"):
    """Generate with fallback."""

    try:
        # Try primary model
        return generate(query, context, model)

    except TokenLimitError:
        # Reduce context
        summarized = summarize_context(context, max_tokens=500)
        return generate(query, summarized, model)

    except APIError as e:
        # Fall back to simpler extraction
        return extract_answer_from_context(query, context)

    except Exception as e:
        # Last resort: return context itself
        return f"Unable to process. Here's the relevant information:\n{context}"

Next Step

→ Evaluation - Measuring system quality