Context management strategies

The attention dilution problem

Transformer models attend less reliably to content in the middle of long contexts. This is not a context window size limitation — it is a property of the attention mechanism.

Document position in context vs. attention reliability:

Beginning  ████████████  High reliability
Middle     ████░░░░░░░░  Lower reliability ("lost in the middle")
End        ████████████  High reliability

The exam trap: "Use a larger context window model" is always the wrong answer for attention dilution. A 200K window helps with fitting the content, but does not fix attention quality in the middle.

Per-section pass architecture

def analyze_long_document(document: str) -> dict:
    sections = split_into_sections(document)
    section_analyses = []

    # Pass 1: Each section gets full, focused attention
    for i, section in enumerate(sections):
        analysis = client.messages.create(
            model="claude-sonnet-4-6",
            system="Analyze this document section thoroughly. Extract all claims, dates, entities, and key facts.",
            messages=[{
                "role": "user",
                "content": f"Section {i+1} of {len(sections)}:\n\n{section}"
            }]
        )
        section_analyses.append({
            "section_index": i,
            "content": section[:200] + "...",  # Reference snippet
            "analysis": analysis.content[0].text
        })

    # Pass 2: Integration pass — synthesize all section analyses
    synthesis = client.messages.create(
        model="claude-sonnet-4-6",
        system="Synthesize these section analyses into a coherent whole. Identify cross-section themes, contradictions, and the overall narrative arc.",
        messages=[{
            "role": "user",
            "content": f"Synthesize these {len(sections)} section analyses:\n\n" +
                      "\n\n---\n\n".join(f"Section {a['section_index']+1}:\n{a['analysis']}"
                                         for a in section_analyses)
        }]
    )

    return {"sections": section_analyses, "synthesis": synthesis.content[0].text}

Rolling summary for long conversations

class ConversationManager:
    def __init__(self, max_live_turns: int = 20, summary_threshold: int = 40):
        self.messages = []
        self.max_live_turns = max_live_turns
        self.summary_threshold = summary_threshold
        self.summary = None

    def add_turn(self, role: str, content: str) -> None:
        self.messages.append({"role": role, "content": content})

        # Compress when threshold is reached
        if len(self.messages) >= self.summary_threshold:
            self._compress()

    def _compress(self) -> None:
        # Summarize everything except the last N turns
        old_messages = self.messages[:-self.max_live_turns]
        recent_messages = self.messages[-self.max_live_turns:]

        summary_text = self._summarize(old_messages)

        # Replace old messages with compact summary
        self.messages = [
            {"role": "user", "content": f"[Previous conversation summary]\n{summary_text}"},
            {"role": "assistant", "content": "Understood. I'll continue with that context in mind."},
            *recent_messages
        ]

    def get_messages(self) -> list:
        return self.messages

Prompt caching for repeated prefixes

# Large stable prefix — cache it to save cost and latency
response = client.messages.create(
    model="claude-sonnet-4-6",
    system=[
        {
            "type": "text",
            "text": LARGE_SYSTEM_PROMPT,  # 50K tokens, reused on every request
            "cache_control": {"type": "ephemeral"}  # Mark for caching
        }
    ],
    messages=[{"role": "user", "content": user_query}]
)

Cache invalidation warning: Any change to the cached prefix — even adding a timestamp, a version string, or a single space — breaks the cache and forces full re-processing. Keep cached prefixes strictly immutable.

The attention dilution problem​

Per-section pass architecture​

Rolling summary for long conversations​

Prompt caching for repeated prefixes​

Official documentation​

The attention dilution problem

Per-section pass architecture

Rolling summary for long conversations

Prompt caching for repeated prefixes

Official documentation