Context management strategies
The attention dilution problem
Transformer models attend less reliably to content in the middle of long contexts. This is not a context window size limitation — it is a property of the attention mechanism.
Document position in context vs. attention reliability:
Beginning ████████████ High reliability
Middle ████░░░░░░░░ Lower reliability ("lost in the middle")
End ████████████ High reliability
The exam trap: "Use a larger context window model" is always the wrong answer for attention dilution. A 200K window helps with fitting the content, but does not fix attention quality in the middle.
Per-section pass architecture
def analyze_long_document(document: str) -> dict:
sections = split_into_sections(document)
section_analyses = []
# Pass 1: Each section gets full, focused attention
for i, section in enumerate(sections):
analysis = client.messages.create(
model="claude-sonnet-4-6",
system="Analyze this document section thoroughly. Extract all claims, dates, entities, and key facts.",
messages=[{
"role": "user",
"content": f"Section {i+1} of {len(sections)}:\n\n{section}"
}]
)
section_analyses.append({
"section_index": i,
"content": section[:200] + "...", # Reference snippet
"analysis": analysis.content[0].text
})
# Pass 2: Integration pass — synthesize all section analyses
synthesis = client.messages.create(
model="claude-sonnet-4-6",
system="Synthesize these section analyses into a coherent whole. Identify cross-section themes, contradictions, and the overall narrative arc.",
messages=[{
"role": "user",
"content": f"Synthesize these {len(sections)} section analyses:\n\n" +
"\n\n---\n\n".join(f"Section {a['section_index']+1}:\n{a['analysis']}"
for a in section_analyses)
}]
)
return {"sections": section_analyses, "synthesis": synthesis.content[0].text}
Rolling summary for long conversations
class ConversationManager:
def __init__(self, max_live_turns: int = 20, summary_threshold: int = 40):
self.messages = []
self.max_live_turns = max_live_turns
self.summary_threshold = summary_threshold
self.summary = None
def add_turn(self, role: str, content: str) -> None:
self.messages.append({"role": role, "content": content})
# Compress when threshold is reached
if len(self.messages) >= self.summary_threshold:
self._compress()
def _compress(self) -> None:
# Summarize everything except the last N turns
old_messages = self.messages[:-self.max_live_turns]
recent_messages = self.messages[-self.max_live_turns:]
summary_text = self._summarize(old_messages)
# Replace old messages with compact summary
self.messages = [
{"role": "user", "content": f"[Previous conversation summary]\n{summary_text}"},
{"role": "assistant", "content": "Understood. I'll continue with that context in mind."},
*recent_messages
]
def get_messages(self) -> list:
return self.messages
Prompt caching for repeated prefixes
# Large stable prefix — cache it to save cost and latency
response = client.messages.create(
model="claude-sonnet-4-6",
system=[
{
"type": "text",
"text": LARGE_SYSTEM_PROMPT, # 50K tokens, reused on every request
"cache_control": {"type": "ephemeral"} # Mark for caching
}
],
messages=[{"role": "user", "content": user_query}]
)
Cache invalidation warning: Any change to the cached prefix — even adding a timestamp, a version string, or a single space — breaks the cache and forces full re-processing. Keep cached prefixes strictly immutable.