What it does
Multi-document comparison
Clause extraction
Risk flagging
Plain-English summaries
Stack
Claude APILlamaIndexChromaDBStreamlit
Deploy on
✓ Local✓ Streamlit Cloud✓ Hugging Face Spaces
Full source code
Install commands are in the top comments. Copy and run.
# Legal Document RAG Agent
# Uses Claude for its 200k context window — ideal for long legal documents
# Stack: Anthropic Claude + LlamaIndex + ChromaDB + Streamlit
# pip install anthropic llama-index llama-index-llms-anthropic
# llama-index-embeddings-huggingface chromadb streamlit pypdf
import streamlit as st
import os, tempfile
from pathlib import Path
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.core.storage.storage_context import StorageContext
from llama_index.llms.anthropic import Anthropic
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
# ── SETUP ─────────────────────────────────────────────────────────
ANTHROPIC_KEY = os.environ.get("ANTHROPIC_API_KEY")
# Use local HuggingFace embeddings (free) + Claude for LLM
@st.cache_resource
def init_components():
Settings.llm = Anthropic(
model="claude-sonnet-4-5", # Best for long docs
api_key=ANTHROPIC_KEY,
max_tokens=4096
)
# Free local embeddings - no API cost
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
chroma_client = chromadb.PersistentClient(path="./legal_chroma")
collection = chroma_client.get_or_create_collection("legal_docs")
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
return vector_store, storage_context, collection
vector_store, storage_context, collection = init_components()
# ── LEGAL ANALYSIS PROMPTS ─────────────────────────────────────────
LEGAL_SYSTEM_PROMPT = """You are an expert legal analyst. When analysing documents:
1. Quote exact clauses when answering — use quotation marks
2. Flag potential risks or unusual terms with ⚠️
3. Clearly state if something is NOT found in the documents
4. Compare documents objectively when asked
5. Use plain English — explain legal jargon when you use it
6. Always cite which document and which section your answer is from
IMPORTANT: This is for informational purposes only. Always recommend consulting a qualified lawyer for legal decisions."""
ANALYSIS_TEMPLATES = {
"Summary": "Provide a comprehensive summary of this document including: parties involved, key obligations, important dates, payment terms, and any unusual clauses.",
"Risk Analysis": "Identify all potential risks and unfavourable terms in this contract. Flag anything that could be problematic. Rate each risk as High/Medium/Low.",
"Termination Clauses": "Extract and explain all termination clauses, including notice periods, grounds for termination, and consequences.",
"IP & Confidentiality": "Extract all clauses related to intellectual property, ownership, and confidentiality obligations.",
"Payment Terms": "Extract all payment-related clauses including amounts, schedules, penalties, and dispute processes.",
}
# ── INDEX DOCUMENTS ────────────────────────────────────────────────
def index_documents(uploaded_files):
with tempfile.TemporaryDirectory() as tmp_dir:
for f in uploaded_files:
dest = Path(tmp_dir) / f.name
dest.write_bytes(f.read())
documents = SimpleDirectoryReader(tmp_dir, required_exts=[".pdf", ".txt"]).load_data()
for doc in documents:
doc.metadata["uploaded_at"] = str(st.session_state.get("session_id", ""))
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
show_progress=False
)
return index, len(documents)
# ── QUERY ─────────────────────────────────────────────────────────
def query_legal(question: str, index):
query_engine = index.as_query_engine(
similarity_top_k=8,
system_prompt=LEGAL_SYSTEM_PROMPT
)
response = query_engine.query(question)
sources = []
for node in response.source_nodes:
sources.append({
"file": node.metadata.get("file_name", "Document"),
"page": node.metadata.get("page_label", "?"),
"score": round(node.score or 0, 3)
})
return str(response), sources
# ── STREAMLIT UI ───────────────────────────────────────────────────
st.set_page_config(page_title="Legal Document Agent", layout="wide", page_icon="⚖️")
st.title("⚖️ Legal Document Analysis Agent")
st.caption("Upload contracts and legal documents — ask questions in plain English")
if "messages" not in st.session_state:
st.session_state.messages = []
if "index" not in st.session_state:
st.session_state.index = None
# Sidebar
with st.sidebar:
st.subheader("📄 Upload Documents")
uploads = st.file_uploader("Upload PDF contracts or text files",
type=["pdf", "txt"],
accept_multiple_files=True)
if uploads and st.button("Index Documents", type="primary"):
with st.spinner("Analysing documents..."):
idx, count = index_documents(uploads)
st.session_state.index = idx
st.success(f"✅ Indexed {len(uploads)} documents ({count} sections)")
st.divider()
st.subheader("⚡ Quick Analysis")
for template_name, template_prompt in ANALYSIS_TEMPLATES.items():
if st.button(f"📋 {template_name}"):
st.session_state.pending_query = template_prompt
# Main chat
for msg in st.session_state.messages:
with st.chat_message(msg["role"]):
st.markdown(msg["content"])
if msg.get("sources"):
with st.expander("📚 Sources"):
for s in msg["sources"]:
st.caption(f"📄 {s['file']} (p.{s['page']}) — score: {s['score']}")
pending = st.session_state.pop("pending_query", None)
prompt = st.chat_input("Ask about your legal documents...") or pending
if prompt:
if not st.session_state.index:
st.warning("⚠️ Please upload and index documents first using the sidebar.")
else:
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
with st.chat_message("assistant"):
with st.spinner("Analysing..."):
answer, sources = query_legal(prompt, st.session_state.index)
st.markdown(answer)
if sources:
with st.expander(f"📚 {len(sources)} source sections"):
for s in sources[:4]:
st.caption(f"📄 {s['file']} — p.{s['page']}")
st.session_state.messages.append({
"role": "assistant", "content": answer, "sources": sources
})