What it does
Understands code structure
Semantic code search
Dependency mapping
Function-level indexing
Stack
Tree-sitterOpenAIChromaDBFastAPI
Deploy on
✓ Local✓ Docker
Full source code
Install commands are in the top comments. Copy and run.
# Codebase Understanding RAG Agent
# Indexes any codebase and answers questions about it
# pip install openai chromadb tree-sitter gitpython fastapi uvicorn
import os, subprocess
from pathlib import Path
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from openai import OpenAI
OPENAI_KEY = os.environ.get("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_KEY)
# File types to index
CODE_EXTENSIONS = {
".py": "python", ".js": "javascript", ".ts": "typescript",
".tsx": "tsx", ".jsx": "jsx", ".java": "java", ".go": "golang",
".rs": "rust", ".cs": "csharp", ".cpp": "cpp", ".md": "markdown",
".yaml": "yaml", ".yml": "yaml", ".json": "json"
}
IGNORE_DIRS = {".git", "node_modules", "__pycache__", ".venv", "dist", "build", ".next"}
# ── INDEXING ──────────────────────────────────────────────────────
def index_codebase(repo_path: str, collection_name: str = "codebase") -> dict:
"""Index a local codebase into ChromaDB"""
chroma = chromadb.PersistentClient(path="./codebase_chroma")
ef = OpenAIEmbeddingFunction(api_key=OPENAI_KEY, model_name="text-embedding-3-small")
collection = chroma.get_or_create_collection(collection_name, embedding_function=ef)
repo = Path(repo_path)
files_indexed = 0
chunks_total = 0
for file_path in repo.rglob("*"):
# Skip ignored directories
if any(part in IGNORE_DIRS for part in file_path.parts):
continue
if not file_path.is_file():
continue
ext = file_path.suffix.lower()
if ext not in CODE_EXTENSIONS:
continue
try:
content = file_path.read_text(encoding="utf-8", errors="ignore")
except Exception:
continue
if len(content) < 50: # Skip tiny files
continue
# Chunk by function/class for code files
chunks = chunk_code(content, str(file_path.relative_to(repo)), ext)
if chunks:
batch_ids = [f"{file_path.relative_to(repo)}::chunk_{i}" for i in range(len(chunks))]
# Remove existing chunks for this file (for re-indexing)
try:
existing = collection.get(where={"file_path": str(file_path.relative_to(repo))})
if existing["ids"]:
collection.delete(ids=existing["ids"])
except Exception:
pass
collection.add(
documents=[c["content"] for c in chunks],
metadatas=[c["metadata"] for c in chunks],
ids=batch_ids
)
files_indexed += 1
chunks_total += len(chunks)
return {"files": files_indexed, "chunks": chunks_total}
def chunk_code(content: str, file_path: str, ext: str) -> list[dict]:
"""Split code into meaningful chunks"""
chunks = []
lines = content.split("\n")
# Simple chunking: 50-line windows with 10-line overlap
chunk_size = 50
overlap = 10
for i in range(0, len(lines), chunk_size - overlap):
chunk_lines = lines[i:i + chunk_size]
chunk_content = "\n".join(chunk_lines)
if len(chunk_content.strip()) < 30:
continue
# Detect if this chunk contains a function/class definition
definitions = []
for j, line in enumerate(chunk_lines[:5]):
if any(kw in line for kw in ["def ", "class ", "function ", "const ", "export ", "async def "]):
definitions.append(line.strip())
chunks.append({
"content": f"# File: {file_path}\n# Lines: {i+1}-{i+len(chunk_lines)}\n\n{chunk_content}",
"metadata": {
"file_path": file_path,
"language": CODE_EXTENSIONS.get(ext, "unknown"),
"start_line": i + 1,
"end_line": i + len(chunk_lines),
"has_definition": str(bool(definitions)),
"definitions": str(definitions[:2])
}
})
return chunks
# ── QUERY ─────────────────────────────────────────────────────────
def query_codebase(question: str, collection_name: str = "codebase", n: int = 6) -> dict:
chroma = chromadb.PersistentClient(path="./codebase_chroma")
ef = OpenAIEmbeddingFunction(api_key=OPENAI_KEY, model_name="text-embedding-3-small")
collection = chroma.get_or_create_collection(collection_name, embedding_function=ef)
results = collection.query(query_texts=[question], n_results=n)
if not results["documents"][0]:
return {"answer": "Could not find relevant code. Try indexing the repository first.", "sources": []}
context = ""
sources = []
for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
context += f"\n\n{doc}"
sources.append({
"file": meta["file_path"],
"lines": f"{meta['start_line']}-{meta['end_line']}",
"language": meta["language"],
"definitions": meta.get("definitions", "")
})
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": """You are an expert code reviewer helping developers understand a codebase.
Answer questions based on the provided code snippets. Be specific — reference file names and line numbers.
Explain the code clearly, describe the architecture, and point to relevant files."""},
{"role": "user", "content": f"Code context:\n---\n{context}\n---\n\nQuestion: {question}"}
]
)
return {"answer": response.choices[0].message.content, "sources": sources}
# ── CLI INTERFACE ─────────────────────────────────────────────────
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == "index":
repo_path = sys.argv[2] if len(sys.argv) > 2 else "."
print(f"🔍 Indexing: {repo_path}")
result = index_codebase(repo_path)
print(f"✅ Indexed {result['files']} files → {result['chunks']} chunks")
else:
print("💻 Codebase Agent — Ask questions about your code")
print("Usage: python agent.py index ./my-project (to index)")
print("Then run without args to start chat\n")
while True:
question = input("❓ > ")
if question.lower() in ["exit", "quit"]:
break
result = query_codebase(question)
print(f"\n💡 {result['answer']}")
print("\n📁 Sources:")
for s in result["sources"][:3]:
print(f" {s['file']} (lines {s['lines']})")