💻 RAG Agent · Development

Codebase Understanding Agent

Index any GitHub repo or local codebase. Ask "How does the auth flow work?" or "Where is user validation done?" Perfect for onboarding to new codebases.

Development Intermediate LocalDocker

Quick info

CategoryDevelopment

DifficultyIntermediate

Deploy onLocal

Get the code

Includes install commands in comments

What it does

Understands code structure

Semantic code search

Dependency mapping

Function-level indexing

Stack

Tree-sitterOpenAIChromaDBFastAPI

Deploy on

✓ Local✓ Docker

Full source code

Install commands are in the top comments. Copy and run.

# Codebase Understanding RAG Agent # Indexes any codebase and answers questions about it # pip install openai chromadb tree-sitter gitpython fastapi uvicorn import os, subprocess from pathlib import Path import chromadb from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction from openai import OpenAI OPENAI_KEY = os.environ.get("OPENAI_API_KEY") client = OpenAI(api_key=OPENAI_KEY) # File types to index CODE_EXTENSIONS = { ".py": "python", ".js": "javascript", ".ts": "typescript", ".tsx": "tsx", ".jsx": "jsx", ".java": "java", ".go": "golang", ".rs": "rust", ".cs": "csharp", ".cpp": "cpp", ".md": "markdown", ".yaml": "yaml", ".yml": "yaml", ".json": "json" } IGNORE_DIRS = {".git", "node_modules", "__pycache__", ".venv", "dist", "build", ".next"} # ── INDEXING ────────────────────────────────────────────────────── def index_codebase(repo_path: str, collection_name: str = "codebase") -> dict: """Index a local codebase into ChromaDB""" chroma = chromadb.PersistentClient(path="./codebase_chroma") ef = OpenAIEmbeddingFunction(api_key=OPENAI_KEY, model_name="text-embedding-3-small") collection = chroma.get_or_create_collection(collection_name, embedding_function=ef) repo = Path(repo_path) files_indexed = 0 chunks_total = 0 for file_path in repo.rglob("*"): # Skip ignored directories if any(part in IGNORE_DIRS for part in file_path.parts): continue if not file_path.is_file(): continue ext = file_path.suffix.lower() if ext not in CODE_EXTENSIONS: continue try: content = file_path.read_text(encoding="utf-8", errors="ignore") except Exception: continue if len(content) < 50: # Skip tiny files continue # Chunk by function/class for code files chunks = chunk_code(content, str(file_path.relative_to(repo)), ext) if chunks: batch_ids = [f"{file_path.relative_to(repo)}::chunk_{i}" for i in range(len(chunks))] # Remove existing chunks for this file (for re-indexing) try: existing = collection.get(where={"file_path": str(file_path.relative_to(repo))}) if existing["ids"]: collection.delete(ids=existing["ids"]) except Exception: pass collection.add( documents=[c["content"] for c in chunks], metadatas=[c["metadata"] for c in chunks], ids=batch_ids ) files_indexed += 1 chunks_total += len(chunks) return {"files": files_indexed, "chunks": chunks_total} def chunk_code(content: str, file_path: str, ext: str) -> list[dict]: """Split code into meaningful chunks""" chunks = [] lines = content.split("\n") # Simple chunking: 50-line windows with 10-line overlap chunk_size = 50 overlap = 10 for i in range(0, len(lines), chunk_size - overlap): chunk_lines = lines[i:i + chunk_size] chunk_content = "\n".join(chunk_lines) if len(chunk_content.strip()) < 30: continue # Detect if this chunk contains a function/class definition definitions = [] for j, line in enumerate(chunk_lines[:5]): if any(kw in line for kw in ["def ", "class ", "function ", "const ", "export ", "async def "]): definitions.append(line.strip()) chunks.append({ "content": f"# File: {file_path}\n# Lines: {i+1}-{i+len(chunk_lines)}\n\n{chunk_content}", "metadata": { "file_path": file_path, "language": CODE_EXTENSIONS.get(ext, "unknown"), "start_line": i + 1, "end_line": i + len(chunk_lines), "has_definition": str(bool(definitions)), "definitions": str(definitions[:2]) } }) return chunks # ── QUERY ───────────────────────────────────────────────────────── def query_codebase(question: str, collection_name: str = "codebase", n: int = 6) -> dict: chroma = chromadb.PersistentClient(path="./codebase_chroma") ef = OpenAIEmbeddingFunction(api_key=OPENAI_KEY, model_name="text-embedding-3-small") collection = chroma.get_or_create_collection(collection_name, embedding_function=ef) results = collection.query(query_texts=[question], n_results=n) if not results["documents"][0]: return {"answer": "Could not find relevant code. Try indexing the repository first.", "sources": []} context = "" sources = [] for doc, meta in zip(results["documents"][0], results["metadatas"][0]): context += f"\n\n{doc}" sources.append({ "file": meta["file_path"], "lines": f"{meta['start_line']}-{meta['end_line']}", "language": meta["language"], "definitions": meta.get("definitions", "") }) response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": """You are an expert code reviewer helping developers understand a codebase. Answer questions based on the provided code snippets. Be specific — reference file names and line numbers. Explain the code clearly, describe the architecture, and point to relevant files."""}, {"role": "user", "content": f"Code context:\n---\n{context}\n---\n\nQuestion: {question}"} ] ) return {"answer": response.choices[0].message.content, "sources": sources} # ── CLI INTERFACE ───────────────────────────────────────────────── if __name__ == "__main__": import sys if len(sys.argv) > 1 and sys.argv[1] == "index": repo_path = sys.argv[2] if len(sys.argv) > 2 else "." print(f"🔍 Indexing: {repo_path}") result = index_codebase(repo_path) print(f"✅ Indexed {result['files']} files → {result['chunks']} chunks") else: print("💻 Codebase Agent — Ask questions about your code") print("Usage: python agent.py index ./my-project (to index)") print("Then run without args to start chat\n") while True: question = input("❓ > ") if question.lower() in ["exit", "quit"]: break result = query_codebase(question) print(f"\n💡 {result['answer']}") print("\n📁 Sources:") for s in result["sources"][:3]: print(f" {s['file']} (lines {s['lines']})")

Codebase Understanding Agent

Quick info

Get the code

What it does

Stack

Deploy on

Full source code

More agents

🧠 Personal Knowledge Base Agent

🏢 Company Docs & Policy Agent

📈 NSE Stock Research RAG Agent