Learn 🧠 All Concepts (20) 🤖 What is an LLM? 📚 RAG Explained ⚡ AI Agents 💻 Run AI Locally 🇮🇳 AI in India 📖 Learn Tracks 🔧 DevOps Track ⚙️ AI Ops Track 🗺️ AI Engineer Roadmap
Tools 🔧 AI Tools Directory 🔓 Open Source AI ⭐ Top GitHub Repos ✦ Claude Skill Repos 🚀 Ready-to-Deploy Projects
Build 🏗️ Build Hub 🎯 Master Prompts 🧩 RAG Agents 🚀 App Megaprompts
Workflows ⚡ All Workflows (22) 🎥 Text to Video 🎞️ Image to Video 🔊 Text to Speech ♻️ Automation
Resources 🧪 Colab Notebooks ⚙️ n8n Workflows 📈 Algo Trading 💰 Passive Income
🗂️ Browse All Topics About AItheGuru
← RAG agents
💻 RAG Agent · Development

Codebase Understanding Agent

Index any GitHub repo or local codebase. Ask "How does the auth flow work?" or "Where is user validation done?" Perfect for onboarding to new codebases.

Development Intermediate LocalDocker

Quick info

CategoryDevelopment
DifficultyIntermediate
Deploy onLocal

Get the code

Includes install commands in comments

What it does

Understands code structure
Semantic code search
Dependency mapping
Function-level indexing

Stack

Tree-sitterOpenAIChromaDBFastAPI

Deploy on

✓ Local✓ Docker

Full source code

Install commands are in the top comments. Copy and run.

# Codebase Understanding RAG Agent # Indexes any codebase and answers questions about it # pip install openai chromadb tree-sitter gitpython fastapi uvicorn import os, subprocess from pathlib import Path import chromadb from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction from openai import OpenAI OPENAI_KEY = os.environ.get("OPENAI_API_KEY") client = OpenAI(api_key=OPENAI_KEY) # File types to index CODE_EXTENSIONS = { ".py": "python", ".js": "javascript", ".ts": "typescript", ".tsx": "tsx", ".jsx": "jsx", ".java": "java", ".go": "golang", ".rs": "rust", ".cs": "csharp", ".cpp": "cpp", ".md": "markdown", ".yaml": "yaml", ".yml": "yaml", ".json": "json" } IGNORE_DIRS = {".git", "node_modules", "__pycache__", ".venv", "dist", "build", ".next"} # ── INDEXING ────────────────────────────────────────────────────── def index_codebase(repo_path: str, collection_name: str = "codebase") -> dict: """Index a local codebase into ChromaDB""" chroma = chromadb.PersistentClient(path="./codebase_chroma") ef = OpenAIEmbeddingFunction(api_key=OPENAI_KEY, model_name="text-embedding-3-small") collection = chroma.get_or_create_collection(collection_name, embedding_function=ef) repo = Path(repo_path) files_indexed = 0 chunks_total = 0 for file_path in repo.rglob("*"): # Skip ignored directories if any(part in IGNORE_DIRS for part in file_path.parts): continue if not file_path.is_file(): continue ext = file_path.suffix.lower() if ext not in CODE_EXTENSIONS: continue try: content = file_path.read_text(encoding="utf-8", errors="ignore") except Exception: continue if len(content) < 50: # Skip tiny files continue # Chunk by function/class for code files chunks = chunk_code(content, str(file_path.relative_to(repo)), ext) if chunks: batch_ids = [f"{file_path.relative_to(repo)}::chunk_{i}" for i in range(len(chunks))] # Remove existing chunks for this file (for re-indexing) try: existing = collection.get(where={"file_path": str(file_path.relative_to(repo))}) if existing["ids"]: collection.delete(ids=existing["ids"]) except Exception: pass collection.add( documents=[c["content"] for c in chunks], metadatas=[c["metadata"] for c in chunks], ids=batch_ids ) files_indexed += 1 chunks_total += len(chunks) return {"files": files_indexed, "chunks": chunks_total} def chunk_code(content: str, file_path: str, ext: str) -> list[dict]: """Split code into meaningful chunks""" chunks = [] lines = content.split("\n") # Simple chunking: 50-line windows with 10-line overlap chunk_size = 50 overlap = 10 for i in range(0, len(lines), chunk_size - overlap): chunk_lines = lines[i:i + chunk_size] chunk_content = "\n".join(chunk_lines) if len(chunk_content.strip()) < 30: continue # Detect if this chunk contains a function/class definition definitions = [] for j, line in enumerate(chunk_lines[:5]): if any(kw in line for kw in ["def ", "class ", "function ", "const ", "export ", "async def "]): definitions.append(line.strip()) chunks.append({ "content": f"# File: {file_path}\n# Lines: {i+1}-{i+len(chunk_lines)}\n\n{chunk_content}", "metadata": { "file_path": file_path, "language": CODE_EXTENSIONS.get(ext, "unknown"), "start_line": i + 1, "end_line": i + len(chunk_lines), "has_definition": str(bool(definitions)), "definitions": str(definitions[:2]) } }) return chunks # ── QUERY ───────────────────────────────────────────────────────── def query_codebase(question: str, collection_name: str = "codebase", n: int = 6) -> dict: chroma = chromadb.PersistentClient(path="./codebase_chroma") ef = OpenAIEmbeddingFunction(api_key=OPENAI_KEY, model_name="text-embedding-3-small") collection = chroma.get_or_create_collection(collection_name, embedding_function=ef) results = collection.query(query_texts=[question], n_results=n) if not results["documents"][0]: return {"answer": "Could not find relevant code. Try indexing the repository first.", "sources": []} context = "" sources = [] for doc, meta in zip(results["documents"][0], results["metadatas"][0]): context += f"\n\n{doc}" sources.append({ "file": meta["file_path"], "lines": f"{meta['start_line']}-{meta['end_line']}", "language": meta["language"], "definitions": meta.get("definitions", "") }) response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": """You are an expert code reviewer helping developers understand a codebase. Answer questions based on the provided code snippets. Be specific — reference file names and line numbers. Explain the code clearly, describe the architecture, and point to relevant files."""}, {"role": "user", "content": f"Code context:\n---\n{context}\n---\n\nQuestion: {question}"} ] ) return {"answer": response.choices[0].message.content, "sources": sources} # ── CLI INTERFACE ───────────────────────────────────────────────── if __name__ == "__main__": import sys if len(sys.argv) > 1 and sys.argv[1] == "index": repo_path = sys.argv[2] if len(sys.argv) > 2 else "." print(f"🔍 Indexing: {repo_path}") result = index_codebase(repo_path) print(f"✅ Indexed {result['files']} files → {result['chunks']} chunks") else: print("💻 Codebase Agent — Ask questions about your code") print("Usage: python agent.py index ./my-project (to index)") print("Then run without args to start chat\n") while True: question = input("❓ > ") if question.lower() in ["exit", "quit"]: break result = query_codebase(question) print(f"\n💡 {result['answer']}") print("\n📁 Sources:") for s in result["sources"][:3]: print(f" {s['file']} (lines {s['lines']})")