What it does
Indexes PDF annual reports
Live NSE price data
Earnings call transcripts
Competitor comparison
Stack
LangChainOpenAIPineconeyfinanceStreamlit
Deploy on
✓ Streamlit Cloud (free)✓ Hugging Face Spaces✓ Railway
Full source code
Install commands are in the top comments. Copy and run.
# NSE Stock Research RAG Agent
# Stack: LangChain + OpenAI + Pinecone + yfinance + Streamlit
# pip install langchain langchain-openai langchain-pinecone pinecone-client
# yfinance streamlit pypdf requests beautifulsoup4
import streamlit as st
import yfinance as yf
import os, requests
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferWindowMemory
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from pinecone import Pinecone, ServerlessSpec
from datetime import datetime
import tempfile
# ── INIT ──────────────────────────────────────────────────────────
OPENAI_KEY = os.environ.get("OPENAI_API_KEY")
PINECONE_KEY = os.environ.get("PINECONE_API_KEY")
INDEX_NAME = "stock-research"
@st.cache_resource
def init_pinecone():
pc = Pinecone(api_key=PINECONE_KEY)
if INDEX_NAME not in pc.list_indexes().names():
pc.create_index(
name=INDEX_NAME,
dimension=1536,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
return pc.Index(INDEX_NAME)
@st.cache_resource
def init_chain():
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_KEY)
vectorstore = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
llm = ChatOpenAI(
model="gpt-4o",
temperature=0,
openai_api_key=OPENAI_KEY
)
memory = ConversationBufferWindowMemory(
memory_key="chat_history",
k=5,
return_messages=True,
output_key="answer"
)
chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=vectorstore.as_retriever(search_kwargs={"k": 6}),
memory=memory,
return_source_documents=True,
verbose=False,
combine_docs_chain_kwargs={
"prompt": get_stock_prompt()
}
)
return chain
def get_stock_prompt():
from langchain.prompts import PromptTemplate
template = """You are a senior equity research analyst specialising in Indian markets (NSE/BSE).
Answer using ONLY the provided context from annual reports and filings.
If data isn't in the context, say "Not found in indexed documents" — never guess.
Always cite: which document, which year, and direct quotes where possible.
Context: {context}
Question: {question}
Analysis:"""
return PromptTemplate(template=template, input_variables=["context", "question"])
# ── LIVE STOCK DATA ────────────────────────────────────────────────
def get_stock_summary(symbol: str) -> dict:
"""Get live NSE stock data"""
ticker = yf.Ticker(f"{symbol}.NS")
info = ticker.info
hist = ticker.history(period="1y")
return {
"name": info.get("longName", symbol),
"price": info.get("currentPrice"),
"pe_ratio": info.get("trailingPE"),
"pb_ratio": info.get("priceToBook"),
"market_cap_cr": round(info.get("marketCap", 0) / 1e7, 0),
"52w_high": info.get("fiftyTwoWeekHigh"),
"52w_low": info.get("fiftyTwoWeekLow"),
"revenue_growth": info.get("revenueGrowth"),
"roe": info.get("returnOnEquity"),
"analyst_rating": info.get("recommendationKey"),
"1y_return": round((hist["Close"].iloc[-1] / hist["Close"].iloc[0] - 1) * 100, 1) if not hist.empty else None
}
# ── INDEX DOCUMENTS ────────────────────────────────────────────────
def index_pdf(uploaded_file):
"""Index an uploaded PDF (annual report, transcript)"""
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_KEY)
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(uploaded_file.read())
tmp_path = tmp.name
loader = PyPDFLoader(tmp_path)
documents = loader.load()
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
separators=["\n\n", "\n", ". ", " "]
)
chunks = splitter.split_documents(documents)
# Add filename to metadata
for chunk in chunks:
chunk.metadata["source_file"] = uploaded_file.name
chunk.metadata["indexed_at"] = datetime.now().isoformat()
PineconeVectorStore.from_documents(chunks, embeddings, index_name=INDEX_NAME)
os.unlink(tmp_path)
return len(chunks)
# ── STREAMLIT UI ───────────────────────────────────────────────────
st.set_page_config(page_title="Stock Research Agent", layout="wide", page_icon="📈")
st.title("📈 NSE Stock Research Agent")
st.caption("Ask questions across annual reports, earnings transcripts, and filings")
# Sidebar: Stock live data + PDF upload
with st.sidebar:
st.subheader("📊 Live Data")
symbol = st.text_input("NSE Symbol", "RELIANCE").upper()
if st.button("Fetch Data"):
with st.spinner("Fetching..."):
data = get_stock_summary(symbol)
st.metric("Price", f"₹{data['price']:,}" if data['price'] else "N/A")
st.metric("P/E Ratio", data['pe_ratio'] or "N/A")
st.metric("Market Cap", f"₹{data['market_cap_cr']:,.0f} Cr" if data['market_cap_cr'] else "N/A")
st.metric("1Y Return", f"{data['1y_return']}%" if data['1y_return'] else "N/A")
st.metric("ROE", f"{round(data['roe']*100,1)}%" if data['roe'] else "N/A")
st.divider()
st.subheader("📄 Index Documents")
uploaded = st.file_uploader("Upload Annual Report / Transcript PDF", type=["pdf"])
if uploaded and st.button("Index PDF"):
with st.spinner("Indexing..."):
chunks = index_pdf(uploaded)
st.success(f"✅ Indexed {chunks} chunks from {uploaded.name}")
# Main chat interface
if "messages" not in st.session_state:
st.session_state.messages = []
# Display history
for msg in st.session_state.messages:
with st.chat_message(msg["role"]):
st.markdown(msg["content"])
if msg.get("sources"):
with st.expander(f"📚 {len(msg['sources'])} sources"):
for src in msg["sources"]:
st.caption(f"📄 {src.metadata.get('source_file','Unknown')} — p.{src.metadata.get('page','?')}")
# Chat input
if prompt := st.chat_input("Ask about any indexed stock documents..."):
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
with st.chat_message("assistant"):
with st.spinner("Researching..."):
chain = init_chain()
result = chain.invoke({"question": prompt})
answer = result["answer"]
sources = result.get("source_documents", [])
st.markdown(answer)
if sources:
with st.expander(f"📚 {len(sources)} sources used"):
for src in sources[:4]:
st.caption(f"📄 {src.metadata.get('source_file','Doc')} — relevance: high")
st.session_state.messages.append({
"role": "assistant",
"content": answer,
"sources": sources
})