What it does
Daily website scraping with Playwright
Redis deduplication
FAISS semantic search
Company-filtered queries
Source URL attribution
Stack
PythonPlaywrightFAISSOpenAI
Deploy on
✓ Render✓ Railway (cron)
Full source code
Install commands are in the top comments. Copy and run.
import asyncio, hashlib, redis
from playwright.async_api import async_playwright
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
r = redis.Redis(decode_responses=True)
COMPETITORS = {'CompA':['https://compa.com/blog'],'CompB':['https://compb.io/changelog']}
async def scrape(url):
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
await page.goto(url,timeout=30000)
await page.wait_for_load_state('networkidle')
text = await page.evaluate('() => document.body.innerText')
await browser.close(); return text[:5000]
async def crawl():
docs = []
for company,urls in COMPETITORS.items():
for url in urls:
try:
text = await scrape(url)
uid = hashlib.md5((url+text[:100]).encode()).hexdigest()
if r.get(f'seen:{uid}'): continue
r.setex(f'seen:{uid}',86400*30,'1')
docs.append(Document(page_content=text,metadata={'company':company,'url':url}))
except: pass
if docs:
embs = OpenAIEmbeddings()
try: db = FAISS.load_local('comp_db',embs,allow_dangerous_deserialization=True); db.add_documents(docs)
except: db = FAISS.from_documents(docs,embs)
db.save_local('comp_db')
def query(question):
db = FAISS.load_local('comp_db',OpenAIEmbeddings(),allow_dangerous_deserialization=True)
docs = db.similarity_search(question,k=6)
ctx = '\n---\n'.join(f"[{d.metadata['company']}] {d.page_content[:400]}" for d in docs)
res = ChatOpenAI(model='gpt-4o',temperature=0).invoke(f'Intel:\n{ctx}\n\nAnswer: {question}')
return {'answer':res.content,'sources':[d.metadata['url'] for d in docs]}
asyncio.run(crawl())