RAG / Retrieval
Full RAG pipeline
Embedding retrieval → rerank fine-ranking → chat answering. 10k docs at ~$0.06 vs ~$5+ raw prompt.
python
from nexevo_ai import Nexevo
client = Nexevo() # reads NEXEVO_API_KEY env by default
# ── 1. Offline one-time indexing ──
# Embedding model MUST be pinned (vector DB dimension lock) — bge-m3 is 1024-dim, multilingual
all_docs = ["...full text of doc 1...", "...full text of doc 2...", "..."] # 10000 docs
emb_resp = client.embeddings.create(model="bge-m3", input=all_docs)
vectors = [d["embedding"] for d in emb_resp["data"]]
# (vector store code omitted — use Pinecone / Milvus / pgvector)
# ── 2. On user query (per-request)──
question = "How to reset employee VPN?"
# 2a) embed query → vector store top-50 retrieval (same bge-m3 as index → matching dim)
q_emb = client.embeddings.create(model="bge-m3", input=question)
top50 = vector_db.query(q_emb["data"][0]["embedding"], k=50)
# 2b) rerank top-50 → top-5 most relevant — auto-routed (backend picks best reranker per query)
ranked = client.rerank.create(
model="nexevo-auto",
query=question,
documents=[d.text for d in top50],
top_n=5,
)["results"]
top5 = [top50[r["index"]] for r in ranked]
# 2c) chat answers using top-5
context = "\n\n".join(d.text for d in top5)
ans = client.chat.completions.create(
model="nexevo-auto",
messages=[
{"role": "system",
"content": f"Answer based on the following materials (use only these, do not fabricate):\n\n{context}"},
{"role": "user", "content": question},
],
)
print(ans.choices[0].message.content)