How to Build a RAG App with Cohere Embeddings
·APIScout Team
cohereragembeddingsaitutorial
How to Build a RAG App with Cohere Embeddings
RAG (Retrieval-Augmented Generation) lets LLMs answer questions about your data without fine-tuning. Embed your documents, store vectors, search for relevant chunks, and pass them as context to the model. Cohere's Embed v3 and Command R+ make this straightforward.
What You'll Build
- Document chunking and embedding pipeline
- Vector storage and similarity search
- RAG query endpoint with citations
- Conversational RAG with chat history
- Reranking for better relevance
Prerequisites: Node.js 18+, Cohere API key (free tier: 100 API calls/min).
1. Setup
npm install cohere-ai
// lib/cohere.ts
import { CohereClient } from 'cohere-ai';
export const cohere = new CohereClient({
token: process.env.COHERE_API_KEY!,
});
2. Document Chunking
Split documents into chunks that fit in the embedding model's context window.
// lib/chunker.ts
interface Chunk {
id: string;
text: string;
metadata: {
source: string;
chunkIndex: number;
totalChunks: number;
};
}
export function chunkDocument(
text: string,
source: string,
options: {
chunkSize?: number;
overlap?: number;
} = {}
): Chunk[] {
const { chunkSize = 500, overlap = 50 } = options;
// Split by paragraphs first, then combine
const paragraphs = text.split(/\n\n+/).filter(p => p.trim().length > 0);
const chunks: Chunk[] = [];
let currentChunk = '';
let chunkIndex = 0;
for (const paragraph of paragraphs) {
if (currentChunk.length + paragraph.length > chunkSize && currentChunk.length > 0) {
chunks.push({
id: `${source}_chunk_${chunkIndex}`,
text: currentChunk.trim(),
metadata: { source, chunkIndex, totalChunks: 0 },
});
// Keep overlap from end of previous chunk
const words = currentChunk.split(' ');
currentChunk = words.slice(-overlap).join(' ') + '\n\n' + paragraph;
chunkIndex++;
} else {
currentChunk += (currentChunk ? '\n\n' : '') + paragraph;
}
}
// Add final chunk
if (currentChunk.trim()) {
chunks.push({
id: `${source}_chunk_${chunkIndex}`,
text: currentChunk.trim(),
metadata: { source, chunkIndex, totalChunks: 0 },
});
}
// Update total chunks count
return chunks.map(c => ({
...c,
metadata: { ...c.metadata, totalChunks: chunks.length },
}));
}
3. Generate Embeddings
// lib/embeddings.ts
import { cohere } from './cohere';
export async function embedDocuments(texts: string[]): Promise<number[][]> {
// Batch in groups of 96 (Cohere limit)
const batchSize = 96;
const allEmbeddings: number[][] = [];
for (let i = 0; i < texts.length; i += batchSize) {
const batch = texts.slice(i, i + batchSize);
const response = await cohere.v2.embed({
texts: batch,
model: 'embed-v4.0',
inputType: 'search_document',
embeddingTypes: ['float'],
});
allEmbeddings.push(...(response.embeddings.float ?? []));
}
return allEmbeddings;
}
export async function embedQuery(query: string): Promise<number[]> {
const response = await cohere.v2.embed({
texts: [query],
model: 'embed-v4.0',
inputType: 'search_query', // Different input type for queries
embeddingTypes: ['float'],
});
return response.embeddings.float![0];
}
4. Vector Store
Simple In-Memory Store
// lib/vector-store.ts
interface StoredDocument {
id: string;
text: string;
embedding: number[];
metadata: Record<string, any>;
}
class VectorStore {
private documents: StoredDocument[] = [];
async add(docs: { id: string; text: string; metadata: Record<string, any> }[]) {
const texts = docs.map(d => d.text);
const embeddings = await embedDocuments(texts);
for (let i = 0; i < docs.length; i++) {
this.documents.push({
...docs[i],
embedding: embeddings[i],
});
}
}
async search(query: string, topK: number = 5): Promise<StoredDocument[]> {
const queryEmbedding = await embedQuery(query);
// Calculate cosine similarity
const scored = this.documents.map(doc => ({
...doc,
score: cosineSimilarity(queryEmbedding, doc.embedding),
}));
return scored
.sort((a, b) => b.score - a.score)
.slice(0, topK);
}
}
function cosineSimilarity(a: number[], b: number[]): number {
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
export const vectorStore = new VectorStore();
With PostgreSQL + pgvector (Production)
// lib/vector-store-pg.ts
import { Pool } from 'pg';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
// Setup (run once)
export async function initVectorStore() {
await pool.query(`
CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE IF NOT EXISTS documents (
id TEXT PRIMARY KEY,
text TEXT NOT NULL,
embedding vector(1024),
metadata JSONB DEFAULT '{}'
);
CREATE INDEX IF NOT EXISTS documents_embedding_idx
ON documents USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
`);
}
export async function addDocuments(
docs: { id: string; text: string; embedding: number[]; metadata: Record<string, any> }[]
) {
const query = `
INSERT INTO documents (id, text, embedding, metadata)
VALUES ($1, $2, $3::vector, $4)
ON CONFLICT (id) DO UPDATE SET
text = EXCLUDED.text,
embedding = EXCLUDED.embedding,
metadata = EXCLUDED.metadata
`;
for (const doc of docs) {
await pool.query(query, [
doc.id,
doc.text,
`[${doc.embedding.join(',')}]`,
JSON.stringify(doc.metadata),
]);
}
}
export async function searchDocuments(queryEmbedding: number[], topK: number = 5) {
const result = await pool.query(
`SELECT id, text, metadata, 1 - (embedding <=> $1::vector) as score
FROM documents
ORDER BY embedding <=> $1::vector
LIMIT $2`,
[`[${queryEmbedding.join(',')}]`, topK]
);
return result.rows;
}
5. RAG Query
// lib/rag.ts
import { cohere } from './cohere';
import { vectorStore } from './vector-store';
export async function ragQuery(question: string): Promise<{
answer: string;
sources: { text: string; source: string }[];
}> {
// 1. Search for relevant documents
const relevantDocs = await vectorStore.search(question, 5);
// 2. Rerank for better relevance
const reranked = await cohere.v2.rerank({
model: 'rerank-v3.5',
query: question,
documents: relevantDocs.map(d => ({ text: d.text })),
topN: 3,
});
const topDocs = reranked.results.map(r => relevantDocs[r.index]);
// 3. Generate answer with context
const context = topDocs
.map((doc, i) => `[Source ${i + 1}]: ${doc.text}`)
.join('\n\n');
const response = await cohere.v2.chat({
model: 'command-r-plus',
messages: [
{
role: 'system',
content: `You are a helpful assistant. Answer questions based on the provided context.
If the context doesn't contain the answer, say so. Always cite your sources using [Source N] format.`,
},
{
role: 'user',
content: `Context:\n${context}\n\nQuestion: ${question}`,
},
],
});
return {
answer: response.message?.content?.[0]?.text ?? 'No answer generated.',
sources: topDocs.map(d => ({
text: d.text.substring(0, 200) + '...',
source: d.metadata.source,
})),
};
}
6. Cohere Chat with RAG (Built-In)
Cohere's Chat API has built-in RAG with automatic citations:
export async function cohereRAG(question: string) {
// Get relevant documents
const relevantDocs = await vectorStore.search(question, 5);
const response = await cohere.v2.chat({
model: 'command-r-plus',
messages: [
{
role: 'user',
content: question,
},
],
documents: relevantDocs.map(doc => ({
id: doc.id,
data: {
text: doc.text,
source: doc.metadata.source,
},
})),
});
return {
answer: response.message?.content?.[0]?.text,
citations: response.message?.citations ?? [],
};
}
7. Conversational RAG
// lib/conversational-rag.ts
import { cohere } from './cohere';
import { vectorStore } from './vector-store';
interface Message {
role: 'user' | 'assistant';
content: string;
}
export async function conversationalRAG(
question: string,
chatHistory: Message[]
): Promise<{ answer: string; sources: any[] }> {
// 1. Rewrite question with context from chat history
let searchQuery = question;
if (chatHistory.length > 0) {
const rewrite = await cohere.v2.chat({
model: 'command-r-plus',
messages: [
{
role: 'system',
content: 'Rewrite the user question to be self-contained, incorporating context from chat history. Return only the rewritten question.',
},
...chatHistory.map(m => ({
role: m.role as 'user' | 'assistant',
content: m.content,
})),
{ role: 'user' as const, content: question },
],
});
searchQuery = rewrite.message?.content?.[0]?.text ?? question;
}
// 2. Search with rewritten query
const relevantDocs = await vectorStore.search(searchQuery, 5);
// 3. Rerank
const reranked = await cohere.v2.rerank({
model: 'rerank-v3.5',
query: searchQuery,
documents: relevantDocs.map(d => ({ text: d.text })),
topN: 3,
});
const topDocs = reranked.results.map(r => relevantDocs[r.index]);
// 4. Generate with full chat history
const response = await cohere.v2.chat({
model: 'command-r-plus',
messages: [
{
role: 'system',
content: 'Answer based on the provided documents. Cite sources.',
},
...chatHistory.map(m => ({
role: m.role as 'user' | 'assistant',
content: m.content,
})),
{
role: 'user',
content: question,
},
],
documents: topDocs.map(doc => ({
id: doc.id,
data: {
text: doc.text,
source: doc.metadata.source,
},
})),
});
return {
answer: response.message?.content?.[0]?.text ?? '',
sources: topDocs.map(d => ({ text: d.text.substring(0, 150), source: d.metadata.source })),
};
}
8. API Route
// app/api/rag/route.ts
import { NextResponse } from 'next/server';
import { ragQuery } from '@/lib/rag';
export async function POST(req: Request) {
const { question } = await req.json();
if (!question) {
return NextResponse.json({ error: 'Question required' }, { status: 400 });
}
const result = await ragQuery(question);
return NextResponse.json(result);
}
9. Ingestion Script
// scripts/ingest.ts
import { readFileSync, readdirSync } from 'fs';
import { join } from 'path';
import { chunkDocument } from '../lib/chunker';
import { vectorStore } from '../lib/vector-store';
async function ingest(docsDir: string) {
const files = readdirSync(docsDir).filter(f => f.endsWith('.md') || f.endsWith('.txt'));
console.log(`Found ${files.length} documents`);
for (const file of files) {
const content = readFileSync(join(docsDir, file), 'utf-8');
const chunks = chunkDocument(content, file);
console.log(` ${file}: ${chunks.length} chunks`);
await vectorStore.add(chunks);
}
console.log('Ingestion complete!');
}
ingest('./docs').catch(console.error);
Cohere Pricing
| Model | Price |
|---|---|
| Embed v4 | $0.10 / 1M tokens |
| Command R+ | $2.50 / 1M input, $10 / 1M output |
| Command R | $0.15 / 1M input, $0.60 / 1M output |
| Rerank v3.5 | $2.00 / 1K searches |
Example cost: 1,000 documents (500 tokens each) embedded + 100 RAG queries/day:
- Embedding: $0.05 (one-time)
- Queries: ~$0.30/day (embed + rerank + generate)
- Total: ~$9/month
Common Mistakes
| Mistake | Impact | Fix |
|---|---|---|
| Chunks too large | Poor retrieval precision | Keep chunks to 200-500 tokens |
| Chunks too small | Missing context | Use overlap between chunks |
| Same input type for docs and queries | Poor search quality | Use search_document and search_query |
| No reranking | Irrelevant results ranked high | Add Cohere Rerank step |
| Stuffing all results into prompt | Noise overwhelms signal | Rerank and use top 3-5 results |
| No chat history rewriting | Follow-up questions fail | Rewrite questions with context |
Building AI-powered search? Compare Cohere vs OpenAI Embeddings vs Voyage AI on APIScout — embedding quality, pricing, and RAG performance.