Retrieval-Augmented Generation (RAG) systems enhance the capabilities of large language models (LLMs) by incorporating external knowledge sources. This approach allows the model to generate more accurate, contextually relevant, and grounded responses. Several libraries and frameworks facilitate the implementation of RAG, each with its own strengths and weaknesses. Here's a detailed comparison of some prominent options:
Creating a RAG system involves several key steps:
This step involves loading your data and preparing it for indexing. Chunking is a common technique to break down large documents into smaller pieces.
import pandas as pd
from transformers import AutoTokenizer, AutoModel
# Load data
data = pd.read_csv('your_data.csv')
# Preprocess data (chunking)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')
def chunk_data(text, chunk_size=512):
chunks = []
for i in range(0, len(text), chunk_size):
chunk = text[i:i + chunk_size]
chunks.append(chunk)
return chunks
chunks = data['text'].apply(chunk_data)
This step involves generating embeddings for the text chunks and indexing them for efficient retrieval.
from sentence_transformers import SentenceTransformer
# Embedding model for indexing
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Generate embeddings for chunks
embeddings = embedding_model.encode(chunks)
This step involves creating an index and implementing a function to retrieve relevant chunks based on a query.
import faiss
# Create an index for efficient retrieval
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
def retrieve_chunks(query, k=5):
query_embedding = embedding_model.encode([query])
D, I = index.search(query_embedding, k)
return I[0]
# Example query
query = "What is the meaning of life?"
retrieved_chunk_ids = retrieve_chunks(query)
This step involves integrating a language model to generate responses based on the retrieved context.
from transformers import pipeline
# Load a language model for response generation
generator = pipeline('text-generation', model='gpt-3.5-turbo')
def generate_response(retrieved_chunk_ids):
context = ' '.join([chunks[i] for i in retrieved_chunk_ids])
response = generator(context, max_length=200)
return response[0]['generated_text']
# Generate response
response = generate_response(retrieved_chunk_ids)
This step involves reranking the generated responses and post-processing them to ensure coherence and accuracy.
from transformers import pipeline
# Load a model for reranking
reranker = pipeline('text-classification', model='distilbert-base-uncased-finetuned-sst-2-english')
def rerank_responses(responses):
scores = reranker(responses)
best_response = responses[scores.index(max(scores))]
return best_response
# Rerank and post-process the response
final_response = rerank_responses([response])
Here is a simplified example combining these steps:
import pandas as pd
from transformers import AutoTokenizer, AutoModel, pipeline
from sentence_transformers import SentenceTransformer
import faiss
# Load and preprocess data
data = pd.read_csv('your_data.csv')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')
def chunk_data(text, chunk_size=512):
chunks = []
for i in range(0, len(text), chunk_size):
chunk = text[i:i + chunk_size]
chunks.append(chunk)
return chunks
chunks = data['text'].apply(chunk_data)
# Generate embeddings and index
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedding_model.encode([chunk for chunk_list in chunks for chunk in chunk_list])
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
# Define retrieval and response generation functions
def retrieve_chunks(query, k=5):
query_embedding = embedding_model.encode([query])
D, I = index.search(query_embedding, k)
return I[0]
generator = pipeline('text-generation', model='gpt-3.5-turbo')
def generate_response(retrieved_chunk_ids):
context = ' '.join([chunks[i] for i in retrieved_chunk_ids])
response = generator(context, max_length=200)
return response[0]['generated_text']
# Example usage
query = "What is the meaning of life?"
retrieved_chunk_ids = retrieve_chunks(query)
response = generate_response(retrieved_chunk_ids)
print(response)
Here's a basic RAG implementation using mock components:
import numpy as np
from typing import List, Dict, Any
class DocumentStore:
def __init__(self):
self.documents = {}
self.embeddings = {}
def add_document(self, doc_id: str, text: str, embedding: np.ndarray):
self.documents[doc_id] = text
self.embeddings[doc_id] = embedding
def get_documents(self, doc_ids: List[str]) -> List[str]:
return [self.documents[doc_id] for doc_id in doc_ids]
class EmbeddingModel:
def encode(self, text: str) -> np.ndarray:
# Mock embedding model
return np.random.rand(768) # 768-dimensional vector
class Retriever:
def __init__(self, document_store: DocumentStore):
self.document_store = document_store
def retrieve(self, query: str, top_k: int = 3) -> List[str]:
query_embedding = EmbeddingModel().encode(query)
similarities = {}
for doc_id, doc_embedding in self.document_store.embeddings.items():
similarity = np.dot(query_embedding, doc_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding))
similarities[doc_id] = similarity
top_docs = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_k]
return self.document_store.get_documents([doc_id for doc_id, _ in top_docs])
class Generator:
def generate(self, query: str, context: List[str]) -> str:
# Mock generator
context_str = " ".join(context)
return f"Generated response for query '{query}' based on context: {context_str}"
class RAG:
def __init__(self):
self.document_store = DocumentStore()
self.retriever = Retriever(self.document_store)
self.generator = Generator()
def add_document(self, doc_id: str, text: str):
embedding = EmbeddingModel().encode(text)
self.document_store.add_document(doc_id, text, embedding)
def query(self, query: str, top_k: int = 3) -> Dict[str, Any]:
context = self.retriever.retrieve(query, top_k)
generated_response = self.generator.generate(query, context)
return {
"query": query,
"context": context,
"response": generated_response
}
# Example usage
rag = RAG()
# Add some sample documents
rag.add_document("doc1", "The quick brown fox jumps over the lazy dog.")
rag.add_document("doc2", "Python is a versatile programming language.")
rag.add_document("doc3", "Artificial intelligence is transforming many industries.")
# Query the RAG system
result = rag.query("What is Python used for?")
print(result)
Here's a basic RAG implementation using Sentence Transformers for embeddings and cosine similarity:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Tuple
class SimpleRAGSearch:
def __init__(self):
# Initialize the embedding model
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
self.documents: List[Dict] = []
self.embeddings: List[np.ndarray] = []
def add_document(self, text: str, metadata: Dict = None):
"""Add a document to the search index."""
# Generate embedding for the document
embedding = self.embedding_model.encode([text])[0]
# Store document and its embedding
doc = {
'text': text,
'metadata': metadata or {},
'id': len(self.documents)
}
self.documents.append(doc)
self.embeddings.append(embedding)
def search(self, query: str, top_k: int = 5) -> List[Tuple[Dict, float]]:
"""Search for similar documents using cosine similarity."""
# Generate embedding for the query
query_embedding = self.embedding_model.encode([query])[0]
# Calculate similarities
similarities = cosine_similarity(
[query_embedding],
self.embeddings
)[0]
# Get top_k results
top_indices = np.argsort(similarities)[-top_k:][::-1]
results = [
(self.documents[idx], float(similarities[idx]))
for idx in top_indices
]
return results
def batch_add_documents(self, texts: List[str], metadata_list: List[Dict] = None):
"""Add multiple documents at once."""
if metadata_list is None:
metadata_list = [None] * len(texts)
embeddings = self.embedding_model.encode(texts)
for text, embedding, metadata in zip(texts, embeddings, metadata_list):
doc = {
'text': text,
'metadata': metadata or {},
'id': len(self.documents)
}
self.documents.append(doc)
self.embeddings.append(embedding)
def main():
# Initialize the search engine
search_engine = SimpleRAGSearch()
# Add some sample documents
documents = [
"The quick brown fox jumps over the lazy dog.",
"Machine learning is a subset of artificial intelligence.",
"Python is a popular programming language.",
"Natural language processing helps computers understand human language.",
"Deep learning models require significant computational resources."
]
metadata_list = [
{'category': 'phrase'},
{'category': 'tech'},
{'category': 'programming'},
{'category': 'nlp'},
{'category': 'ai'}
]
search_engine.batch_add_documents(documents, metadata_list)
# Search for a query
query = "What is Python used for?"
results = search_engine.search(query, top_k=3)
print(f"Query: {query}")
for doc, score in results:
print(f"Score: {score:.4f}, Document: {doc['text']}, Metadata: {doc['metadata']}")
if __name__ == "__main__":
main()
Building a RAG system involves integrating efficient retrieval mechanisms with powerful generative models to produce accurate and contextually relevant responses. While existing libraries like Haystack, LangChain, and LlamaIndex offer comprehensive solutions with extensive features and scalability, creating a custom RAG system allows for tailored functionalities suited to specific needs. The provided examples serve as a foundational blueprint. Depending on your requirements, you can expand and optimize them further by incorporating advanced retrieval techniques, expanding the knowledge base, and enhancing the generative capabilities.