Retrieval Augmented Generation (RAG) with LangChain
Meri Nova
Machine Learning Engineer
chunk_size
chunk_overlap
from langchain_text_splitters import CharacterTextSplitter
text = """Machine learning is a fascinating field.\n\nIt involves algorithms and models that can learn from data. These models can then make predictions or decisions without being explicitly programmed to perform the task.\nThis capability is increasingly valuable in various industries, from finance to healthcare.\n\nThere are many types of machine learning, including supervised, unsupervised, and reinforcement learning.\nEach type has its own strengths and applications."""
text_splitter = CharacterTextSplitter( separator="\n\n", chunk_size=100, chunk_overlap=10 )
chunks = text_splitter.split_text(text)
print(chunks) print([len(chunk) for chunk in chunks])
['Machine learning is a fascinating field.', 'It involves algorithms and models that can learn from data. These models can...', 'There are many types of machine learning, including supervised, unsupervised...']
[40, 260, 155]
chunk_size
from langchain_text_splitters import RecursiveCharacterTextSplitter splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", " ", ""],
chunk_size=100, chunk_overlap=10
)
chunks = splitter.split_text(text)
print(chunks)
print([len(chunk) for chunk in chunks])
['Machine learning is a fascinating field.',
'It involves algorithms and models that can learn from data. These models ...',
'or decisions without being explicitly programmed to perform the task.',
'This capability is increasingly valuable in various industries, from ...',
'There are many types of machine learning, including supervised, ...',
'learning.',
'Each type has its own strengths and applications.']
[40, 98, 69, 91, 95, 9, 49]
from langchain_community.document_loaders import PyPDFLoader loader = PyPDFLoader("research_paper.pdf") documents = loader.load() splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(documents)
print(chunks)
print([len(chunk.page_content) for chunk in chunks])
[Document(metadata={'source': 'Rag Paper.pdf', 'page': 0}, page_content='...'), Document(metadata={'source': 'Rag Paper.pdf', 'page': 0}, page_content='...'), Document(metadata={'source': 'Rag Paper.pdf', 'page': 0}, page_content='...')]
[928, 946, 921,...]
from langchain_openai import OpenAIEmbeddings from langchain_chroma import Chroma embedding_model = OpenAIEmbeddings( api_key=openai_api_key, model="text-embedding-3-small" )
vector_store = Chroma.from_documents( documents=chunks, embedding=embedding_model )
Retrieval Augmented Generation (RAG) with LangChain