Retrieval Augmented Generation (RAG) with LangChain
Meri Nova
Machine Learning Engineer





chunk_size

chunk_overlap

from langchain_text_splitters import CharacterTextSplittertext = """Machine learning is a fascinating field.\n\nIt involves algorithms and models that can learn from data. These models can then make predictions or decisions without being explicitly programmed to perform the task.\nThis capability is increasingly valuable in various industries, from finance to healthcare.\n\nThere are many types of machine learning, including supervised, unsupervised, and reinforcement learning.\nEach type has its own strengths and applications."""text_splitter = CharacterTextSplitter( separator="\n\n", chunk_size=100, chunk_overlap=10 )
chunks = text_splitter.split_text(text)print(chunks) print([len(chunk) for chunk in chunks])
['Machine learning is a fascinating field.', 'It involves algorithms and models that can learn from data. These models can...', 'There are many types of machine learning, including supervised, unsupervised...'][40, 260, 155]
chunk_sizefrom langchain_text_splitters import RecursiveCharacterTextSplitter splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", " ", ""],chunk_size=100, chunk_overlap=10)
chunks = splitter.split_text(text)
print(chunks)
print([len(chunk) for chunk in chunks])
['Machine learning is a fascinating field.',
'It involves algorithms and models that can learn from data. These models ...',
'or decisions without being explicitly programmed to perform the task.',
'This capability is increasingly valuable in various industries, from ...',
'There are many types of machine learning, including supervised, ...',
'learning.',
'Each type has its own strengths and applications.']
[40, 98, 69, 91, 95, 9, 49]
from langchain_community.document_loaders import PyPDFLoader loader = PyPDFLoader("research_paper.pdf") documents = loader.load() splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)chunks = splitter.split_documents(documents)
print(chunks)print([len(chunk.page_content) for chunk in chunks])
[Document(metadata={'source': 'Rag Paper.pdf', 'page': 0}, page_content='...'), Document(metadata={'source': 'Rag Paper.pdf', 'page': 0}, page_content='...'), Document(metadata={'source': 'Rag Paper.pdf', 'page': 0}, page_content='...')][928, 946, 921,...]





from langchain_openai import OpenAIEmbeddings from langchain_chroma import Chroma embedding_model = OpenAIEmbeddings( api_key=openai_api_key, model="text-embedding-3-small" )vector_store = Chroma.from_documents( documents=chunks, embedding=embedding_model )
Retrieval Augmented Generation (RAG) with LangChain