Cookbook
RAG Application
Build a Retrieval-Augmented Generation application with Docling
Building a RAG Application
This page is not accurate! Each item needs to be validated.
Learn how to build a Retrieval-Augmented Generation (RAG) application using Docling for IBM watsonx to prepare documents for vector databases and semantic search.
Overview
RAG applications combine document retrieval with large language models to provide accurate, context-aware responses. Docling helps by converting complex documents into clean, structured formats that preserve semantic meaning and document structure.
What You'll Build
- Document processing pipeline for RAG
- Vector database integration
- Semantic search with preserved structure
- Context-aware question answering
Prerequisites
- Docling for IBM watsonx account with Service URL and API Key
- Python 3.8+
- Vector database (e.g., Pinecone, Weaviate, or Chroma)
- OpenAI API key (or other LLM provider)
Architecture
Step 1: Install Dependencies
pip install docling-service-client
pip install langchain
pip install chromadb
pip install openaiStep 2: Convert Documents with Docling
import os
from pathlib import Path
from docling.service_client import DoclingServiceClient
SERVICE_URL = os.getenv("DOCLING_SERVICE_URL")
API_KEY = os.getenv("DOCLING_API_KEY")
def convert_documents(document_paths):
"""Convert documents to Markdown format."""
converted_docs = []
with DoclingServiceClient(url=SERVICE_URL, api_key=API_KEY) as client:
for doc_path in document_paths:
result = client.convert(source=Path(doc_path))
markdown = result.document.export_to_markdown()
converted_docs.append({
'path': doc_path,
'content': markdown,
'metadata': {
'source': doc_path,
'format': 'markdown'
}
})
return converted_docsStep 3: Chunk Documents
from langchain.text_splitter import RecursiveCharacterTextSplitter
def chunk_documents(converted_docs, chunk_size=1000, chunk_overlap=200):
"""Split documents into chunks for embedding."""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", " ", ""]
)
chunks = []
for doc in converted_docs:
doc_chunks = text_splitter.split_text(doc['content'])
for i, chunk in enumerate(doc_chunks):
chunks.append({
'content': chunk,
'metadata': {
**doc['metadata'],
'chunk_id': i,
'total_chunks': len(doc_chunks)
}
})
return chunksStep 4: Create Embeddings and Store
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
def create_vector_store(chunks):
"""Create vector store from document chunks."""
embeddings = OpenAIEmbeddings()
texts = [chunk['content'] for chunk in chunks]
metadatas = [chunk['metadata'] for chunk in chunks]
vectorstore = Chroma.from_texts(
texts=texts,
embedding=embeddings,
metadatas=metadatas,
persist_directory="./chroma_db"
)
return vectorstoreStep 5: Build RAG Chain
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
def create_rag_chain(vectorstore):
"""Create RAG chain for question answering."""
llm = ChatOpenAI(model_name="gpt-4", temperature=0)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectorstore.as_retriever(
search_kwargs={"k": 4}
),
return_source_documents=True
)
return qa_chainStep 6: Query the System
def query_documents(qa_chain, question):
"""Query the RAG system."""
result = qa_chain({"query": question})
print(f"Question: {question}")
print(f"\nAnswer: {result['result']}")
print(f"\nSources:")
for doc in result['source_documents']:
print(f"- {doc.metadata['source']} (chunk {doc.metadata['chunk_id']})")Complete Example
import os
from pathlib import Path
from docling.service_client import DoclingServiceClient
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
# Configuration
SERVICE_URL = os.getenv("DOCLING_SERVICE_URL")
API_KEY = os.getenv("DOCLING_API_KEY")
def build_rag_system(document_paths):
"""Build complete RAG system."""
# Step 1: Convert documents
print("Converting documents...")
converted_docs = []
with DoclingServiceClient(url=SERVICE_URL, api_key=API_KEY) as client:
for doc_path in document_paths:
result = client.convert(source=Path(doc_path))
converted_docs.append({
'path': doc_path,
'content': result.document.export_to_markdown(),
'metadata': {'source': doc_path}
})
# Step 2: Chunk documents
print("Chunking documents...")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
chunks = []
for doc in converted_docs:
doc_chunks = text_splitter.split_text(doc['content'])
for i, chunk in enumerate(doc_chunks):
chunks.append({
'content': chunk,
'metadata': {**doc['metadata'], 'chunk_id': i}
})
# Step 3: Create vector store
print("Creating vector store...")
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_texts(
texts=[c['content'] for c in chunks],
embedding=embeddings,
metadatas=[c['metadata'] for c in chunks]
)
# Step 4: Create RAG chain
print("Building RAG chain...")
llm = ChatOpenAI(model_name="gpt-4", temperature=0)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
return_source_documents=True
)
return qa_chain
# Usage
if __name__ == "__main__":
documents = ["report1.pdf", "report2.pdf", "report3.pdf"]
qa_chain = build_rag_system(documents)
# Query the system
result = qa_chain({"query": "What are the main findings?"})
print(f"Answer: {result['result']}")Best Practices
Document Preparation
- Use Docling's structure preservation - Tables, lists, and headings improve retrieval
- Convert to Markdown - Clean, semantic format for chunking
- Maintain metadata - Track source documents and page numbers
Chunking Strategy
- Respect document structure - Split on headings and paragraphs
- Overlap chunks - Ensure context continuity
- Optimize chunk size - Balance between context and precision
Retrieval Optimization
- Tune k parameter - Number of chunks to retrieve
- Use metadata filtering - Filter by document type or date
- Implement reranking - Improve relevance of retrieved chunks
Advanced Features
Multi-modal RAG
Include images and tables from documents:
result = client.convert(
source=Path("document.pdf"),
options={
"image_ref_mode": "embedded",
"output_format": "json"
}
)
# Access structured elements
for item in result.document.items:
if item.type == "table":
# Process table data
pass
elif item.type == "image":
# Process image
passHybrid Search
Combine semantic and keyword search:
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers import BM25Retriever
# Create hybrid retriever
bm25_retriever = BM25Retriever.from_texts(texts)
ensemble_retriever = EnsembleRetriever(
retrievers=[vectorstore.as_retriever(), bm25_retriever],
weights=[0.7, 0.3]
)Next Steps
- Explore Agentic Workflows for interactive RAG
- Learn about Pipeline Integration for automation
- Check the API Reference for advanced conversion options