DoclingDocling for IBM watsonx
This is a preview with content being developed and subject to changes. Rely on theofficial announcement and documentationabout the Docling for IBM watsonx product.
Cookbook

RAG Application

Build a Retrieval-Augmented Generation application with Docling

Building a RAG Application

This page is not accurate! Each item needs to be validated.

Learn how to build a Retrieval-Augmented Generation (RAG) application using Docling for IBM watsonx to prepare documents for vector databases and semantic search.

Overview

RAG applications combine document retrieval with large language models to provide accurate, context-aware responses. Docling helps by converting complex documents into clean, structured formats that preserve semantic meaning and document structure.

What You'll Build

  • Document processing pipeline for RAG
  • Vector database integration
  • Semantic search with preserved structure
  • Context-aware question answering

Prerequisites

  • Docling for IBM watsonx account with Service URL and API Key
  • Python 3.8+
  • Vector database (e.g., Pinecone, Weaviate, or Chroma)
  • OpenAI API key (or other LLM provider)

Architecture

Step 1: Install Dependencies

pip install docling-service-client
pip install langchain
pip install chromadb
pip install openai

Step 2: Convert Documents with Docling

import os
from pathlib import Path
from docling.service_client import DoclingServiceClient

SERVICE_URL = os.getenv("DOCLING_SERVICE_URL")
API_KEY = os.getenv("DOCLING_API_KEY")

def convert_documents(document_paths):
    """Convert documents to Markdown format."""
    converted_docs = []
    
    with DoclingServiceClient(url=SERVICE_URL, api_key=API_KEY) as client:
        for doc_path in document_paths:
            result = client.convert(source=Path(doc_path))
            markdown = result.document.export_to_markdown()
            
            converted_docs.append({
                'path': doc_path,
                'content': markdown,
                'metadata': {
                    'source': doc_path,
                    'format': 'markdown'
                }
            })
    
    return converted_docs

Step 3: Chunk Documents

from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_documents(converted_docs, chunk_size=1000, chunk_overlap=200):
    """Split documents into chunks for embedding."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    
    chunks = []
    for doc in converted_docs:
        doc_chunks = text_splitter.split_text(doc['content'])
        
        for i, chunk in enumerate(doc_chunks):
            chunks.append({
                'content': chunk,
                'metadata': {
                    **doc['metadata'],
                    'chunk_id': i,
                    'total_chunks': len(doc_chunks)
                }
            })
    
    return chunks

Step 4: Create Embeddings and Store

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

def create_vector_store(chunks):
    """Create vector store from document chunks."""
    embeddings = OpenAIEmbeddings()
    
    texts = [chunk['content'] for chunk in chunks]
    metadatas = [chunk['metadata'] for chunk in chunks]
    
    vectorstore = Chroma.from_texts(
        texts=texts,
        embedding=embeddings,
        metadatas=metadatas,
        persist_directory="./chroma_db"
    )
    
    return vectorstore

Step 5: Build RAG Chain

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

def create_rag_chain(vectorstore):
    """Create RAG chain for question answering."""
    llm = ChatOpenAI(model_name="gpt-4", temperature=0)
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(
            search_kwargs={"k": 4}
        ),
        return_source_documents=True
    )
    
    return qa_chain

Step 6: Query the System

def query_documents(qa_chain, question):
    """Query the RAG system."""
    result = qa_chain({"query": question})
    
    print(f"Question: {question}")
    print(f"\nAnswer: {result['result']}")
    print(f"\nSources:")
    for doc in result['source_documents']:
        print(f"- {doc.metadata['source']} (chunk {doc.metadata['chunk_id']})")

Complete Example

import os
from pathlib import Path
from docling.service_client import DoclingServiceClient
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# Configuration
SERVICE_URL = os.getenv("DOCLING_SERVICE_URL")
API_KEY = os.getenv("DOCLING_API_KEY")

def build_rag_system(document_paths):
    """Build complete RAG system."""
    # Step 1: Convert documents
    print("Converting documents...")
    converted_docs = []
    with DoclingServiceClient(url=SERVICE_URL, api_key=API_KEY) as client:
        for doc_path in document_paths:
            result = client.convert(source=Path(doc_path))
            converted_docs.append({
                'path': doc_path,
                'content': result.document.export_to_markdown(),
                'metadata': {'source': doc_path}
            })
    
    # Step 2: Chunk documents
    print("Chunking documents...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    chunks = []
    for doc in converted_docs:
        doc_chunks = text_splitter.split_text(doc['content'])
        for i, chunk in enumerate(doc_chunks):
            chunks.append({
                'content': chunk,
                'metadata': {**doc['metadata'], 'chunk_id': i}
            })
    
    # Step 3: Create vector store
    print("Creating vector store...")
    embeddings = OpenAIEmbeddings()
    vectorstore = Chroma.from_texts(
        texts=[c['content'] for c in chunks],
        embedding=embeddings,
        metadatas=[c['metadata'] for c in chunks]
    )
    
    # Step 4: Create RAG chain
    print("Building RAG chain...")
    llm = ChatOpenAI(model_name="gpt-4", temperature=0)
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectorstore.as_retriever(search_kwargs={"k": 4}),
        return_source_documents=True
    )
    
    return qa_chain

# Usage
if __name__ == "__main__":
    documents = ["report1.pdf", "report2.pdf", "report3.pdf"]
    qa_chain = build_rag_system(documents)
    
    # Query the system
    result = qa_chain({"query": "What are the main findings?"})
    print(f"Answer: {result['result']}")

Best Practices

Document Preparation

  1. Use Docling's structure preservation - Tables, lists, and headings improve retrieval
  2. Convert to Markdown - Clean, semantic format for chunking
  3. Maintain metadata - Track source documents and page numbers

Chunking Strategy

  1. Respect document structure - Split on headings and paragraphs
  2. Overlap chunks - Ensure context continuity
  3. Optimize chunk size - Balance between context and precision

Retrieval Optimization

  1. Tune k parameter - Number of chunks to retrieve
  2. Use metadata filtering - Filter by document type or date
  3. Implement reranking - Improve relevance of retrieved chunks

Advanced Features

Multi-modal RAG

Include images and tables from documents:

result = client.convert(
    source=Path("document.pdf"),
    options={
        "image_ref_mode": "embedded",
        "output_format": "json"
    }
)

# Access structured elements
for item in result.document.items:
    if item.type == "table":
        # Process table data
        pass
    elif item.type == "image":
        # Process image
        pass

Combine semantic and keyword search:

from langchain.retrievers import EnsembleRetriever
from langchain.retrievers import BM25Retriever

# Create hybrid retriever
bm25_retriever = BM25Retriever.from_texts(texts)
ensemble_retriever = EnsembleRetriever(
    retrievers=[vectorstore.as_retriever(), bm25_retriever],
    weights=[0.7, 0.3]
)

Next Steps

Resources

On this page