Skip to content

Commit 653940e

Browse files
authored
docs: add examples and fix docstring bug in DocumentConverter (#3064)
docs: add docstrings to DocumentConverter methods Add Examples sections to __init__, convert, convert_all, and convert_string methods with runnable usage examples. Fix documentation bug in convert_all where max_file_size was described as 'Maximum number of pages' instead of file size in bytes. Refs #2748 Signed-off-by: easedu <[email protected]>
1 parent 8b99085 commit 653940e

1 file changed

Lines changed: 80 additions & 3 deletions

File tree

docling/document_converter.py

Lines changed: 80 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,29 @@ def __init__(
226226
allowed_formats: List of allowed input formats. By default, any
227227
format supported by Docling is allowed.
228228
format_options: Dictionary of format-specific options.
229+
230+
Examples:
231+
Create a converter with default settings (all formats allowed):
232+
233+
>>> converter = DocumentConverter()
234+
235+
Allow only PDF and DOCX formats:
236+
237+
>>> from docling.datamodel.base_models import InputFormat
238+
>>> converter = DocumentConverter(
239+
... allowed_formats=[InputFormat.PDF, InputFormat.DOCX]
240+
... )
241+
242+
Customize pipeline options for PDF:
243+
244+
>>> from docling.datamodel.pipeline_options import PdfPipelineOptions
245+
>>> converter = DocumentConverter(
246+
... format_options={
247+
... InputFormat.PDF: PdfFormatOption(
248+
... pipeline_options=PdfPipelineOptions()
249+
... ),
250+
... }
251+
... )
229252
"""
230253
self.allowed_formats: list[InputFormat] = (
231254
allowed_formats if allowed_formats is not None else list(InputFormat)
@@ -333,6 +356,26 @@ def convert(
333356
334357
Raises:
335358
ConversionError: An error occurred during conversion.
359+
360+
Examples:
361+
Convert a local PDF file:
362+
363+
>>> from pathlib import Path
364+
>>> converter = DocumentConverter()
365+
>>> result = converter.convert("path/to/document.pdf")
366+
>>> print(result.document.export_to_markdown())
367+
368+
Convert a document from a URL:
369+
370+
>>> result = converter.convert("https://example.com/paper.pdf")
371+
372+
Convert from an in-memory stream:
373+
374+
>>> from io import BytesIO
375+
>>> from docling.datamodel.base_models import DocumentStream
376+
>>> buf = BytesIO(b"<html><body>Hello</body></html>")
377+
>>> stream = DocumentStream(name="page.html", stream=buf)
378+
>>> result = converter.convert(stream)
336379
"""
337380
all_res = self.convert_all(
338381
source=[source],
@@ -362,9 +405,10 @@ def convert_all(
362405
headers: Optional headers given as a (single) dictionary of string
363406
key-value pairs, in case of URL input source.
364407
raises_on_error: Whether to raise an error on the first conversion failure.
365-
max_num_pages: Maximum number of pages to convert.
366-
max_file_size: Maximum number of pages accepted per document. Documents
367-
exceeding this number will be skipped.
408+
max_num_pages: Maximum number of pages accepted per document.
409+
Documents exceeding this number will not be converted.
410+
max_file_size: Maximum file size in bytes. Documents exceeding this
411+
limit will be skipped.
368412
page_range: Range of pages to convert in each document.
369413
370414
Yields:
@@ -373,6 +417,21 @@ def convert_all(
373417
374418
Raises:
375419
ConversionError: An error occurred during conversion.
420+
421+
Examples:
422+
Convert a batch of local files:
423+
424+
>>> from pathlib import Path
425+
>>> converter = DocumentConverter()
426+
>>> paths = list(Path("docs/").glob("*.pdf"))
427+
>>> for result in converter.convert_all(paths):
428+
... print(result.document.export_to_markdown()[:100])
429+
430+
Convert with a file size limit of 20 MB:
431+
432+
>>> results = converter.convert_all(
433+
... paths, max_file_size=20 * 1024 * 1024
434+
... )
376435
"""
377436
limits = DocumentLimits(
378437
max_num_pages=max_num_pages,
@@ -435,6 +494,24 @@ def convert_string(
435494
Raises:
436495
ValueError: If format is neither `InputFormat.MD` nor `InputFormat.HTML`.
437496
ConversionError: An error occurred during conversion.
497+
498+
Examples:
499+
Convert a Markdown string:
500+
501+
>>> from docling.datamodel.base_models import InputFormat
502+
>>> converter = DocumentConverter()
503+
>>> result = converter.convert_string(
504+
... "# Title\nSome text.", format=InputFormat.MD
505+
... )
506+
>>> print(result.document.export_to_markdown())
507+
508+
Convert an HTML string:
509+
510+
>>> result = converter.convert_string(
511+
... "<h1>Title</h1><p>Some text.</p>",
512+
... format=InputFormat.HTML,
513+
... name="my_page",
514+
... )
438515
"""
439516
name = name or datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
440517

0 commit comments

Comments
 (0)