Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1e5181b
Add lockfile tracking for exported pages
naoki-tateyama Feb 2, 2026
90f211c
feat: Incremental page download using lockfile
naoki-tateyama Feb 3, 2026
8691524
add prune command
naoki-tateyama Feb 3, 2026
0b5dbd3
add incremental export option to pages command
naoki-tateyama Feb 8, 2026
6bac022
fix performance issue by adding Descendant type
naoki-tateyama Feb 8, 2026
6edb622
integrate version property into Descendant class
naoki-tateyama Feb 8, 2026
70617ea
fix performance issue by avoiding ancestor fetching in the _template_…
naoki-tateyama Feb 8, 2026
d34eff7
add documentation for the new incremental option
naoki-tateyama Feb 8, 2026
bd13853
remove expand parameters from from_id and from_url methods.
naoki-tateyama Feb 8, 2026
2516f41
move LockfileManager to outside of the Page class
naoki-tateyama Feb 9, 2026
5029d3d
parse ancestor in simpler way
naoki-tateyama Feb 9, 2026
4201ccf
add tests for lockfile utils
naoki-tateyama Feb 11, 2026
8603a3b
refactor Ancestor and Descendant classes
naoki-tateyama Feb 11, 2026
dd86763
atomic file writes
naoki-tateyama Feb 15, 2026
5263c39
Update confluence_markdown_exporter/utils/lockfile.py
naoki-tateyama Mar 4, 2026
6f15581
fix Ancestor.id type from str to int
naoki-tateyama Mar 4, 2026
02562ff
prefilter pages before tqdm progress bar
naoki-tateyama Mar 4, 2026
509d3e6
sort lockfile entries by key for stable git diffs
naoki-tateyama Mar 4, 2026
4253132
make skip_unchanged a config option enabled by default
naoki-tateyama Mar 4, 2026
ed2f5a1
add automatic cleanup for deleted and moved pages
naoki-tateyama Mar 4, 2026
f899267
Introduce new config options and refactor clean
Spenhouet Mar 5, 2026
a115f39
Fix connection config
Spenhouet Mar 6, 2026
1439186
Redownload if file is missing
Spenhouet Mar 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
- Converts Confluence macros to equivalent Markdown syntax where possible.
- Handles images and attachments by linking them appropriately in the Markdown output.
- Supports extended Markdown features like tasks, alerts, and front matter.
- Supports incremental exports — only re-exports pages that have changed since the last run.
- Supports Confluence add-ons: [draw.io](https://marketplace.atlassian.com/apps/1210933/draw-io-diagrams-uml-bpmn-aws-erd-flowcharts), [PlantUML](https://marketplace.atlassian.com/apps/1222993/flowchart-plantuml-diagrams-for-confluence)

## Supported Markdown Elements
Expand Down Expand Up @@ -94,14 +95,38 @@ Export all Confluence pages of a single Space:
confluence-markdown-exporter spaces <space-key e.g. MYSPACE> <output path e.g. ./output_path/>
```

#### 2.3. Export all Spaces
#### 2.4. Export all Spaces

Export all Confluence pages across all spaces:

```sh
confluence-markdown-exporter all-spaces <output path e.g. ./output_path/>
```

#### 2.5. Incremental Export

All export commands (`pages`, `pages-with-descendants`, `spaces`, `all-spaces`) support the `--incremental` flag. When enabled, only pages that have changed since the last export are re-exported:

```sh
confluence-markdown-exporter spaces <space-key> --incremental
```

This uses a lockfile to track previously exported pages and their versions, making subsequent exports significantly faster.

#### 2.6. Prune Untracked Files

After using incremental exports, you can clean up exported files that are no longer tracked in the lockfile (e.g. deleted pages):

```sh
confluence-markdown-exporter prune
```

Use `--dry-run` to preview which files would be deleted without actually deleting them:

```sh
confluence-markdown-exporter prune --dry-run
```

### 3. Output

The exported Markdown file(s) will be saved in the specified `output` directory e.g.:
Expand Down
111 changes: 75 additions & 36 deletions confluence_markdown_exporter/confluence.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from confluence_markdown_exporter.utils.export import sanitize_filename
from confluence_markdown_exporter.utils.export import sanitize_key
from confluence_markdown_exporter.utils.export import save_file
from confluence_markdown_exporter.utils.lockfile import LockfileManager
from confluence_markdown_exporter.utils.table_converter import TableConverter
from confluence_markdown_exporter.utils.type_converter import str_to_bool

Expand Down Expand Up @@ -133,7 +134,7 @@ class Organization(BaseModel):
spaces: list["Space"]

@property
def pages(self) -> list[int]:
def pages(self) -> list["Page | Descendant"]:
return [page for space in self.spaces for page in space.pages]

def export(self) -> None:
Expand Down Expand Up @@ -165,15 +166,15 @@ class Space(BaseModel):
homepage: int | None

@property
def pages(self) -> list[int]:
def pages(self) -> list["Page | Descendant"]:
if self.homepage is None:
logger.warning(
f"Space '{self.name}' (key: {self.key}) has no homepage. No pages will be exported."
)
return []

homepage = Page.from_id(self.homepage)
return [self.homepage, *homepage.descendants]
return [homepage, *homepage.descendants]

def export(self) -> None:
export_pages(self.pages)
Expand Down Expand Up @@ -212,7 +213,8 @@ def from_json(cls, data: JsonResponse) -> "Label":
class Document(BaseModel):
title: str
space: Space
ancestors: list[int]
ancestors: list["Ancestor"]
version: Version

@property
def _template_vars(self) -> dict[str, str]:
Expand All @@ -221,10 +223,8 @@ def _template_vars(self) -> dict[str, str]:
"space_name": sanitize_filename(self.space.name),
"homepage_id": str(self.space.homepage),
"homepage_title": sanitize_filename(Page.from_id(self.space.homepage).title),
"ancestor_ids": "/".join(str(a) for a in self.ancestors),
"ancestor_titles": "/".join(
sanitize_filename(Page.from_id(a).title) for a in self.ancestors
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the _template_vars property, although the results are cached, Page.from_id(a).title) for a in self.ancestors unnecessarily calls from_id. This is inconvenient for speeding up with the incremental option.
So I added Ancestor type to store ancestor infomation.

),
"ancestor_ids": "/".join(str(a.id) for a in self.ancestors),
"ancestor_titles": "/".join(sanitize_filename(a.title) for a in self.ancestors),
}


Expand All @@ -237,7 +237,6 @@ class Attachment(Document):
collection_name: str
download_link: str
comment: str
version: Version

@property
def extension(self) -> str:
Expand Down Expand Up @@ -284,8 +283,8 @@ def from_json(cls, data: JsonResponse) -> "Attachment":
download_link=data.get("_links", {}).get("download", ""),
comment=extensions.get("comment", ""),
ancestors=[
*[ancestor.get("id") for ancestor in container.get("ancestors", [])],
container.get("id"),
*[Ancestor.from_json(ancestor) for ancestor in container.get("ancestors", [])],
Ancestor.from_json(container),
][1:],
version=Version.from_json(data.get("version", {})),
)
Expand Down Expand Up @@ -333,6 +332,47 @@ def export(self) -> None:
)


class Ancestor(Document):
id: str

@classmethod
def from_json(cls, data: JsonResponse) -> "Ancestor":
return cls(
id=data.get("id", 0),
title=data.get("title", ""),
space=Space.from_key(data.get("_expandable", {}).get("space", "").split("/")[-1]),
ancestors=[], # Ancestors of ancestor is not needed for now.
version=Version.from_json({}), # Version of ancestor is not needed for now.
)


class Descendant(Document):
id: int

@property
def _template_vars(self) -> dict[str, str]:
return {
**super()._template_vars,
"page_id": str(self.id),
"page_title": sanitize_filename(self.title),
}

@property
def export_path(self) -> Path:
filepath_template = Template(settings.export.page_path.replace("{", "${"))
return Path(filepath_template.safe_substitute(self._template_vars))

@classmethod
def from_json(cls, data: JsonResponse) -> "Descendant":
return cls(
id=data.get("id", 0),
title=data.get("title", ""),
space=Space.from_key(data.get("_expandable", {}).get("space", "").split("/")[-1]),
ancestors=[Ancestor.from_json(ancestor) for ancestor in data.get("ancestors", [])][1:],
version=Version.from_json(data.get("version", {})),
)


class Page(Document):
id: int
body: str
Expand All @@ -342,10 +382,11 @@ class Page(Document):
attachments: list["Attachment"]

@property
def descendants(self) -> list[int]:
def descendants(self) -> list["Descendant"]:
url = "rest/api/content/search"
params = {
"cql": f"type=page AND ancestor={self.id}",
"expand": "metadata.properties,ancestors,version",
"limit": 100,
}
results = []
Expand All @@ -372,8 +413,7 @@ def descendants(self) -> list[int]:
f"Unexpected error when fetching descendants for content ID {self.id}."
)
return []

return [result["id"] for result in results]
return [Descendant.from_json(result) for result in results]

@property
def _template_vars(self) -> dict[str, str]:
Expand Down Expand Up @@ -410,7 +450,7 @@ def export(self) -> None:
self.export_markdown()

def export_with_descendants(self) -> None:
export_pages([self.id, *self.descendants])
export_pages([self, *self.descendants])

def export_body(self) -> None:
soup = BeautifulSoup(self.html, "html.parser")
Expand Down Expand Up @@ -498,7 +538,8 @@ def from_json(cls, data: JsonResponse) -> "Page":
for label in data.get("metadata", {}).get("labels", {}).get("results", [])
],
attachments=Attachment.from_page_id(data.get("id", 0)),
ancestors=[ancestor.get("id") for ancestor in data.get("ancestors", [])][1:],
ancestors=[Ancestor.from_json(ancestor) for ancestor in data.get("ancestors", [])][1:],
version=Version.from_json(data.get("version", {})),
)

@classmethod
Expand All @@ -511,7 +552,7 @@ def from_id(cls, page_id: int) -> "Page":
confluence.get_page_by_id(
page_id,
expand="body.view,body.export_view,body.editor2,metadata.labels,"
"metadata.properties,ancestors",
"metadata.properties,ancestors,version",
),
)
)
Expand All @@ -528,6 +569,7 @@ def from_id(cls, page_id: int) -> "Page":
labels=[],
attachments=[],
ancestors=[],
version=Version.from_json({}),
)

@classmethod
Expand Down Expand Up @@ -596,7 +638,9 @@ def front_matter(self) -> str:
@property
def breadcrumbs(self) -> str:
return (
" > ".join([self.convert_page_link(ancestor) for ancestor in self.page.ancestors])
" > ".join(
[self.convert_page_link(ancestor.id) for ancestor in self.page.ancestors]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

convert_page_link takes an integer while ancestor.id is a string. Can you check what it is and either adjust ancestor.id to int or ensure that convert_page_link can handle the string?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in 6f15581.

)
+ "\n"
)

Expand Down Expand Up @@ -1001,7 +1045,7 @@ def convert_drawio(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -

return ""

def convert_plantuml(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str: # noqa: PLR0911
def convert_plantuml(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str: # noqa: PLR0911
"""Convert PlantUML diagrams from editor2 XML to Markdown code blocks.

PlantUML diagrams are stored in the editor2 XML as structured macros with
Expand Down Expand Up @@ -1092,24 +1136,19 @@ def _get_path_for_href(self, path: Path, style: Literal["absolute", "relative"])
return result


def export_page(page_id: int) -> None:
"""Export a Confluence page to Markdown.

Args:
page_id: The page id.
output_path: The output path.
"""
page = Page.from_id(page_id)
page.export()


def export_pages(page_ids: list[int]) -> None:
def export_pages(pages: list["Page | Descendant"]) -> None:
"""Export a list of Confluence pages to Markdown.

Args:
page_ids: List of pages to export.
output_path: The output path.
pages: List of pages to export.
"""
for page_id in (pbar := tqdm(page_ids, smoothing=0.05)):
pbar.set_postfix_str(f"Exporting page {page_id}")
export_page(page_id)
for page in (pbar := tqdm(pages, smoothing=0.05)):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could prefilter before starting the tqdm to only show a progressbar for pages to be exported:

    pages_to_export = [page for page in pages if LockfileManager.should_export(page)]

    if not pages_to_export:
        logger.info("No pages to export based on lockfile state.")
        return

    for page in (pbar := tqdm(pages_to_export, smoothing=0.05)):

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in 02562ff.

# filter pages new and updated only
if LockfileManager.should_export(page):
pbar.set_postfix_str(f"Exporting page {page.id}")
_page = Page.from_id(page.id)
_page.export()
# Record to lockfile if enabled
LockfileManager.record_page(_page)
else:
pbar.set_postfix_str(f"Skipping page {page.id} (no changes)")
Loading