Skip to content
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1e5181b
Add lockfile tracking for exported pages
naoki-tateyama Feb 2, 2026
90f211c
feat: Incremental page download using lockfile
naoki-tateyama Feb 3, 2026
8691524
add prune command
naoki-tateyama Feb 3, 2026
0b5dbd3
add incremental export option to pages command
naoki-tateyama Feb 8, 2026
6bac022
fix performance issue by adding Descendant type
naoki-tateyama Feb 8, 2026
6edb622
integrate version property into Descendant class
naoki-tateyama Feb 8, 2026
70617ea
fix performance issue by avoiding ancestor fetching in the _template_…
naoki-tateyama Feb 8, 2026
d34eff7
add documentation for the new incremental option
naoki-tateyama Feb 8, 2026
bd13853
remove expand parameters from from_id and from_url methods.
naoki-tateyama Feb 8, 2026
2516f41
move LockfileManager to outside of the Page class
naoki-tateyama Feb 9, 2026
5029d3d
parse ancestor in simpler way
naoki-tateyama Feb 9, 2026
4201ccf
add tests for lockfile utils
naoki-tateyama Feb 11, 2026
8603a3b
refactor Ancestor and Descendant classes
naoki-tateyama Feb 11, 2026
dd86763
atomic file writes
naoki-tateyama Feb 15, 2026
5263c39
Update confluence_markdown_exporter/utils/lockfile.py
naoki-tateyama Mar 4, 2026
6f15581
fix Ancestor.id type from str to int
naoki-tateyama Mar 4, 2026
02562ff
prefilter pages before tqdm progress bar
naoki-tateyama Mar 4, 2026
509d3e6
sort lockfile entries by key for stable git diffs
naoki-tateyama Mar 4, 2026
4253132
make skip_unchanged a config option enabled by default
naoki-tateyama Mar 4, 2026
ed2f5a1
add automatic cleanup for deleted and moved pages
naoki-tateyama Mar 4, 2026
f899267
Introduce new config options and refactor clean
Spenhouet Mar 5, 2026
a115f39
Fix connection config
Spenhouet Mar 6, 2026
1439186
Redownload if file is missing
Spenhouet Mar 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .vscode/extensions.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
"ms-python.python",
"ms-python.vscode-pylance",
"njpwerner.autodocstring",
"visualstudioexptteam.vscodeintellicode",
"charliermarsh.ruff",
],
// List of extensions recommended by VS Code that should not be recommended for users of this workspace.
"unwantedRecommendations": []
}
}
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
- Converts Confluence macros to equivalent Markdown syntax where possible.
- Handles images and attachments by linking them appropriately in the Markdown output.
- Supports extended Markdown features like tasks, alerts, and front matter.
- Skips unchanged pages by default — only re-exports pages that have changed since the last run.
- Supports Confluence add-ons: [draw.io](https://marketplace.atlassian.com/apps/1210933/draw-io-diagrams-uml-bpmn-aws-erd-flowcharts), [PlantUML](https://marketplace.atlassian.com/apps/1222993/flowchart-plantuml-diagrams-for-confluence)

## Supported Markdown Elements
Expand Down Expand Up @@ -94,7 +95,7 @@ Export all Confluence pages of a single Space:
confluence-markdown-exporter spaces <space-key e.g. MYSPACE> <output path e.g. ./output_path/>
```

#### 2.3. Export all Spaces
#### 2.4. Export all Spaces

Export all Confluence pages across all spaces:

Expand Down Expand Up @@ -149,12 +150,17 @@ This will open a menu where you can:
| export.filename_encoding | Character mapping for filename encoding. | Default mappings for forbidden characters. |
| export.filename_length | Maximum length of filenames. | 255 |
| export.include_document_title | Whether to include the document title in the exported markdown file. | True |
| export.skip_unchanged | Skip exporting pages that have not changed since last export. Uses a lockfile to track page versions. | True |
| export.cleanup_stale | After export, delete local files for pages removed from Confluence or whose export path has changed. | True |
| export.lockfile_name | Name of the lock file used to track exported pages. | confluence-lock.json |
| export.existence_check_batch_size | Number of page IDs per batch when checking page existence during cleanup. Capped at 25 for self-hosted (CQL). | 250 |
| connection_config.backoff_and_retry | Enable automatic retry with exponential backoff | True |
| connection_config.backoff_factor | Multiplier for exponential backoff | 2 |
| connection_config.max_backoff_seconds | Maximum seconds to wait between retries | 60 |
| connection_config.max_backoff_retries | Maximum number of retry attempts | 5 |
| connection_config.retry_status_codes | HTTP status codes that trigger a retry | \[413, 429, 502, 503, 504\] |
| connection_config.verify_ssl | Whether to verify SSL certificates for HTTPS requests. | True |
| connection_config.use_v2_api | Enable Confluence REST API v2 endpoints. Supported on Atlassian Cloud and Data Center 8+. Disable for self-hosted Server instances. | False |
| auth.confluence.url | Confluence instance URL | "" |
| auth.confluence.username | Confluence username/email | "" |
| auth.confluence.api_token | Confluence API token | "" |
Expand Down
8 changes: 4 additions & 4 deletions confluence_markdown_exporter/api_clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,15 @@ def get_confluence_instance() -> ConfluenceApiSdk:
"""Get authenticated Confluence API client using current settings."""
settings = get_settings()
auth = settings.auth
connection_config = settings.connection_config.model_dump()
connection_config = settings.connection_config.model_dump(exclude={"use_v2_api"})

while True:
try:
confluence = ApiClientFactory(connection_config).create_confluence(auth.confluence)
break
except ConnectionError:
except ConnectionError as e:
questionary.print(
"Confluence connection failed: Redirecting to Confluence authentication config...",
f"{e}\nRedirecting to Confluence authentication config...",
style="fg:red bold",
)
main_config_menu_loop("auth.confluence")
Expand All @@ -99,7 +99,7 @@ def get_jira_instance() -> JiraApiSdk:
"""Get authenticated Jira API client using current settings with required authentication."""
settings = get_settings()
auth = settings.auth
connection_config = settings.connection_config.model_dump()
connection_config = settings.connection_config.model_dump(exclude={"use_v2_api"})

while True:
try:
Expand Down
188 changes: 155 additions & 33 deletions confluence_markdown_exporter/confluence.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from confluence_markdown_exporter.utils.export import sanitize_filename
from confluence_markdown_exporter.utils.export import sanitize_key
from confluence_markdown_exporter.utils.export import save_file
from confluence_markdown_exporter.utils.lockfile import LockfileManager
from confluence_markdown_exporter.utils.table_converter import TableConverter
from confluence_markdown_exporter.utils.type_converter import str_to_bool

Expand Down Expand Up @@ -133,7 +134,7 @@ class Organization(BaseModel):
spaces: list["Space"]

@property
def pages(self) -> list[int]:
def pages(self) -> list["Page | Descendant"]:
return [page for space in self.spaces for page in space.pages]

def export(self) -> None:
Expand Down Expand Up @@ -165,15 +166,15 @@ class Space(BaseModel):
homepage: int | None

@property
def pages(self) -> list[int]:
def pages(self) -> list["Page | Descendant"]:
if self.homepage is None:
logger.warning(
f"Space '{self.name}' (key: {self.key}) has no homepage. No pages will be exported."
)
return []

homepage = Page.from_id(self.homepage)
return [self.homepage, *homepage.descendants]
return [homepage, *homepage.descendants]

def export(self) -> None:
export_pages(self.pages)
Expand Down Expand Up @@ -212,7 +213,8 @@ def from_json(cls, data: JsonResponse) -> "Label":
class Document(BaseModel):
title: str
space: Space
ancestors: list[int]
ancestors: list["Ancestor"]
version: Version

@property
def _template_vars(self) -> dict[str, str]:
Expand All @@ -221,10 +223,8 @@ def _template_vars(self) -> dict[str, str]:
"space_name": sanitize_filename(self.space.name),
"homepage_id": str(self.space.homepage),
"homepage_title": sanitize_filename(Page.from_id(self.space.homepage).title),
"ancestor_ids": "/".join(str(a) for a in self.ancestors),
"ancestor_titles": "/".join(
sanitize_filename(Page.from_id(a).title) for a in self.ancestors
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the _template_vars property, although the results are cached, Page.from_id(a).title) for a in self.ancestors unnecessarily calls from_id. This is inconvenient for speeding up with the incremental option.
So I added Ancestor type to store ancestor infomation.

),
"ancestor_ids": "/".join(str(a.id) for a in self.ancestors),
"ancestor_titles": "/".join(sanitize_filename(a.title) for a in self.ancestors),
}


Expand All @@ -237,7 +237,6 @@ class Attachment(Document):
collection_name: str
download_link: str
comment: str
version: Version

@property
def extension(self) -> str:
Expand Down Expand Up @@ -284,8 +283,8 @@ def from_json(cls, data: JsonResponse) -> "Attachment":
download_link=data.get("_links", {}).get("download", ""),
comment=extensions.get("comment", ""),
ancestors=[
*[ancestor.get("id") for ancestor in container.get("ancestors", [])],
container.get("id"),
*[Ancestor.from_json(ancestor) for ancestor in container.get("ancestors", [])],
Ancestor.from_json(container),
][1:],
version=Version.from_json(data.get("version", {})),
)
Expand Down Expand Up @@ -333,6 +332,47 @@ def export(self) -> None:
)


class Ancestor(Document):
id: int

@classmethod
def from_json(cls, data: JsonResponse) -> "Ancestor":
return cls(
id=data.get("id", 0),
title=data.get("title", ""),
space=Space.from_key(data.get("_expandable", {}).get("space", "").split("/")[-1]),
ancestors=[], # Ancestors of ancestor is not needed for now.
version=Version.from_json({}), # Version of ancestor is not needed for now.
)


class Descendant(Document):
id: int

@property
def _template_vars(self) -> dict[str, str]:
return {
**super()._template_vars,
"page_id": str(self.id),
"page_title": sanitize_filename(self.title),
}

@property
def export_path(self) -> Path:
filepath_template = Template(settings.export.page_path.replace("{", "${"))
return Path(filepath_template.safe_substitute(self._template_vars))

@classmethod
def from_json(cls, data: JsonResponse) -> "Descendant":
return cls(
id=data.get("id", 0),
title=data.get("title", ""),
space=Space.from_key(data.get("_expandable", {}).get("space", "").split("/")[-1]),
ancestors=[Ancestor.from_json(ancestor) for ancestor in data.get("ancestors", [])][1:],
version=Version.from_json(data.get("version", {})),
)


class Page(Document):
id: int
body: str
Expand All @@ -342,11 +382,12 @@ class Page(Document):
attachments: list["Attachment"]

@property
def descendants(self) -> list[int]:
def descendants(self) -> list["Descendant"]:
url = "rest/api/content/search"
params = {
"cql": f"type=page AND ancestor={self.id}",
"limit": 100,
"expand": "metadata.properties,ancestors,version",
"limit": 250,
}
results = []

Expand All @@ -372,8 +413,7 @@ def descendants(self) -> list[int]:
f"Unexpected error when fetching descendants for content ID {self.id}."
)
return []

return [result["id"] for result in results]
return [Descendant.from_json(result) for result in results]

@property
def _template_vars(self) -> dict[str, str]:
Expand Down Expand Up @@ -410,7 +450,7 @@ def export(self) -> None:
self.export_markdown()

def export_with_descendants(self) -> None:
export_pages([self.id, *self.descendants])
export_pages([self, *self.descendants])

def export_body(self) -> None:
soup = BeautifulSoup(self.html, "html.parser")
Expand Down Expand Up @@ -498,7 +538,8 @@ def from_json(cls, data: JsonResponse) -> "Page":
for label in data.get("metadata", {}).get("labels", {}).get("results", [])
],
attachments=Attachment.from_page_id(data.get("id", 0)),
ancestors=[ancestor.get("id") for ancestor in data.get("ancestors", [])][1:],
ancestors=[Ancestor.from_json(ancestor) for ancestor in data.get("ancestors", [])][1:],
version=Version.from_json(data.get("version", {})),
)

@classmethod
Expand All @@ -511,7 +552,7 @@ def from_id(cls, page_id: int) -> "Page":
confluence.get_page_by_id(
page_id,
expand="body.view,body.export_view,body.editor2,metadata.labels,"
"metadata.properties,ancestors",
"metadata.properties,ancestors,version",
),
)
)
Expand All @@ -528,6 +569,7 @@ def from_id(cls, page_id: int) -> "Page":
labels=[],
attachments=[],
ancestors=[],
version=Version.from_json({}),
)

@classmethod
Expand Down Expand Up @@ -596,7 +638,9 @@ def front_matter(self) -> str:
@property
def breadcrumbs(self) -> str:
return (
" > ".join([self.convert_page_link(ancestor) for ancestor in self.page.ancestors])
" > ".join(
[self.convert_page_link(ancestor.id) for ancestor in self.page.ancestors]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

convert_page_link takes an integer while ancestor.id is a string. Can you check what it is and either adjust ancestor.id to int or ensure that convert_page_link can handle the string?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in 6f15581.

)
+ "\n"
)

Expand Down Expand Up @@ -1001,7 +1045,7 @@ def convert_drawio(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -

return ""

def convert_plantuml(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str: # noqa: PLR0911
def convert_plantuml(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str: # noqa: PLR0911
"""Convert PlantUML diagrams from editor2 XML to Markdown code blocks.

PlantUML diagrams are stored in the editor2 XML as structured macros with
Expand Down Expand Up @@ -1092,24 +1136,102 @@ def _get_path_for_href(self, path: Path, style: Literal["absolute", "relative"])
return result


def export_page(page_id: int) -> None:
"""Export a Confluence page to Markdown.
_CQL_MAX_BATCH_SIZE: int = 25

Args:
page_id: The page id.
output_path: The output path.

def _fetch_page_ids_v2_batch(batch: list[str]) -> set[str]:
"""Single v2 API request for a batch of page IDs.

Uses GET /api/v2/pages?id=X&id=Y&... (Atlassian Cloud).
The v2 API accepts multiple ``id`` params, so they are encoded directly
into the URL path since the SDK only accepts a dict for ``params``.
"""
query = urllib.parse.urlencode([("id", pid) for pid in batch] + [("limit", len(batch))])
response = confluence.get(f"api/v2/pages?{query}")
if not response:
return set()
return {str(item["id"]) for item in response.get("results", [])}


def _fetch_page_ids_cql_batch(batch: list[str]) -> set[str]:
"""Single CQL v1 request for a batch of page IDs.

Uses GET /rest/api/content/search with id in (...) (self-hosted / fallback).
"""
cql = "id in ({})".format(",".join(batch))
response = confluence.get(
"rest/api/content/search",
params={"cql": cql, "limit": len(batch), "fields": "id"},
)
if not response:
return set()
return {str(item["id"]) for item in response.get("results", [])}


def fetch_deleted_page_ids(page_ids: list[str]) -> set[str]:
"""Return the subset of *page_ids* that no longer exist in Confluence.

Uses the v2 REST API when ``connection_config.use_v2_api`` is enabled
(multiple ``id`` query params, up to ``export.existence_check_batch_size``
IDs per request), or the v1 CQL content search otherwise (capped at
:data:`_CQL_MAX_BATCH_SIZE` IDs per request).

Per-batch API failures are handled safely: affected IDs are assumed to
still exist so they are never accidentally deleted.
"""
page = Page.from_id(page_id)
page.export()
if not page_ids:
return set()

use_v2 = settings.connection_config.use_v2_api
batch_size = settings.export.existence_check_batch_size
effective_batch_size = batch_size if use_v2 else min(batch_size, _CQL_MAX_BATCH_SIZE)
existing: set[str] = set()

for i in range(0, len(page_ids), effective_batch_size):
batch = page_ids[i : i + effective_batch_size]
try:
if use_v2:
existing.update(_fetch_page_ids_v2_batch(batch))
else:
existing.update(_fetch_page_ids_cql_batch(batch))
except Exception: # noqa: BLE001
logger.warning(
"Failed to check page existence for batch (%d IDs). "
"Skipping deletion for these pages.",
len(batch),
)
existing.update(batch)

return set(page_ids) - existing


def sync_removed_pages() -> None:
"""Orchestrate stale-file cleanup: check API for deleted pages, then clean up."""
if not settings.export.cleanup_stale:
return

unseen = LockfileManager.unseen_ids()
deleted = fetch_deleted_page_ids(sorted(unseen)) if unseen else set()
LockfileManager.remove_pages(deleted)


def export_pages(page_ids: list[int]) -> None:
def export_pages(pages: list["Page | Descendant"]) -> None:
"""Export a list of Confluence pages to Markdown.

Args:
page_ids: List of pages to export.
output_path: The output path.
pages: List of pages to export.
"""
for page_id in (pbar := tqdm(page_ids, smoothing=0.05)):
pbar.set_postfix_str(f"Exporting page {page_id}")
export_page(page_id)
# Mark all pages as seen so cleanup skips API checks for unchanged pages
LockfileManager.mark_seen([p.id for p in pages])
pages_to_export = [page for page in pages if LockfileManager.should_export(page)]

if not pages_to_export:
logger.info("No pages to export based on lockfile state.")
return

for page in (pbar := tqdm(pages_to_export, smoothing=0.05)):
pbar.set_postfix_str(f"Exporting page {page.id}")
_page = Page.from_id(page.id)
_page.export()
# Record to lockfile if enabled
LockfileManager.record_page(_page)
Loading