Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1e5181b
Add lockfile tracking for exported pages
naoki-tateyama Feb 2, 2026
90f211c
feat: Incremental page download using lockfile
naoki-tateyama Feb 3, 2026
8691524
add prune command
naoki-tateyama Feb 3, 2026
0b5dbd3
add incremental export option to pages command
naoki-tateyama Feb 8, 2026
6bac022
fix performance issue by adding Descendant type
naoki-tateyama Feb 8, 2026
6edb622
integrate version property into Descendant class
naoki-tateyama Feb 8, 2026
70617ea
fix performance issue by avoiding ancestor fetching in the _template_…
naoki-tateyama Feb 8, 2026
d34eff7
add documentation for the new incremental option
naoki-tateyama Feb 8, 2026
bd13853
remove expand parameters from from_id and from_url methods.
naoki-tateyama Feb 8, 2026
2516f41
move LockfileManager to outside of the Page class
naoki-tateyama Feb 9, 2026
5029d3d
parse ancestor in simpler way
naoki-tateyama Feb 9, 2026
4201ccf
add tests for lockfile utils
naoki-tateyama Feb 11, 2026
8603a3b
refactor Ancestor and Descendant classes
naoki-tateyama Feb 11, 2026
dd86763
atomic file writes
naoki-tateyama Feb 15, 2026
5263c39
Update confluence_markdown_exporter/utils/lockfile.py
naoki-tateyama Mar 4, 2026
6f15581
fix Ancestor.id type from str to int
naoki-tateyama Mar 4, 2026
02562ff
prefilter pages before tqdm progress bar
naoki-tateyama Mar 4, 2026
509d3e6
sort lockfile entries by key for stable git diffs
naoki-tateyama Mar 4, 2026
4253132
make skip_unchanged a config option enabled by default
naoki-tateyama Mar 4, 2026
ed2f5a1
add automatic cleanup for deleted and moved pages
naoki-tateyama Mar 4, 2026
f899267
Introduce new config options and refactor clean
Spenhouet Mar 5, 2026
a115f39
Fix connection config
Spenhouet Mar 6, 2026
1439186
Redownload if file is missing
Spenhouet Mar 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 25 additions & 14 deletions confluence_markdown_exporter/confluence.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from confluence_markdown_exporter.utils.export import sanitize_filename
from confluence_markdown_exporter.utils.export import sanitize_key
from confluence_markdown_exporter.utils.export import save_file
from confluence_markdown_exporter.utils.lockfile import LockfileManager
from confluence_markdown_exporter.utils.table_converter import TableConverter
from confluence_markdown_exporter.utils.type_converter import str_to_bool

Expand Down Expand Up @@ -133,7 +134,7 @@ class Organization(BaseModel):
spaces: list["Space"]

@property
def pages(self) -> list[int]:
def pages(self) -> list["Page"]:
return [page for space in self.spaces for page in space.pages]

def export(self) -> None:
Expand Down Expand Up @@ -165,15 +166,15 @@ class Space(BaseModel):
homepage: int | None

@property
def pages(self) -> list[int]:
def pages(self) -> list["Page"]:
if self.homepage is None:
logger.warning(
f"Space '{self.name}' (key: {self.key}) has no homepage. No pages will be exported."
)
return []

homepage = Page.from_id(self.homepage)
return [self.homepage, *homepage.descendants]
return [homepage, *homepage.descendants]

def export(self) -> None:
export_pages(self.pages)
Expand Down Expand Up @@ -340,12 +341,14 @@ class Page(Document):
editor2: str
labels: list["Label"]
attachments: list["Attachment"]
version: Version

@property
def descendants(self) -> list[int]:
def descendants(self) -> list["Page"]:
url = "rest/api/content/search"
params = {
"cql": f"type=page AND ancestor={self.id}",
"expand": "metadata.properties,ancestors,version",
"limit": 100,
}
results = []
Expand All @@ -372,8 +375,7 @@ def descendants(self) -> list[int]:
f"Unexpected error when fetching descendants for content ID {self.id}."
)
return []

return [result["id"] for result in results]
return [self.from_json(result) for result in results]

@property
def _template_vars(self) -> dict[str, str]:
Expand Down Expand Up @@ -409,8 +411,11 @@ def export(self) -> None:
self.export_attachments()
self.export_markdown()

# Record to lockfile if enabled
LockfileManager.record_page(self)

def export_with_descendants(self) -> None:
export_pages([self.id, *self.descendants])
export_pages([self, *self.descendants])

def export_body(self) -> None:
soup = BeautifulSoup(self.html, "html.parser")
Expand Down Expand Up @@ -499,6 +504,7 @@ def from_json(cls, data: JsonResponse) -> "Page":
],
attachments=Attachment.from_page_id(data.get("id", 0)),
ancestors=[ancestor.get("id") for ancestor in data.get("ancestors", [])][1:],
version=Version.from_json(data.get("version", {})),
)

@classmethod
Expand All @@ -511,7 +517,7 @@ def from_id(cls, page_id: int) -> "Page":
confluence.get_page_by_id(
page_id,
expand="body.view,body.export_view,body.editor2,metadata.labels,"
"metadata.properties,ancestors",
"metadata.properties,ancestors,version",
),
)
)
Expand All @@ -528,6 +534,7 @@ def from_id(cls, page_id: int) -> "Page":
labels=[],
attachments=[],
ancestors=[],
version=Version.from_json({}),
)

@classmethod
Expand Down Expand Up @@ -1001,7 +1008,7 @@ def convert_drawio(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -

return ""

def convert_plantuml(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str: # noqa: PLR0911
def convert_plantuml(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str: # noqa: PLR0911
"""Convert PlantUML diagrams from editor2 XML to Markdown code blocks.

PlantUML diagrams are stored in the editor2 XML as structured macros with
Expand Down Expand Up @@ -1103,13 +1110,17 @@ def export_page(page_id: int) -> None:
page.export()


def export_pages(page_ids: list[int]) -> None:
def export_pages(pages: list["Page"]) -> None:
"""Export a list of Confluence pages to Markdown.

Args:
page_ids: List of pages to export.
pages: List of pages to export.
output_path: The output path.
"""
for page_id in (pbar := tqdm(page_ids, smoothing=0.05)):
pbar.set_postfix_str(f"Exporting page {page_id}")
export_page(page_id)
for page in (pbar := tqdm(pages, smoothing=0.05)):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could prefilter before starting the tqdm to only show a progressbar for pages to be exported:

    pages_to_export = [page for page in pages if LockfileManager.should_export(page)]

    if not pages_to_export:
        logger.info("No pages to export based on lockfile state.")
        return

    for page in (pbar := tqdm(pages_to_export, smoothing=0.05)):

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in 02562ff.

# filter pages new and updated only
if LockfileManager.should_export(page):
pbar.set_postfix_str(f"Exporting page {page.id}")
export_page(page.id)
else:
pbar.set_postfix_str(f"Skipping page {page.id} (no changes)")
95 changes: 92 additions & 3 deletions confluence_markdown_exporter/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from confluence_markdown_exporter.utils.app_data_store import get_settings
from confluence_markdown_exporter.utils.app_data_store import set_setting
from confluence_markdown_exporter.utils.config_interactive import main_config_menu_loop
from confluence_markdown_exporter.utils.lockfile import LockfileManager
from confluence_markdown_exporter.utils.measure_time import measure
from confluence_markdown_exporter.utils.platform_compat import handle_powershell_tilde_expansion
from confluence_markdown_exporter.utils.type_converter import str_to_bool
Expand All @@ -32,12 +33,22 @@ def pages(
help="Directory to write exported Markdown files to. Overrides config if set."
),
] = None,
*,
lockfile: Annotated[
bool,
typer.Option(
"--lockfile",
help="Enable lock file tracking for exported pages.",
),
] = False,
) -> None:
from confluence_markdown_exporter.confluence import Page

with measure(f"Export pages {', '.join(pages)}"):
override_output_path_config(output_path)
if lockfile:
LockfileManager.init()
for page in pages:
override_output_path_config(output_path)
_page = Page.from_id(int(page)) if page.isdigit() else Page.from_url(page)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

page command downloads page body directly here. So it's difficult to filter using version infomotion.

_page.export()

Expand All @@ -51,12 +62,29 @@ def pages_with_descendants(
help="Directory to write exported Markdown files to. Overrides config if set."
),
] = None,
*,
lockfile: Annotated[
bool,
typer.Option(
"--lockfile",
help="Enable lock file tracking for exported pages.",
),
] = False,
incremental: Annotated[
bool,
typer.Option(
"--incremental",
help="Only export pages that have changed since last export.",
),
] = False,
) -> None:
from confluence_markdown_exporter.confluence import Page

with measure(f"Export pages {', '.join(pages)} with descendants"):
override_output_path_config(output_path)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

override_output_path_config does not need to be called multiple times. I move this line.

if lockfile or incremental:
LockfileManager.init()
for page in pages:
override_output_path_config(output_path)
_page = Page.from_id(int(page)) if page.isdigit() else Page.from_url(page)
_page.export_with_descendants()

Expand All @@ -70,6 +98,21 @@ def spaces(
help="Directory to write exported Markdown files to. Overrides config if set."
),
] = None,
*,
lockfile: Annotated[
bool,
typer.Option(
"--lockfile",
help="Enable lock file tracking for exported pages.",
),
] = False,
incremental: Annotated[
bool,
typer.Option(
"--incremental",
help="Only export pages that have changed since last export.",
),
] = False,
) -> None:
from confluence_markdown_exporter.confluence import Space

Expand All @@ -78,8 +121,10 @@ def spaces(
normalized_space_keys = [handle_powershell_tilde_expansion(key) for key in space_keys]

with measure(f"Export spaces {', '.join(normalized_space_keys)}"):
override_output_path_config(output_path)
if lockfile or incremental:
LockfileManager.init()
for space_key in normalized_space_keys:
override_output_path_config(output_path)
space = Space.from_key(space_key)
space.export()

Expand All @@ -92,11 +137,28 @@ def all_spaces(
help="Directory to write exported Markdown files to. Overrides config if set."
),
] = None,
*,
lockfile: Annotated[
bool,
typer.Option(
"--lockfile",
help="Enable lock file tracking for exported pages.",
),
] = False,
incremental: Annotated[
bool,
typer.Option(
"--incremental",
help="Only export pages that have changed since last export.",
),
] = False,
) -> None:
from confluence_markdown_exporter.confluence import Organization

with measure("Export all spaces"):
override_output_path_config(output_path)
if lockfile or incremental:
LockfileManager.init()
org = Organization.from_api()
org.export()

Expand Down Expand Up @@ -125,6 +187,33 @@ def config(
main_config_menu_loop(jump_to)


@app.command(help="Delete exported files that are not tracked in the lockfile.")
def prune(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do this as a separate command? Because it could be destructive?

Why not always run this? Could also make this a config option which someone can disable if this is not desired.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Spenhouet Thank you for your comments! I'll work on your comments soon.

Regarding this comment, for users who use prior versions of cme, the files already downloaded are not tracked by the confluence-lock.json. If prune command is always executed, files not on the lockfile would be deleted.
This may be inconvenient for users using multiple executions of export commands like

cme pages 1234
cme pages 5678

The execution of downloading 1234 would delete 5678.
After all, the 5678 file would be downloaded by the second command, so this may not be a problem, but deleting may not be intended by users.
To allow users to explicitly control the behavior, I intentionally separated prune command.
What do you think?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we need to rethink how the pruning works. Imo. we need to detect what files previously tracked on the lock file are now gone I.e. a entry removed in the lock file results in removal of the file on disk. Similar for renamed or moved pages.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some brainstorming here:

  1. For every page a new scan hits and compares against the lock file (record_page) we can tell if the export_path changed and we do know the previous export_path so we can delete the file at the previous export path at that point in time while executing record_page. This should already cover moved and renamed pages.
  2. That leaves us with deleted pages. These are bit more tricky. Note that we are not guaranteed that the command executed is always against the whole space (or against the scope of the lock file). Which means we can not simply "delete everything that is in the lock file but wasn't in the sync". We could use that info to narrow down how many pages of the lock file we might need to check. At the end of a run we could get the list of page which are in the lock file but were not in the sync results. Then we could query all these pages and see if they still exist. For all pages which are in the lock file but no longer exist in Confluence we delete the old page file on disk. This way we only perform the deletion for previously synced pages and for nothing else. That check is a bit expensive but I don't yet have a better idea.

Copy link
Contributor Author

@naoki-tateyama naoki-tateyama Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in 9e4369c. Replaced the separate prune command with automatic cleanup during export. Each lockfile entry now records command and args to define its scope. On cleanup, pages no longer present in the current scope are automatically deleted from disk and the lockfile. Moved pages (changed export_path) also have their old files removed.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked if we can run batch requests and Sonnet provided this script, which uses the v2 API and a fallback via CQL for instances without v2 API:

# ---------------------------------------------------------------------------
# Confluence existence check — batched requests
# ---------------------------------------------------------------------------

def _fetch_existing_ids_v2(
    session: requests.Session,
    v2_base: str,
    all_ids: list[str],
) -> set[str]:
    """Atlassian Cloud: GET /wiki/api/v2/pages?id=X&id=Y&...&limit=250.

    One request per batch of V2_BATCH_SIZE IDs.  IDs present in the response
    exist; IDs absent from the response are deleted.
    """
    existing: set[str] = set()
    n_batches = math.ceil(len(all_ids) / V2_BATCH_SIZE)

    for batch_num, start in enumerate(range(0, len(all_ids), V2_BATCH_SIZE), 1):
        batch = all_ids[start : start + V2_BATCH_SIZE]
        print(
            f"  Batch {batch_num}/{n_batches} ({len(batch)} IDs) ...",
            end="\r",
            flush=True,
        )
        params: list[tuple[str, str | int]] = [("id", pid) for pid in batch]
        params.append(("limit", len(batch)))
        r = session.get(f"{v2_base}/pages", params=params)
        if not r.ok:
            print(
                f"\nERROR: v2 pages request failed (HTTP {r.status_code}).\n"
                f"Response: {r.text[:400]}",
                file=sys.stderr,
            )
            sys.exit(1)
        for item in r.json().get("results", []):
            existing.add(str(item["id"]))

    print(" " * 60, end="\r")
    return existing


def _fetch_existing_ids_cql(
    session: requests.Session,
    api_base: str,
    all_ids: list[str],
) -> set[str]:
    """Self-hosted fallback: CQL id in (...) in batches of CQL_BATCH_SIZE.

    Smaller batches (25) stay well within the CQL aggregator limits.
    """
    existing: set[str] = set()
    n_batches = math.ceil(len(all_ids) / CQL_BATCH_SIZE)

    for batch_num, start in enumerate(range(0, len(all_ids), CQL_BATCH_SIZE), 1):
        batch = all_ids[start : start + CQL_BATCH_SIZE]
        print(
            f"  Batch {batch_num}/{n_batches} ({len(batch)} IDs) ...",
            end="\r",
            flush=True,
        )
        cql = "id in (" + ",".join(batch) + ")"
        r = session.get(
            f"{api_base}/content/search",
            params={"cql": cql, "limit": len(batch), "fields": "id"},
        )
        if not r.ok:
            print(
                f"\nERROR: CQL query failed (HTTP {r.status_code}).\n"
                f"Response: {r.text[:400]}",
                file=sys.stderr,
            )
            sys.exit(1)
        for item in r.json().get("results", []):
            existing.add(str(item["id"]))

    print(" " * 60, end="\r")
    return existing


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

def main() -> None:
    parser = argparse.ArgumentParser(
        description="Find pages in a .confluence-lock.json that no longer exist in Confluence.",
    )
    parser.add_argument(
        "--lock",
        default=str(Path(__file__).parent / ".confluence-lock.json"),
        help="Path to .confluence-lock.json (default: <script-dir>/.confluence-lock.json)",
    )
    args = parser.parse_args()

    lock_path = Path(args.lock)
    pages = _load_lock(lock_path)
    all_ids = list(pages.keys())

    print(f"Lock file: {lock_path.resolve()}")
    print(f"Total pages in lock: {len(all_ids)}")

    api_base, username, api_token, pat = _get_credentials()
    session = _make_session(username, api_token, pat)

    # Derive raw URL from api_base to build v2 URL
    raw_url = api_base.replace("/wiki/rest/api", "").replace("/rest/api", "")
    v2_base = _get_v2_base(raw_url)

    if v2_base:
        n_batches = math.ceil(len(all_ids) / V2_BATCH_SIZE)
        print(
            f"Confluence API (v2): {v2_base}\n"
            f"Checking {len(all_ids)} pages in {n_batches} batch(es) "
            f"of up to {V2_BATCH_SIZE} IDs ..."
        )
        existing_ids = _fetch_existing_ids_v2(session, v2_base, all_ids)
    else:
        n_batches = math.ceil(len(all_ids) / CQL_BATCH_SIZE)
        print(
            f"Confluence API (v1 CQL): {api_base}\n"
            f"Checking {len(all_ids)} pages in {n_batches} batch(es) "
            f"of up to {CQL_BATCH_SIZE} IDs ..."
        )
        existing_ids = _fetch_existing_ids_cql(session, api_base, all_ids)

    deleted_ids = sorted(set(all_ids) - existing_ids, key=int)

    print(f"Result: {len(existing_ids)} existing, {len(deleted_ids)} deleted.\n")

    if not deleted_ids:
        print("No deleted pages found.")
        return

    print(f"Deleted pages ({len(deleted_ids)}):")
    print(f"{'ID':<15}  {'Title':<50}  export_path")
    print("-" * 120)
    for page_id in deleted_ids:
        entry = pages[page_id]
        title = entry.get("title", "")
        export_path = entry.get("export_path", "")
        print(f"{page_id:<15}  {title:<50}  {export_path}")

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Each lockfile entry now records command and args to define its scope

Not a fan of recording the scope and args in the lock file. Also it should be possible to e.g. only sync a single page without all other previously synced pages being deleted. We should only delete pages which were synced before and truely no longer exist on Confluence.

I'd prefer to the do the batch scan at the end of the sync for all pages that were not within the sync results but are in the lock file.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might want to add config options for the auto prune (Default on) and the batch size v2 and cql.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in ed2f5a1. Replaced the scope-based approach with a v2 API batch check (wiki/api/v2/pages) during cleanup. Unseen lockfile pages are checked against Confluence in batches of 250, and only pages confirmed to no longer exist are deleted from disk and the lockfile. Old files for moved pages (changed export_path) are also cleaned up.

Added export.cleanup_stale config option (default: True) to enable/disable this behavior.

output_path: Annotated[
Path | None,
typer.Option(help="Directory containing exported Markdown files. Overrides config if set."),
] = None,
*,
dry_run: Annotated[
bool,
typer.Option(
"--dry-run",
help="Show files that would be deleted without actually deleting them.",
),
] = False,
) -> None:
"""Delete exported files not tracked in the lockfile."""
override_output_path_config(output_path)
LockfileManager.init()
deleted = LockfileManager.cleanup_untracked(dry_run=dry_run)
if dry_run:
typer.echo(f"Would delete {len(deleted)} file(s):")
for path in deleted:
typer.echo(f" {path}")
else:
typer.echo(f"Deleted {len(deleted)} file(s).")


@app.command(help="Show the current version of confluence-markdown-exporter.")
def version() -> None:
"""Display the current version."""
Expand Down
Loading