Skip to content

Commit 9538fe3

Browse files
add scope-based auto-prune for removed/moved pages
Replace manual prune command with automatic cleanup during export. Each lockfile entry now tracks command and args to define its scope. On cleanup, pages no longer in the current scope are deleted from disk and lockfile. Moved pages have their old files removed. Co-Authored-By: Claude Opus 4.6 <[email protected]>
1 parent 4253132 commit 9538fe3

5 files changed

Lines changed: 376 additions & 106 deletions

File tree

README.md

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
- Converts Confluence macros to equivalent Markdown syntax where possible.
2222
- Handles images and attachments by linking them appropriately in the Markdown output.
2323
- Supports extended Markdown features like tasks, alerts, and front matter.
24-
- Supports incremental exports — only re-exports pages that have changed since the last run.
24+
- Skips unchanged pages by default — only re-exports pages that have changed since the last run.
2525
- Supports Confluence add-ons: [draw.io](https://marketplace.atlassian.com/apps/1210933/draw-io-diagrams-uml-bpmn-aws-erd-flowcharts), [PlantUML](https://marketplace.atlassian.com/apps/1222993/flowchart-plantuml-diagrams-for-confluence)
2626

2727
## Supported Markdown Elements
@@ -103,30 +103,6 @@ Export all Confluence pages across all spaces:
103103
confluence-markdown-exporter all-spaces <output path e.g. ./output_path/>
104104
```
105105

106-
#### 2.5. Incremental Export
107-
108-
All export commands (`pages`, `pages-with-descendants`, `spaces`, `all-spaces`) support the `--incremental` flag. When enabled, only pages that have changed since the last export are re-exported:
109-
110-
```sh
111-
confluence-markdown-exporter spaces <space-key> --incremental
112-
```
113-
114-
This uses a lockfile to track previously exported pages and their versions, making subsequent exports significantly faster.
115-
116-
#### 2.6. Prune Untracked Files
117-
118-
After using incremental exports, you can clean up exported files that are no longer tracked in the lockfile (e.g. deleted pages):
119-
120-
```sh
121-
confluence-markdown-exporter prune
122-
```
123-
124-
Use `--dry-run` to preview which files would be deleted without actually deleting them:
125-
126-
```sh
127-
confluence-markdown-exporter prune --dry-run
128-
```
129-
130106
### 3. Output
131107

132108
The exported Markdown file(s) will be saved in the specified `output` directory e.g.:
@@ -174,6 +150,7 @@ This will open a menu where you can:
174150
| export.filename_encoding | Character mapping for filename encoding. | Default mappings for forbidden characters. |
175151
| export.filename_length | Maximum length of filenames. | 255 |
176152
| export.include_document_title | Whether to include the document title in the exported markdown file. | True |
153+
| export.skip_unchanged | Skip exporting pages that have not changed since last export. Uses a lockfile to track page versions. | True |
177154
| connection_config.backoff_and_retry | Enable automatic retry with exponential backoff | True |
178155
| connection_config.backoff_factor | Multiplier for exponential backoff | 2 |
179156
| connection_config.max_backoff_seconds | Maximum seconds to wait between retries | 60 |

confluence_markdown_exporter/confluence.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,7 @@ def export_pages(pages: list["Page | Descendant"]) -> None:
11421142
Args:
11431143
pages: List of pages to export.
11441144
"""
1145+
LockfileManager.mark_seen([p.id for p in pages])
11451146
pages_to_export = [page for page in pages if LockfileManager.should_export(page)]
11461147

11471148
if not pages_to_export:
@@ -1152,5 +1153,4 @@ def export_pages(pages: list["Page | Descendant"]) -> None:
11521153
pbar.set_postfix_str(f"Exporting page {page.id}")
11531154
_page = Page.from_id(page.id)
11541155
_page.export()
1155-
# Record to lockfile if enabled
11561156
LockfileManager.record_page(_page)

confluence_markdown_exporter/main.py

Lines changed: 8 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,12 @@ def pages(
3838

3939
with measure(f"Export pages {', '.join(pages)}"):
4040
override_output_path_config(output_path)
41-
LockfileManager.init()
41+
LockfileManager.init("pages", pages)
4242
for page in pages:
4343
_page = Page.from_id(int(page)) if page.isdigit() else Page.from_url(page)
4444
_page.export()
4545
LockfileManager.record_page(_page)
46+
LockfileManager.cleanup()
4647

4748

4849
@app.command(help="Export Confluence pages and their descendant pages by ID or URL to Markdown.")
@@ -59,10 +60,11 @@ def pages_with_descendants(
5960

6061
with measure(f"Export pages {', '.join(pages)} with descendants"):
6162
override_output_path_config(output_path)
62-
LockfileManager.init()
6363
for page in pages:
64+
LockfileManager.init("pages_with_descendants", [page])
6465
_page = Page.from_id(int(page)) if page.isdigit() else Page.from_url(page)
6566
_page.export_with_descendants()
67+
LockfileManager.cleanup()
6668

6769

6870
@app.command(help="Export all Confluence pages of one or more spaces to Markdown.")
@@ -83,10 +85,11 @@ def spaces(
8385

8486
with measure(f"Export spaces {', '.join(normalized_space_keys)}"):
8587
override_output_path_config(output_path)
86-
LockfileManager.init()
8788
for space_key in normalized_space_keys:
89+
LockfileManager.init("spaces", [space_key])
8890
space = Space.from_key(space_key)
8991
space.export()
92+
LockfileManager.cleanup()
9093

9194

9295
@app.command(help="Export all Confluence pages across all spaces to Markdown.")
@@ -102,9 +105,10 @@ def all_spaces(
102105

103106
with measure("Export all spaces"):
104107
override_output_path_config(output_path)
105-
LockfileManager.init()
108+
LockfileManager.init("all_spaces", [])
106109
org = Organization.from_api()
107110
org.export()
111+
LockfileManager.cleanup()
108112

109113

110114
@app.command(help="Open the interactive configuration menu or display current configuration.")
@@ -131,33 +135,6 @@ def config(
131135
main_config_menu_loop(jump_to)
132136

133137

134-
@app.command(help="Delete exported files that are not tracked in the lockfile.")
135-
def prune(
136-
output_path: Annotated[
137-
Path | None,
138-
typer.Option(help="Directory containing exported Markdown files. Overrides config if set."),
139-
] = None,
140-
*,
141-
dry_run: Annotated[
142-
bool,
143-
typer.Option(
144-
"--dry-run",
145-
help="Show files that would be deleted without actually deleting them.",
146-
),
147-
] = False,
148-
) -> None:
149-
"""Delete exported files not tracked in the lockfile."""
150-
override_output_path_config(output_path)
151-
LockfileManager.init()
152-
deleted = LockfileManager.cleanup_untracked(dry_run=dry_run)
153-
if dry_run:
154-
typer.echo(f"Would delete {len(deleted)} file(s):")
155-
for path in deleted:
156-
typer.echo(f" {path}")
157-
else:
158-
typer.echo(f"Deleted {len(deleted)} file(s).")
159-
160-
161138
@app.command(help="Show the current version of confluence-markdown-exporter.")
162139
def version() -> None:
163140
"""Display the current version."""

confluence_markdown_exporter/utils/lockfile.py

Lines changed: 55 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from datetime import timezone
1010
from pathlib import Path
1111
from typing import TYPE_CHECKING
12+
from typing import ClassVar
1213

1314
from pydantic import BaseModel
1415
from pydantic import Field
@@ -30,6 +31,8 @@ class PageEntry(BaseModel):
3031
title: str
3132
version: int
3233
export_path: str
34+
command: str
35+
args: list[str] = Field(default_factory=list)
3336

3437

3538
class ConfluenceLock(BaseModel):
@@ -50,7 +53,7 @@ def load(cls, lockfile_path: Path) -> ConfluenceLock:
5053
logger.warning(f"Failed to parse lock file: {lockfile_path}. Starting fresh.")
5154
return cls()
5255

53-
def save(self, lockfile_path: Path) -> None:
56+
def save(self, lockfile_path: Path, *, delete_ids: set[str] | None = None) -> None:
5457
"""Save lock file to disk.
5558
5659
To handle concurrent writes, this method reads the existing lock file
@@ -61,9 +64,12 @@ def save(self, lockfile_path: Path) -> None:
6164
# Read existing lock file and merge to handle concurrent writes
6265
existing = ConfluenceLock.load(lockfile_path)
6366
existing.pages = dict(sorted({**existing.pages, **self.pages}.items()))
67+
if delete_ids:
68+
for page_id in delete_ids:
69+
existing.pages.pop(page_id, None)
6470
existing.last_export = datetime.now(timezone.utc).isoformat()
6571

66-
json_str = json.dumps(existing.model_dump(), indent=2)
72+
json_str = json.dumps(existing.model_dump(), indent=2, ensure_ascii=False)
6773
tmp_path = None
6874
try:
6975
with tempfile.NamedTemporaryFile(
@@ -85,7 +91,7 @@ def save(self, lockfile_path: Path) -> None:
8591
self.pages = existing.pages
8692
self.last_export = existing.last_export
8793

88-
def add_page(self, page: Page) -> None:
94+
def add_page(self, page: Page, command: str = "", args: list[str] | None = None) -> None:
8995
"""Add or update a page entry in the lock file."""
9096
if page.version is None:
9197
logger.warning(f"Page {page.id} has no version info. Skipping lock entry.")
@@ -95,35 +101,57 @@ def add_page(self, page: Page) -> None:
95101
title=page.title,
96102
version=page.version.number,
97103
export_path=str(page.export_path),
104+
command=command,
105+
args=args or [],
98106
)
99107

100108

101109
class LockfileManager:
102110
"""Manager for lock file operations during export."""
103111

104-
_lockfile_path: Path | None = None
105-
_lock: ConfluenceLock | None = None
112+
_lockfile_path: ClassVar[Path | None] = None
113+
_lock: ClassVar[ConfluenceLock | None] = None
114+
_output_path: ClassVar[Path | None] = None
115+
_command: ClassVar[str] = ""
116+
_args: ClassVar[list[str]] = []
117+
_scope_entries: ClassVar[dict[str, PageEntry]] = {}
118+
_seen_page_ids: ClassVar[set[str]] = set()
106119

107120
@classmethod
108-
def init(cls) -> None:
121+
def init(cls, command: str = "", args: list[str] | None = None) -> None:
109122
"""Initialize the lockfile manager if skip_unchanged is enabled."""
110123
from confluence_markdown_exporter.utils.app_data_store import get_settings
111124

112125
settings = get_settings()
113126
if not settings.export.skip_unchanged:
114127
return
115128

116-
cls._lockfile_path = settings.export.output_path / LOCKFILE_FILENAME
129+
cls._output_path = settings.export.output_path
130+
cls._lockfile_path = cls._output_path / LOCKFILE_FILENAME
117131
cls._lock = ConfluenceLock.load(cls._lockfile_path)
132+
cls._command = command
133+
cls._args = args or []
134+
cls._scope_entries = {
135+
k: v
136+
for k, v in cls._lock.pages.items()
137+
if v.command == cls._command and v.args == cls._args
138+
}
139+
cls._seen_page_ids = set()
118140

119141
@classmethod
120142
def record_page(cls, page: Page) -> None:
121143
"""Record a page export to the lock file."""
122144
if cls._lock is None or cls._lockfile_path is None:
123145
return
124146

125-
cls._lock.add_page(page)
147+
cls._lock.add_page(page, command=cls._command, args=cls._args)
126148
cls._lock.save(cls._lockfile_path)
149+
cls._seen_page_ids.add(str(page.id))
150+
151+
@classmethod
152+
def mark_seen(cls, page_ids: list[int]) -> None:
153+
"""Mark page IDs as part of the current export scope."""
154+
cls._seen_page_ids.update(str(pid) for pid in page_ids)
127155

128156
@classmethod
129157
def should_export(cls, page: Page | Descendant) -> bool:
@@ -146,33 +174,23 @@ def should_export(cls, page: Page | Descendant) -> bool:
146174
return entry.version != page.version.number or entry.export_path != str(page.export_path)
147175

148176
@classmethod
149-
def cleanup_untracked(cls, *, dry_run: bool = False) -> list[Path]:
150-
"""Delete exported files that are not in the lockfile.
151-
152-
Args:
153-
dry_run: If True, only return files that would be deleted without deleting.
154-
155-
Returns list of deleted (or would-be-deleted) file paths.
156-
"""
157-
from confluence_markdown_exporter.utils.app_data_store import get_settings
158-
159-
if cls._lock is None:
160-
return []
177+
def cleanup(cls) -> None:
178+
"""Remove lockfile entries and files for pages no longer in the current scope."""
179+
if cls._lock is None or cls._lockfile_path is None or cls._output_path is None:
180+
return
161181

162-
settings = get_settings()
163-
output_path = settings.export.output_path
164-
165-
# Collect all export_paths from lockfile
166-
tracked_paths = {Path(entry.export_path) for entry in cls._lock.pages.values()}
167-
168-
# Find all markdown files in output directory
169-
untracked: list[Path] = []
170-
for md_file in output_path.rglob("*.md"):
171-
relative_path = md_file.relative_to(output_path)
172-
if relative_path not in tracked_paths:
173-
untracked.append(relative_path)
174-
if not dry_run:
175-
md_file.unlink()
176-
logger.info(f"Deleted untracked file: {relative_path}")
177-
178-
return untracked
182+
delete_ids: set[str] = set()
183+
184+
for page_id, old_entry in cls._scope_entries.items():
185+
if page_id not in cls._seen_page_ids:
186+
(cls._output_path / old_entry.export_path).unlink(missing_ok=True)
187+
logger.info(f"Deleted removed page: {old_entry.export_path}")
188+
delete_ids.add(page_id)
189+
elif page_id in cls._lock.pages:
190+
new_entry = cls._lock.pages[page_id]
191+
if old_entry.export_path != new_entry.export_path:
192+
(cls._output_path / old_entry.export_path).unlink(missing_ok=True)
193+
logger.info(f"Deleted old path for moved page: {old_entry.export_path}")
194+
195+
if delete_ids:
196+
cls._lock.save(cls._lockfile_path, delete_ids=delete_ids)

0 commit comments

Comments
 (0)