-
-
Notifications
You must be signed in to change notification settings - Fork 77
feat: incremental download using lockfile #141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
1e5181b
90f211c
8691524
0b5dbd3
6bac022
6edb622
70617ea
d34eff7
bd13853
2516f41
5029d3d
4201ccf
8603a3b
dd86763
5263c39
6f15581
02562ff
509d3e6
4253132
ed2f5a1
f899267
a115f39
1439186
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,7 @@ | |
| from confluence_markdown_exporter.utils.export import sanitize_filename | ||
| from confluence_markdown_exporter.utils.export import sanitize_key | ||
| from confluence_markdown_exporter.utils.export import save_file | ||
| from confluence_markdown_exporter.utils.lockfile import LockfileManager | ||
| from confluence_markdown_exporter.utils.table_converter import TableConverter | ||
| from confluence_markdown_exporter.utils.type_converter import str_to_bool | ||
|
|
||
|
|
@@ -133,7 +134,7 @@ class Organization(BaseModel): | |
| spaces: list["Space"] | ||
|
|
||
| @property | ||
| def pages(self) -> list[int]: | ||
| def pages(self) -> list["Page | Descendant"]: | ||
| return [page for space in self.spaces for page in space.pages] | ||
|
|
||
| def export(self) -> None: | ||
|
|
@@ -165,15 +166,15 @@ class Space(BaseModel): | |
| homepage: int | None | ||
|
|
||
| @property | ||
| def pages(self) -> list[int]: | ||
| def pages(self) -> list["Page | Descendant"]: | ||
| if self.homepage is None: | ||
| logger.warning( | ||
| f"Space '{self.name}' (key: {self.key}) has no homepage. No pages will be exported." | ||
| ) | ||
| return [] | ||
|
|
||
| homepage = Page.from_id(self.homepage) | ||
| return [self.homepage, *homepage.descendants] | ||
| return [homepage, *homepage.descendants] | ||
|
|
||
| def export(self) -> None: | ||
| export_pages(self.pages) | ||
|
|
@@ -212,7 +213,8 @@ def from_json(cls, data: JsonResponse) -> "Label": | |
| class Document(BaseModel): | ||
| title: str | ||
| space: Space | ||
| ancestors: list[int] | ||
| ancestors: list["Ancestor"] | ||
| version: Version | ||
|
|
||
| @property | ||
| def _template_vars(self) -> dict[str, str]: | ||
|
|
@@ -221,10 +223,8 @@ def _template_vars(self) -> dict[str, str]: | |
| "space_name": sanitize_filename(self.space.name), | ||
| "homepage_id": str(self.space.homepage), | ||
| "homepage_title": sanitize_filename(Page.from_id(self.space.homepage).title), | ||
| "ancestor_ids": "/".join(str(a) for a in self.ancestors), | ||
| "ancestor_titles": "/".join( | ||
| sanitize_filename(Page.from_id(a).title) for a in self.ancestors | ||
| ), | ||
| "ancestor_ids": "/".join(str(a.id) for a in self.ancestors), | ||
| "ancestor_titles": "/".join(sanitize_filename(a.title) for a in self.ancestors), | ||
| } | ||
|
|
||
|
|
||
|
|
@@ -237,7 +237,6 @@ class Attachment(Document): | |
| collection_name: str | ||
| download_link: str | ||
| comment: str | ||
| version: Version | ||
|
|
||
| @property | ||
| def extension(self) -> str: | ||
|
|
@@ -284,8 +283,8 @@ def from_json(cls, data: JsonResponse) -> "Attachment": | |
| download_link=data.get("_links", {}).get("download", ""), | ||
| comment=extensions.get("comment", ""), | ||
| ancestors=[ | ||
| *[ancestor.get("id") for ancestor in container.get("ancestors", [])], | ||
| container.get("id"), | ||
| *[Ancestor.from_json(ancestor) for ancestor in container.get("ancestors", [])], | ||
| Ancestor.from_json(container), | ||
| ][1:], | ||
| version=Version.from_json(data.get("version", {})), | ||
| ) | ||
|
|
@@ -333,6 +332,47 @@ def export(self) -> None: | |
| ) | ||
|
|
||
|
|
||
| class Ancestor(Document): | ||
| id: str | ||
|
|
||
| @classmethod | ||
| def from_json(cls, data: JsonResponse) -> "Ancestor": | ||
| return cls( | ||
| id=data.get("id", 0), | ||
| title=data.get("title", ""), | ||
| space=Space.from_key(data.get("_expandable", {}).get("space", "").split("/")[-1]), | ||
| ancestors=[], # Ancestors of ancestor is not needed for now. | ||
| version=Version.from_json({}), # Version of ancestor is not needed for now. | ||
| ) | ||
|
|
||
|
|
||
| class Descendant(Document): | ||
| id: int | ||
|
|
||
| @property | ||
| def _template_vars(self) -> dict[str, str]: | ||
| return { | ||
| **super()._template_vars, | ||
| "page_id": str(self.id), | ||
| "page_title": sanitize_filename(self.title), | ||
| } | ||
|
|
||
| @property | ||
| def export_path(self) -> Path: | ||
| filepath_template = Template(settings.export.page_path.replace("{", "${")) | ||
| return Path(filepath_template.safe_substitute(self._template_vars)) | ||
|
|
||
| @classmethod | ||
| def from_json(cls, data: JsonResponse) -> "Descendant": | ||
| return cls( | ||
| id=data.get("id", 0), | ||
| title=data.get("title", ""), | ||
| space=Space.from_key(data.get("_expandable", {}).get("space", "").split("/")[-1]), | ||
| ancestors=[Ancestor.from_json(ancestor) for ancestor in data.get("ancestors", [])][1:], | ||
| version=Version.from_json(data.get("version", {})), | ||
| ) | ||
|
|
||
|
|
||
| class Page(Document): | ||
| id: int | ||
| body: str | ||
|
|
@@ -342,10 +382,11 @@ class Page(Document): | |
| attachments: list["Attachment"] | ||
|
|
||
| @property | ||
| def descendants(self) -> list[int]: | ||
| def descendants(self) -> list["Descendant"]: | ||
| url = "rest/api/content/search" | ||
| params = { | ||
| "cql": f"type=page AND ancestor={self.id}", | ||
| "expand": "metadata.properties,ancestors,version", | ||
| "limit": 100, | ||
| } | ||
| results = [] | ||
|
|
@@ -372,8 +413,7 @@ def descendants(self) -> list[int]: | |
| f"Unexpected error when fetching descendants for content ID {self.id}." | ||
| ) | ||
| return [] | ||
|
|
||
| return [result["id"] for result in results] | ||
| return [Descendant.from_json(result) for result in results] | ||
|
|
||
| @property | ||
| def _template_vars(self) -> dict[str, str]: | ||
|
|
@@ -410,7 +450,7 @@ def export(self) -> None: | |
| self.export_markdown() | ||
|
|
||
| def export_with_descendants(self) -> None: | ||
| export_pages([self.id, *self.descendants]) | ||
| export_pages([self, *self.descendants]) | ||
|
|
||
| def export_body(self) -> None: | ||
| soup = BeautifulSoup(self.html, "html.parser") | ||
|
|
@@ -498,7 +538,8 @@ def from_json(cls, data: JsonResponse) -> "Page": | |
| for label in data.get("metadata", {}).get("labels", {}).get("results", []) | ||
| ], | ||
| attachments=Attachment.from_page_id(data.get("id", 0)), | ||
| ancestors=[ancestor.get("id") for ancestor in data.get("ancestors", [])][1:], | ||
| ancestors=[Ancestor.from_json(ancestor) for ancestor in data.get("ancestors", [])][1:], | ||
| version=Version.from_json(data.get("version", {})), | ||
| ) | ||
|
|
||
| @classmethod | ||
|
|
@@ -511,7 +552,7 @@ def from_id(cls, page_id: int) -> "Page": | |
| confluence.get_page_by_id( | ||
| page_id, | ||
| expand="body.view,body.export_view,body.editor2,metadata.labels," | ||
| "metadata.properties,ancestors", | ||
| "metadata.properties,ancestors,version", | ||
| ), | ||
| ) | ||
| ) | ||
|
|
@@ -528,6 +569,7 @@ def from_id(cls, page_id: int) -> "Page": | |
| labels=[], | ||
| attachments=[], | ||
| ancestors=[], | ||
| version=Version.from_json({}), | ||
| ) | ||
|
|
||
| @classmethod | ||
|
|
@@ -596,7 +638,9 @@ def front_matter(self) -> str: | |
| @property | ||
| def breadcrumbs(self) -> str: | ||
| return ( | ||
| " > ".join([self.convert_page_link(ancestor) for ancestor in self.page.ancestors]) | ||
| " > ".join( | ||
| [self.convert_page_link(ancestor.id) for ancestor in self.page.ancestors] | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. convert_page_link takes an integer while ancestor.id is a string. Can you check what it is and either adjust ancestor.id to int or ensure that convert_page_link can handle the string?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed in 6f15581. |
||
| ) | ||
| + "\n" | ||
| ) | ||
|
|
||
|
|
@@ -1001,7 +1045,7 @@ def convert_drawio(self, el: BeautifulSoup, text: str, parent_tags: list[str]) - | |
|
|
||
| return "" | ||
|
|
||
| def convert_plantuml(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str: # noqa: PLR0911 | ||
| def convert_plantuml(self, el: BeautifulSoup, text: str, parent_tags: list[str]) -> str: # noqa: PLR0911 | ||
| """Convert PlantUML diagrams from editor2 XML to Markdown code blocks. | ||
|
|
||
| PlantUML diagrams are stored in the editor2 XML as structured macros with | ||
|
|
@@ -1092,24 +1136,19 @@ def _get_path_for_href(self, path: Path, style: Literal["absolute", "relative"]) | |
| return result | ||
|
|
||
|
|
||
| def export_page(page_id: int) -> None: | ||
| """Export a Confluence page to Markdown. | ||
|
|
||
| Args: | ||
| page_id: The page id. | ||
| output_path: The output path. | ||
| """ | ||
| page = Page.from_id(page_id) | ||
| page.export() | ||
|
|
||
|
|
||
| def export_pages(page_ids: list[int]) -> None: | ||
| def export_pages(pages: list["Page | Descendant"]) -> None: | ||
| """Export a list of Confluence pages to Markdown. | ||
|
|
||
| Args: | ||
| page_ids: List of pages to export. | ||
| output_path: The output path. | ||
| pages: List of pages to export. | ||
| """ | ||
| for page_id in (pbar := tqdm(page_ids, smoothing=0.05)): | ||
| pbar.set_postfix_str(f"Exporting page {page_id}") | ||
| export_page(page_id) | ||
| for page in (pbar := tqdm(pages, smoothing=0.05)): | ||
|
||
| # filter pages new and updated only | ||
| if LockfileManager.should_export(page): | ||
| pbar.set_postfix_str(f"Exporting page {page.id}") | ||
| _page = Page.from_id(page.id) | ||
| _page.export() | ||
| # Record to lockfile if enabled | ||
| LockfileManager.record_page(_page) | ||
| else: | ||
| pbar.set_postfix_str(f"Skipping page {page.id} (no changes)") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the
_template_varsproperty, although the results are cached,Page.from_id(a).title) for a in self.ancestorsunnecessarily callsfrom_id. This is inconvenient for speeding up with theincrementaloption.So I added Ancestor type to store ancestor infomation.