diff --git a/changes/perserve-styles.feature b/changes/perserve-styles.feature new file mode 100644 index 0000000..9f15f5d --- /dev/null +++ b/changes/perserve-styles.feature @@ -0,0 +1 @@ +Optionally preserve styles with the same id of appended documents. [buchi] \ No newline at end of file diff --git a/docxcompose/command.py b/docxcompose/command.py index 0b2e253..be50f73 100644 --- a/docxcompose/command.py +++ b/docxcompose/command.py @@ -27,6 +27,11 @@ def setup_parser(): help="path to the output file", metavar="file", ) + parser.add_argument( + "--preserve-styles", + action="store_true", + default=False, + ) return parser @@ -46,7 +51,10 @@ def parse_args(parser, args): def compose_files(parser, parsed_args): - composer = Composer(Document(parsed_args.master)) + options = { + "preserve_styles": parsed_args.preserve_styles, + } + composer = Composer(Document(parsed_args.master), **options) for slave_path in parsed_args.files: composer.append(Document(slave_path)) diff --git a/docxcompose/composer.py b/docxcompose/composer.py index 05a1088..72cb754 100644 --- a/docxcompose/composer.py +++ b/docxcompose/composer.py @@ -15,7 +15,9 @@ from docxcompose.image import ImageWrapper from docxcompose.properties import CustomProperties +from docxcompose.utils import increment_name from docxcompose.utils import NS +from docxcompose.utils import xml_elements_equal from docxcompose.utils import xpath @@ -34,13 +36,22 @@ RT.FOOTNOTES, ] +IGNORED_STYLE_TAGS = set( + [ + "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}name", + "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rsid", + ] +) + class Composer(object): - def __init__(self, doc): + def __init__(self, doc, preserve_styles=False): self.doc = doc self.pkg = doc.part.package self.restart_numbering = True + self.preserve_styles = preserve_styles + self._preserved_styles = {} self.reset_reference_mapping() @@ -59,6 +70,7 @@ def append(self, doc, remove_property_fields=True): def insert(self, index, doc, remove_property_fields=True): """Insert the given document at the given index.""" self.reset_reference_mapping() + self._current_preserved_styles = {} # Remove custom property fields but keep the values if remove_property_fields: @@ -299,24 +311,61 @@ def add_styles(self, doc, element): for style_id in used_style_ids: our_style_id = self.mapped_style_id(style_id) - if our_style_id not in our_style_ids: + # To preserve styles with the same id from added documents, we + # create a copy and append a suffix to the id and name. + if self.preserve_styles and our_style_id in our_style_ids: + if our_style_id not in self._current_preserved_styles: + style_element = deepcopy(doc.styles.element.get_by_id(style_id)) + our_style_element = self.doc.styles.element.get_by_id(our_style_id) + + # Check if we already have an identical style + preserved_style_ids = self._preserved_styles.get( + our_style_id, [our_style_id] + ) + matched_style_id = None + for pstyle_id in preserved_style_ids: + our_style_element = self.doc.styles.element.get_by_id(pstyle_id) + if xml_elements_equal( + style_element, + our_style_element, + ignored_tags=IGNORED_STYLE_TAGS, + ): + matched_style_id = pstyle_id + self._current_preserved_styles[our_style_id] = ( + style_element.styleId + ) + break + # No matching style found, insert style with a new name + if matched_style_id is None: + new_id = increment_name(our_style_id) + new_name = None + if style_element.name is not None: + new_name = increment_name(style_element.name.val) + while new_id in our_style_ids: + new_id = increment_name(new_id) + if new_name is not None: + new_name = increment_name(new_name) + style_element.styleId = new_id + if new_name is not None: + style_element.name.val = new_name + self.doc.styles.element.append(style_element) + self.add_numberings(doc, style_element) + self.add_linked_styles(doc, style_element) + self._current_preserved_styles[our_style_id] = new_id + self._preserved_styles.setdefault( + our_style_id, [our_style_id] + ).append(new_id) + else: + self._current_preserved_styles[our_style_id] = matched_style_id + + for el in xpath(element, ".//w:tblStyle|.//w:pStyle|.//w:rStyle"): + el.val = self._current_preserved_styles[our_style_id] + elif our_style_id not in our_style_ids: style_element = deepcopy(doc.styles.element.get_by_id(style_id)) if style_element is not None: self.doc.styles.element.append(style_element) self.add_numberings(doc, style_element) - # Also add linked styles - linked_style_ids = xpath(style_element, ".//w:link/@w:val") - if linked_style_ids: - linked_style_id = linked_style_ids[0] - our_linked_style_id = self.mapped_style_id(linked_style_id) - if our_linked_style_id not in our_style_ids: - our_linked_style = doc.styles.element.get_by_id( - linked_style_id - ) - if our_linked_style is not None: - self.doc.styles.element.append( - deepcopy(our_linked_style) - ) + self.add_linked_styles(doc, style_element) else: # Create a mapping for abstractNumIds used in existing styles # This is used when adding numberings to avoid having multiple @@ -360,6 +409,17 @@ def add_styles(self, doc, element): # Update our style ids our_style_ids = [s.style_id for s in self.doc.styles] + def add_linked_styles(self, doc, element): + linked_style_ids = xpath(element, ".//w:link/@w:val") + if linked_style_ids: + linked_style_id = linked_style_ids[0] + our_linked_style_id = self.mapped_style_id(linked_style_id) + our_style_ids = [s.style_id for s in self.doc.styles] + if our_linked_style_id not in our_style_ids: + our_linked_style = doc.styles.element.get_by_id(linked_style_id) + if our_linked_style is not None: + self.doc.styles.element.append(deepcopy(our_linked_style)) + def add_numberings(self, doc, element): """Add numberings from the given document used in the given element.""" # Search for numbering references diff --git a/docxcompose/server.py b/docxcompose/server.py index 4a0a8cb..36885f4 100644 --- a/docxcompose/server.py +++ b/docxcompose/server.py @@ -10,6 +10,7 @@ from docx import Document from docxcompose.composer import Composer +from docxcompose.utils import to_bool CHUNK_SIZE = 65536 @@ -48,7 +49,7 @@ async def compose(request): composed_filename = os.path.join(temp_dir, "composed.docx") try: - composer = Composer(Document(documents.pop(0))) + composer = Composer(Document(documents.pop(0)), **compose_options(request)) for document in documents: composer.append(Document(document)) composer.save(composed_filename) @@ -63,6 +64,12 @@ async def compose(request): ) +def compose_options(request): + return { + "preserve_styles": to_bool(request.rel_url.query.get("preserve_styles", "")), + } + + async def save_part_to_file(part, directory): filename = os.path.join(directory, f"{part.name}_{part.filename}") with open(filename, "wb") as file_: diff --git a/docxcompose/utils.py b/docxcompose/utils.py index 36be83a..e287968 100644 --- a/docxcompose/utils.py +++ b/docxcompose/utils.py @@ -48,3 +48,87 @@ def word_to_python_date_format(format_str): for word_format, python_format in date_format_map: format_str = re.sub(word_format, python_format, format_str) return format_str + + +def increment_name(name): + increment_part = name.split("_")[-1] + try: + increment = int(increment_part) + except ValueError: + return f"{name}_1" + return f"{name.removesuffix(increment_part)}{increment + 1}" + + +def to_bool(value): + return value.lower() in ["1", "yes", "true", "on", "ok"] + + +def xml_elements_equal( + left, + right, + ignored_tags=None, + compare_text=True, + compare_tail=False, + compare_attributes=True, +): + return xml_element_signature( + left, + ignored_tags=ignored_tags, + compare_text=compare_text, + compare_tail=compare_tail, + compare_attributes=compare_attributes, + ) == xml_element_signature( + right, + ignored_tags=ignored_tags, + compare_text=compare_text, + compare_tail=compare_tail, + compare_attributes=compare_attributes, + ) + + +def xml_element_signature( + element, + ignored_tags=None, + compare_text=True, + compare_tail=False, + compare_attributes=True, + is_root=True, +): + """ + Creates a canonical, recursive representation of an element. + + Child elements are included as a sorted list of signatures, + so their order is irrelevant. + """ + tag = element.tag + attrs = tuple(sorted(element.attrib.items())) if compare_attributes else () + text = normalize_text(element.text) if compare_text else None + tail = normalize_text(element.tail) if compare_tail else None + + child_signatures = [] + for child in element: + if ignored_tags and child.tag in ignored_tags: + continue + + child_signatures.append( + xml_element_signature( + child, + ignored_tags=ignored_tags, + compare_text=compare_text, + compare_tail=compare_tail, + compare_attributes=compare_attributes, + is_root=False, + ) + ) + child_signatures.sort() + + if is_root: + return (None, None, None, None, tuple(child_signatures)) + else: + return (tag, attrs, text, tail, tuple(child_signatures)) + + +def normalize_text(value): + if value is None: + return "" + return value.strip() diff --git a/tests/docs/composed_fixture/styles_preserve.docx b/tests/docs/composed_fixture/styles_preserve.docx new file mode 100644 index 0000000..4845f7f Binary files /dev/null and b/tests/docs/composed_fixture/styles_preserve.docx differ diff --git a/tests/docs/styles_preserve1.docx b/tests/docs/styles_preserve1.docx new file mode 100644 index 0000000..0329556 Binary files /dev/null and b/tests/docs/styles_preserve1.docx differ diff --git a/tests/docs/styles_preserve2.docx b/tests/docs/styles_preserve2.docx new file mode 100644 index 0000000..6da64a9 Binary files /dev/null and b/tests/docs/styles_preserve2.docx differ diff --git a/tests/test_server.py b/tests/test_server.py index 93f8360..91e3fbc 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -62,6 +62,18 @@ async def test_post_returns_500_if_compose_fails(http_client): assert text == "Failed composing documents" +async def test_post_with_url_parameters(http_client): + files = { + "master": open(docx_path("master.docx"), "rb"), + "table": open(docx_path("table.docx"), "rb"), + } + resp = await http_client.post("/?preserve_styles=1", data=files) + assert resp.status == 200 + composed_doc = ComparableDocument(Document(BytesIO(await resp.read()))) + composed_fixture = FixtureDocument("table.docx") + assert composed_doc == composed_fixture + + async def test_healtcheck_returns_200(http_client): resp = await http_client.get("/healthcheck") assert resp.status == 200 diff --git a/tests/test_styles.py b/tests/test_styles.py index 3b0060e..cc160d7 100644 --- a/tests/test_styles.py +++ b/tests/test_styles.py @@ -1,5 +1,6 @@ import pytest from docx import Document +from utils import ComparableDocument from utils import ComposedDocument from utils import docx_path from utils import FixtureDocument @@ -64,6 +65,42 @@ def test_continue_when_no_styles(): ComposedDocument("aatmay.docx", "aatmay.docx") +def test_preserve_styles_with_same_id(): + composer = Composer( + Document(docx_path("styles_preserve1.docx")), preserve_styles=True + ) + composer.append(Document(docx_path("styles_preserve2.docx"))) + style_ids = [s.style_id for s in composer.doc.styles] + assert "MyCustomStyle" in style_ids + assert "MyCustomStyle_1" in style_ids + + expected = FixtureDocument("styles_preserve.docx") + composed = ComparableDocument(composer.doc) + assert composed == expected + + +def test_ignore_styles_with_same_id(): + composer = Composer(Document(docx_path("styles_preserve1.docx"))) + composer.append(Document(docx_path("styles_preserve2.docx"))) + style_ids = [s.style_id for s in composer.doc.styles] + assert "MyCustomStyle" in style_ids + assert "MyCustomStyle_1" not in style_ids + + +def test_preserve_styles_does_not_duplicate_identical_styles(): + composer = Composer( + Document(docx_path("styles_preserve1.docx")), preserve_styles=True + ) + composer.append(Document(docx_path("styles_preserve2.docx"))) + composer.append(Document(docx_path("styles_preserve2.docx"))) + composer.append(Document(docx_path("styles_preserve1.docx"))) + assert [ + s.style_id + for s in composer.doc.styles + if s.style_id.startswith("MyCustomStyle") + ] == ["MyCustomStyle", "MyCustomStyleZchn", "MyCustomStyle_1"] + + @pytest.fixture def merged_styles(): composer = Composer(Document(docx_path("styles_en.docx"))) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..8509578 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,43 @@ +from lxml import etree + +from docxcompose.utils import xml_elements_equal + + +def test_xml_elements_are_equal(): + xml1 = """ + + Foo + Bar + 123 + + """ + xml2 = """ + + Bar + Foo + 999 + + """ + e1 = etree.fromstring(xml1) + e2 = etree.fromstring(xml2) + assert xml_elements_equal(e1, e2, ignored_tags=["ignore_me"]) is True + + +def test_xml_elements_are_not_equal(): + xml1 = """ + + Foo + Bar + 123 + + """ + xml2 = """ + + Bar + Foo + 999 + + """ + e1 = etree.fromstring(xml1) + e2 = etree.fromstring(xml2) + assert xml_elements_equal(e1, e2, ignored_tags=["ignore_me"]) is False