Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion skills/docx/scripts/office/validators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,7 +760,7 @@ def _validate_single_file_xsd(self, xml_file, base_path):
)
schema = lxml.etree.XMLSchema(xsd_doc)

with open(xml_file, "r") as f:
with open(xml_file, "r", encoding="utf-8") as f:
xml_doc = lxml.etree.parse(f)

xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
Expand Down
4 changes: 2 additions & 2 deletions skills/docx/scripts/office/validators/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def compare_paragraph_counts(self):

diff = new_count - original_count
diff_str = f"+{diff}" if diff > 0 else str(diff)
print(f"\nParagraphs: {original_count} {new_count} ({diff_str})")
print(f"\nParagraphs: {original_count} -> {new_count} ({diff_str})")

def _parse_id_value(self, val: str, base: int = 16) -> int:
return int(val, base)
Expand Down Expand Up @@ -428,7 +428,7 @@ def repair_durableId(self) -> int:

elem.setAttribute("w16cid:durableId", new_id)
print(
f" Repaired: {xml_file.name}: durableId {durable_id} {new_id}"
f" Repaired: {xml_file.name}: durableId {durable_id} -> {new_id}"
)
repairs += 1
modified = True
Expand Down
Empty file added tests/__init__.py
Empty file.
102 changes: 102 additions & 0 deletions tests/test_docx_windows_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import io
import sys
import tempfile
import unittest
from pathlib import Path
from unittest import mock

sys.path.insert(0, str(Path(__file__).resolve().parents[1] / 'skills' / 'docx' / 'scripts' / 'office'))

from validators.base import BaseSchemaValidator
from validators.docx import DOCXSchemaValidator


class DummySchemaValidator(BaseSchemaValidator):
def validate(self):
return True


class ValidateSingleFileXsdEncodingTests(unittest.TestCase):
def test_validate_single_file_xsd_opens_xml_as_utf8_text(self):
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
xml_file = tmp_path / 'word' / 'document.xml'
xml_file.parent.mkdir(parents=True)
xml_file.write_text('<?xml version="1.0" encoding="UTF-8"?><root/>', encoding='utf-8')
schema_path = tmp_path / 'schema.xsd'
schema_path.write_text(
'<?xml version="1.0" encoding="UTF-8"?>\n'
'<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">\n'
' <xs:element name="root" type="xs:string"/>\n'
'</xs:schema>\n',
encoding='utf-8',
)

validator = DummySchemaValidator(tmp_path)
validator._get_schema_path = lambda _: schema_path
validator._remove_template_tags_from_text_nodes = lambda xml_doc: (xml_doc, False)
validator._preprocess_for_mc_ignorable = lambda xml_doc: xml_doc
validator._clean_ignorable_namespaces = lambda xml_doc: xml_doc

open_calls = []
real_open = open

def tracking_open(file, mode='r', *args, **kwargs):
open_calls.append((Path(file), mode, kwargs.get('encoding')))
return real_open(file, mode, *args, **kwargs)

with mock.patch('builtins.open', side_effect=tracking_open):
valid, errors = validator._validate_single_file_xsd(xml_file, tmp_path)

self.assertTrue(valid)
self.assertEqual(errors, set())
self.assertIn((xml_file, 'r', 'utf-8'), open_calls)


class WindowsConsoleOutputTests(unittest.TestCase):
def _cp1252_stdout(self):
buffer = io.BytesIO()
stream = io.TextIOWrapper(buffer, encoding='cp1252', errors='strict')
return buffer, stream

def test_compare_paragraph_counts_uses_ascii_safe_arrow(self):
with tempfile.TemporaryDirectory() as tmp:
validator = DOCXSchemaValidator(tmp)
validator.count_paragraphs_in_original = lambda: 3
validator.count_paragraphs_in_unpacked = lambda: 5

buffer, stream = self._cp1252_stdout()
with mock.patch('sys.stdout', stream):
validator.compare_paragraph_counts()
stream.flush()

output = buffer.getvalue().decode('cp1252')
self.assertIn('Paragraphs: 3 -> 5 (+2)', output)

def test_repair_durable_id_uses_ascii_safe_arrow(self):
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
xml_file = tmp_path / 'word' / 'document.xml'
xml_file.parent.mkdir(parents=True)
xml_file.write_text(
'<?xml version="1.0" encoding="UTF-8"?>\n'
'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"\n'
' xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid">\n'
' <w:body><w:p w16cid:durableId="FFFFFFFF"/></w:body>\n'
'</w:document>\n',
encoding='utf-8',
)
validator = DOCXSchemaValidator(tmp_path)

buffer, stream = self._cp1252_stdout()
with mock.patch('sys.stdout', stream), mock.patch('validators.docx.random.randint', return_value=42):
repairs = validator.repair_durableId()
stream.flush()

output = buffer.getvalue().decode('cp1252')
self.assertEqual(repairs, 1)
self.assertIn('Repaired: document.xml: durableId FFFFFFFF -> 0000002A', output)


if __name__ == '__main__':
unittest.main()