diff --git a/skills/docx/scripts/office/validators/base.py b/skills/docx/scripts/office/validators/base.py
index db4a06a22..c9d3fef94 100644
--- a/skills/docx/scripts/office/validators/base.py
+++ b/skills/docx/scripts/office/validators/base.py
@@ -760,7 +760,7 @@ def _validate_single_file_xsd(self, xml_file, base_path):
)
schema = lxml.etree.XMLSchema(xsd_doc)
- with open(xml_file, "r") as f:
+ with open(xml_file, "r", encoding="utf-8") as f:
xml_doc = lxml.etree.parse(f)
xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
diff --git a/skills/docx/scripts/office/validators/docx.py b/skills/docx/scripts/office/validators/docx.py
index fec405e69..6352afd5e 100644
--- a/skills/docx/scripts/office/validators/docx.py
+++ b/skills/docx/scripts/office/validators/docx.py
@@ -246,7 +246,7 @@ def compare_paragraph_counts(self):
diff = new_count - original_count
diff_str = f"+{diff}" if diff > 0 else str(diff)
- print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})")
+ print(f"\nParagraphs: {original_count} -> {new_count} ({diff_str})")
def _parse_id_value(self, val: str, base: int = 16) -> int:
return int(val, base)
@@ -428,7 +428,7 @@ def repair_durableId(self) -> int:
elem.setAttribute("w16cid:durableId", new_id)
print(
- f" Repaired: {xml_file.name}: durableId {durable_id} → {new_id}"
+ f" Repaired: {xml_file.name}: durableId {durable_id} -> {new_id}"
)
repairs += 1
modified = True
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_docx_windows_encoding.py b/tests/test_docx_windows_encoding.py
new file mode 100644
index 000000000..43a324eec
--- /dev/null
+++ b/tests/test_docx_windows_encoding.py
@@ -0,0 +1,102 @@
+import io
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+from unittest import mock
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1] / 'skills' / 'docx' / 'scripts' / 'office'))
+
+from validators.base import BaseSchemaValidator
+from validators.docx import DOCXSchemaValidator
+
+
+class DummySchemaValidator(BaseSchemaValidator):
+ def validate(self):
+ return True
+
+
+class ValidateSingleFileXsdEncodingTests(unittest.TestCase):
+ def test_validate_single_file_xsd_opens_xml_as_utf8_text(self):
+ with tempfile.TemporaryDirectory() as tmp:
+ tmp_path = Path(tmp)
+ xml_file = tmp_path / 'word' / 'document.xml'
+ xml_file.parent.mkdir(parents=True)
+ xml_file.write_text('', encoding='utf-8')
+ schema_path = tmp_path / 'schema.xsd'
+ schema_path.write_text(
+ '\n'
+ '\n'
+ ' \n'
+ '\n',
+ encoding='utf-8',
+ )
+
+ validator = DummySchemaValidator(tmp_path)
+ validator._get_schema_path = lambda _: schema_path
+ validator._remove_template_tags_from_text_nodes = lambda xml_doc: (xml_doc, False)
+ validator._preprocess_for_mc_ignorable = lambda xml_doc: xml_doc
+ validator._clean_ignorable_namespaces = lambda xml_doc: xml_doc
+
+ open_calls = []
+ real_open = open
+
+ def tracking_open(file, mode='r', *args, **kwargs):
+ open_calls.append((Path(file), mode, kwargs.get('encoding')))
+ return real_open(file, mode, *args, **kwargs)
+
+ with mock.patch('builtins.open', side_effect=tracking_open):
+ valid, errors = validator._validate_single_file_xsd(xml_file, tmp_path)
+
+ self.assertTrue(valid)
+ self.assertEqual(errors, set())
+ self.assertIn((xml_file, 'r', 'utf-8'), open_calls)
+
+
+class WindowsConsoleOutputTests(unittest.TestCase):
+ def _cp1252_stdout(self):
+ buffer = io.BytesIO()
+ stream = io.TextIOWrapper(buffer, encoding='cp1252', errors='strict')
+ return buffer, stream
+
+ def test_compare_paragraph_counts_uses_ascii_safe_arrow(self):
+ with tempfile.TemporaryDirectory() as tmp:
+ validator = DOCXSchemaValidator(tmp)
+ validator.count_paragraphs_in_original = lambda: 3
+ validator.count_paragraphs_in_unpacked = lambda: 5
+
+ buffer, stream = self._cp1252_stdout()
+ with mock.patch('sys.stdout', stream):
+ validator.compare_paragraph_counts()
+ stream.flush()
+
+ output = buffer.getvalue().decode('cp1252')
+ self.assertIn('Paragraphs: 3 -> 5 (+2)', output)
+
+ def test_repair_durable_id_uses_ascii_safe_arrow(self):
+ with tempfile.TemporaryDirectory() as tmp:
+ tmp_path = Path(tmp)
+ xml_file = tmp_path / 'word' / 'document.xml'
+ xml_file.parent.mkdir(parents=True)
+ xml_file.write_text(
+ '\n'
+ '\n'
+ ' \n'
+ '\n',
+ encoding='utf-8',
+ )
+ validator = DOCXSchemaValidator(tmp_path)
+
+ buffer, stream = self._cp1252_stdout()
+ with mock.patch('sys.stdout', stream), mock.patch('validators.docx.random.randint', return_value=42):
+ repairs = validator.repair_durableId()
+ stream.flush()
+
+ output = buffer.getvalue().decode('cp1252')
+ self.assertEqual(repairs, 1)
+ self.assertIn('Repaired: document.xml: durableId FFFFFFFF -> 0000002A', output)
+
+
+if __name__ == '__main__':
+ unittest.main()