diff --git a/skills/docx/scripts/office/validators/base.py b/skills/docx/scripts/office/validators/base.py index db4a06a22..c9d3fef94 100644 --- a/skills/docx/scripts/office/validators/base.py +++ b/skills/docx/scripts/office/validators/base.py @@ -760,7 +760,7 @@ def _validate_single_file_xsd(self, xml_file, base_path): ) schema = lxml.etree.XMLSchema(xsd_doc) - with open(xml_file, "r") as f: + with open(xml_file, "r", encoding="utf-8") as f: xml_doc = lxml.etree.parse(f) xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc) diff --git a/skills/docx/scripts/office/validators/docx.py b/skills/docx/scripts/office/validators/docx.py index fec405e69..6352afd5e 100644 --- a/skills/docx/scripts/office/validators/docx.py +++ b/skills/docx/scripts/office/validators/docx.py @@ -246,7 +246,7 @@ def compare_paragraph_counts(self): diff = new_count - original_count diff_str = f"+{diff}" if diff > 0 else str(diff) - print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})") + print(f"\nParagraphs: {original_count} -> {new_count} ({diff_str})") def _parse_id_value(self, val: str, base: int = 16) -> int: return int(val, base) @@ -428,7 +428,7 @@ def repair_durableId(self) -> int: elem.setAttribute("w16cid:durableId", new_id) print( - f" Repaired: {xml_file.name}: durableId {durable_id} → {new_id}" + f" Repaired: {xml_file.name}: durableId {durable_id} -> {new_id}" ) repairs += 1 modified = True diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_docx_windows_encoding.py b/tests/test_docx_windows_encoding.py new file mode 100644 index 000000000..43a324eec --- /dev/null +++ b/tests/test_docx_windows_encoding.py @@ -0,0 +1,102 @@ +import io +import sys +import tempfile +import unittest +from pathlib import Path +from unittest import mock + +sys.path.insert(0, str(Path(__file__).resolve().parents[1] / 'skills' / 'docx' / 'scripts' / 'office')) + +from validators.base import BaseSchemaValidator +from validators.docx import DOCXSchemaValidator + + +class DummySchemaValidator(BaseSchemaValidator): + def validate(self): + return True + + +class ValidateSingleFileXsdEncodingTests(unittest.TestCase): + def test_validate_single_file_xsd_opens_xml_as_utf8_text(self): + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + xml_file = tmp_path / 'word' / 'document.xml' + xml_file.parent.mkdir(parents=True) + xml_file.write_text('', encoding='utf-8') + schema_path = tmp_path / 'schema.xsd' + schema_path.write_text( + '\n' + '\n' + ' \n' + '\n', + encoding='utf-8', + ) + + validator = DummySchemaValidator(tmp_path) + validator._get_schema_path = lambda _: schema_path + validator._remove_template_tags_from_text_nodes = lambda xml_doc: (xml_doc, False) + validator._preprocess_for_mc_ignorable = lambda xml_doc: xml_doc + validator._clean_ignorable_namespaces = lambda xml_doc: xml_doc + + open_calls = [] + real_open = open + + def tracking_open(file, mode='r', *args, **kwargs): + open_calls.append((Path(file), mode, kwargs.get('encoding'))) + return real_open(file, mode, *args, **kwargs) + + with mock.patch('builtins.open', side_effect=tracking_open): + valid, errors = validator._validate_single_file_xsd(xml_file, tmp_path) + + self.assertTrue(valid) + self.assertEqual(errors, set()) + self.assertIn((xml_file, 'r', 'utf-8'), open_calls) + + +class WindowsConsoleOutputTests(unittest.TestCase): + def _cp1252_stdout(self): + buffer = io.BytesIO() + stream = io.TextIOWrapper(buffer, encoding='cp1252', errors='strict') + return buffer, stream + + def test_compare_paragraph_counts_uses_ascii_safe_arrow(self): + with tempfile.TemporaryDirectory() as tmp: + validator = DOCXSchemaValidator(tmp) + validator.count_paragraphs_in_original = lambda: 3 + validator.count_paragraphs_in_unpacked = lambda: 5 + + buffer, stream = self._cp1252_stdout() + with mock.patch('sys.stdout', stream): + validator.compare_paragraph_counts() + stream.flush() + + output = buffer.getvalue().decode('cp1252') + self.assertIn('Paragraphs: 3 -> 5 (+2)', output) + + def test_repair_durable_id_uses_ascii_safe_arrow(self): + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + xml_file = tmp_path / 'word' / 'document.xml' + xml_file.parent.mkdir(parents=True) + xml_file.write_text( + '\n' + '\n' + ' \n' + '\n', + encoding='utf-8', + ) + validator = DOCXSchemaValidator(tmp_path) + + buffer, stream = self._cp1252_stdout() + with mock.patch('sys.stdout', stream), mock.patch('validators.docx.random.randint', return_value=42): + repairs = validator.repair_durableId() + stream.flush() + + output = buffer.getvalue().decode('cp1252') + self.assertEqual(repairs, 1) + self.assertIn('Repaired: document.xml: durableId FFFFFFFF -> 0000002A', output) + + +if __name__ == '__main__': + unittest.main()