Skip to content

Commit 7f755e9

Browse files
h4x0rclaude
andcommitted
feat: auto-repair, warn-on-save, and backup-on-overwrite
Three layers of defense added to save(): 1. Backup-on-overwrite: creates .bak (or .bak2, .bak3, ...) before overwriting any existing file. Backup path returned in save response. 2. Auto-repair (_pre_save_repair): silently fixes known corruption patterns before writing — orphaned footnote/endnote definitions, duplicate paraIds, and broken internal relationship targets. Repair counts returned in save response. 3. Warn-on-save (_post_repair_warnings): lightweight validation for issues that can't be auto-repaired — heading level skips, unpaired bookmarks, inconsistent table columns, artifact markers (DRAFT, TODO, FIXME). Warnings array returned in save response. 344 tests, 100% coverage. Co-Authored-By: Claude Opus 4.6 <[email protected]>
1 parent a910539 commit 7f755e9

2 files changed

Lines changed: 428 additions & 1 deletion

File tree

docx_mcp/document/base.py

Lines changed: 153 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,15 @@ def save(self, output_path: str | None = None) -> dict:
144144

145145
output = Path(output_path) if output_path else self.source_path
146146

147+
# Backup existing file before overwriting
148+
backup_path = self._backup_if_exists(output)
149+
150+
# Auto-repair known corruption patterns
151+
repairs = self._pre_save_repair()
152+
153+
# Lightweight validation — warn about issues that can't be auto-repaired
154+
warnings = self._post_repair_warnings()
155+
147156
# Serialize modified trees
148157
for rel_path in self._modified:
149158
tree = self._trees.get(rel_path)
@@ -169,11 +178,154 @@ def save(self, output_path: str | None = None) -> dict:
169178

170179
modified = sorted(self._modified)
171180
self._modified.clear()
172-
return {
181+
result = {
173182
"path": str(output),
174183
"size_bytes": output.stat().st_size,
175184
"modified_parts": modified,
185+
"repairs": repairs,
186+
"warnings": warnings,
176187
}
188+
if backup_path:
189+
result["backup"] = str(backup_path)
190+
return result
191+
192+
@staticmethod
193+
def _backup_if_exists(output: Path) -> Path | None:
194+
"""Create a backup of output if it already exists.
195+
196+
Uses .bak, .bak2, .bak3, ... to avoid overwriting previous backups.
197+
"""
198+
if not output.exists():
199+
return None
200+
# Find next available backup name
201+
bak = output.with_suffix(output.suffix + ".bak")
202+
if not bak.exists():
203+
shutil.copy2(output, bak)
204+
return bak
205+
n = 2
206+
while True:
207+
bak = output.parent / (output.stem + output.suffix + f".bak{n}")
208+
if not bak.exists():
209+
break
210+
n += 1
211+
shutil.copy2(output, bak)
212+
return bak
213+
214+
# ── Pre-save auto-repair ─────────────────────────────────────────────────
215+
216+
def _pre_save_repair(self) -> dict:
217+
"""Auto-repair known corruption patterns before writing.
218+
219+
Returns a dict of repair counts (all zero if nothing was fixed).
220+
"""
221+
repairs: dict[str, int] = {
222+
"orphan_footnotes_removed": 0,
223+
"orphan_endnotes_removed": 0,
224+
"paraids_deduplicated": 0,
225+
"broken_rels_removed": 0,
226+
}
227+
228+
doc = self._tree("word/document.xml")
229+
if doc is None:
230+
return repairs
231+
232+
body = doc.find(f"{W}body")
233+
234+
# ── Orphan footnotes ──────────────────────────────────────────────
235+
fn_tree = self._tree("word/footnotes.xml")
236+
if fn_tree is not None and body is not None:
237+
ref_ids = {int(ref.get(f"{W}id", "0")) for ref in body.iter(f"{W}footnoteReference")}
238+
for fn in list(fn_tree.findall(f"{W}footnote")):
239+
fn_id = int(fn.get(f"{W}id", "0"))
240+
if fn_id >= 2 and fn_id not in ref_ids:
241+
fn_tree.remove(fn)
242+
repairs["orphan_footnotes_removed"] += 1
243+
if repairs["orphan_footnotes_removed"]:
244+
self._mark("word/footnotes.xml")
245+
246+
# ── Orphan endnotes ───────────────────────────────────────────────
247+
en_tree = self._tree("word/endnotes.xml")
248+
if en_tree is not None and body is not None:
249+
ref_ids = {int(ref.get(f"{W}id", "0")) for ref in body.iter(f"{W}endnoteReference")}
250+
for en in list(en_tree.findall(f"{W}endnote")):
251+
en_id_str = en.get(f"{W}id", "0")
252+
if en_id_str in ("0", "-1"):
253+
continue
254+
en_id = int(en_id_str)
255+
if en_id not in ref_ids:
256+
en_tree.remove(en)
257+
repairs["orphan_endnotes_removed"] += 1
258+
if repairs["orphan_endnotes_removed"]:
259+
self._mark("word/endnotes.xml")
260+
261+
# ── Duplicate paraIds ─────────────────────────────────────────────
262+
seen: dict[str, bool] = {}
263+
for rel_path, tree in self._trees.items():
264+
if not rel_path.endswith(".xml"):
265+
continue
266+
for elem in tree.iter():
267+
pid = elem.get(f"{W14}paraId")
268+
if pid is None:
269+
continue
270+
if pid in seen:
271+
elem.set(f"{W14}paraId", self._new_para_id())
272+
repairs["paraids_deduplicated"] += 1
273+
self._mark(rel_path)
274+
else:
275+
seen[pid] = True
276+
277+
# ── Broken internal relationships ─────────────────────────────────
278+
rels_tree = self._tree("word/_rels/document.xml.rels")
279+
if rels_tree is not None:
280+
for rel in list(rels_tree.findall(f"{RELS}Relationship")):
281+
if rel.get("TargetMode") == "External":
282+
continue
283+
target = rel.get("Target", "")
284+
if target and not (self.workdir / "word" / target).exists():
285+
rels_tree.remove(rel)
286+
repairs["broken_rels_removed"] += 1
287+
if repairs["broken_rels_removed"]:
288+
self._mark("word/_rels/document.xml.rels")
289+
290+
return repairs
291+
292+
def _post_repair_warnings(self) -> list[str]:
293+
"""Check for issues that can't be auto-repaired. Returns warning strings."""
294+
warnings: list[str] = []
295+
doc = self._tree("word/document.xml")
296+
if doc is None:
297+
return warnings
298+
299+
# Heading level skips
300+
headings = self._find_headings(doc)
301+
prev = 0
302+
for h in headings:
303+
if h["level"] > prev + 1 and prev > 0:
304+
warnings.append(
305+
f"Heading level skip: H{prev} -> H{h['level']} at '{h['text'][:50]}'"
306+
)
307+
prev = h["level"]
308+
309+
# Unpaired bookmarks
310+
starts = {e.get(f"{W}id") for e in doc.iter(f"{W}bookmarkStart") if e.get(f"{W}id")}
311+
ends = {e.get(f"{W}id") for e in doc.iter(f"{W}bookmarkEnd") if e.get(f"{W}id")}
312+
unpaired = len(starts - ends) + len(ends - starts)
313+
if unpaired:
314+
warnings.append(f"{unpaired} unpaired bookmark(s)")
315+
316+
# Inconsistent table columns
317+
for idx, tbl in enumerate(doc.iter(f"{W}tbl")):
318+
counts = [len(tr.findall(f"{W}tc")) for tr in tbl.findall(f"{W}tr")]
319+
if counts and len(set(counts)) > 1:
320+
warnings.append(f"Table {idx + 1} has inconsistent column counts: {counts}")
321+
322+
# Artifact markers (DRAFT, TODO, FIXME, XXX)
323+
for marker in ("DRAFT", "TODO", "FIXME", "XXX"):
324+
hits = self.search_text(marker)
325+
for hit in hits:
326+
warnings.append(f"{marker} marker in {hit['source']}: '{hit['text'][:60]}'")
327+
328+
return warnings
177329

178330
# ── Private helpers ─────────────────────────────────────────────────────
179331

0 commit comments

Comments
 (0)