@@ -144,6 +144,15 @@ def save(self, output_path: str | None = None) -> dict:
144144
145145 output = Path (output_path ) if output_path else self .source_path
146146
147+ # Backup existing file before overwriting
148+ backup_path = self ._backup_if_exists (output )
149+
150+ # Auto-repair known corruption patterns
151+ repairs = self ._pre_save_repair ()
152+
153+ # Lightweight validation — warn about issues that can't be auto-repaired
154+ warnings = self ._post_repair_warnings ()
155+
147156 # Serialize modified trees
148157 for rel_path in self ._modified :
149158 tree = self ._trees .get (rel_path )
@@ -169,11 +178,154 @@ def save(self, output_path: str | None = None) -> dict:
169178
170179 modified = sorted (self ._modified )
171180 self ._modified .clear ()
172- return {
181+ result = {
173182 "path" : str (output ),
174183 "size_bytes" : output .stat ().st_size ,
175184 "modified_parts" : modified ,
185+ "repairs" : repairs ,
186+ "warnings" : warnings ,
176187 }
188+ if backup_path :
189+ result ["backup" ] = str (backup_path )
190+ return result
191+
192+ @staticmethod
193+ def _backup_if_exists (output : Path ) -> Path | None :
194+ """Create a backup of output if it already exists.
195+
196+ Uses .bak, .bak2, .bak3, ... to avoid overwriting previous backups.
197+ """
198+ if not output .exists ():
199+ return None
200+ # Find next available backup name
201+ bak = output .with_suffix (output .suffix + ".bak" )
202+ if not bak .exists ():
203+ shutil .copy2 (output , bak )
204+ return bak
205+ n = 2
206+ while True :
207+ bak = output .parent / (output .stem + output .suffix + f".bak{ n } " )
208+ if not bak .exists ():
209+ break
210+ n += 1
211+ shutil .copy2 (output , bak )
212+ return bak
213+
214+ # ── Pre-save auto-repair ─────────────────────────────────────────────────
215+
216+ def _pre_save_repair (self ) -> dict :
217+ """Auto-repair known corruption patterns before writing.
218+
219+ Returns a dict of repair counts (all zero if nothing was fixed).
220+ """
221+ repairs : dict [str , int ] = {
222+ "orphan_footnotes_removed" : 0 ,
223+ "orphan_endnotes_removed" : 0 ,
224+ "paraids_deduplicated" : 0 ,
225+ "broken_rels_removed" : 0 ,
226+ }
227+
228+ doc = self ._tree ("word/document.xml" )
229+ if doc is None :
230+ return repairs
231+
232+ body = doc .find (f"{ W } body" )
233+
234+ # ── Orphan footnotes ──────────────────────────────────────────────
235+ fn_tree = self ._tree ("word/footnotes.xml" )
236+ if fn_tree is not None and body is not None :
237+ ref_ids = {int (ref .get (f"{ W } id" , "0" )) for ref in body .iter (f"{ W } footnoteReference" )}
238+ for fn in list (fn_tree .findall (f"{ W } footnote" )):
239+ fn_id = int (fn .get (f"{ W } id" , "0" ))
240+ if fn_id >= 2 and fn_id not in ref_ids :
241+ fn_tree .remove (fn )
242+ repairs ["orphan_footnotes_removed" ] += 1
243+ if repairs ["orphan_footnotes_removed" ]:
244+ self ._mark ("word/footnotes.xml" )
245+
246+ # ── Orphan endnotes ───────────────────────────────────────────────
247+ en_tree = self ._tree ("word/endnotes.xml" )
248+ if en_tree is not None and body is not None :
249+ ref_ids = {int (ref .get (f"{ W } id" , "0" )) for ref in body .iter (f"{ W } endnoteReference" )}
250+ for en in list (en_tree .findall (f"{ W } endnote" )):
251+ en_id_str = en .get (f"{ W } id" , "0" )
252+ if en_id_str in ("0" , "-1" ):
253+ continue
254+ en_id = int (en_id_str )
255+ if en_id not in ref_ids :
256+ en_tree .remove (en )
257+ repairs ["orphan_endnotes_removed" ] += 1
258+ if repairs ["orphan_endnotes_removed" ]:
259+ self ._mark ("word/endnotes.xml" )
260+
261+ # ── Duplicate paraIds ─────────────────────────────────────────────
262+ seen : dict [str , bool ] = {}
263+ for rel_path , tree in self ._trees .items ():
264+ if not rel_path .endswith (".xml" ):
265+ continue
266+ for elem in tree .iter ():
267+ pid = elem .get (f"{ W14 } paraId" )
268+ if pid is None :
269+ continue
270+ if pid in seen :
271+ elem .set (f"{ W14 } paraId" , self ._new_para_id ())
272+ repairs ["paraids_deduplicated" ] += 1
273+ self ._mark (rel_path )
274+ else :
275+ seen [pid ] = True
276+
277+ # ── Broken internal relationships ─────────────────────────────────
278+ rels_tree = self ._tree ("word/_rels/document.xml.rels" )
279+ if rels_tree is not None :
280+ for rel in list (rels_tree .findall (f"{ RELS } Relationship" )):
281+ if rel .get ("TargetMode" ) == "External" :
282+ continue
283+ target = rel .get ("Target" , "" )
284+ if target and not (self .workdir / "word" / target ).exists ():
285+ rels_tree .remove (rel )
286+ repairs ["broken_rels_removed" ] += 1
287+ if repairs ["broken_rels_removed" ]:
288+ self ._mark ("word/_rels/document.xml.rels" )
289+
290+ return repairs
291+
292+ def _post_repair_warnings (self ) -> list [str ]:
293+ """Check for issues that can't be auto-repaired. Returns warning strings."""
294+ warnings : list [str ] = []
295+ doc = self ._tree ("word/document.xml" )
296+ if doc is None :
297+ return warnings
298+
299+ # Heading level skips
300+ headings = self ._find_headings (doc )
301+ prev = 0
302+ for h in headings :
303+ if h ["level" ] > prev + 1 and prev > 0 :
304+ warnings .append (
305+ f"Heading level skip: H{ prev } -> H{ h ['level' ]} at '{ h ['text' ][:50 ]} '"
306+ )
307+ prev = h ["level" ]
308+
309+ # Unpaired bookmarks
310+ starts = {e .get (f"{ W } id" ) for e in doc .iter (f"{ W } bookmarkStart" ) if e .get (f"{ W } id" )}
311+ ends = {e .get (f"{ W } id" ) for e in doc .iter (f"{ W } bookmarkEnd" ) if e .get (f"{ W } id" )}
312+ unpaired = len (starts - ends ) + len (ends - starts )
313+ if unpaired :
314+ warnings .append (f"{ unpaired } unpaired bookmark(s)" )
315+
316+ # Inconsistent table columns
317+ for idx , tbl in enumerate (doc .iter (f"{ W } tbl" )):
318+ counts = [len (tr .findall (f"{ W } tc" )) for tr in tbl .findall (f"{ W } tr" )]
319+ if counts and len (set (counts )) > 1 :
320+ warnings .append (f"Table { idx + 1 } has inconsistent column counts: { counts } " )
321+
322+ # Artifact markers (DRAFT, TODO, FIXME, XXX)
323+ for marker in ("DRAFT" , "TODO" , "FIXME" , "XXX" ):
324+ hits = self .search_text (marker )
325+ for hit in hits :
326+ warnings .append (f"{ marker } marker in { hit ['source' ]} : '{ hit ['text' ][:60 ]} '" )
327+
328+ return warnings
177329
178330 # ── Private helpers ─────────────────────────────────────────────────────
179331
0 commit comments