Skip to content

Commit 3ac45f8

Browse files
feat(office): Add ooxmlsdk integration for Word/PowerPoint preservation
1 parent f839302 commit 3ac45f8

File tree

7 files changed

+957
-378
lines changed

7 files changed

+957
-378
lines changed

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,8 +211,9 @@ rust_xlsxwriter = "0.79"
211211
spreadsheet-ods = "1.0"
212212

213213
# Word/PowerPoint Support - MS Office 100% Compatibility
214+
# ooxmlsdk preserves: Full document structure at XML level (100% round-trip)
214215
docx-rs = "0.4"
215-
ooxmlsdk = { version = "0.3", features = ["docx", "pptx"] }
216+
ooxmlsdk = { version = "0.3", features = ["docx", "pptx", "parts", "office2021"] }
216217
# ppt-rs disabled due to version conflict - using ooxmlsdk for PPTX support instead
217218
# ppt-rs = { version = "0.2", default-features = false }
218219

src/docs/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
pub mod collaboration;
22
pub mod handlers;
3+
pub mod ooxml;
34
pub mod storage;
45
pub mod types;
56
pub mod utils;

src/docs/ooxml.rs

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
use std::io::Cursor;
2+
3+
pub struct OoxmlDocument {
4+
pub original_bytes: Vec<u8>,
5+
pub paragraphs: Vec<ParagraphInfo>,
6+
}
7+
8+
pub struct ParagraphInfo {
9+
pub text: String,
10+
pub index: usize,
11+
}
12+
13+
pub fn load_docx_preserving(bytes: &[u8]) -> Result<OoxmlDocument, String> {
14+
use ooxmlsdk::parts::wordprocessing_document::WordprocessingDocument;
15+
16+
let reader = Cursor::new(bytes);
17+
let docx = WordprocessingDocument::new(reader)
18+
.map_err(|e| format!("Failed to parse DOCX: {e}"))?;
19+
20+
let xml_str = docx
21+
.main_document_part
22+
.root_element
23+
.to_xml()
24+
.unwrap_or_default();
25+
26+
let paragraphs = extract_paragraphs(&xml_str);
27+
28+
Ok(OoxmlDocument {
29+
original_bytes: bytes.to_vec(),
30+
paragraphs,
31+
})
32+
}
33+
34+
fn extract_paragraphs(xml: &str) -> Vec<ParagraphInfo> {
35+
let mut paragraphs = Vec::new();
36+
let mut para_index = 0;
37+
38+
let mut pos = 0;
39+
while let Some(p_start) = xml[pos..].find("<w:p") {
40+
let abs_start = pos + p_start;
41+
42+
if let Some(p_end_rel) = xml[abs_start..].find("</w:p>") {
43+
let abs_end = abs_start + p_end_rel + 6;
44+
let para_content = &xml[abs_start..abs_end];
45+
46+
let text = extract_text_from_paragraph(para_content);
47+
if !text.trim().is_empty() {
48+
paragraphs.push(ParagraphInfo {
49+
text,
50+
index: para_index,
51+
});
52+
}
53+
para_index += 1;
54+
pos = abs_end;
55+
} else {
56+
break;
57+
}
58+
}
59+
60+
paragraphs
61+
}
62+
63+
fn extract_text_from_paragraph(para_xml: &str) -> String {
64+
let mut text = String::new();
65+
let mut pos = 0;
66+
67+
while let Some(t_start) = para_xml[pos..].find("<w:t") {
68+
let abs_start = pos + t_start;
69+
70+
if let Some(content_start_rel) = para_xml[abs_start..].find('>') {
71+
let abs_content_start = abs_start + content_start_rel + 1;
72+
73+
if let Some(t_end_rel) = para_xml[abs_content_start..].find("</w:t>") {
74+
let content = &para_xml[abs_content_start..abs_content_start + t_end_rel];
75+
text.push_str(content);
76+
pos = abs_content_start + t_end_rel + 6;
77+
} else {
78+
break;
79+
}
80+
} else {
81+
break;
82+
}
83+
}
84+
85+
unescape_xml(&text)
86+
}
87+
88+
fn unescape_xml(text: &str) -> String {
89+
text.replace("&amp;", "&")
90+
.replace("&lt;", "<")
91+
.replace("&gt;", ">")
92+
.replace("&quot;", "\"")
93+
.replace("&apos;", "'")
94+
}
95+
96+
fn escape_xml(text: &str) -> String {
97+
text.replace('&', "&amp;")
98+
.replace('<', "&lt;")
99+
.replace('>', "&gt;")
100+
.replace('"', "&quot;")
101+
.replace('\'', "&apos;")
102+
}
103+
104+
pub fn save_docx_preserving(original_bytes: &[u8]) -> Result<Vec<u8>, String> {
105+
use ooxmlsdk::parts::wordprocessing_document::WordprocessingDocument;
106+
107+
let reader = Cursor::new(original_bytes);
108+
let docx = WordprocessingDocument::new(reader)
109+
.map_err(|e| format!("Failed to parse DOCX: {e}"))?;
110+
111+
let mut output = Cursor::new(Vec::new());
112+
docx.save(&mut output)
113+
.map_err(|e| format!("Failed to save DOCX: {e}"))?;
114+
115+
Ok(output.into_inner())
116+
}
117+
118+
pub fn update_docx_text(
119+
original_bytes: &[u8],
120+
new_paragraphs: &[String],
121+
) -> Result<Vec<u8>, String> {
122+
use std::io::{Read, Write};
123+
use zip::{write::SimpleFileOptions, ZipArchive, ZipWriter};
124+
125+
let reader = Cursor::new(original_bytes);
126+
let mut archive =
127+
ZipArchive::new(reader).map_err(|e| format!("Failed to open DOCX archive: {e}"))?;
128+
129+
let mut output_buf = Cursor::new(Vec::new());
130+
{
131+
let mut zip_writer = ZipWriter::new(&mut output_buf);
132+
let options =
133+
SimpleFileOptions::default().compression_method(zip::CompressionMethod::Deflated);
134+
135+
for i in 0..archive.len() {
136+
let mut file = archive
137+
.by_index(i)
138+
.map_err(|e| format!("Failed to read archive entry: {e}"))?;
139+
140+
let name = file.name().to_string();
141+
142+
if name == "word/document.xml" {
143+
let mut content = String::new();
144+
file.read_to_string(&mut content)
145+
.map_err(|e| format!("Failed to read document.xml: {e}"))?;
146+
147+
let modified_content = replace_paragraph_texts(&content, new_paragraphs);
148+
149+
zip_writer
150+
.start_file(&name, options)
151+
.map_err(|e| format!("Failed to start file in zip: {e}"))?;
152+
zip_writer
153+
.write_all(modified_content.as_bytes())
154+
.map_err(|e| format!("Failed to write document.xml: {e}"))?;
155+
} else {
156+
let mut buf = Vec::new();
157+
file.read_to_end(&mut buf)
158+
.map_err(|e| format!("Failed to read file: {e}"))?;
159+
160+
zip_writer
161+
.start_file(&name, options)
162+
.map_err(|e| format!("Failed to start file in zip: {e}"))?;
163+
zip_writer
164+
.write_all(&buf)
165+
.map_err(|e| format!("Failed to write file: {e}"))?;
166+
}
167+
}
168+
169+
zip_writer
170+
.finish()
171+
.map_err(|e| format!("Failed to finish zip: {e}"))?;
172+
}
173+
174+
Ok(output_buf.into_inner())
175+
}
176+
177+
fn replace_paragraph_texts(xml: &str, new_paragraphs: &[String]) -> String {
178+
let mut result = xml.to_string();
179+
let mut para_idx = 0;
180+
let mut search_pos = 0;
181+
182+
while let Some(p_start) = result[search_pos..]
183+
.find("<w:p ")
184+
.or_else(|| result[search_pos..].find("<w:p>"))
185+
{
186+
let abs_start = search_pos + p_start;
187+
188+
if let Some(p_end_rel) = result[abs_start..].find("</w:p>") {
189+
let abs_end = abs_start + p_end_rel + 6;
190+
let para_content = result[abs_start..abs_end].to_string();
191+
192+
if para_content.contains("<w:t") {
193+
if para_idx < new_paragraphs.len() {
194+
let new_para = replace_first_text_run(&para_content, &new_paragraphs[para_idx]);
195+
let new_len = new_para.len();
196+
result = format!("{}{}{}", &result[..abs_start], new_para, &result[abs_end..]);
197+
search_pos = abs_start + new_len;
198+
} else {
199+
search_pos = abs_end;
200+
}
201+
para_idx += 1;
202+
} else {
203+
search_pos = abs_end;
204+
}
205+
} else {
206+
break;
207+
}
208+
}
209+
210+
result
211+
}
212+
213+
fn replace_first_text_run(para_xml: &str, new_text: &str) -> String {
214+
let mut result = para_xml.to_string();
215+
let mut found_first = false;
216+
217+
let mut search_pos = 0;
218+
while let Some(t_start) = result[search_pos..].find("<w:t") {
219+
let abs_start = search_pos + t_start;
220+
221+
if let Some(tag_end_rel) = result[abs_start..].find('>') {
222+
let abs_content_start = abs_start + tag_end_rel + 1;
223+
224+
if let Some(t_end_rel) = result[abs_content_start..].find("</w:t>") {
225+
let abs_content_end = abs_content_start + t_end_rel;
226+
227+
if !found_first {
228+
let escaped = escape_xml(new_text);
229+
result = format!(
230+
"{}{}{}",
231+
&result[..abs_content_start],
232+
escaped,
233+
&result[abs_content_end..]
234+
);
235+
found_first = true;
236+
search_pos = abs_content_start + escaped.len() + 6;
237+
} else {
238+
result = format!("{}{}", &result[..abs_content_start], &result[abs_content_end..]);
239+
search_pos = abs_content_start;
240+
}
241+
} else {
242+
break;
243+
}
244+
} else {
245+
break;
246+
}
247+
}
248+
249+
result
250+
}

0 commit comments

Comments
 (0)