1313
1414#include " util/encoding.h"
1515#include " util/css_const.h"
16+ #include " util/unicode.h"
1617
1718namespace pdf2htmlEX {
1819
@@ -32,6 +33,7 @@ HTMLTextLine::HTMLTextLine (const HTMLLineState & line_state, const Param & para
3233 ,clip_x1(0 )
3334 ,clip_y1(0 )
3435 ,width(0 )
36+ ,last_output_unicode(0 )
3537{ }
3638
3739void HTMLTextLine::append_unicodes (const Unicode * u, int l, double width)
@@ -88,16 +90,25 @@ void HTMLTextLine::dump_char(std::ostream & out, int pos)
8890 int c = text[pos];
8991 if (c > 0 )
9092 {
91- Unicode u = c;
92- writeUnicodes (out, &u, 1 );
93+ dump_unicode (out, c);
9394 }
9495 else if (c < 0 )
9596 {
9697 auto dt = decomposed_text[- c - 1 ];
97- writeUnicodes (out, &dt.front (), dt.size ());
98+ for (auto it = dt.begin (), end = dt.end (); it != end; it++)
99+ dump_unicode (out, *it);
98100 }
99101}
100102
103+ void HTMLTextLine::dump_unicode (std::ostream & out, Unicode u)
104+ {
105+ // ZWSP following space can be optimized out.
106+ if (u == zero_width_space && last_output_unicode == ' ' )
107+ return ;
108+ writeUnicodes (out, &u, 1 );
109+ last_output_unicode = u;
110+ }
111+
101112void HTMLTextLine::dump_chars (ostream & out, int begin, int len)
102113{
103114 static const Color transparent (0 , 0 , 0 , true );
@@ -162,6 +173,7 @@ void HTMLTextLine::dump_text(ostream & out)
162173 << " " << CSS::BOTTOM_CN << all_manager.bottom .install (line_state.y - clip_y1)
163174 ;
164175 // it will be closed by the first state
176+ last_output_unicode = 0 ;
165177 }
166178
167179 std::vector<State*> stack;
@@ -249,8 +261,7 @@ void HTMLTextLine::dump_text(ostream & out)
249261 double space_off = state_iter1->single_space_offset ();
250262 if (std::abs (target - space_off) <= param.h_eps )
251263 {
252- Unicode u = ' ' ;
253- writeUnicodes (out, &u, 1 );
264+ dump_unicode (out, ' ' );
254265 actual_offset = space_off;
255266 done = true ;
256267 }
@@ -269,7 +280,10 @@ void HTMLTextLine::dump_text(ostream & out)
269280 double threshold = state_iter1->em_size () * (param.space_threshold );
270281
271282 out << " <span class=\" " << CSS::WHITESPACE_CN
272- << ' ' << CSS::WHITESPACE_CN << wid << " \" >" << (target > (threshold - EPS) ? " " : " " ) << " </span>" ;
283+ << ' ' << CSS::WHITESPACE_CN << wid << " \" >" ;
284+ if (target > (threshold - EPS))
285+ dump_unicode (out, ' ' );
286+ out << " </span>" ;
273287 }
274288 }
275289 }
0 commit comments