@@ -40,7 +40,7 @@ def convert_config(fairseq_cfg, vocab_size, cfg):
4040 cfg .MODEL .shared_embed = fairseq_cfg .share_all_embeddings
4141 cfg .MODEL .scale_embed = not fairseq_cfg .no_scale_embedding
4242 cfg .MODEL .tie_weights = fairseq_cfg .share_decoder_input_output_embed
43- cfg .MODEL .layernorm_embedding = fairseq_cfg .layernorm_embedding
43+ cfg .MODEL .data_norm = fairseq_cfg .layernorm_embedding
4444 cfg .MODEL .pooler_activation = fairseq_cfg .pooler_activation_fn
4545 cfg .MODEL .layer_norm_eps = 1E-5
4646 cfg .MODEL .dropout = fairseq_cfg .dropout
@@ -111,26 +111,6 @@ def convert_attention(num_layers,
111111 gl_qkv_bias .set_data (
112112 np .concatenate ([fs_q_bias , fs_k_bias , fs_v_bias ], axis = 0 ))
113113
114- def convert_embeddings (fairseq_prefix , gluon_prefix ):
115- for k , v in [
116- ('.embed_tokens.weight' , '_embed_layer.weight' ),
117- ('.layernorm_embedding.weight' , '_embed_ln.gamma' ),
118- ('.layernorm_embedding.bias' , '_embed_ln.beta' ),
119- ]:
120- fs_name = fairseq_prefix + k
121- gl_name = gluon_prefix + v
122- all_keys .remove (gl_name )
123- gluon_params [gl_name ].set_data (
124- fairseq_params [fs_name ].cpu ().numpy ())
125-
126- # position embed weight
127- padding_idx = fairseq_model .task .dictionary .pad_index
128- fs_pos_embed_name = fairseq_prefix + '.embed_positions.weight'
129- gl_pos_embed_name = gluon_prefix + '_pos_embed_layer._embed.weight'
130- all_keys .remove (gl_pos_embed_name )
131- gluon_params [gl_pos_embed_name ].set_data (
132- fairseq_params [fs_pos_embed_name ].cpu ().numpy ()[padding_idx + 1 :, :])
133-
134114 def convert_ffn (num_layers , fairseq_prefix , gluon_prefix ):
135115 # convert feed forward layer in encoder
136116 for layer_id in range (num_layers ):
@@ -150,11 +130,33 @@ def convert_ffn(num_layers, fairseq_prefix, gluon_prefix):
150130 gluon_params [gl_name ].set_data (
151131 fairseq_params [fs_name ].cpu ().numpy ())
152132
133+ print ('converting embedding params' )
134+ padding_idx = fairseq_model .task .dictionary .pad_index
135+ for fs_name , gl_name in [
136+ ('model.encoder.embed_tokens.weight' , 'src_embed_layer.weight' ),
137+ ('model.encoder.embed_positions.weight' , 'src_pos_embed_layer._embed.weight' ),
138+ ('model.encoder.layernorm_embedding.weight' , 'encoder.ln_data.gamma' ),
139+ ('model.encoder.layernorm_embedding.bias' , 'encoder.ln_data.beta' ),
140+ ('model.decoder.embed_tokens.weight' , 'tgt_embed_layer.weight' ),
141+ ('model.decoder.embed_positions.weight' , 'tgt_pos_embed_layer._embed.weight' ),
142+ ('model.decoder.layernorm_embedding.weight' , 'decoder.ln_data.gamma' ),
143+ ('model.decoder.layernorm_embedding.bias' , 'decoder.ln_data.beta' ),
144+ # final projection in decoder
145+ ('model.decoder.output_projection.weight' , 'tgt_final_layer.weight' ),
146+ ]:
147+ all_keys .remove (gl_name )
148+ if 'embed_positions' in fs_name :
149+ # position embed weight
150+ gluon_params [gl_name ].set_data (
151+ fairseq_params [fs_name ].cpu ().numpy ()[padding_idx + 1 :, :])
152+ else :
153+ gluon_params [gl_name ].set_data (
154+ fairseq_params [fs_name ].cpu ().numpy ())
155+
153156 print ('converting encoder params' )
154157 encoder_num_layers = gluon_cfg .MODEL .ENCODER .num_layers
155158 convert_attention (encoder_num_layers , 'model.encoder' , 'encoder' )
156159 convert_ffn (encoder_num_layers , 'model.encoder' , 'encoder' )
157- convert_embeddings ('model.encoder' , 'src' )
158160 for layer_id in range (encoder_num_layers ):
159161 for k , v in [
160162 ('self_attn.out_proj.weight' , 'attention_proj.weight' ),
@@ -170,6 +172,7 @@ def convert_ffn(num_layers, fairseq_prefix, gluon_prefix):
170172 gluon_params [gl_name ].set_data (
171173 fairseq_params [fs_name ].cpu ().numpy ())
172174
175+ print ('converting decoder params' )
173176 decoder_num_layers = gluon_cfg .MODEL .DECODER .num_layers
174177 convert_attention (decoder_num_layers , 'model.decoder' , 'decoder' ,
175178 gluon_attn_prefix = 'attn_in_qkv' )
@@ -201,14 +204,6 @@ def convert_ffn(num_layers, fairseq_prefix, gluon_prefix):
201204 gluon_params [gl_name ].set_data (
202205 fairseq_params [fs_name ].cpu ().numpy ())
203206
204- convert_embeddings ('model.decoder' , 'tgt' )
205- # final projection in decoder
206- for fs_name , gl_name in [
207- ('model.decoder.output_projection.weight' , 'tgt_final_layer.weight' ),
208- ]:
209- all_keys .remove (gl_name )
210- gluon_params [gl_name ].set_data (
211- fairseq_params [fs_name ].cpu ().numpy ())
212207 assert len (all_keys ) == 0 , 'parameters missing from tensorflow checkpoint'
213208
214209 # check parameters sharing if share_decoder_input_output_embed is true
0 commit comments