Skip to content

Commit 97708ad

Browse files
authored
[Dy2stat] Add word2vec as unittest (#24944)
[Dy2stat] Add word2vec as unittest
1 parent 0eb1b0b commit 97708ad

File tree

1 file changed

+306
-0
lines changed

1 file changed

+306
-0
lines changed
Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import io
16+
import os
17+
import sys
18+
import requests
19+
from collections import OrderedDict
20+
import math
21+
import random
22+
import numpy as np
23+
import paddle
24+
import paddle.fluid as fluid
25+
import unittest
26+
27+
from paddle.fluid.dygraph.nn import Embedding
28+
from paddle.fluid.dygraph import ProgramTranslator
29+
from paddle.fluid.dygraph import declarative
30+
31+
32+
def fake_text():
33+
corpus = []
34+
for i in range(100):
35+
line = "i love paddlepaddle"
36+
corpus.append(line)
37+
return corpus
38+
39+
40+
corpus = fake_text()
41+
42+
43+
def data_preprocess(corpus):
44+
new_corpus = []
45+
for line in corpus:
46+
line = line.strip().lower()
47+
line = line.split(" ")
48+
new_corpus.append(line)
49+
50+
return new_corpus
51+
52+
53+
corpus = data_preprocess(corpus)
54+
55+
56+
def build_dict(corpus, min_freq=3):
57+
word_freq_dict = dict()
58+
for line in corpus:
59+
for word in line:
60+
if word not in word_freq_dict:
61+
word_freq_dict[word] = 0
62+
word_freq_dict[word] += 1
63+
64+
word_freq_dict = sorted(
65+
word_freq_dict.items(), key=lambda x: x[1], reverse=True)
66+
67+
word2id_dict = dict()
68+
word2id_freq = dict()
69+
id2word_dict = dict()
70+
71+
word2id_freq[0] = 1.
72+
word2id_dict['[oov]'] = 0
73+
id2word_dict[0] = '[oov]'
74+
75+
for word, freq in word_freq_dict:
76+
77+
if freq < min_freq:
78+
word2id_freq[0] += freq
79+
continue
80+
81+
curr_id = len(word2id_dict)
82+
word2id_dict[word] = curr_id
83+
word2id_freq[word2id_dict[word]] = freq
84+
id2word_dict[curr_id] = word
85+
86+
return word2id_freq, word2id_dict, id2word_dict
87+
88+
89+
word2id_freq, word2id_dict, id2word_dict = build_dict(corpus)
90+
vocab_size = len(word2id_freq)
91+
print("there are totoally %d different words in the corpus" % vocab_size)
92+
for _, (word, word_id) in zip(range(50), word2id_dict.items()):
93+
print("word %s, its id %d, its word freq %d" %
94+
(word, word_id, word2id_freq[word_id]))
95+
96+
97+
def convert_corpus_to_id(corpus, word2id_dict):
98+
new_corpus = []
99+
for line in corpus:
100+
new_line = [
101+
word2id_dict[word]
102+
if word in word2id_dict else word2id_dict['[oov]'] for word in line
103+
]
104+
new_corpus.append(new_line)
105+
return new_corpus
106+
107+
108+
corpus = convert_corpus_to_id(corpus, word2id_dict)
109+
110+
111+
def subsampling(corpus, word2id_freq):
112+
def keep(word_id):
113+
return random.uniform(0, 1) < math.sqrt(1e-4 / word2id_freq[word_id] *
114+
len(corpus))
115+
116+
new_corpus = []
117+
for line in corpus:
118+
new_line = [word for word in line if keep(word)]
119+
new_corpus.append(line)
120+
return new_corpus
121+
122+
123+
corpus = subsampling(corpus, word2id_freq)
124+
125+
126+
def build_data(corpus,
127+
word2id_dict,
128+
word2id_freq,
129+
max_window_size=3,
130+
negative_sample_num=10):
131+
132+
dataset = []
133+
134+
for line in corpus:
135+
for center_word_idx in range(len(line)):
136+
window_size = random.randint(1, max_window_size)
137+
center_word = line[center_word_idx]
138+
139+
positive_word_range = (max(0, center_word_idx - window_size), min(
140+
len(line) - 1, center_word_idx + window_size))
141+
positive_word_candidates = [
142+
line[idx]
143+
for idx in range(positive_word_range[0], positive_word_range[1]
144+
+ 1)
145+
if idx != center_word_idx and line[idx] != line[center_word_idx]
146+
]
147+
148+
if not positive_word_candidates:
149+
continue
150+
151+
for positive_word in positive_word_candidates:
152+
dataset.append((center_word, positive_word, 1))
153+
154+
i = 0
155+
while i < negative_sample_num:
156+
negative_word_candidate = random.randint(0, vocab_size - 1)
157+
158+
if negative_word_candidate not in positive_word_candidates:
159+
dataset.append((center_word, negative_word_candidate, 0))
160+
i += 1
161+
162+
return dataset
163+
164+
165+
dataset = build_data(corpus, word2id_dict, word2id_freq)
166+
for _, (center_word, target_word, label) in zip(range(50), dataset):
167+
print("center_word %s, target %s, label %d" %
168+
(id2word_dict[center_word], id2word_dict[target_word], label))
169+
170+
171+
def build_batch(dataset, batch_size, epoch_num):
172+
173+
center_word_batch = []
174+
target_word_batch = []
175+
label_batch = []
176+
eval_word_batch = []
177+
178+
for epoch in range(epoch_num):
179+
for center_word, target_word, label in dataset:
180+
center_word_batch.append([center_word])
181+
target_word_batch.append([target_word])
182+
label_batch.append([label])
183+
184+
if len(eval_word_batch) < 5:
185+
eval_word_batch.append([random.randint(0, 99)])
186+
elif len(eval_word_batch) < 10:
187+
eval_word_batch.append([random.randint(0, vocab_size - 1)])
188+
189+
if len(center_word_batch) == batch_size:
190+
yield np.array(center_word_batch).astype("int64"), np.array(
191+
target_word_batch).astype("int64"), np.array(
192+
label_batch).astype("float32"), np.array(
193+
eval_word_batch).astype("int64")
194+
center_word_batch = []
195+
target_word_batch = []
196+
label_batch = []
197+
eval_word_batch = []
198+
199+
if len(center_word_batch) > 0:
200+
yield np.array(center_word_batch).astype("int64"), np.array(
201+
target_word_batch).astype("int64"), np.array(label_batch).astype(
202+
"float32"), np.array(eval_word_batch).astype("int64")
203+
204+
205+
class SkipGram(fluid.dygraph.Layer):
206+
def __init__(self, name_scope, vocab_size, embedding_size, init_scale=0.1):
207+
super(SkipGram, self).__init__(name_scope)
208+
self.vocab_size = vocab_size
209+
self.embedding_size = embedding_size
210+
211+
self.embedding = Embedding(
212+
size=[self.vocab_size, self.embedding_size],
213+
dtype='float32',
214+
param_attr=fluid.ParamAttr(
215+
name='embedding_para',
216+
initializer=fluid.initializer.UniformInitializer(
217+
low=-0.5 / self.embedding_size,
218+
high=0.5 / self.embedding_size)))
219+
220+
self.embedding_out = Embedding(
221+
size=[self.vocab_size, self.embedding_size],
222+
dtype='float32',
223+
param_attr=fluid.ParamAttr(
224+
name='embedding_out_para',
225+
initializer=fluid.initializer.UniformInitializer(
226+
low=-0.5 / self.embedding_size,
227+
high=0.5 / self.embedding_size)))
228+
229+
@declarative
230+
def forward(self, center_words, target_words, label):
231+
center_words_emb = self.embedding(center_words)
232+
target_words_emb = self.embedding_out(target_words)
233+
234+
# center_words_emb = [batch_size, embedding_size]
235+
# target_words_emb = [batch_size, embedding_size]
236+
word_sim = fluid.layers.elementwise_mul(center_words_emb,
237+
target_words_emb)
238+
word_sim = fluid.layers.reduce_sum(word_sim, dim=-1)
239+
240+
pred = fluid.layers.sigmoid(word_sim)
241+
242+
loss = fluid.layers.sigmoid_cross_entropy_with_logits(word_sim, label)
243+
loss = fluid.layers.reduce_mean(loss)
244+
245+
return pred, loss
246+
247+
248+
batch_size = 512
249+
epoch_num = 1
250+
embedding_size = 200
251+
learning_rate = 1e-3
252+
total_steps = len(dataset) * epoch_num // batch_size
253+
254+
255+
def train(to_static):
256+
program_translator = ProgramTranslator()
257+
program_translator.enable(to_static)
258+
259+
random.seed(0)
260+
np.random.seed(0)
261+
262+
place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
263+
) else fluid.CPUPlace()
264+
with fluid.dygraph.guard(place):
265+
fluid.default_startup_program().random_seed = 1000
266+
fluid.default_main_program().random_seed = 1000
267+
268+
skip_gram_model = SkipGram("skip_gram_model", vocab_size,
269+
embedding_size)
270+
adam = fluid.optimizer.AdamOptimizer(
271+
learning_rate=learning_rate,
272+
parameter_list=skip_gram_model.parameters())
273+
274+
step = 0
275+
ret = []
276+
for center_words, target_words, label, eval_words in build_batch(
277+
dataset, batch_size, epoch_num):
278+
center_words_var = fluid.dygraph.to_variable(center_words)
279+
target_words_var = fluid.dygraph.to_variable(target_words)
280+
label_var = fluid.dygraph.to_variable(label)
281+
pred, loss = skip_gram_model(center_words_var, target_words_var,
282+
label_var)
283+
284+
loss.backward()
285+
adam.minimize(loss)
286+
skip_gram_model.clear_gradients()
287+
288+
step += 1
289+
mean_loss = np.mean(loss.numpy())
290+
print("step %d / %d, loss %f" % (step, total_steps, mean_loss))
291+
ret.append(mean_loss)
292+
return np.array(ret)
293+
294+
295+
class TestWord2Vec(unittest.TestCase):
296+
def test_dygraph_static_same_loss(self):
297+
dygraph_loss = train(to_static=False)
298+
static_loss = train(to_static=True)
299+
self.assertTrue(
300+
np.allclose(dygraph_loss, static_loss),
301+
msg="dygraph_loss: {} \nstatic_loss: {}".format(dygraph_loss,
302+
static_loss))
303+
304+
305+
if __name__ == '__main__':
306+
unittest.main()

0 commit comments

Comments
 (0)