Chap 4 word2vec: polish chapter content (#541)

daming-lu · web-flow · commit 928d178c3a1b · 2018-06-07T23:32:01.000-07:00
diff --git a/04.word2vec/README.md b/04.word2vec/README.md
@@ -329,7 +329,9 @@ def train(use_cuda, train_program, params_dirname):
             if event.step % 10 == 0:
                 print "Step %d: Average Cost %f" % (event.step, avg_cost)
 
-            # If average cost is lower than 5.0, we consider the model good enough to stop.
+            # If average cost is lower than 5.8, we consider the model good enough to stop.
+            # Note 5.8 is a relatively high value. In order to get a better model, one should
+            # aim for avg_cost lower than 3.5. But the training could take longer time.
             if avg_cost < 5.8:
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -383,16 +385,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # detail (lod) info of each LoDtensor should be [[1]] meaning there is only
     # one lod_level and there is only one sequence of one word on this level.
     # Note that lod info should be a list of lists.
-    lod1 = [[211]]  # 'among'
-    lod2 = [[6]]    # 'a'
-    lod3 = [[96]]   # 'group'
-    lod4 = [[4]]    # 'of'
-    base_shape = [1]
 
-    first_word  = fluid.create_lod_tensor(lod1, base_shape, place)
-    second_word = fluid.create_lod_tensor(lod2, base_shape, place)
-    third_word  = fluid.create_lod_tensor(lod3, base_shape, place)
-    fourth_word = fluid.create_lod_tensor(lod4, base_shape, place)
+    data1 = [[211]]  # 'among'
+    data2 = [[6]]    # 'a'
+    data3 = [[96]]   # 'group'
+    data4 = [[4]]    # 'of'
+    lod = [[1]]
+
+    first_word  = fluid.create_lod_tensor(data1, lod, place)
+    second_word = fluid.create_lod_tensor(data2, lod, place)
+    third_word  = fluid.create_lod_tensor(data3, lod, place)
+    fourth_word = fluid.create_lod_tensor(data4, lod, place)
 
     result = inferencer.infer(
         {
@@ -406,16 +409,18 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print(numpy.array(result[0]))
     most_possible_word_index = numpy.argmax(result[0])
     print(most_possible_word_index)
-    print([key for key, value in word_dict.iteritems() if value == most_possible_word_index][0])
+    print([
+        key for key, value in word_dict.iteritems()
+        if value == most_possible_word_index
+    ][0])
 ```
 
-When we spent 30 mins in training, the output is like below, which means the next word for `among a group of` is `unknown`. After several hours training, it gives a meaningful prediction as `workers`.
+When we spent 3 mins in training, the output is like below, which means the next word for `among a group of` is `a`. If we train the model with a longer time, it will give a meaningful prediction as `workers`.
 
 ```text
-[[4.0056456e-02 5.4810006e-02 5.3107393e-05 ... 1.0061498e-04
-  8.9233123e-05 1.5757295e-01]]
-2072
-<unk>
+[[0.00106646 0.0007907  0.00072041 ... 0.00049024 0.00041355 0.00084464]]
+6
+a
 ```
 
 The main entrance of the program is fairly simple:
diff --git a/04.word2vec/index.html b/04.word2vec/index.html
@@ -371,7 +371,9 @@
             if event.step % 10 == 0:
                 print "Step %d: Average Cost %f" % (event.step, avg_cost)
 
-            # If average cost is lower than 5.0, we consider the model good enough to stop.
+            # If average cost is lower than 5.8, we consider the model good enough to stop.
+            # Note 5.8 is a relatively high value. In order to get a better model, one should
+            # aim for avg_cost lower than 3.5. But the training could take longer time.
             if avg_cost < 5.8:
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -425,16 +427,17 @@
     # detail (lod) info of each LoDtensor should be [[1]] meaning there is only
     # one lod_level and there is only one sequence of one word on this level.
     # Note that lod info should be a list of lists.
-    lod1 = [[211]]  # 'among'
-    lod2 = [[6]]    # 'a'
-    lod3 = [[96]]   # 'group'
-    lod4 = [[4]]    # 'of'
-    base_shape = [1]
 
-    first_word  = fluid.create_lod_tensor(lod1, base_shape, place)
-    second_word = fluid.create_lod_tensor(lod2, base_shape, place)
-    third_word  = fluid.create_lod_tensor(lod3, base_shape, place)
-    fourth_word = fluid.create_lod_tensor(lod4, base_shape, place)
+    data1 = [[211]]  # 'among'
+    data2 = [[6]]    # 'a'
+    data3 = [[96]]   # 'group'
+    data4 = [[4]]    # 'of'
+    lod = [[1]]
+
+    first_word  = fluid.create_lod_tensor(data1, lod, place)
+    second_word = fluid.create_lod_tensor(data2, lod, place)
+    third_word  = fluid.create_lod_tensor(data3, lod, place)
+    fourth_word = fluid.create_lod_tensor(data4, lod, place)
 
     result = inferencer.infer(
         {
@@ -448,16 +451,18 @@
     print(numpy.array(result[0]))
     most_possible_word_index = numpy.argmax(result[0])
     print(most_possible_word_index)
-    print([key for key, value in word_dict.iteritems() if value == most_possible_word_index][0])
+    print([
+        key for key, value in word_dict.iteritems()
+        if value == most_possible_word_index
+    ][0])
 ```
 
-When we spent 30 mins in training, the output is like below, which means the next word for `among a group of` is `unknown`. After several hours training, it gives a meaningful prediction as `workers`.
+When we spent 3 mins in training, the output is like below, which means the next word for `among a group of` is `board`. If we train the model with a longer time, it will give a meaningful prediction as `workers`.
 
 ```text
-[[4.0056456e-02 5.4810006e-02 5.3107393e-05 ... 1.0061498e-04
-  8.9233123e-05 1.5757295e-01]]
-2072
-<unk>
+[[0.00144043 0.00073983 0.00042264 ... 0.00061815 0.00038701 0.00099838]]
+142
+board
 ```
 
 The main entrance of the program is fairly simple:
diff --git a/04.word2vec/train.py b/04.word2vec/train.py
@@ -107,6 +107,9 @@ def event_handler(event):
             if event.step % 10 == 0:
                 print "Step %d: Average Cost %f" % (event.step, avg_cost)
 
+            # If average cost is lower than 5.8, we consider the model good enough to stop.
+            # Note 5.8 is a relatively high value. In order to get a better model, one should
+            # aim for avg_cost lower than 3.5. But the training could take longer time.
             if avg_cost < 5.8:
                 trainer.save_params(params_dirname)
                 trainer.stop()
@@ -138,17 +141,17 @@ def infer(use_cuda, inference_program, params_dirname=None):
     # detail (lod) info of each LoDtensor should be [[1]] meaning there is only
     # one lod_level and there is only one sequence of one word on this level.
     # Note that lod info should be a list of lists.
+
+    data1 = [[211]]  # 'among'
+    data2 = [[6]]  # 'a'
+    data3 = [[96]]  # 'group'
+    data4 = [[4]]  # 'of'
     lod = [[1]]
-    base_shape = [1]
-    # The range of random integers is [low, high]
-    first_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
-    second_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
-    third_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
-    fourth_word = fluid.create_random_int_lodtensor(
-        lod, base_shape, place, low=0, high=dict_size - 1)
+
+    first_word = fluid.create_lod_tensor(data1, lod, place)
+    second_word = fluid.create_lod_tensor(data2, lod, place)
+    third_word = fluid.create_lod_tensor(data3, lod, place)
+    fourth_word = fluid.create_lod_tensor(data4, lod, place)
 
     result = inferencer.infer(
         {