medallia · wangyum · Nov 20, 2015 · Nov 25, 2015 · Nov 25, 2015
diff --git a/src/main/java/com/medallia/word2vec/Word2VecModel.java b/src/main/java/com/medallia/word2vec/Word2VecModel.java
@@ -18,6 +18,7 @@
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
+import com.google.common.primitives.Bytes;
 import com.google.common.primitives.Doubles;
 import com.medallia.word2vec.thrift.Word2VecModelThrift;
 import com.medallia.word2vec.util.Common;
@@ -41,6 +42,8 @@ public class Word2VecModel {
 	final int layerSize;
 	final DoubleBuffer vectors;
 	private final static long ONE_GB = 1024 * 1024 * 1024;
+	private final static byte LINE_SEPARATOR = 10;
+	private final static byte COLUME_SEPARATOR = 32;
 
 	Word2VecModel(Iterable<String> vocab, int layerSize, DoubleBuffer vectors) {
 		this.vocab = ImmutableList.copyOf(vocab);
@@ -137,14 +140,14 @@ public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder, Profilin
 			// every gigabyte. To calculate offsets correctly, we have to keep track how many gigabytes
 			// we've already skipped. That's what this is for.
 
-			StringBuilder sb = new StringBuilder();
-			char c = (char) buffer.get();
-			while (c != '\n') {
-				sb.append(c);
-				c = (char) buffer.get();
+			List<Byte> list = new ArrayList<Byte>();
+			byte c = buffer.get();
+			while (c != LINE_SEPARATOR) {
+				list.add(c);
+				c = buffer.get();
 			}
-			String firstLine = sb.toString();
-			int index = firstLine.indexOf(' ');
+			String firstLine = new String(Bytes.toArray(list));
+			int index = firstLine.indexOf(COLUME_SEPARATOR);
 			Preconditions.checkState(index != -1,
 					"Expected a space in the first line of file '%s': '%s'",
 					file.getAbsolutePath(), firstLine);
@@ -163,17 +166,17 @@ public static Word2VecModel fromBinFile(File file, ByteOrder byteOrder, Profilin
 			final float[] floats = new float[layerSize];
 			for (int lineno = 0; lineno < vocabSize; lineno++) {
 				// read vocab
-				sb.setLength(0);
-				c = (char) buffer.get();
-				while (c != ' ') {
+				list.clear();
+				c = buffer.get();
+				while (c != COLUME_SEPARATOR) {
 					// ignore newlines in front of words (some binary files have newline,
 					// some don't)
-					if (c != '\n') {
-						sb.append(c);
+					if (c != LINE_SEPARATOR) {
+						list.add(c);
 					}
-					c = (char) buffer.get();
+					c = buffer.get();
 				}
-				vocabs.add(sb.toString());
+				vocabs.add(new String(Bytes.toArray(list)));
 
 				// read vector
 				final FloatBuffer floatBuffer = buffer.asFloatBuffer();

diff --git a/src/test/resources/com/medallia/word2vec/chinese.bin b/src/test/resources/com/medallia/word2vec/chinese.bin