fxsjy · JesseyXujin · Nov 19, 2019 · Nov 19, 2019 · Nov 19, 2019 · Nov 19, 2019
diff --git a/README.md b/README.md
@@ -57,6 +57,9 @@ jieba
 # encoding=utf-8
 import jieba
 
+seg_list = jieba.cut("我来到北京清华大学", use_paddle=True)
+print("Paddle Mode: " + "/ ".join(seg_list))  # paddle模式
+
 seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
 print("Full Mode: " + "/ ".join(seg_list))  # 全模式
 
@@ -196,7 +199,8 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
 
 ```pycon
 >>> import jieba.posseg as pseg
->>> words = pseg.cut("我爱北京天安门")
+>>> words = pseg.cut("我爱北京天安门") #jieba默认模式
+>>> words = pseg.cut("我爱北京天安门",use_paddle=True) #paddle模式
 >>> for word, flag in words:
 ...    print('%s %s' % (word, flag))
 ...
@@ -206,6 +210,21 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
 天安门 ns
 ```
 
+paddle模式词性标注对应表如下：
+
+词性和专名类别标签集合如下表，其中词性标签 24 个（小写字母），专名类别标签 4 个（大写字母）。
+
+| 标签 | 含义     | 标签 | 含义     | 标签 | 含义     | 标签 | 含义     |
+| ---- | -------- | ---- | -------- | ---- | -------- | ---- | -------- |
+| n    | 普通名词 | f    | 方位名词 | s    | 处所名词 | t    | 时间     |
+| nr   | 人名     | ns   | 地名     | nt   | 机构名   | nw   | 作品名   |
+| nz   | 其他专名 | v    | 普通动词 | vd   | 动副词   | vn   | 名动词   |
+| a    | 形容词   | ad   | 副形词   | an   | 名形词   | d    | 副词     |
+| m    | 数量词   | q    | 量词     | r    | 代词     | p    | 介词     |
+| c    | 连词     | u    | 助词     | xc   | 其他虚词 | w    | 标点符号 |
+| PER  | 人名     | LOC  | 地名     | ORG  | 机构名   | TIME | 时间     |
+
+
 5. 并行分词
 -----------
 * 原理：将目标文本按行分隔后，把各行文本分配到多个 Python 进程并行分词，然后归并结果，从而获得分词速度的可观提升

diff --git a/jieba/__init__.py b/jieba/__init__.py
@@ -1,3 +1,5 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
 from __future__ import absolute_import, unicode_literals
 __version__ = '0.39'
 __license__ = 'MIT'
@@ -6,20 +8,22 @@
 import os
 import sys
 import time
+import imp
 import logging
 import marshal
 import tempfile
 import threading
 from math import log
 from hashlib import md5
 from ._compat import *
-from . import finalseg
 
+import jieba.finalseg
 if os.name == 'nt':
     from shutil import move as _replace_file
 else:
     _replace_file = os.rename
 
+
 _get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
 
 DEFAULT_DICT = None
@@ -272,7 +276,7 @@ def __cut_DAG(self, sentence):
                 for elem in buf:
                     yield elem
 
-    def cut(self, sentence, cut_all=False, HMM=True):
+    def cut(self, sentence, cut_all = False, HMM = True,use_paddle = False):
         '''
         The main function that segments an entire sentence that contains
         Chinese characters into separated words.
@@ -282,8 +286,17 @@ def cut(self, sentence, cut_all=False, HMM=True):
             - cut_all: Model type. True for full pattern, False for accurate pattern.
             - HMM: Whether to use the Hidden Markov Model.
         '''
-        sentence = strdecode(sentence)
-
+        is_paddle_installed = False
+        if use_paddle == True:
+            is_paddle_installed = check_paddle_install()
+        sentence = strdecode(sentence)    
+        if use_paddle == True and is_paddle_installed == True:
+            results = predict.get_sent(sentence)
+            for sent in results:
+                if sent is None:
+                    continue
+                yield sent
+            return
         if cut_all:
             re_han = re_han_cut_all
             re_skip = re_skip_cut_all

diff --git a/jieba/_compat.py b/jieba/_compat.py
@@ -1,6 +1,16 @@
 # -*- coding: utf-8 -*-
 import os
 import sys
+import imp
+import logging
+
+log_console = logging.StreamHandler(sys.stderr)
+default_logger = logging.getLogger(__name__)
+default_logger.setLevel(logging.DEBUG)
+
+def setLogLevel(log_level):
+    global logger
+    default_logger.setLevel(log_level)
 
 try:
     import pkg_resources
@@ -10,6 +20,14 @@
     get_module_res = lambda *res: open(os.path.normpath(os.path.join(
                             os.getcwd(), os.path.dirname(__file__), *res)), 'rb')
 
+try:
+    import paddle
+    if paddle.__version__ == '1.6.1':
+        import paddle.fluid as fluid
+        import jieba.lac_small.predict as predict
+except ImportError:
+    pass
+
 PY2 = sys.version_info[0] == 2
 
 default_encoding = sys.getfilesystemencoding()
@@ -44,3 +62,19 @@ def resolve_filename(f):
         return f.name
     except AttributeError:
         return repr(f)
+
+
+def check_paddle_install():
+    is_paddle_installed =  False
+    try:
+        if imp.find_module('paddle') and paddle.__version__ == '1.6.1':
+            is_paddle_installed = True
+        elif paddle.__version__ != '1.6.1':
+            is_paddle_installed = False
+            default_logger.debug("Check the paddle version is not correct, subject\
+            you to use command to install paddle: pip uninstall paddlepaddle(-gpu), \
+            pip install paddlepaddle-tiny==1.6.1. Now, back to jieba basic cut......")
+    except ImportError:
+        default_logger.debug("Can not import paddle, back to jieba basic cut......")
+        is_paddle_installed = False
+    return is_paddle_installed
diff --git a/jieba/lac_small/__init__.py b/jieba/lac_small/__init__.py
diff --git a/jieba/lac_small/creator.py b/jieba/lac_small/creator.py
@@ -0,0 +1,46 @@
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Define the function to create lexical analysis model and model's data reader
+"""
+import sys
+import os
+import math
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.initializer import NormalInitializer
+import jieba.lac_small.nets as nets
+
+
+def create_model(vocab_size, num_labels, mode='train'):
+    """create lac model"""
+
+    # model's input data
+    words = fluid.data(name='words', shape=[-1, 1], dtype='int64', lod_level=1)
+    targets = fluid.data(
+        name='targets', shape=[-1, 1], dtype='int64', lod_level=1)
+
+    # for inference process
+    if mode == 'infer':
+        crf_decode = nets.lex_net(
+            words, vocab_size, num_labels, for_infer=True, target=None)
+        return {
+            "feed_list": [words],
+            "words": words,
+            "crf_decode": crf_decode,
+        }
+    return ret
+
diff --git a/jieba/lac_small/model_baseline/crfw b/jieba/lac_small/model_baseline/crfw
diff --git a/jieba/lac_small/model_baseline/fc_0.b_0 b/jieba/lac_small/model_baseline/fc_0.b_0
diff --git a/jieba/lac_small/model_baseline/fc_0.w_0 b/jieba/lac_small/model_baseline/fc_0.w_0
diff --git a/jieba/lac_small/model_baseline/fc_1.b_0 b/jieba/lac_small/model_baseline/fc_1.b_0
diff --git a/jieba/lac_small/model_baseline/fc_1.w_0 b/jieba/lac_small/model_baseline/fc_1.w_0
diff --git a/jieba/lac_small/model_baseline/fc_2.b_0 b/jieba/lac_small/model_baseline/fc_2.b_0
diff --git a/jieba/lac_small/model_baseline/fc_2.w_0 b/jieba/lac_small/model_baseline/fc_2.w_0
diff --git a/jieba/lac_small/model_baseline/fc_3.b_0 b/jieba/lac_small/model_baseline/fc_3.b_0
diff --git a/jieba/lac_small/model_baseline/fc_3.w_0 b/jieba/lac_small/model_baseline/fc_3.w_0
diff --git a/jieba/lac_small/model_baseline/fc_4.b_0 b/jieba/lac_small/model_baseline/fc_4.b_0
diff --git a/jieba/lac_small/model_baseline/fc_4.w_0 b/jieba/lac_small/model_baseline/fc_4.w_0
diff --git a/jieba/lac_small/model_baseline/gru_0.b_0 b/jieba/lac_small/model_baseline/gru_0.b_0
diff --git a/jieba/lac_small/model_baseline/gru_0.w_0 b/jieba/lac_small/model_baseline/gru_0.w_0
diff --git a/jieba/lac_small/model_baseline/gru_1.b_0 b/jieba/lac_small/model_baseline/gru_1.b_0
diff --git a/jieba/lac_small/model_baseline/gru_1.w_0 b/jieba/lac_small/model_baseline/gru_1.w_0
diff --git a/jieba/lac_small/model_baseline/gru_2.b_0 b/jieba/lac_small/model_baseline/gru_2.b_0
diff --git a/jieba/lac_small/model_baseline/gru_2.w_0 b/jieba/lac_small/model_baseline/gru_2.w_0
diff --git a/jieba/lac_small/model_baseline/gru_3.b_0 b/jieba/lac_small/model_baseline/gru_3.b_0
diff --git a/jieba/lac_small/model_baseline/gru_3.w_0 b/jieba/lac_small/model_baseline/gru_3.w_0
diff --git a/jieba/lac_small/model_baseline/word_emb b/jieba/lac_small/model_baseline/word_emb
diff --git a/jieba/lac_small/nets.py b/jieba/lac_small/nets.py
@@ -0,0 +1,122 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The function lex_net(args) define the lexical analysis network structure
+"""
+import sys
+import os
+import math
+
+import paddle.fluid as fluid
+from paddle.fluid.initializer import NormalInitializer
+
+
+def lex_net(word, vocab_size, num_labels, for_infer=True, target=None):
+    """
+    define the lexical analysis network structure
+    word: stores the input of the model
+    for_infer: a boolean value, indicating if the model to be created is for training or predicting.
+
+    return:
+        for infer: return the prediction
+        otherwise: return the prediction
+    """
+
+    word_emb_dim=128
+    grnn_hidden_dim=128
+    bigru_num=2
+    emb_lr = 1.0
+    crf_lr = 1.0
+    init_bound = 0.1
+    IS_SPARSE = True
+
+    def _bigru_layer(input_feature):
+        """
+        define the bidirectional gru layer
+        """
+        pre_gru = fluid.layers.fc(
+            input=input_feature,
+            size=grnn_hidden_dim * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+        gru = fluid.layers.dynamic_gru(
+            input=pre_gru,
+            size=grnn_hidden_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        pre_gru_r = fluid.layers.fc(
+            input=input_feature,
+            size=grnn_hidden_dim * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+        gru_r = fluid.layers.dynamic_gru(
+            input=pre_gru_r,
+            size=grnn_hidden_dim,
+            is_reverse=True,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1)
+        return bi_merge
+
+    def _net_conf(word, target=None):
+        """
+        Configure the network
+        """
+        word_embedding = fluid.embedding(
+            input=word,
+            size=[vocab_size, word_emb_dim],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr=fluid.ParamAttr(
+                learning_rate=emb_lr,
+                name="word_emb",
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound)))
+
+        input_feature = word_embedding
+        for i in range(bigru_num):
+            bigru_output = _bigru_layer(input_feature)
+            input_feature = bigru_output
+
+        emission = fluid.layers.fc(
+            size=num_labels,
+            input=bigru_output,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        size = emission.shape[1]
+        fluid.layers.create_parameter(
+            shape=[size + 2, size], dtype=emission.dtype, name='crfw')
+        crf_decode = fluid.layers.crf_decoding(
+            input=emission, param_attr=fluid.ParamAttr(name='crfw'))
+
+        return crf_decode
+    return _net_conf(word)