Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
9559dff
add jieba
JesseyXujin Nov 19, 2019
48275cc
add readme and test.py
JesseyXujin Nov 19, 2019
6310972
fix bug
JesseyXujin Nov 19, 2019
722c98a
update for easy setup
xyzhou-puck Nov 19, 2019
8bdc465
Merge pull request #1 from xyzhou-puck/paddle_demo
JesseyXujin Nov 19, 2019
66b7f87
update path
JesseyXujin Nov 20, 2019
cef556d
add paddle pos seg
JesseyXujin Nov 21, 2019
0174b05
change readme
JesseyXujin Nov 21, 2019
28764cb
change readme
JesseyXujin Nov 21, 2019
8b668a7
modify readme
JesseyXujin Nov 21, 2019
9e0c9ef
modify posseg
JesseyXujin Nov 25, 2019
e769671
modify readme
JesseyXujin Nov 25, 2019
4994aa8
fix bug
JesseyXujin Nov 25, 2019
4c1c256
fix bug
JesseyXujin Nov 25, 2019
87cd3b6
fix word_lens problem
JesseyXujin Nov 25, 2019
78064dc
modify cpk
JesseyXujin Nov 26, 2019
72c7746
fix bugs
JesseyXujin Nov 26, 2019
b238975
fix unicode problem
JesseyXujin Nov 26, 2019
c226f67
add try catch import
JesseyXujin Nov 26, 2019
a6bd564
update install check
Dec 5, 2019
166812f
delete symbols modification
Dec 5, 2019
6d7cfa7
modify paddle test pipy
Dec 10, 2019
6d4510a
add init.py in posseg
Dec 10, 2019
6519433
set posseg paddle default False
Dec 10, 2019
c49cbeb
change model_baseline
Dec 10, 2019
ad9a2b7
support windows int64
Dec 11, 2019
2841841
modify paddle install command
Dec 11, 2019
64df8f2
modify readme
Dec 11, 2019
a49ff1a
modify warning
Dec 12, 2019
de53d3a
remove conf
Dec 12, 2019
cb2c8fb
remove test pypi, use formal pypi
Dec 23, 2019
a7aa781
modify details for support OOV
JesseyXujin Dec 23, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion README.md
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ jieba
# encoding=utf-8
import jieba

seg_list = jieba.cut("我来到北京清华大学", use_paddle=True)
print("Paddle Mode: " + "/ ".join(seg_list)) # paddle模式

seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list)) # 全模式

Expand Down Expand Up @@ -196,7 +199,8 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py

```pycon
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> words = pseg.cut("我爱北京天安门") #jieba默认模式
>>> words = pseg.cut("我爱北京天安门",use_paddle=True) #paddle模式
>>> for word, flag in words:
... print('%s %s' % (word, flag))
...
Expand All @@ -206,6 +210,21 @@ https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
天安门 ns
```

paddle模式词性标注对应表如下:

词性和专名类别标签集合如下表,其中词性标签 24 个(小写字母),专名类别标签 4 个(大写字母)。

| 标签 | 含义 | 标签 | 含义 | 标签 | 含义 | 标签 | 含义 |
| ---- | -------- | ---- | -------- | ---- | -------- | ---- | -------- |
| n | 普通名词 | f | 方位名词 | s | 处所名词 | t | 时间 |
| nr | 人名 | ns | 地名 | nt | 机构名 | nw | 作品名 |
| nz | 其他专名 | v | 普通动词 | vd | 动副词 | vn | 名动词 |
| a | 形容词 | ad | 副形词 | an | 名形词 | d | 副词 |
| m | 数量词 | q | 量词 | r | 代词 | p | 介词 |
| c | 连词 | u | 助词 | xc | 其他虚词 | w | 标点符号 |
| PER | 人名 | LOC | 地名 | ORG | 机构名 | TIME | 时间 |


5. 并行分词
-----------
* 原理:将目标文本按行分隔后,把各行文本分配到多个 Python 进程并行分词,然后归并结果,从而获得分词速度的可观提升
Expand Down
21 changes: 17 additions & 4 deletions jieba/__init__.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from __future__ import absolute_import, unicode_literals
__version__ = '0.39'
__license__ = 'MIT'
Expand All @@ -6,20 +8,22 @@
import os
import sys
import time
import imp
import logging
import marshal
import tempfile
import threading
from math import log
from hashlib import md5
from ._compat import *
from . import finalseg

import jieba.finalseg
if os.name == 'nt':
from shutil import move as _replace_file
else:
_replace_file = os.rename


_get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))

DEFAULT_DICT = None
Expand Down Expand Up @@ -272,7 +276,7 @@ def __cut_DAG(self, sentence):
for elem in buf:
yield elem

def cut(self, sentence, cut_all=False, HMM=True):
def cut(self, sentence, cut_all = False, HMM = True,use_paddle = False):
'''
The main function that segments an entire sentence that contains
Chinese characters into separated words.
Expand All @@ -282,8 +286,17 @@ def cut(self, sentence, cut_all=False, HMM=True):
- cut_all: Model type. True for full pattern, False for accurate pattern.
- HMM: Whether to use the Hidden Markov Model.
'''
sentence = strdecode(sentence)

is_paddle_installed = False
if use_paddle == True:
is_paddle_installed = check_paddle_install()
sentence = strdecode(sentence)
if use_paddle == True and is_paddle_installed == True:
results = predict.get_sent(sentence)
for sent in results:
if sent is None:
continue
yield sent
return
if cut_all:
re_han = re_han_cut_all
re_skip = re_skip_cut_all
Expand Down
34 changes: 34 additions & 0 deletions jieba/_compat.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# -*- coding: utf-8 -*-
import os
import sys
import imp
import logging

log_console = logging.StreamHandler(sys.stderr)
default_logger = logging.getLogger(__name__)
default_logger.setLevel(logging.DEBUG)

def setLogLevel(log_level):
global logger
default_logger.setLevel(log_level)

try:
import pkg_resources
Expand All @@ -10,6 +20,14 @@
get_module_res = lambda *res: open(os.path.normpath(os.path.join(
os.getcwd(), os.path.dirname(__file__), *res)), 'rb')

try:
import paddle
if paddle.__version__ == '1.6.1':
import paddle.fluid as fluid
import jieba.lac_small.predict as predict
except ImportError:
pass

PY2 = sys.version_info[0] == 2

default_encoding = sys.getfilesystemencoding()
Expand Down Expand Up @@ -44,3 +62,19 @@ def resolve_filename(f):
return f.name
except AttributeError:
return repr(f)


def check_paddle_install():
is_paddle_installed = False
try:
if imp.find_module('paddle') and paddle.__version__ == '1.6.1':
is_paddle_installed = True
elif paddle.__version__ != '1.6.1':
is_paddle_installed = False
default_logger.debug("Check the paddle version is not correct, subject\
you to use command to install paddle: pip uninstall paddlepaddle(-gpu), \
pip install paddlepaddle-tiny==1.6.1. Now, back to jieba basic cut......")
except ImportError:
default_logger.debug("Can not import paddle, back to jieba basic cut......")
is_paddle_installed = False
return is_paddle_installed
Empty file added jieba/lac_small/__init__.py
Empty file.
46 changes: 46 additions & 0 deletions jieba/lac_small/creator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- coding: UTF-8 -*-
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Define the function to create lexical analysis model and model's data reader
"""
import sys
import os
import math

import paddle
import paddle.fluid as fluid
from paddle.fluid.initializer import NormalInitializer
import jieba.lac_small.nets as nets


def create_model(vocab_size, num_labels, mode='train'):
"""create lac model"""

# model's input data
words = fluid.data(name='words', shape=[-1, 1], dtype='int64', lod_level=1)
targets = fluid.data(
name='targets', shape=[-1, 1], dtype='int64', lod_level=1)

# for inference process
if mode == 'infer':
crf_decode = nets.lex_net(
words, vocab_size, num_labels, for_infer=True, target=None)
return {
"feed_list": [words],
"words": words,
"crf_decode": crf_decode,
}
return ret

Binary file added jieba/lac_small/model_baseline/crfw
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/fc_0.b_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/fc_0.w_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/fc_1.b_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/fc_1.w_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/fc_2.b_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/fc_2.w_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/fc_3.b_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/fc_3.w_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/fc_4.b_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/fc_4.w_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/gru_0.b_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/gru_0.w_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/gru_1.b_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/gru_1.w_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/gru_2.b_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/gru_2.w_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/gru_3.b_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/gru_3.w_0
Binary file not shown.
Binary file added jieba/lac_small/model_baseline/word_emb
Binary file not shown.
122 changes: 122 additions & 0 deletions jieba/lac_small/nets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The function lex_net(args) define the lexical analysis network structure
"""
import sys
import os
import math

import paddle.fluid as fluid
from paddle.fluid.initializer import NormalInitializer


def lex_net(word, vocab_size, num_labels, for_infer=True, target=None):
"""
define the lexical analysis network structure
word: stores the input of the model
for_infer: a boolean value, indicating if the model to be created is for training or predicting.

return:
for infer: return the prediction
otherwise: return the prediction
"""

word_emb_dim=128
grnn_hidden_dim=128
bigru_num=2
emb_lr = 1.0
crf_lr = 1.0
init_bound = 0.1
IS_SPARSE = True

def _bigru_layer(input_feature):
"""
define the bidirectional gru layer
"""
pre_gru = fluid.layers.fc(
input=input_feature,
size=grnn_hidden_dim * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
gru = fluid.layers.dynamic_gru(
input=pre_gru,
size=grnn_hidden_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))

pre_gru_r = fluid.layers.fc(
input=input_feature,
size=grnn_hidden_dim * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
gru_r = fluid.layers.dynamic_gru(
input=pre_gru_r,
size=grnn_hidden_dim,
is_reverse=True,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))

bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1)
return bi_merge

def _net_conf(word, target=None):
"""
Configure the network
"""
word_embedding = fluid.embedding(
input=word,
size=[vocab_size, word_emb_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(
learning_rate=emb_lr,
name="word_emb",
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound)))

input_feature = word_embedding
for i in range(bigru_num):
bigru_output = _bigru_layer(input_feature)
input_feature = bigru_output

emission = fluid.layers.fc(
size=num_labels,
input=bigru_output,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))

size = emission.shape[1]
fluid.layers.create_parameter(
shape=[size + 2, size], dtype=emission.dtype, name='crfw')
crf_decode = fluid.layers.crf_decoding(
input=emission, param_attr=fluid.ParamAttr(name='crfw'))

return crf_decode
return _net_conf(word)
Loading