From 33e584b7d6abc8817202d3e435c36bd9ccd649e2 Mon Sep 17 00:00:00 2001 From: WorgenZhang Date: Wed, 28 Jul 2021 18:23:34 +0800 Subject: [PATCH 01/12] [CPU-PSLIB] Add consistency insepection of use_var_list and data_generator data, test=develop --- python/paddle/fluid/dataset.py | 52 ++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index 8d20dd994475f0..8da42c336518dd 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -277,6 +277,58 @@ def set_use_var(self, var_list): "Currently, fluid.dataset only supports dtype=float32, dtype=int32 and dtype=int64" ) + def check_use_var_with_data_generator(self, var_list, data_generator_class, + test_file): + f = open(test_file, "r") + var_len = len(var_list) + + while True: + line = f.readline() + if line: + line_iter = data_generator_class.generate_sample(line) + for user_parsed_line in line_iter(): + if user_parsed_line == None: + continue + + data_gen_len = len(user_parsed_line) + if var_len != data_gen_len: + raise ValueError( + "var length mismatch error: var_list = %s vs data_generator = %s" + % (var_len, data_gen_len)) + + for i, ele in enumerate(user_parsed_line): + # print(ele[0], ele[1]) + + if len(ele[1]) == 0: + raise ValueError( + "var length error: var %s's length in data_genrator is 0" + % ele[0]) + + if var_list[ + i].dtype == core.VarDesc.VarType.FP32 and not all( + isinstance(ele, float) for ele in ele[1]): + raise TypeError( + "var dtype mismatch error: var name = %s, var type in var_list = %s, while var in data_generator contains non-float value, which is %s \n" + "Please check if order of var_list and data_generator are aligned. \n" + "Please check if var's type in data_generator is correct." + % (ele[0], "float", ele[1])) + + if (var_list[i].dtype == core.VarDesc.VarType.INT64 or + var_list[i].dtype == core.VarDesc.VarType.INT32 + ) and not all( + isinstance(ele, int) or isinstance(ele, long) + for ele in ele[1]): + raise TypeError( + "var dtype mismatch error: var name = %s, var type in var_list = %s, while var in data_generator contains non-int value, which is %s \n" + "Please check if order of var_list and data_generator are aligned. \n" + "Please check if var's type in data_generator is correct." + % (ele[0], "int", ele[1])) + + else: + break + + f.close() + def set_hdfs_config(self, fs_name, fs_ugi): """ Set hdfs config: fs name ad ugi From e05a7d61554dc1f1f070d78073359931bbca82ba Mon Sep 17 00:00:00 2001 From: WorgenZhang Date: Wed, 28 Jul 2021 18:24:18 +0800 Subject: [PATCH 02/12] [CPU-PSLIB] Add consistency insepection of use_var_list and data_generator data, test=develop --- python/paddle/fluid/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index 8da42c336518dd..5d6022fe5281bc 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -301,7 +301,7 @@ def check_use_var_with_data_generator(self, var_list, data_generator_class, if len(ele[1]) == 0: raise ValueError( - "var length error: var %s's length in data_genrator is 0" + "var length error: var %s's length in data_generator is 0" % ele[0]) if var_list[ From 733c9dcdf00d79ea7b65c2fb2d6fc3af4a944803 Mon Sep 17 00:00:00 2001 From: WorgenZhang Date: Wed, 28 Jul 2021 18:40:05 +0800 Subject: [PATCH 03/12] [CPU-PSLIB] Add consistency insepection of use_var_list and data_generator data, test=develop --- python/paddle/fluid/dataset.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index 5d6022fe5281bc..bfbc32dabefc5f 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -279,6 +279,24 @@ def set_use_var(self, var_list): def check_use_var_with_data_generator(self, var_list, data_generator_class, test_file): + """ + Var consistency insepection of use_var_list and data_generator data. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + from dataset_generator_old import CTRDataset + dataset = fluid.DatasetFactory().create_dataset() + generator_class = CTRDataset() + dataset.check_use_var_with_data_generator([data, label], generator_class, "data/part-00000") + + Args: + var_list(list): variable list + data_generator_class(class): data_generator class + test_file(str): local test file path + """ + f = open(test_file, "r") var_len = len(var_list) From a8fb3ddaa63c63503d55b64849b9a9166b16e070 Mon Sep 17 00:00:00 2001 From: WorgenZhang Date: Thu, 29 Jul 2021 19:37:56 +0800 Subject: [PATCH 04/12] [CPU-PSLIB] Add consistency insepection of use_var_list and data_generator data, test=develop --- python/paddle/fluid/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index bfbc32dabefc5f..530c5a533d6272 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -285,6 +285,7 @@ def check_use_var_with_data_generator(self, var_list, data_generator_class, Examples: .. code-block:: python + # required: skiptest import paddle.fluid as fluid from dataset_generator_old import CTRDataset dataset = fluid.DatasetFactory().create_dataset() From 9d222e79f9dbdcfb172dc66127d5316caeb12506 Mon Sep 17 00:00:00 2001 From: WorgenZhang Date: Mon, 16 Aug 2021 12:04:40 +0800 Subject: [PATCH 05/12] [CPU-PSLIB] Add Unitest For: consistency insepection of use_var_list and data_generator --- python/paddle/fluid/dataset.py | 2 +- .../test_dataset_consistency_inspection.py | 330 ++++++++++++++++++ 2 files changed, 331 insertions(+), 1 deletion(-) create mode 100644 python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index 530c5a533d6272..ebe5b2e7440415 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -287,7 +287,7 @@ def check_use_var_with_data_generator(self, var_list, data_generator_class, # required: skiptest import paddle.fluid as fluid - from dataset_generator_old import CTRDataset + from dataset_generator import CTRDataset dataset = fluid.DatasetFactory().create_dataset() generator_class = CTRDataset() dataset.check_use_var_with_data_generator([data, label], generator_class, "data/part-00000") diff --git a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py new file mode 100644 index 00000000000000..5bda51022939c3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py @@ -0,0 +1,330 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +TestCases for Dataset consistency insepection of use_var_list and data_generator. +""" + +from __future__ import print_function +import paddle +import paddle.fluid as fluid +import paddle.compat as cpt +import paddle.fluid.core as core +import numpy as np +import random +import math +import os +import shutil +import unittest +import paddle.fluid.incubate.data_generator as dg + +url_schema_len = 5 +query_schema = [ + 'Q_query_basic', 'Q_query_phrase', 'Q_quq', 'Q_timelevel', + 'Q_context_title_basic1', 'Q_context_title_basic2', + 'Q_context_title_basic3', 'Q_context_title_basic4', + 'Q_context_title_basic5', 'Q_context_title_phrase1', + 'Q_context_title_phrase2', 'Q_context_title_phrase3', + 'Q_context_title_phrase4', 'Q_context_title_phrase5', 'Q_context_site1', + 'Q_context_site2', 'Q_context_site3', 'Q_context_site4', 'Q_context_site5' +] + + +class CTRDataset(dg.MultiSlotDataGenerator): + def __init__(self, mode): + self.test = mode + + def generate_sample(self, line): + def reader(): + ins = line.strip().split(';') + label_pos_num = int(ins[1].split(' ')[0]) + label_neg_num = int(ins[1].split(' ')[1]) + + #query fea parse + bias = 2 + query_len = 0 + sparse_query_feature = [] + for index in range(len(query_schema)): + pos = index + bias + sparse_query_feature.append( + [int(x) for x in ins[pos].split(' ')]) + if index == 0: + query_len = len(ins[pos].split(' ')) + query_len = 1.0 / (1 + pow(2.7182818, 3 - 1.0 * query_len)) + + #positive url fea parse + bias = 2 + len(query_schema) + pos_url_feas = [] + pos_click_feas = [] + pos_context_feas = [] + for k in range(label_pos_num): + pos_url_fea = [] + pos = 0 + for index in range(url_schema_len - 1): + pos = bias + k * (url_schema_len) + index + pos_url_fea.append([int(x) for x in ins[pos].split(' ')]) + #click info + if (ins[pos + 1] == ''): + continue + item = ins[pos + 1].split(' ') + if len(item) != 17: + continue + stat_fea = [[max(float(item[i]), 0.0)] for i in range(len(item)) \ + if not (i == 5 or i == 9 or i == 13 or i == 14 or i ==15 or i ==16)] + pos_url_feas.append(pos_url_fea) + pos_click_feas.append(stat_fea) + + query_serach = float(item[5]) + if query_serach > 0.0: + query_serach = min(math.log(query_serach), 10.0) / 10.0 + pos_context_fea = [[query_serach], [query_len]] + pos_context_feas.append(pos_context_fea) + + #negative url fea parse + bias = 2 + len(query_schema) + label_pos_num * (url_schema_len) + neg_url_feas = [] + neg_click_feas = [] + neg_context_feas = [] + for k in range(label_neg_num): + neg_url_fea = [] + pos = 0 + for index in range(url_schema_len - 1): + pos = bias + k * (url_schema_len) + index + neg_url_fea.append([int(x) for x in ins[pos].split(' ')]) + if (ins[pos + 1] == ''): + continue + item = ins[pos + 1].split(' ') + #zdf_tmp + if len(item) != 17: + continue + #print ins[pos + 1] + stat_fea = [[max(float(item[i]), 0.0)] for i in range(len(item)) \ + if not (i == 5 or i == 9 or i == 13 or i == 14 or i == 15 or i == 16)] + neg_click_feas.append(stat_fea) + neg_url_feas.append(neg_url_fea) + + query_serach = float(item[5]) + if query_serach > 0.0: + query_serach = min(math.log(query_serach), 10.0) / 10.0 + neg_context_fea = [[query_serach], [query_len]] + neg_context_feas.append(neg_context_fea) + + #make train data + if self.test == 1: + for p in range(len(pos_url_feas)): + # feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + pos_url_fea = pos_url_feas[p] + pos_click_fea = pos_click_feas[p] + pos_context_fea = pos_context_feas[p] + yield zip(feature_name, [[1]] + sparse_query_feature + + pos_url_fea + pos_click_fea + pos_context_fea + + pos_url_fea + pos_click_fea + pos_context_fea) + for n in range(len(neg_url_feas)): + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + neg_url_fea = neg_url_feas[n] + neg_click_fea = neg_click_feas[n] + neg_context_fea = neg_context_feas[n] + yield zip(feature_name, [[0]] + sparse_query_feature + + neg_url_fea + neg_click_fea + neg_context_fea + + neg_url_fea + neg_click_fea + neg_context_fea) + elif self.test == 0: + for p in range(len(pos_url_feas)): + #feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + url_schema[4:] + click_info_schema[11:] + context_schema[2:] + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + #print("#######") + #print(feature_name) + #print("#######") + pos_url_fea = pos_url_feas[p] + pos_click_fea = pos_click_feas[p] + pos_context_fea = pos_context_feas[p] + for n in range(len(neg_url_feas)): + # prob = get_rand() + # if prob < sample_rate: + neg_url_fea = neg_url_feas[n] + neg_click_fea = neg_click_feas[n] + neg_context_fea = neg_context_feas[n] + #print("q:", query_feas) + #print("pos:", pos_url_fea) + #print("neg:", neg_url_fea) + # yield zip(feature_name[:3], sparse_query_feature[:3]) + yield zip(feature_name, [[1]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea) + elif self.test == 2: + for p in range(len(pos_url_feas)): + #feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + url_schema[4:] + click_info_schema[11:] + context_schema[2:] + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + #print("#######") + #print(feature_name) + #print("#######") + pos_url_fea = pos_url_feas[p] + pos_click_fea = pos_click_feas[p] + pos_context_fea = pos_context_feas[p] + for n in range(len(neg_url_feas)): + # prob = get_rand() + # if prob < sample_rate: + neg_url_fea = neg_url_feas[n] + neg_click_fea = neg_click_feas[n] + neg_context_fea = neg_context_feas[n] + #print("q:", query_feas) + #print("pos:", pos_url_fea) + #print("neg:", neg_url_fea) + # yield zip(feature_name[:3], sparse_query_feature[:3]) + yield zip(feature_name, [[1], [2]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea) + elif self.test == 3: + for p in range(len(pos_url_feas)): + #feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + url_schema[4:] + click_info_schema[11:] + context_schema[2:] + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + #print("#######") + #print(feature_name) + #print("#######") + pos_url_fea = pos_url_feas[p] + pos_click_fea = pos_click_feas[p] + pos_context_fea = pos_context_feas[p] + for n in range(len(neg_url_feas)): + # prob = get_rand() + # if prob < sample_rate: + neg_url_fea = neg_url_feas[n] + neg_click_fea = neg_click_feas[n] + neg_context_fea = neg_context_feas[n] + #print("q:", query_feas) + #print("pos:", pos_url_fea) + #print("neg:", neg_url_fea) + # yield zip(feature_name[:3], sparse_query_feature[:3]) + yield zip(feature_name, [[1], [2.0]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea) + + return reader + + +class TestDataset(unittest.TestCase): + """ TestCases for Dataset. """ + + def setUp(self): + pass + # use_data_loader = False + # epoch_num = 10 + # drop_last = False + + def test_var_consistency_insepection(self): + """ + Testcase for InMemoryDataset of consistency insepection of use_var_list and data_generator. + """ + with open("test_run_with_dump_a.txt", "w") as f: + data = "2 1;1 9;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;0;40000001;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20012788 20000157;20002001 20001240 20001860 20003611 20000623 20000251 20000157 20000723 20000070 20000001 20000057;20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20003519 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20003519 20000005;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20131464;20002001 20001240 20001860 20003611 20018820 20000157 20000723 20000070 20000001 20000057;20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000200;10000200;10063938;10000008;10000177;20002001 20001240 20001860 20003611 20010833 20000210 20000500 20000401 20000251 20012198 20001023 20000157;20002001 20001240 20001860 20003611 20012396 20000500 20002513 20012198 20001023 20000157;10000123;30000004;0.623 0.233 0.290 0.208 0.354 49.000 0.000 0.000 0.000 -1.000 0.569 0.679 0.733 53 17 2 0;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;10000047;30000004;0.067 0.000 0.161 0.005 0.000 49.000 0.000 0.000 0.000 -1.000 0.000 0.378 0.043 0 6 0 0;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20003519 20000005;10000200;30000001;0.407 0.111 0.196 0.095 0.181 49.000 0.000 0.000 0.000 -1.000 0.306 0.538 0.355 48 8 0 0;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20003519 20000005;10000200;30000001;0.226 0.029 0.149 0.031 0.074 49.000 0.000 0.000 0.000 -1.000 0.220 0.531 0.286 26 6 0 0;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20012788 20000157;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20131464;10063938;30000001;0.250 0.019 0.138 0.012 0.027 49.000 0.000 0.000 0.000 -1.000 0.370 0.449 0.327 7 2 0 0;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;10000003;30000002;0.056 0.000 0.139 0.003 0.000 49.000 0.000 0.000 0.000 -1.000 0.000 0.346 0.059 15 3 0 0;20002001 20001240 20001860 20003611 20000623 20000251 20000157 20000723 20000070 20000001 20000057;20002001 20001240 20001860 20003611 20018820 20000157 20000723 20000070 20000001 20000057;10000008;30000001;0.166 0.004 0.127 0.001 0.004 49.000 0.000 0.000 0.000 -1.000 0.103 0.417 0.394 10 3 0 0;20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000177;30000001;0.094 0.008 0.157 0.012 0.059 49.000 0.000 0.000 0.000 -1.000 0.051 0.382 0.142 21 0 0 0;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20000157;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20000157;10000134;30000001;0.220 0.016 0.181 0.037 0.098 49.000 0.000 0.000 0.000 -1.000 0.192 0.453 0.199 17 1 0 0;20002001 20001240 20001860 20003611 20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002001 20001240 20001860 20003611 20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000638;30000001;0.000 0.000 0.000 0.000 0.000 49.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0 0 0 0;\n" + data += "2 1;1 11;20000025 20000404;20001923;20000002 20000157 20000028 20004205 20000500 20028809 20000571 20000007 20027523 20004940 20000651 20000043 20000051 20000520 20015398 20000066 20004720 20000070 20001648;40000001;20000025 20000404 20000571 20004940 20000001 20000017;20000025 20000404 20000029 20000500 20001408 20000404 20000001 20000017;0;0;0;20001923 20011130 20000027;20001923 20000029 20000500 20001408 20000404 20000027;0;0;0;10000005;10000005;0;0;0;20003316 20000392 20001979 20000474 20000025 20000194 20000025 20000404 20000019 20000109;20016528 20024913 20004748 20001923 20000019 20000109;10000015;30000002;0.572 0.043 0.401 0.352 0.562 32859.000 0.005 0.060 0.362 -1.000 0.448 0.673 0.222 16316 991 89 0;20000025 20000404 20000571 20004940 20000001 20000017;20001923 20011130 20000027;10000005;30000001;0.495 0.024 0.344 0.285 0.379 32859.000 0.002 0.050 0.362 -1.000 0.423 0.764 0.254 19929 896 72 0;20000202 20000026 20001314 20004289 20000025 20000404 20000451 20000089 20000007;20000202 20000026 20014094 20001314 20004289 20001923 20000451 20000089 20000007;10000035;30000003;0.133 0.006 0.162 0.042 0.174 32859.000 0.003 0.037 0.362 -1.000 0.363 0.542 0.122 14763 664 53 0;20000202 20000026 20001314 20004289 20000025 20000404;20000202 20000026 20014094 20001314 20004289 20001923;10000021;30000001;0.058 0.004 0.133 0.017 0.120 32859.000 0.000 0.006 0.362 -1.000 0.168 0.437 0.041 -1 -1 -1 -1;20000025 20000404 20000018 20012461 20001699 20000446 20000174 20000062 20000133 20003172 20000240 20007877 20067375 20000111 20000164 20001410 20000204 20016958;20001923 20000018 20012461 20001699 20007717 20000062 20000133 20003172 20000240 20007877 20067375 20000111 20000164 20001410 20000204 20016958;10000002;30000001;0.017 0.000 0.099 0.004 0.072 32859.000 0.000 0.009 0.362 -1.000 0.058 0.393 0.025 -1 -1 -1 -1;20000025 20000404;20001923;10000133;30000005;0.004 0.000 0.122 0.000 0.000 32859.000 0.000 0.000 0.362 -1.000 0.000 0.413 0.020 0 444 35 0;20000025 20000404;20001923;10005297;30000004;0.028 0.000 0.138 0.002 0.000 32859.000 0.000 0.000 0.362 -1.000 0.000 0.343 0.024 0 600 48 0;20000025 20000404;20001923;10000060;30000005;0.107 0.000 0.110 0.027 0.077 32859.000 0.000 0.005 0.362 -1.000 0.095 0.398 0.062 1338 491 39 0;20002960 20005534 20000043 20000025 20000404 20000025 20000007;20002960 20005534 20000043 20001923 20000025 20000007;10000020;30000003;0.041 0.000 0.122 0.012 0.101 32859.000 0.001 0.025 0.362 -1.000 0.302 0.541 0.065 9896 402 35 0;20000025 20000404 20000259 20000228 20000235 20000142;20001923 20000259 20000264 20000142;10000024;30000003;0.072 0.002 0.156 0.026 0.141 32859.000 0.002 0.032 0.362 -1.000 0.386 0.569 0.103 9896 364 35 0;20000025 20000404 20000029 20000500 20001408 20000404 20000001 20000017;20001923 20000029 20000500 20001408 20000404 20000027;10000005;30000001;0.328 0.006 0.179 0.125 0.181 32859.000 0.003 0.058 0.362 -1.000 0.300 0.445 0.141 9896 402 32 0;20000025 20000404;20001923;10012839;30000002;0.012 0.000 0.108 0.002 0.048 32859.000 0.000 0.000 0.362 -1.000 0.021 0.225 0.016 2207 120 12 0;\n" + f.write(data) + + slot_data = [] + label = fluid.layers.data( + name="click", + shape=[-1, 1], + dtype="int64", + lod_level=0, + append_batch_size=False) + slot_data.append(label) + + # sprase_query_feat_names + len_sparse_query = 19 + for feat_name in range(1, len_sparse_query + 1): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='int64', lod_level=1)) + + # sparse_url_feat_names + for feat_name in range(len_sparse_query + 1, len_sparse_query + 5): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='int64', lod_level=1)) + + # dense_feat_names + for feat_name in range(len_sparse_query + 5, len_sparse_query + 16): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='float32')) + + # context_feat_namess + for feat_name in range(len_sparse_query + 16, len_sparse_query + 18): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='float32')) + + # neg sparse_url_feat_names + for feat_name in range(len_sparse_query + 18, len_sparse_query + 22): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='int64', lod_level=1)) + + # neg dense_feat_names + for feat_name in range(len_sparse_query + 22, len_sparse_query + 33): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='float32')) + + # neg context_feat_namess + for feat_name in range(len_sparse_query + 33, len_sparse_query + 35): + slot_data.append( + fluid.layers.data( + name=str(feat_name), shape=[1], dtype='float32')) + + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + + print("========================================") + generator_class = CTRDataset(mode=0) + try: + dataset.check_use_var_with_data_generator( + slot_data, generator_class, "test_run_with_dump_a.txt") + print("case 1: check passed!") + except Exception as e: + print("warning: catch expected error") + print(e) + print("========================================") + print("\n") + + print("========================================") + generator_class = CTRDataset(mode=2) + try: + dataset.check_use_var_with_data_generator( + slot_data, generator_class, "test_run_with_dump_a.txt") + except Exception as e: + print("warning: case 2 catch expected error") + print(e) + print("========================================") + print("\n") + + print("========================================") + generator_class = CTRDataset(mode=3) + try: + dataset.check_use_var_with_data_generator( + slot_data, generator_class, "test_run_with_dump_a.txt") + except Exception as e: + print("warning: case 3 catch expected error") + print(e) + print("========================================") + + os.remove("./test_run_with_dump_a.txt") + + +if __name__ == '__main__': + unittest.main() From b1123e9009d09f2b5764c0fe9a80ad1687f34985 Mon Sep 17 00:00:00 2001 From: WorgenZhang Date: Mon, 16 Aug 2021 13:33:09 +0800 Subject: [PATCH 06/12] [CPU-PSLIB] Add Unitest For: consistency insepection of use_var_list and data_generator --- .../tests/unittests/test_dataset_consistency_inspection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py index 5bda51022939c3..d9f3ffd310e987 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py +++ b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py @@ -28,6 +28,8 @@ import unittest import paddle.fluid.incubate.data_generator as dg +paddle.enable_static() + url_schema_len = 5 query_schema = [ 'Q_query_basic', 'Q_query_phrase', 'Q_quq', 'Q_timelevel', From 6b8d16a915ae87ae9eb1077661505b6ca9c7270c Mon Sep 17 00:00:00 2001 From: WorgenZhang Date: Tue, 17 Aug 2021 13:12:42 +0800 Subject: [PATCH 07/12] [CPU-PSLIB] Add Unitest For: consistency insepection of use_var_list and data_generator data --- python/paddle/fluid/dataset.py | 3 -- .../test_dataset_consistency_inspection.py | 41 ++++++++++++++++++- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index ebe5b2e7440415..fc4c47724ea124 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -306,9 +306,6 @@ def check_use_var_with_data_generator(self, var_list, data_generator_class, if line: line_iter = data_generator_class.generate_sample(line) for user_parsed_line in line_iter(): - if user_parsed_line == None: - continue - data_gen_len = len(user_parsed_line) if var_len != data_gen_len: raise ValueError( diff --git a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py index d9f3ffd310e987..ebf7dfde420b7f 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py +++ b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py @@ -28,7 +28,8 @@ import unittest import paddle.fluid.incubate.data_generator as dg -paddle.enable_static() +# paddle.enable_static() +fluid.disable_dygraph() url_schema_len = 5 query_schema = [ @@ -216,6 +217,30 @@ def reader(): # yield zip(feature_name[:3], sparse_query_feature[:3]) yield zip(feature_name, [[1], [2.0]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ neg_url_fea + neg_click_fea + neg_context_fea) + elif self.test == 4: + for p in range(len(pos_url_feas)): + #feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + url_schema[4:] + click_info_schema[11:] + context_schema[2:] + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + #print("#######") + #print(feature_name) + #print("#######") + pos_url_fea = pos_url_feas[p] + pos_click_fea = pos_click_feas[p] + pos_context_fea = pos_context_feas[p] + for n in range(len(neg_url_feas)): + # prob = get_rand() + # if prob < sample_rate: + neg_url_fea = neg_url_feas[n] + neg_click_fea = neg_click_feas[n] + neg_context_fea = neg_context_feas[n] + #print("q:", query_feas) + #print("pos:", pos_url_fea) + #print("neg:", neg_url_fea) + # yield zip(feature_name[:3], sparse_query_feature[:3]) + yield zip(feature_name, [[], [2.0]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea) return reader @@ -234,8 +259,11 @@ def test_var_consistency_insepection(self): Testcase for InMemoryDataset of consistency insepection of use_var_list and data_generator. """ with open("test_run_with_dump_a.txt", "w") as f: + # data = "\n" + # data += "\n" data = "2 1;1 9;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;0;40000001;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20012788 20000157;20002001 20001240 20001860 20003611 20000623 20000251 20000157 20000723 20000070 20000001 20000057;20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20003519 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20003519 20000005;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20131464;20002001 20001240 20001860 20003611 20018820 20000157 20000723 20000070 20000001 20000057;20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000200;10000200;10063938;10000008;10000177;20002001 20001240 20001860 20003611 20010833 20000210 20000500 20000401 20000251 20012198 20001023 20000157;20002001 20001240 20001860 20003611 20012396 20000500 20002513 20012198 20001023 20000157;10000123;30000004;0.623 0.233 0.290 0.208 0.354 49.000 0.000 0.000 0.000 -1.000 0.569 0.679 0.733 53 17 2 0;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;10000047;30000004;0.067 0.000 0.161 0.005 0.000 49.000 0.000 0.000 0.000 -1.000 0.000 0.378 0.043 0 6 0 0;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20003519 20000005;10000200;30000001;0.407 0.111 0.196 0.095 0.181 49.000 0.000 0.000 0.000 -1.000 0.306 0.538 0.355 48 8 0 0;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20003519 20000005;10000200;30000001;0.226 0.029 0.149 0.031 0.074 49.000 0.000 0.000 0.000 -1.000 0.220 0.531 0.286 26 6 0 0;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20012788 20000157;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20131464;10063938;30000001;0.250 0.019 0.138 0.012 0.027 49.000 0.000 0.000 0.000 -1.000 0.370 0.449 0.327 7 2 0 0;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;10000003;30000002;0.056 0.000 0.139 0.003 0.000 49.000 0.000 0.000 0.000 -1.000 0.000 0.346 0.059 15 3 0 0;20002001 20001240 20001860 20003611 20000623 20000251 20000157 20000723 20000070 20000001 20000057;20002001 20001240 20001860 20003611 20018820 20000157 20000723 20000070 20000001 20000057;10000008;30000001;0.166 0.004 0.127 0.001 0.004 49.000 0.000 0.000 0.000 -1.000 0.103 0.417 0.394 10 3 0 0;20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000177;30000001;0.094 0.008 0.157 0.012 0.059 49.000 0.000 0.000 0.000 -1.000 0.051 0.382 0.142 21 0 0 0;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20000157;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20000157;10000134;30000001;0.220 0.016 0.181 0.037 0.098 49.000 0.000 0.000 0.000 -1.000 0.192 0.453 0.199 17 1 0 0;20002001 20001240 20001860 20003611 20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002001 20001240 20001860 20003611 20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000638;30000001;0.000 0.000 0.000 0.000 0.000 49.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0 0 0 0;\n" data += "2 1;1 11;20000025 20000404;20001923;20000002 20000157 20000028 20004205 20000500 20028809 20000571 20000007 20027523 20004940 20000651 20000043 20000051 20000520 20015398 20000066 20004720 20000070 20001648;40000001;20000025 20000404 20000571 20004940 20000001 20000017;20000025 20000404 20000029 20000500 20001408 20000404 20000001 20000017;0;0;0;20001923 20011130 20000027;20001923 20000029 20000500 20001408 20000404 20000027;0;0;0;10000005;10000005;0;0;0;20003316 20000392 20001979 20000474 20000025 20000194 20000025 20000404 20000019 20000109;20016528 20024913 20004748 20001923 20000019 20000109;10000015;30000002;0.572 0.043 0.401 0.352 0.562 32859.000 0.005 0.060 0.362 -1.000 0.448 0.673 0.222 16316 991 89 0;20000025 20000404 20000571 20004940 20000001 20000017;20001923 20011130 20000027;10000005;30000001;0.495 0.024 0.344 0.285 0.379 32859.000 0.002 0.050 0.362 -1.000 0.423 0.764 0.254 19929 896 72 0;20000202 20000026 20001314 20004289 20000025 20000404 20000451 20000089 20000007;20000202 20000026 20014094 20001314 20004289 20001923 20000451 20000089 20000007;10000035;30000003;0.133 0.006 0.162 0.042 0.174 32859.000 0.003 0.037 0.362 -1.000 0.363 0.542 0.122 14763 664 53 0;20000202 20000026 20001314 20004289 20000025 20000404;20000202 20000026 20014094 20001314 20004289 20001923;10000021;30000001;0.058 0.004 0.133 0.017 0.120 32859.000 0.000 0.006 0.362 -1.000 0.168 0.437 0.041 -1 -1 -1 -1;20000025 20000404 20000018 20012461 20001699 20000446 20000174 20000062 20000133 20003172 20000240 20007877 20067375 20000111 20000164 20001410 20000204 20016958;20001923 20000018 20012461 20001699 20007717 20000062 20000133 20003172 20000240 20007877 20067375 20000111 20000164 20001410 20000204 20016958;10000002;30000001;0.017 0.000 0.099 0.004 0.072 32859.000 0.000 0.009 0.362 -1.000 0.058 0.393 0.025 -1 -1 -1 -1;20000025 20000404;20001923;10000133;30000005;0.004 0.000 0.122 0.000 0.000 32859.000 0.000 0.000 0.362 -1.000 0.000 0.413 0.020 0 444 35 0;20000025 20000404;20001923;10005297;30000004;0.028 0.000 0.138 0.002 0.000 32859.000 0.000 0.000 0.362 -1.000 0.000 0.343 0.024 0 600 48 0;20000025 20000404;20001923;10000060;30000005;0.107 0.000 0.110 0.027 0.077 32859.000 0.000 0.005 0.362 -1.000 0.095 0.398 0.062 1338 491 39 0;20002960 20005534 20000043 20000025 20000404 20000025 20000007;20002960 20005534 20000043 20001923 20000025 20000007;10000020;30000003;0.041 0.000 0.122 0.012 0.101 32859.000 0.001 0.025 0.362 -1.000 0.302 0.541 0.065 9896 402 35 0;20000025 20000404 20000259 20000228 20000235 20000142;20001923 20000259 20000264 20000142;10000024;30000003;0.072 0.002 0.156 0.026 0.141 32859.000 0.002 0.032 0.362 -1.000 0.386 0.569 0.103 9896 364 35 0;20000025 20000404 20000029 20000500 20001408 20000404 20000001 20000017;20001923 20000029 20000500 20001408 20000404 20000027;10000005;30000001;0.328 0.006 0.179 0.125 0.181 32859.000 0.003 0.058 0.362 -1.000 0.300 0.445 0.141 9896 402 32 0;20000025 20000404;20001923;10012839;30000002;0.012 0.000 0.108 0.002 0.048 32859.000 0.000 0.000 0.362 -1.000 0.021 0.225 0.016 2207 120 12 0;\n" + # data += "" f.write(data) slot_data = [] @@ -324,6 +352,17 @@ def test_var_consistency_insepection(self): print("warning: case 3 catch expected error") print(e) print("========================================") + print("\n") + + print("========================================") + generator_class = CTRDataset(mode=4) + try: + dataset.check_use_var_with_data_generator( + slot_data, generator_class, "test_run_with_dump_a.txt") + except Exception as e: + print("warning: case 4 catch expected error") + print(e) + print("========================================") os.remove("./test_run_with_dump_a.txt") From 9aafded4a5178512b70ef7c549efdb30ab8b8140 Mon Sep 17 00:00:00 2001 From: WorgenZhang Date: Tue, 17 Aug 2021 13:40:01 +0800 Subject: [PATCH 08/12] [CPU-PSLIB] Add Unitest For: consistency insepection of use_var_list and data_generator data --- .../test_dataset_consistency_inspection.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py index ebf7dfde420b7f..3e4a7de4c79f18 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py +++ b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py @@ -167,8 +167,8 @@ def reader(): #print("pos:", pos_url_fea) #print("neg:", neg_url_fea) # yield zip(feature_name[:3], sparse_query_feature[:3]) - yield zip(feature_name, [[1]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ - neg_url_fea + neg_click_fea + neg_context_fea) + yield list(zip(feature_name, [[1]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea)) elif self.test == 2: for p in range(len(pos_url_feas)): #feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + url_schema[4:] + click_info_schema[11:] + context_schema[2:] @@ -191,8 +191,8 @@ def reader(): #print("pos:", pos_url_fea) #print("neg:", neg_url_fea) # yield zip(feature_name[:3], sparse_query_feature[:3]) - yield zip(feature_name, [[1], [2]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ - neg_url_fea + neg_click_fea + neg_context_fea) + yield list(zip(feature_name, [[1], [2]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea)) elif self.test == 3: for p in range(len(pos_url_feas)): #feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + url_schema[4:] + click_info_schema[11:] + context_schema[2:] @@ -215,8 +215,8 @@ def reader(): #print("pos:", pos_url_fea) #print("neg:", neg_url_fea) # yield zip(feature_name[:3], sparse_query_feature[:3]) - yield zip(feature_name, [[1], [2.0]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ - neg_url_fea + neg_click_fea + neg_context_fea) + yield list(zip(feature_name, [[1], [2.0]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea)) elif self.test == 4: for p in range(len(pos_url_feas)): #feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + url_schema[4:] + click_info_schema[11:] + context_schema[2:] @@ -239,8 +239,8 @@ def reader(): #print("pos:", pos_url_fea) #print("neg:", neg_url_fea) # yield zip(feature_name[:3], sparse_query_feature[:3]) - yield zip(feature_name, [[], [2.0]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ - neg_url_fea + neg_click_fea + neg_context_fea) + yield list(zip(feature_name, [[], [2.0]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea)) return reader From 9f966126f68c06ac3bf4da617a17fa460ef9fef3 Mon Sep 17 00:00:00 2001 From: WorgenZhang Date: Tue, 17 Aug 2021 13:41:45 +0800 Subject: [PATCH 09/12] [CPU-PSLIB] Add Unitest For: consistency insepection of use_var_list and data_generator data --- .../tests/unittests/test_dataset_consistency_inspection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py index 3e4a7de4c79f18..8209fce03040f9 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py +++ b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py @@ -28,8 +28,8 @@ import unittest import paddle.fluid.incubate.data_generator as dg -# paddle.enable_static() -fluid.disable_dygraph() +paddle.enable_static() +# fluid.disable_dygraph() url_schema_len = 5 query_schema = [ From f84da1c3ffe253737fd979a88cf580b2c5f90619 Mon Sep 17 00:00:00 2001 From: WorgenZhang Date: Tue, 17 Aug 2021 15:28:47 +0800 Subject: [PATCH 10/12] [CPU-PSLIB] Add Unitest For: consistency insepection of use_var_list and data_generator data --- python/paddle/fluid/dataset.py | 3 +- .../test_dataset_consistency_inspection.py | 39 ++++++++++++++++++- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index fc4c47724ea124..ad55089385e31f 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -332,8 +332,7 @@ def check_use_var_with_data_generator(self, var_list, data_generator_class, if (var_list[i].dtype == core.VarDesc.VarType.INT64 or var_list[i].dtype == core.VarDesc.VarType.INT32 ) and not all( - isinstance(ele, int) or isinstance(ele, long) - for ele in ele[1]): + isinstance(ele, int) for ele in ele[1]): raise TypeError( "var dtype mismatch error: var name = %s, var type in var_list = %s, while var in data_generator contains non-int value, which is %s \n" "Please check if order of var_list and data_generator are aligned. \n" diff --git a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py index 8209fce03040f9..c83c67701356b8 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py +++ b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py @@ -28,9 +28,9 @@ import unittest import paddle.fluid.incubate.data_generator as dg -paddle.enable_static() +#paddle.enable_static() # fluid.disable_dygraph() - +fluid.disable_dygraph() url_schema_len = 5 query_schema = [ 'Q_query_basic', 'Q_query_phrase', 'Q_quq', 'Q_timelevel', @@ -241,6 +241,30 @@ def reader(): # yield zip(feature_name[:3], sparse_query_feature[:3]) yield list(zip(feature_name, [[], [2.0]] + sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ neg_url_fea + neg_click_fea + neg_context_fea)) + elif self.test == 5: + for p in range(len(pos_url_feas)): + #feature_name = ["click"] + query_schema + url_schema[:4] + click_info_schema[:11] + context_schema[:2] + url_schema[4:] + click_info_schema[11:] + context_schema[2:] + feature_name = ["click"] + for i in range(1, 54): + feature_name.append(str(i)) + #print("#######") + #print(feature_name) + #print("#######") + pos_url_fea = pos_url_feas[p] + pos_click_fea = pos_click_feas[p] + pos_context_fea = pos_context_feas[p] + for n in range(len(neg_url_feas)): + # prob = get_rand() + # if prob < sample_rate: + neg_url_fea = neg_url_feas[n] + neg_click_fea = neg_click_feas[n] + neg_context_fea = neg_context_feas[n] + #print("q:", query_feas) + #print("pos:", pos_url_fea) + #print("neg:", neg_url_fea) + # yield zip(feature_name[:3], sparse_query_feature[:3]) + yield list(zip(feature_name, sparse_query_feature + pos_url_fea + pos_click_fea + pos_context_fea + \ + neg_url_fea + neg_click_fea + neg_context_fea)) return reader @@ -363,6 +387,17 @@ def test_var_consistency_insepection(self): print("warning: case 4 catch expected error") print(e) print("========================================") + print("\n") + + print("========================================") + generator_class = CTRDataset(mode=5) + try: + dataset.check_use_var_with_data_generator( + slot_data, generator_class, "test_run_with_dump_a.txt") + except Exception as e: + print("warning: case 5 catch expected error") + print(e) + print("========================================") os.remove("./test_run_with_dump_a.txt") From a4713d60c2eb2059acf587ec199c727c50486a98 Mon Sep 17 00:00:00 2001 From: WorgenZhang Date: Wed, 18 Aug 2021 12:02:15 +0800 Subject: [PATCH 11/12] [CPU-PSLIB] Add consistency insepection of use_var_list and data_generator data, test=develop --- .../distributed/fleet/dataset/dataset.py | 65 ++++++++++++++++++ python/paddle/fluid/dataset.py | 67 ------------------- 2 files changed, 65 insertions(+), 67 deletions(-) diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py index 8bc16dfbbae300..25a1d98cb11218 100644 --- a/python/paddle/distributed/fleet/dataset/dataset.py +++ b/python/paddle/distributed/fleet/dataset/dataset.py @@ -255,6 +255,71 @@ def _dynamic_adjust_before_train(self, thread_num): def _dynamic_adjust_after_train(self): pass + def _check_use_var_with_data_generator(self, var_list, data_generator_class, + test_file): + """ + Var consistency insepection of use_var_list and data_generator data. + + Examples: + .. code-block:: python + + # required: skiptest + import paddle + from dataset_generator import CTRDataset + dataset = paddle.distributed.fleet.DatasetBase() + generator_class = CTRDataset() + dataset._check_use_var_with_data_generator([data, label], generator_class, "data/part-00000") + + Args: + var_list(list): variable list + data_generator_class(class): data_generator class + test_file(str): local test file path + """ + + f = open(test_file, "r") + var_len = len(var_list) + + while True: + line = f.readline() + if line: + line_iter = data_generator_class.generate_sample(line) + for user_parsed_line in line_iter(): + data_gen_len = len(user_parsed_line) + if var_len != data_gen_len: + raise ValueError( + "var length mismatch error: var_list = %s vs data_generator = %s" + % (var_len, data_gen_len)) + + for i, ele in enumerate(user_parsed_line): + if len(ele[1]) == 0: + raise ValueError( + "var length error: var %s's length in data_generator is 0" + % ele[0]) + + if var_list[ + i].dtype == core.VarDesc.VarType.FP32 and not all( + isinstance(ele, float) for ele in ele[1]): + raise TypeError( + "var dtype mismatch error: var name = %s, var type in var_list = %s, while var in data_generator contains non-float value, which is %s \n" + "Please check if order of var_list and data_generator are aligned. \n" + "Please check if var's type in data_generator is correct." + % (ele[0], "float", ele[1])) + + if (var_list[i].dtype == core.VarDesc.VarType.INT64 or + var_list[i].dtype == core.VarDesc.VarType.INT32 + ) and not all( + isinstance(ele, int) for ele in ele[1]): + raise TypeError( + "var dtype mismatch error: var name = %s, var type in var_list = %s, while var in data_generator contains non-int value, which is %s \n" + "Please check if order of var_list and data_generator are aligned. \n" + "Please check if var's type in data_generator is correct." + % (ele[0], "int", ele[1])) + + else: + break + + f.close() + class InMemoryDataset(DatasetBase): """ diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index ad55089385e31f..8d20dd994475f0 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -277,73 +277,6 @@ def set_use_var(self, var_list): "Currently, fluid.dataset only supports dtype=float32, dtype=int32 and dtype=int64" ) - def check_use_var_with_data_generator(self, var_list, data_generator_class, - test_file): - """ - Var consistency insepection of use_var_list and data_generator data. - - Examples: - .. code-block:: python - - # required: skiptest - import paddle.fluid as fluid - from dataset_generator import CTRDataset - dataset = fluid.DatasetFactory().create_dataset() - generator_class = CTRDataset() - dataset.check_use_var_with_data_generator([data, label], generator_class, "data/part-00000") - - Args: - var_list(list): variable list - data_generator_class(class): data_generator class - test_file(str): local test file path - """ - - f = open(test_file, "r") - var_len = len(var_list) - - while True: - line = f.readline() - if line: - line_iter = data_generator_class.generate_sample(line) - for user_parsed_line in line_iter(): - data_gen_len = len(user_parsed_line) - if var_len != data_gen_len: - raise ValueError( - "var length mismatch error: var_list = %s vs data_generator = %s" - % (var_len, data_gen_len)) - - for i, ele in enumerate(user_parsed_line): - # print(ele[0], ele[1]) - - if len(ele[1]) == 0: - raise ValueError( - "var length error: var %s's length in data_generator is 0" - % ele[0]) - - if var_list[ - i].dtype == core.VarDesc.VarType.FP32 and not all( - isinstance(ele, float) for ele in ele[1]): - raise TypeError( - "var dtype mismatch error: var name = %s, var type in var_list = %s, while var in data_generator contains non-float value, which is %s \n" - "Please check if order of var_list and data_generator are aligned. \n" - "Please check if var's type in data_generator is correct." - % (ele[0], "float", ele[1])) - - if (var_list[i].dtype == core.VarDesc.VarType.INT64 or - var_list[i].dtype == core.VarDesc.VarType.INT32 - ) and not all( - isinstance(ele, int) for ele in ele[1]): - raise TypeError( - "var dtype mismatch error: var name = %s, var type in var_list = %s, while var in data_generator contains non-int value, which is %s \n" - "Please check if order of var_list and data_generator are aligned. \n" - "Please check if var's type in data_generator is correct." - % (ele[0], "int", ele[1])) - - else: - break - - f.close() - def set_hdfs_config(self, fs_name, fs_ugi): """ Set hdfs config: fs name ad ugi From a5da0d642b32bc219d62904724c34e02d67e5c27 Mon Sep 17 00:00:00 2001 From: WorgenZhang Date: Wed, 18 Aug 2021 12:17:02 +0800 Subject: [PATCH 12/12] [CPU-PSLIB] Add consistency insepection of use_var_list and data_generator data, test=develop --- .../unittests/test_dataset_consistency_inspection.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py index c83c67701356b8..5911ada1817b60 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py +++ b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py @@ -342,12 +342,12 @@ def test_var_consistency_insepection(self): fluid.layers.data( name=str(feat_name), shape=[1], dtype='float32')) - dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset = paddle.distributed.InMemoryDataset() print("========================================") generator_class = CTRDataset(mode=0) try: - dataset.check_use_var_with_data_generator( + dataset._check_use_var_with_data_generator( slot_data, generator_class, "test_run_with_dump_a.txt") print("case 1: check passed!") except Exception as e: @@ -359,7 +359,7 @@ def test_var_consistency_insepection(self): print("========================================") generator_class = CTRDataset(mode=2) try: - dataset.check_use_var_with_data_generator( + dataset._check_use_var_with_data_generator( slot_data, generator_class, "test_run_with_dump_a.txt") except Exception as e: print("warning: case 2 catch expected error") @@ -370,7 +370,7 @@ def test_var_consistency_insepection(self): print("========================================") generator_class = CTRDataset(mode=3) try: - dataset.check_use_var_with_data_generator( + dataset._check_use_var_with_data_generator( slot_data, generator_class, "test_run_with_dump_a.txt") except Exception as e: print("warning: case 3 catch expected error") @@ -381,7 +381,7 @@ def test_var_consistency_insepection(self): print("========================================") generator_class = CTRDataset(mode=4) try: - dataset.check_use_var_with_data_generator( + dataset._check_use_var_with_data_generator( slot_data, generator_class, "test_run_with_dump_a.txt") except Exception as e: print("warning: case 4 catch expected error") @@ -392,7 +392,7 @@ def test_var_consistency_insepection(self): print("========================================") generator_class = CTRDataset(mode=5) try: - dataset.check_use_var_with_data_generator( + dataset._check_use_var_with_data_generator( slot_data, generator_class, "test_run_with_dump_a.txt") except Exception as e: print("warning: case 5 catch expected error")