Skip to content

Commit b1c458d

Browse files
authored
add so parser (#33969)
* add delta score, scale show * so parser * windows * windows
1 parent afddcb9 commit b1c458d

File tree

4 files changed

+181
-0
lines changed

4 files changed

+181
-0
lines changed

paddle/fluid/framework/data_feed.cc

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ USE_INT_STAT(STAT_total_feasign_num_in_mem);
3131
namespace paddle {
3232
namespace framework {
3333

34+
DLManager& global_dlmanager_pool() {
35+
static DLManager manager;
36+
return manager;
37+
}
38+
3439
void RecordCandidateList::ReSize(size_t length) {
3540
mutex_.lock();
3641
capacity_ = length;
@@ -366,6 +371,10 @@ void InMemoryDataFeed<T>::SetParseInsId(bool parse_ins_id) {
366371
template <typename T>
367372
void InMemoryDataFeed<T>::LoadIntoMemory() {
368373
#ifdef _LINUX
374+
if (!so_parser_name_.empty()) {
375+
LoadIntoMemoryFromSo();
376+
return;
377+
}
369378
VLOG(3) << "LoadIntoMemory() begin, thread_id=" << thread_id_;
370379
std::string filename;
371380
while (this->PickOneFile(&filename)) {
@@ -408,6 +417,51 @@ void InMemoryDataFeed<T>::LoadIntoMemory() {
408417
#endif
409418
}
410419

420+
template <typename T>
421+
void InMemoryDataFeed<T>::LoadIntoMemoryFromSo() {
422+
#ifdef _LINUX
423+
VLOG(3) << "LoadIntoMemoryFromSo() begin, thread_id=" << thread_id_;
424+
425+
string::LineFileReader reader;
426+
paddle::framework::CustomParser* parser =
427+
global_dlmanager_pool().Load(so_parser_name_, slot_conf_);
428+
429+
std::string filename;
430+
while (this->PickOneFile(&filename)) {
431+
VLOG(3) << "PickOneFile, filename=" << filename
432+
<< ", thread_id=" << thread_id_;
433+
int err_no = 0;
434+
this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_);
435+
CHECK(this->fp_ != nullptr);
436+
__fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER);
437+
438+
paddle::framework::ChannelWriter<T> writer(input_channel_);
439+
T instance;
440+
platform::Timer timeline;
441+
timeline.Start();
442+
443+
while (1) {
444+
if (!reader.getline(&*(fp_.get()))) {
445+
break;
446+
} else {
447+
const char* str = reader.get();
448+
ParseOneInstanceFromSo(str, &instance, parser);
449+
}
450+
451+
writer << std::move(instance);
452+
instance = T();
453+
}
454+
455+
writer.Flush();
456+
timeline.Pause();
457+
VLOG(3) << "LoadIntoMemoryFromSo() read all lines, file=" << filename
458+
<< ", cost time=" << timeline.ElapsedSec()
459+
<< " seconds, thread_id=" << thread_id_;
460+
}
461+
VLOG(3) << "LoadIntoMemoryFromSo() end, thread_id=" << thread_id_;
462+
#endif
463+
}
464+
411465
// explicit instantiation
412466
template class InMemoryDataFeed<Record>;
413467

@@ -827,16 +881,23 @@ void MultiSlotInMemoryDataFeed::Init(
827881
inductive_shape_index_.resize(all_slot_num);
828882
use_slots_.clear();
829883
use_slots_is_dense_.clear();
884+
slot_conf_.resize(all_slot_num);
830885
for (size_t i = 0; i < all_slot_num; ++i) {
831886
const auto& slot = multi_slot_desc.slots(i);
832887
all_slots_[i] = slot.name();
833888
all_slots_type_[i] = slot.type();
834889
use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
890+
891+
slot_conf_[i].name = slot.name();
892+
slot_conf_[i].type = slot.type();
893+
slot_conf_[i].use_slots_index = use_slots_index_[i];
894+
835895
total_dims_without_inductive_[i] = 1;
836896
inductive_shape_index_[i] = -1;
837897
if (slot.is_used()) {
838898
use_slots_.push_back(all_slots_[i]);
839899
use_slots_is_dense_.push_back(slot.is_dense());
900+
slot_conf_[i].use_slots_is_dense = slot.is_dense();
840901
std::vector<int> local_shape;
841902
if (slot.is_dense()) {
842903
for (int j = 0; j < slot.shape_size(); ++j) {
@@ -869,6 +930,7 @@ void MultiSlotInMemoryDataFeed::Init(
869930
}
870931
visit_.resize(all_slot_num, false);
871932
pipe_command_ = data_feed_desc.pipe_command();
933+
so_parser_name_ = data_feed_desc.so_parser_name();
872934
finish_init_ = true;
873935
input_type_ = data_feed_desc.input_type();
874936
}
@@ -887,6 +949,12 @@ void MultiSlotInMemoryDataFeed::GetMsgFromLogKey(const std::string& log_key,
887949
*rank = (uint32_t)strtoul(rank_str.c_str(), NULL, 16);
888950
}
889951

952+
void MultiSlotInMemoryDataFeed::ParseOneInstanceFromSo(const char* str,
953+
Record* instance,
954+
CustomParser* parser) {
955+
parser->ParseOneInstance(str, instance);
956+
}
957+
890958
bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
891959
#ifdef _LINUX
892960
thread_local string::LineFileReader reader;

paddle/fluid/framework/data_feed.h

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,94 @@ using PvInstance = PvInstanceObject*;
117117

118118
inline PvInstance make_pv_instance() { return new PvInstanceObject(); }
119119

120+
struct SlotConf {
121+
std::string name;
122+
std::string type;
123+
int use_slots_index;
124+
int use_slots_is_dense;
125+
};
126+
127+
class CustomParser {
128+
public:
129+
CustomParser() {}
130+
virtual ~CustomParser() {}
131+
virtual void Init(const std::vector<SlotConf>& slots) = 0;
132+
virtual void ParseOneInstance(const char* str, Record* instance) = 0;
133+
};
134+
135+
typedef paddle::framework::CustomParser* (*CreateParserObjectFunc)();
136+
137+
class DLManager {
138+
struct DLHandle {
139+
void* module;
140+
paddle::framework::CustomParser* parser;
141+
};
142+
143+
public:
144+
DLManager() {}
145+
146+
~DLManager() {
147+
#ifdef _LINUX
148+
std::lock_guard<std::mutex> lock(mutex_);
149+
for (auto it = handle_map_.begin(); it != handle_map_.end(); ++it) {
150+
delete it->second.parser;
151+
dlclose(it->second.module);
152+
}
153+
#endif
154+
}
155+
156+
bool Close(const std::string& name) {
157+
#ifdef _LINUX
158+
auto it = handle_map_.find(name);
159+
if (it == handle_map_.end()) {
160+
return true;
161+
}
162+
delete it->second.parser;
163+
dlclose(it->second.module);
164+
#endif
165+
VLOG(0) << "Not implement in windows";
166+
return false;
167+
}
168+
169+
paddle::framework::CustomParser* Load(const std::string& name,
170+
std::vector<SlotConf>& conf) {
171+
#ifdef _LINUX
172+
std::lock_guard<std::mutex> lock(mutex_);
173+
DLHandle handle;
174+
std::map<std::string, DLHandle>::iterator it = handle_map_.find(name);
175+
if (it != handle_map_.end()) {
176+
return it->second.parser;
177+
}
178+
179+
handle.module = dlopen(name.c_str(), RTLD_NOW);
180+
if (handle.module == nullptr) {
181+
VLOG(0) << "Create so of " << name << " fail";
182+
return nullptr;
183+
}
184+
185+
CreateParserObjectFunc create_parser_func =
186+
(CreateParserObjectFunc)dlsym(handle.module, "CreateParserObject");
187+
handle.parser = create_parser_func();
188+
handle.parser->Init(conf);
189+
handle_map_.insert({name, handle});
190+
191+
return handle.parser;
192+
#endif
193+
VLOG(0) << "Not implement in windows";
194+
return nullptr;
195+
}
196+
197+
paddle::framework::CustomParser* ReLoad(const std::string& name,
198+
std::vector<SlotConf>& conf) {
199+
Close(name);
200+
return Load(name, conf);
201+
}
202+
203+
private:
204+
std::mutex mutex_;
205+
std::map<std::string, DLHandle> handle_map_;
206+
};
207+
120208
class DataFeed {
121209
public:
122210
DataFeed() {
@@ -252,6 +340,8 @@ class DataFeed {
252340
bool finish_set_filelist_;
253341
bool finish_start_;
254342
std::string pipe_command_;
343+
std::string so_parser_name_;
344+
std::vector<SlotConf> slot_conf_;
255345
std::vector<std::string> ins_id_vec_;
256346
std::vector<std::string> ins_content_vec_;
257347
platform::Place place_;
@@ -324,10 +414,13 @@ class InMemoryDataFeed : public DataFeed {
324414
virtual void SetEnablePvMerge(bool enable_pv_merge);
325415
virtual void SetCurrentPhase(int current_phase);
326416
virtual void LoadIntoMemory();
417+
virtual void LoadIntoMemoryFromSo();
327418

328419
protected:
329420
virtual bool ParseOneInstance(T* instance) = 0;
330421
virtual bool ParseOneInstanceFromPipe(T* instance) = 0;
422+
virtual void ParseOneInstanceFromSo(const char* str, T* instance,
423+
CustomParser* parser) {}
331424
virtual void PutToFeedVec(const std::vector<T>& ins_vec) = 0;
332425

333426
int thread_id_;
@@ -688,6 +781,8 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed<Record> {
688781
protected:
689782
virtual bool ParseOneInstance(Record* instance);
690783
virtual bool ParseOneInstanceFromPipe(Record* instance);
784+
virtual void ParseOneInstanceFromSo(const char* str, Record* instance,
785+
CustomParser* parser);
691786
virtual void PutToFeedVec(const std::vector<Record>& ins_vec);
692787
virtual void GetMsgFromLogKey(const std::string& log_key, uint64_t* search_id,
693788
uint32_t* cmatch, uint32_t* rank);

paddle/fluid/framework/data_feed.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,5 @@ message DataFeedDesc {
3333
optional string rank_offset = 6;
3434
optional int32 pv_batch_size = 7 [ default = 32 ];
3535
optional int32 input_type = 8 [ default = 0 ];
36+
optional string so_parser_name = 9;
3637
}

python/paddle/fluid/dataset.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,23 @@ def set_pipe_command(self, pipe_command):
9595
"""
9696
self.proto_desc.pipe_command = pipe_command
9797

98+
def set_so_parser_name(self, so_parser_name):
99+
"""
100+
Set so parser name of current dataset
101+
102+
Examples:
103+
.. code-block:: python
104+
105+
import paddle.fluid as fluid
106+
dataset = fluid.DatasetFactory().create_dataset()
107+
dataset.set_so_parser_name("./abc.so")
108+
109+
Args:
110+
pipe_command(str): pipe command
111+
112+
"""
113+
self.proto_desc.so_parser_name = so_parser_name
114+
98115
def set_rank_offset(self, rank_offset):
99116
"""
100117
Set rank_offset for merge_pv. It set the message of Pv.

0 commit comments

Comments
 (0)