Skip to content

Commit 059ea00

Browse files
authored
Merge pull request #7529 from hzhou/2507_pipeline
ch4/ofi: add new pipeline implementation Approved-by: Ken Raffenetti
2 parents 5108712 + 5e8794d commit 059ea00

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+2113
-1710
lines changed

doc/mpich/tuning_parameters.md

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1096,27 +1096,22 @@ GPU pipeline uses host buffer and pipelining technique to send internode
10961096
messages instead of GPU RDMA. To enable this mode, use the following two
10971097
CVARs:
10981098

1099-
* `MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE`: This CVAR enables GPU pipeline
1100-
for inter-node pt2pt messages
1101-
* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD`: The threshold to start using
1102-
GPU pipelining. Default is 1MB.
1099+
* `MPIR_CVAR_CH4_OFI_EAGER_THRESHOLD`: This CVAR enables enables the RNDV
1100+
(rendezvous) path for large messages above the threshold. Recommended value
1101+
is 1MB.
11031102

1104-
* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ`: Specifies the chunk size
1105-
(in bytes) for GPU pipeline data transfer.
1103+
* `MPIR_CVAR_CH4_OFI_RNDV_PROTOCOL=pipeline`: Forces the RNDV algorithm to
1104+
use pipelining. The default is "auto", which will select best algorithms
1105+
based on message attributes. Other include protocols include "read" - RDMA
1106+
read, and "direct", which relies on underlying network library implementations.
11061107

1107-
* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK`: Specifies the
1108+
* `MPIR_CVAR_CH4_OFI_PIPELINE_CHUNK_SZ`: Specifies the chunk size
1109+
(in bytes) for pipeline data transfer.
1110+
1111+
* `MPIR_CVAR_CH4_OFI_PIPELINE_NUM_CHUNKS`: Specifies the
11081112
number of buffers for GPU pipeline data transfer in each block/chunk of
11091113
the pool.
11101114

1111-
* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS`: Specifies the maximum
1112-
total number of buffers MPICH buffer pool can allocate.
1113-
1114-
* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE`: Specify engine type
1115-
for copying from device to host (sender side), default 0
1116-
1117-
* `MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE`: Specify engine type
1118-
for copying from host to device (receiver side), default 0
1119-
11201115
To enable GPU Direct RDMA support for pt2pt communication, use the
11211116
following CVARs:
11221117
* `MPIR_CVAR_CH4_OFI_ENABLE_HMEM`: This CVAR with a value of `1` enables

src/include/mpiimpl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,6 @@ typedef struct MPIR_Stream MPIR_Stream;
155155
/******************* PART 3: DEVICE INDEPENDENT HEADERS **********************/
156156
/*****************************************************************************/
157157

158-
#include "mpir_misc.h"
159158
#include "mpir_dbg.h"
160159
#include "mpir_objects.h"
161160
#include "mpir_strerror.h"
@@ -166,6 +165,7 @@ typedef struct MPIR_Stream MPIR_Stream;
166165
#include "mpir_mem.h"
167166
#include "mpir_info.h"
168167
#include "mpir_errcodes.h"
168+
#include "mpir_misc.h"
169169
#include "mpir_errhandler.h"
170170
#include "mpir_attr_generic.h"
171171
#include "mpir_contextid.h"

src/include/mpir_misc.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,26 @@ typedef struct {
7474
MPIR_request_type_t type;
7575
} MPIR_gpu_req;
7676

77+
MPL_STATIC_INLINE_PREFIX void MPIR_async_test(MPIR_gpu_req * areq, int *is_done)
78+
{
79+
int err;
80+
switch (areq->type) {
81+
case MPIR_NULL_REQUEST:
82+
/* a dummy, immediately complete */
83+
*is_done = 1;
84+
break;
85+
case MPIR_TYPEREP_REQUEST:
86+
MPIR_Typerep_test(areq->u.y_req, is_done);
87+
break;
88+
case MPIR_GPU_REQUEST:
89+
err = MPL_gpu_test(&areq->u.gpu_req, is_done);
90+
MPIR_Assertp(err == MPL_SUCCESS);
91+
break;
92+
default:
93+
MPIR_Assert(0);
94+
}
95+
}
96+
7797
int MPIR_Localcopy(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype,
7898
void *recvbuf, MPI_Aint recvcount, MPI_Datatype recvtype);
7999
int MPIR_Ilocalcopy(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype,

src/include/mpir_typerep.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,6 @@ int MPIR_Typerep_ipack(const void *inbuf, MPI_Aint incount, MPI_Datatype datatyp
7878
int MPIR_Typerep_iunpack(const void *inbuf, MPI_Aint insize, void *outbuf, MPI_Aint outcount,
7979
MPI_Datatype datatype, MPI_Aint outoffset, MPI_Aint * actual_unpack_bytes,
8080
MPIR_Typerep_req * typerep_req, uint32_t flags);
81-
int MPIR_Typerep_wait(MPIR_Typerep_req typerep_req);
82-
int MPIR_Typerep_test(MPIR_Typerep_req typerep_req, int *completed);
8381

8482
int MPIR_Typerep_size_external32(MPI_Datatype type);
8583
int MPIR_Typerep_pack_external(const void *inbuf, MPI_Aint incount, MPI_Datatype datatype,

src/mpi/datatype/typerep/src/typerep_pre.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,7 @@ typedef struct {
2828
#define MPIR_TYPEREP_HANDLE_NULL NULL
2929
#endif
3030

31+
int MPIR_Typerep_wait(MPIR_Typerep_req typerep_req);
32+
int MPIR_Typerep_test(MPIR_Typerep_req typerep_req, int *completed);
33+
3134
#endif /* TYPEREP_PRE_H_INCLUDED */

src/mpi/misc/utils.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,12 @@ int MPIR_Ilocalcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype se
500500
do_localcopy(sendbuf, sendcount, sendtype, sendoffset, recvbuf, recvcount, recvtype,
501501
recvoffset, LOCALCOPY_NONBLOCKING, &req->u.y_req);
502502
MPIR_ERR_CHECK(mpi_errno);
503-
req->type = MPIR_TYPEREP_REQUEST;
503+
504+
if (req->u.y_req.req == MPIR_TYPEREP_REQ_NULL) {
505+
req->type = MPIR_NULL_REQUEST;
506+
} else {
507+
req->type = MPIR_TYPEREP_REQUEST;
508+
}
504509
#endif
505510

506511
fn_exit:

src/mpid/ch4/ch4_api.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ Non Native API:
7171
NM*: am_hdr_sz, data_sz, data, count, datatype, sreq
7272
SHM*: am_hdr_sz, data_sz, data, count, datatype, sreq
7373
am_can_do_tag: bool
74-
NM*: void
75-
SHM*: void
74+
NM*: rreq
75+
SHM*: rreq
7676
am_tag_send : int
7777
NM*: rank, comm, handler_id, tag, buf, count, datatype, src_vci, dst_vci, sreq
7878
SHM*: rank, comm, handler_id, tag, buf, count, datatype, src_vci, dst_vci, sreq

src/mpid/ch4/netmod/ofi/Makefile.mk

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ mpi_core_sources += src/mpid/ch4/netmod/ofi/func_table.c \
1818
src/mpid/ch4/netmod/ofi/ofi_part.c \
1919
src/mpid/ch4/netmod/ofi/ofi_events.c \
2020
src/mpid/ch4/netmod/ofi/ofi_rndv.c \
21-
src/mpid/ch4/netmod/ofi/ofi_huge.c \
21+
src/mpid/ch4/netmod/ofi/ofi_rndv_read.c \
22+
src/mpid/ch4/netmod/ofi/ofi_rndv_write.c \
23+
src/mpid/ch4/netmod/ofi/ofi_pipeline.c \
2224
src/mpid/ch4/netmod/ofi/ofi_progress.c \
2325
src/mpid/ch4/netmod/ofi/ofi_am_events.c \
2426
src/mpid/ch4/netmod/ofi/ofi_nic.c \

src/mpid/ch4/netmod/ofi/ofi_am.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,17 @@ MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_check_eager(MPI_Aint am_hdr_sz, MPI_Ai
215215
}
216216
}
217217

218-
MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_can_do_tag(void)
218+
MPL_STATIC_INLINE_PREFIX bool MPIDI_NM_am_can_do_tag(MPIR_Request * rreq)
219219
{
220-
return MPIDI_OFI_ENABLE_TAGGED;
220+
if (MPIDI_OFI_ENABLE_TAGGED) {
221+
MPI_Aint data_sz;
222+
MPIR_Datatype_get_size_macro(MPIDIG_REQUEST(rreq, datatype), data_sz);
223+
data_sz *= MPIDIG_REQUEST(rreq, count);
224+
if (data_sz <= MPIDI_OFI_global.max_msg_size) {
225+
return true;
226+
}
227+
}
228+
return false;
221229
}
222230

223231
MPL_STATIC_INLINE_PREFIX MPIDIG_recv_data_copy_cb MPIDI_NM_am_get_data_copy_cb(uint32_t attr)

0 commit comments

Comments
 (0)