diff --git a/src/kernels/rotary_embedding_kernels.cpp b/src/kernels/rotary_embedding_kernels.cpp
index 4b8b817e..52142902 100644
--- a/src/kernels/rotary_embedding_kernels.cpp
+++ b/src/kernels/rotary_embedding_kernels.cpp
@@ -241,7 +241,7 @@ static inline void chatglm2ApplyRotaryPosEmbeding(T *query, T *key, int qStride,
     for (int head = 0; head < head_num; ++head) {
         for (int bs = 0; bs < batch_size; ++bs) {
             for (int seq = 0; seq < seq_len; ++seq) {
-                T *pF = query + seq * qStride + head * dim;
+                T *pF = query + bs * seq_len * qStride + seq * qStride + head * dim;
 
                 int pos = position_ids[seq];
                 float *pcos = emb_cos + pos * dim;
diff --git a/src/utils/shm_reduction.cpp b/src/utils/shm_reduction.cpp
index f1097530..8fc15c35 100644
--- a/src/utils/shm_reduction.cpp
+++ b/src/utils/shm_reduction.cpp
@@ -50,8 +50,9 @@ void ShmReduction::ShmResize(int rank, size_t size) {
     // shm_unlink(shmCtx_.name);
 
     // alloc and map new shm
-    total_size = total_size - shmCtx_.nbytes + size;
     shmCtx_.nbytes = size;
+    shmCtx_.nblocks = (size + SHM_BLOCK_SIZE - 1) / SHM_BLOCK_SIZE;
+    total_size = sizeof(int) * shmCtx_.nstates + shmCtx_.nbytes + shmCtx_.nblocks * shmCtx_.nstates;
     // Truncate the shared memory to the desired size
     if (rank == 0 && ftruncate(shmCtx_.fp, total_size) == -1) {
         perror("shm ftruncate failed.");