Skip to content

Bug with OpenMP device - repro and fix attached - causes no optimization on OpenMP kernels by default #791

@Notargets

Description

@Notargets

OCCA OpenMP Device Missing Default Optimization Flags

Bug Description

OpenMP device in OCCA fails to apply default -O3 optimization flags, causing 7.5x performance degradation. Discovered in gocca's kernel_program_parallel_test.go.

Reproduction

make clean
make

Results

Matrix-vector multiplication benchmark (640K chunks × 56×56 matrix):

Device Flags Performance
OpenMP default 24.06 GFLOPS
OpenMP explicit -O3 180.48 GFLOPS
Serial default 5.72 GFLOPS
Serial explicit -O3 5.72 GFLOPS

Issue: OpenMP default is 7.5x slower than OpenMP with -O3. Serial correctly applies -O3 by default (identical performance).

Root Cause

In src/occa/internal/modes/openmp/device.cpp:95, the code uses += on potentially non-existent compiler_flags:

allKernelProps["compiler_flags"] += " " + lastCompilerOpenMPFlag;

This creates an empty string with only the OpenMP flag, missing the -O3 default.

Fix

diff --git a/src/occa/internal/modes/openmp/device.cpp b/src/occa/internal/modes/openmp/device.cpp
index eb354a20..9dc8eeeb 100644
--- a/src/occa/internal/modes/openmp/device.cpp
+++ b/src/occa/internal/modes/openmp/device.cpp
@@ -101,33 +101,39 @@ namespace occa {
         compiler = "cl.exe";
 #endif
       }
 
       int vendor = allKernelProps["vendor"];
       // Check if we need to re-compute the vendor
       if (compiler.size()) {
         vendor = sys::compilerVendor(compiler);
       }
 
+
       if (compiler != lastCompiler) {
         lastCompiler = compiler;
         lastCompilerOpenMPFlag = openmp::compilerFlag(vendor, compiler);
 
         if (lastCompilerOpenMPFlag == openmp::notSupported) {
           io::stderr << "Compiler [" << (std::string) allKernelProps["compiler"]
                      << "] does not support OpenMP, defaulting to [Serial] mode\n";
         }
       }
 
       const bool usingOpenMP = (lastCompilerOpenMPFlag != openmp::notSupported);
       if (usingOpenMP) {
-        allKernelProps["compiler_flags"] += " " + lastCompilerOpenMPFlag;
+		  if (!allKernelProps.has("compiler_flags") ||
+				  allKernelProps["compiler_flags"].toString().empty()) {
+			  allKernelProps["compiler_flags"] = "-O3";
+		  }
+		  allKernelProps["compiler_flags"] += " " + lastCompilerOpenMPFlag;
+   //     allKernelProps["compiler_flags"] += " " + lastCompilerOpenMPFlag;
       }
 
       modeKernel_t *k = serial::device::buildKernel(filename,
                                                     kernelName,
                                                     kernelHash,
                                                     allKernelProps);
 
       if (k && usingOpenMP) {
         k->modeDevice->removeKernelRef(k);
         k->modeDevice = this;

Workaround

occa::device device({
  {"mode", "OpenMP"},
  {"kernel", {{"compiler_flags", "-O3"}}}
});

Repro

// openmp_bug_demo.cpp
#include <iostream>
#include <iomanip>
#include <chrono>
#include <vector>
#include <cmath>
#include <occa.hpp>

const char* matvec_kernel_source = R"KERNEL(
@kernel void matvec(const int N,
                    const int np,
                    const int chunks_per_block,
                    const double* matrix,
                    const double* input,
                    double* output) {
  // N is total number of chunks, np is matrix size (56)
  // chunks_per_block determines how many chunks each @inner processes
  for (int block = 0; block < (N + chunks_per_block - 1) / chunks_per_block; ++block; @outer) {
    for (int chunk_in_block = 0; chunk_in_block < chunks_per_block; ++chunk_in_block; @inner) {
      int chunk = block * chunks_per_block + chunk_in_block;
      if (chunk < N) {
        for (int i = 0; i < np; ++i) {
          double sum = 0.0;
          for (int j = 0; j < np; ++j) {
            sum += matrix[i*np + j] * input[chunk*np + j];
          }
          output[chunk*np + i] = sum;
        }
      }
    }
  }
}
)KERNEL";

struct BenchmarkResult {
  double time_ms;
  double gflops;
};

BenchmarkResult benchmarkMatvec(occa::device& device,
                               int N, int np, int chunks_per_block, int iterations) {
  // N = number of chunks, np = 56 (matrix size)
  size_t matrix_size = np * np * sizeof(double);
  size_t vector_size = N * np * sizeof(double);

  occa::memory o_matrix = device.malloc(matrix_size);
  occa::memory o_input = device.malloc(vector_size);
  occa::memory o_output = device.malloc(vector_size);

  // Initialize host data
  std::vector<double> h_matrix(np * np);
  std::vector<double> h_input(N * np);

  // Initialize matrix with some pattern
  for (int i = 0; i < np * np; ++i) {
    h_matrix[i] = (double)(i % 100) / 100.0;
  }

  // Initialize input vector
  for (int i = 0; i < N * np; ++i) {
    h_input[i] = (double)(i % 100) / 100.0;
  }

  // Copy to device
  o_matrix.copyFrom(h_matrix.data());
  o_input.copyFrom(h_input.data());

  // Build kernel
  occa::kernel kernel = device.buildKernelFromString(
    matvec_kernel_source,
    "matvec"
  );

  // Warmup
  for (int i = 0; i < 5; ++i) {
    kernel(N, np, chunks_per_block, o_matrix, o_input, o_output);
  }
  device.finish();

  // Benchmark
  auto start = std::chrono::high_resolution_clock::now();

  for (int i = 0; i < iterations; ++i) {
    kernel(N, np, chunks_per_block, o_matrix, o_input, o_output);
  }

# Makefile for OCCA OpenMP Compiler Flag Bug Demonstration

CXX = g++
CXXFLAGS = -std=c++11 -O3 -fopenmp
LDFLAGS = -locca -fopenmp

# OCCA include and library paths
# Adjust these if OCCA is installed in a non-standard location
OCCA_INCLUDE = -I/usr/local/include
OCCA_LIB = -L/usr/local/lib

TARGET = openmp_bug_demo
SOURCE = openmp_bug_demo.cpp

all: $(TARGET) run

$(TARGET): $(SOURCE)
    $(CXX) $(CXXFLAGS) $(OCCA_INCLUDE) $(SOURCE) -o $(TARGET) $(OCCA_LIB) $(LDFLAGS)

run: $(TARGET)
    @echo "Running OCCA OpenMP bug demonstration..."
    @echo "========================================"
    @./$(TARGET)

clean:
    rm -f $(TARGET)

.PHONY: all run clean

Metadata

Metadata

Assignees

No one assigned

    Labels

    OpenMPbugUse this label when reporting bugs!

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions