caesar/test_prompt.py at master · ScalingIntelligence/caesar · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import os
from typing import List
import json
from pydra import Config
import signal


from utils import construct_programatic_feedback
from KernelBenchInternal.src.eval import KernelExecResult
###########################
# Prompt Construction


# Template
    # sample_exec_result = KernelExecResult(
    #     compiled=False,
    #     correctness=False,
    #     metadata="", # need work on this
    #     runtime=-1.0,
    #     runtime_stats={}
    # )


###########################
# Test Cases
# A representative set of samples
###########################
#

def test_compiled_fail_feedback():

    sample_exec_result = KernelExecResult(
        compiled=False,
        correctness=False,
        metadata={
            'hardware': 'NVIDIA L40S',
            'device': 'cuda:0',
            'compilation_error': RuntimeError("Error building extension 'max_reduction'")
        },
        runtime=-1.0,
        runtime_stats={}
    )

    feedback = construct_programatic_feedback(sample_exec_result)
    print(feedback)


def test_correctness_val_fail_feedback():

    sample_exec_result = KernelExecResult(
        compiled=True,
        correctness=False,
        metadata={
            'hardware': 'NVIDIA L40S',
            'device': 'cuda:0',
            'max_difference': ['29.875006', '28.969961', '27.741943', '27.972809', '27.862772'],
            'avg_difference': ['2.578733', '2.575977', '2.574863', '2.581244', '2.582202'],
            'correctness_issue': 'Output mismatch',
            'correctness_trials': '(0 / 5)'
        },
        runtime=-1.0,
        runtime_stats={}
    )

    feedback = construct_programatic_feedback(sample_exec_result)
    print(feedback)


def test_correctness_shape_fail_feedback():

    sample_exec_result = KernelExecResult(
        compiled=True,
        correctness=False,
        metadata={
            'hardware': 'NVIDIA L40S',
            'device': 'cuda:2',
            'correctness_issue': 'Output shape mismatch: Expected torch.Size([128, 64, 32, 32]), got torch.Size([128, 64, 64, 64])'
        },
        runtime=-1.0,
        runtime_stats={}
    )

    feedback = construct_programatic_feedback(sample_exec_result)
    print(feedback)


def test_cuda_error_feedback():

    sample_exec_result = KernelExecResult(
        compiled=False,
        correctness=False,
        metadata={
            'hardware': 'NVIDIA L40S',
            'device': 'cuda:0',
            'cuda_error': 'CUDA Error: CUDA error: an illegal memory access was encountered\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n'
        },
        runtime=-1.0,
        runtime_stats={}
    )

    feedback = construct_programatic_feedback(sample_exec_result)
    print(feedback)


def test_function_name_exists_feedback():

    sample_exec_result = KernelExecResult(
        compiled=False,
        correctness=False,
        metadata={
            'hardware': 'NVIDIA L40S',
            'device': 'cuda:0',
            'other_error': 'error: "attribute \'bias\' already exists"'
        },
        runtime=-1.0,
        runtime_stats={}
    )

    feedback = construct_programatic_feedback(sample_exec_result)
    print(feedback)

def test_runtime_arg_fail_feedback():

    sample_exec_result = KernelExecResult(
        compiled=True,
        correctness=False,
        metadata={
            'hardware': 'NVIDIA L40S',
            'device': 'cuda:0',
            'runtime_error': 'fused_conv_gelu_norm_cuda(): incompatible function arguments. The following argument types are supported:\n    1. (arg0: torch.Tensor, arg1: torch.Tensor, arg2: torch.Tensor, arg3: int, arg4: int, a...'},
        runtime=-1.0,
        runtime_stats={}
    )

    feedback = construct_programatic_feedback(sample_exec_result)
    print(feedback)


def test_empty_feedback():
    """
     Log Path: /matx/u/simonguo/kernel_multi_turn/level2_reflection_all_prev_deepseek/run_deepseek_turns_3/problem_24/sample_0/log.json
    """

    sample_exec_result = KernelExecResult(
        compiled=True,
        correctness=False,
        metadata={
            'hardware': 'NVIDIA L40S',
            'device': 'cuda:4',
            'runtime_error': {}
        },
        runtime=-1.0,
        runtime_stats={}
    )

    feedback = construct_programatic_feedback(sample_exec_result)
    print(feedback)


def test_correctness_success_feedback():
    sample_exec_result = KernelExecResult(
        compiled=True,
        correctness=True,
        metadata={
            'hardware': 'NVIDIA L40S',
            'device': 'cuda:3',
            'correctness_trials': '(5 / 5)'
        },
        runtime=2.45,
        runtime_stats={
            'mean': 2.45,
            'std': 0.00197,
            'min': 2.45,
            'max': 2.46,
            'num_trials': 100,
            'hardware': 'NVIDIA L40S',
            'device': 'cuda:3'
        }
    )

    feedback = construct_programatic_feedback(sample_exec_result)
    print(feedback)