diff --git a/Manifest.toml b/Manifest.toml index 093be028..f5877ce8 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -2,7 +2,7 @@ julia_version = "1.8.5" manifest_format = "2.0" -project_hash = "27e7a553e90a68d1f177019877f2e4b06a23c81a" +project_hash = "7fba634d9208dc361f8caa397a5e8d2621e56bcf" [[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" @@ -48,15 +48,15 @@ version = "1.5.0" [[deps.LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "8695a49bfe05a2dc0feeefd06b4ca6361a018729" +git-tree-sha1 = "f7e39b1ecd9531475bbf3b25363027ba14c3e563" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "6.1.0" +version = "6.2.0" [[deps.LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] -git-tree-sha1 = "c35203c1e1002747da220ffc3c0762ce7754b08c" +git-tree-sha1 = "7ca6850ae880cc99b59b88517545f91a52020afa" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.23+0" +version = "0.0.25+0" [[deps.LazyArtifacts]] deps = ["Artifacts", "Pkg"] diff --git a/Project.toml b/Project.toml index e389ac11..728eead4 100644 --- a/Project.toml +++ b/Project.toml @@ -15,7 +15,7 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [compat] ExprTools = "0.1" -LLVM = "6" +LLVM = "6.2" Scratch = "1" TimerOutputs = "0.5" julia = "1.8" diff --git a/src/GPUCompiler.jl b/src/GPUCompiler.jl index 02301ade..ad79e4c4 100644 --- a/src/GPUCompiler.jl +++ b/src/GPUCompiler.jl @@ -14,6 +14,8 @@ using Scratch: @get_scratch! const CC = Core.Compiler using Core: MethodInstance, CodeInstance, CodeInfo +const use_newpm = LLVM.has_newpm() + include("utils.jl") # compiler interface and implementations diff --git a/src/driver.jl b/src/driver.jl index e7edfbce..9b3768b0 100644 --- a/src/driver.jl +++ b/src/driver.jl @@ -313,6 +313,7 @@ const __llvm_initialized = Ref(false) # global variables. this makes sure that the optimizer can, e.g., # rewrite function signatures. if toplevel + # TODO: there's no good API to use internalize with the new pass manager yet @dispose pm=ModulePassManager() begin exports = collect(values(jobs)) for gvar in globals(ir) @@ -340,20 +341,37 @@ const __llvm_initialized = Ref(false) # deferred codegen has some special optimization requirements, # which also need to happen _after_ regular optimization. # XXX: make these part of the optimizer pipeline? - has_deferred_jobs && @dispose pm=ModulePassManager() begin - # inline and optimize the call to e deferred code. in particular we want - # to remove unnecessary alloca's created by pass-by-ref semantics. - instruction_combining!(pm) - always_inliner!(pm) - scalar_repl_aggregates_ssa!(pm) - promote_memory_to_register!(pm) - gvn!(pm) - - # merge duplicate functions, since each compilation invocation emits everything - # XXX: ideally we want to avoid emitting these in the first place - merge_functions!(pm) - - run!(pm, ir) + if has_deferred_jobs + if use_newpm + @dispose pb=PassBuilder() mpm=NewPMModulePassManager(pb) begin + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, InstCombinePass()) + end + add!(mpm, AlwaysInlinerPass()) + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, SROAPass()) + add!(fpm, GVNPass()) + end + add!(mpm, MergeFunctionsPass()) + run!(mpm, ir) + end + else + @dispose pm=ModulePassManager() begin + # inline and optimize the call to e deferred code. in particular we want + # to remove unnecessary alloca's created by pass-by-ref semantics. + instruction_combining!(pm) + always_inliner!(pm) + scalar_repl_aggregates_ssa!(pm) + promote_memory_to_register!(pm) + gvn!(pm) + + # merge duplicate functions, since each compilation invocation emits everything + # XXX: ideally we want to avoid emitting these in the first place + merge_functions!(pm) + + run!(pm, ir) + end + end end end @@ -363,18 +381,29 @@ const __llvm_initialized = Ref(false) if cleanup @timeit_debug to "clean-up" begin - # we can only clean-up now, as optimization may lower or introduce calls to - # functions from the GPU runtime (e.g. julia.gc_alloc_obj -> gpu_gc_pool_alloc) - @dispose pm=ModulePassManager() begin - # eliminate all unused internal functions - global_optimizer!(pm) - global_dce!(pm) - strip_dead_prototypes!(pm) - - # merge constants (such as exception messages) - constant_merge!(pm) - - run!(pm, ir) + if use_newpm + @dispose pb=PassBuilder() mpm=NewPMModulePassManager(pb) begin + add!(mpm, RecomputeGlobalsAAPass()) + add!(mpm, GlobalOptPass()) + add!(mpm, GlobalDCEPass()) + add!(mpm, StripDeadPrototypesPass()) + add!(mpm, ConstantMergePass()) + run!(mpm, ir) + end + else + # we can only clean-up now, as optimization may lower or introduce calls to + # functions from the GPU runtime (e.g. julia.gc_alloc_obj -> gpu_gc_pool_alloc) + @dispose pm=ModulePassManager() begin + # eliminate all unused internal functions + global_optimizer!(pm) + global_dce!(pm) + strip_dead_prototypes!(pm) + + # merge constants (such as exception messages) + constant_merge!(pm) + + run!(pm, ir) + end end end end diff --git a/src/gcn.jl b/src/gcn.jl index d0d11e7f..40d262b7 100644 --- a/src/gcn.jl +++ b/src/gcn.jl @@ -37,10 +37,7 @@ isintrinsic(::CompilerJob{GCNCompilerTarget}, fn::String) = in(fn, gcn_intrinsic function finish_module!(@nospecialize(job::CompilerJob{GCNCompilerTarget}), mod::LLVM.Module, entry::LLVM.Function) - @dispose pm=ModulePassManager() begin - add!(pm, ModulePass("LowerThrowExtra", lower_throw_extra!)) - run!(pm, mod) - end + lower_throw_extra!(mod) if job.config.kernel # calling convention diff --git a/src/irgen.jl b/src/irgen.jl index 558ea151..fe16c5c9 100644 --- a/src/irgen.jl +++ b/src/irgen.jl @@ -96,6 +96,7 @@ function irgen(@nospecialize(job::CompilerJob)) end end + # TODO: there's no good API to use internalize with the new pass manager yet @dispose pm=ModulePassManager() begin global current_job current_job = job diff --git a/src/mcgen.jl b/src/mcgen.jl index c61d84a9..b1ccdff0 100644 --- a/src/mcgen.jl +++ b/src/mcgen.jl @@ -3,18 +3,30 @@ # final preparations for the module to be compiled to machine code # these passes should not be run when e.g. compiling to write to disk. function prepare_execution!(@nospecialize(job::CompilerJob), mod::LLVM.Module) - @dispose pm=ModulePassManager() begin - global current_job - current_job = job + global current_job + current_job = job - global_optimizer!(pm) + if use_newpm + @dispose pb=PassBuilder() mpm=NewPMModulePassManager(pb) begin + add!(mpm, RecomputeGlobalsAAPass()) + add!(mpm, GlobalOptPass()) + resolve_cpu_references!(mod) + add!(legacy2newpm(resolve_cpu_references!), mpm) + add!(mpm, GlobalDCEPass()) + add!(mpm, StripDeadPrototypesPass()) + run!(mpm, mod) + end + else + @dispose pm=ModulePassManager() begin + global_optimizer!(pm) - add!(pm, ModulePass("ResolveCPUReferences", resolve_cpu_references!)) + add!(pm, ModulePass("ResolveCPUReferences", resolve_cpu_references!)) - global_dce!(pm) - strip_dead_prototypes!(pm) + global_dce!(pm) + strip_dead_prototypes!(pm) - run!(pm, mod) + run!(pm, mod) + end end return diff --git a/src/metal.jl b/src/metal.jl index 743669d6..1dd942ce 100644 --- a/src/metal.jl +++ b/src/metal.jl @@ -87,10 +87,19 @@ function finish_module!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mo # add metadata to AIR intrinsics LLVM doesn't know about annotate_air_intrinsics!(job, mod) - @dispose pm=ModulePassManager() begin - # we emit properties (of the device and ptx isa) as private global constants, - # so run the optimizer so that they are inlined before the rest of the optimizer runs. - global_optimizer!(pm) + # we emit properties (of the air and metal version) as private global constants, + # so run the optimizer so that they are inlined before the rest of the optimizer runs. + if use_newpm + @dispose pb=PassBuilder() mpm=NewPMModulePassManager(pb) begin + add!(mpm, RecomputeGlobalsAAPass()) + add!(mpm, GlobalOptPass()) + run!(mpm, mod) + end + else + @dispose pm=ModulePassManager() begin + global_optimizer!(pm) + run!(pm, mod) + end end return functions(mod)[entry_fn] @@ -121,11 +130,22 @@ function finish_ir!(@nospecialize(job::CompilerJob{MetalCompilerTarget}), mod::L end if changed # lowering may have introduced additional functions marked `alwaysinline` - @dispose pm=ModulePassManager() begin - always_inliner!(pm) - cfgsimplification!(pm) - instruction_combining!(pm) - run!(pm, mod) + if use_newpm + @dispose pb=PassBuilder() mpm=NewPMModulePassManager(pb) begin + add!(mpm, AlwaysInlinerPass()) + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, SimplifyCFGPass()) + add!(fpm, InstCombinePass()) + end + run!(mpm, mod) + end + else + @dispose pm=ModulePassManager() begin + always_inliner!(pm) + cfgsimplification!(pm) + instruction_combining!(pm) + run!(pm, mod) + end end end @@ -158,11 +178,22 @@ end end if any_noreturn - @dispose pm=ModulePassManager() begin - always_inliner!(pm) - cfgsimplification!(pm) - instruction_combining!(pm) - run!(pm, mod) + if use_newpm + @dispose pb=PassBuilder() mpm=NewPMModulePassManager(pb) begin + add!(mpm, AlwaysInlinerPass()) + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, SimplifyCFGPass()) + add!(fpm, InstCombinePass()) + end + run!(mpm, mod) + end + else + @dispose pm=ModulePassManager() begin + always_inliner!(pm) + cfgsimplification!(pm) + instruction_combining!(pm) + run!(pm, mod) + end end end end @@ -294,13 +325,26 @@ function add_address_spaces!(@nospecialize(job::CompilerJob), mod::LLVM.Module, LLVM.name!(new_f, fn) # clean-up after this pass (which runs after optimization) - @dispose pm=ModulePassManager() begin - cfgsimplification!(pm) - scalar_repl_aggregates!(pm) - early_cse!(pm) - instruction_combining!(pm) + if use_newpm + @dispose pb=PassBuilder() mpm=NewPMModulePassManager(pb) begin + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, SimplifyCFGPass()) + add!(fpm, SROAPass()) + add!(fpm, EarlyCSEPass()) + add!(fpm, InstCombinePass()) + end - run!(pm, mod) + run!(mpm, mod) + end + else + @dispose pm=ModulePassManager() begin + cfgsimplification!(pm) + scalar_repl_aggregates!(pm) + early_cse!(pm) + instruction_combining!(pm) + + run!(pm, mod) + end end return new_f diff --git a/src/optim.jl b/src/optim.jl index 4e36dad4..9cb69c87 100644 --- a/src/optim.jl +++ b/src/optim.jl @@ -1,5 +1,416 @@ # LLVM IR optimization +function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module) + if use_newpm + optimize_newpm!(job, mod) + else + optimize_legacypm!(job, mod) + end + return +end + + +## new pm + +function optimize_newpm!(@nospecialize(job::CompilerJob), mod::LLVM.Module) + triple = llvm_triple(job.config.target) + tm = llvm_machine(job.config.target) + + global current_job + current_job = job + + @dispose pb=PassBuilder(tm) begin + @dispose mpm=NewPMModulePassManager(pb) begin + buildNewPMPipeline!(mpm, job) + run!(mpm, mod, tm) + end + end + optimize_module!(job, mod) + run!(DeadArgumentEliminationPass(), mod, tm) + return +end + +function buildNewPMPipeline!(mpm, @nospecialize(job::CompilerJob), opt_level=2) + buildEarlySimplificationPipeline(mpm, job, opt_level) + add!(mpm, AlwaysInlinerPass()) + buildEarlyOptimizerPipeline(mpm, job, opt_level) + add!(mpm, LowerSIMDLoopPass()) + add!(mpm, NewPMFunctionPassManager) do fpm + buildLoopOptimizerPipeline(fpm, job, opt_level) + buildScalarOptimizerPipeline(fpm, job, opt_level) + if opt_level >= 2 + buildVectorPipeline(fpm, job, opt_level) + end + add!(fpm, WarnMissedTransformationsPass()) + end + buildIntrinsicLoweringPipeline(mpm, job, opt_level) + buildCleanupPipeline(mpm, job, opt_level) +end + +if use_newpm + const BasicSimplifyCFGOptions = SimplifyCFGPassOptions(true, true, true, true, false, false, 1) + const AggressiveSimplifyCFGOptions = SimplifyCFGPassOptions(true, true, true, true, true, false, 1) +end + +function buildEarlySimplificationPipeline(mpm, @nospecialize(job::CompilerJob), opt_level) + if should_verify() + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, GCInvariantVerifierPass()) + end + add!(mpm, VerifierPass()) + end + add!(mpm, ForceFunctionAttrsPass()) + # TODO invokePipelineStartCallbacks + add!(mpm, Annotation2MetadataPass()) + add!(mpm, ConstantMergePass()) + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, LowerExpectIntrinsicPass()) + if opt_level >= 2 + add!(fpm, PropagateJuliaAddrspacesPass()) + end + add!(fpm, SimplifyCFGPass(BasicSimplifyCFGOptions)) + if opt_level >= 1 + add!(fpm, DCEPass()) + add!(fpm, SROAPass()) + end + end + # TODO invokeEarlySimplificationCallbacks +end + +function buildEarlyOptimizerPipeline(mpm, @nospecialize(job::CompilerJob), opt_level) + add!(mpm, NewPMCGSCCPassManager) do cgpm + # TODO invokeCGSCCCallbacks + add!(cgpm, NewPMFunctionPassManager) do fpm + add!(fpm, AllocOptPass()) + add!(fpm, Float2IntPass()) + add!(fpm, LowerConstantIntrinsicsPass()) + end + end + add!(legacy2newpm(cpu_features!), mpm) + if opt_level >= 1 + add!(mpm, NewPMFunctionPassManager) do fpm + if opt_level >= 2 + add!(fpm, SROAPass()) + add!(fpm, InstCombinePass()) + add!(fpm, JumpThreadingPass()) + add!(fpm, CorrelatedValuePropagationPass()) + add!(fpm, ReassociatePass()) + add!(fpm, EarlyCSEPass()) + add!(fpm, AllocOptPass()) + else + add!(fpm, InstCombinePass()) + add!(fpm, EarlyCSEPass()) + end + end + # TODO invokePeepholeCallbacks + end +end + +function buildLoopOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_level) + add!(fpm, NewPMLoopPassManager) do lpm + if opt_level >= 2 + add!(lpm, LoopRotatePass()) + end + # TODO invokeLateLoopOptimizationCallbacks + end + if opt_level >= 2 + add!(fpm, NewPMLoopPassManager, #=UseMemorySSA=#true) do lpm + add!(lpm, LICMPass()) + add!(lpm, JuliaLICMPass()) + add!(lpm, SimpleLoopUnswitchPass()) + add!(lpm, LICMPass()) + add!(lpm, JuliaLICMPass()) + end + end + if opt_level >= 2 + add!(fpm, IRCEPass()) + end + add!(fpm, NewPMLoopPassManager) do lpm + if opt_level >= 2 + add!(lpm, LoopInstSimplifyPass()) + add!(lpm, LoopIdiomRecognizePass()) + add!(lpm, IndVarSimplifyPass()) + add!(lpm, LoopDeletionPass()) + add!(lpm, LoopFullUnrollPass()) + end + # TODO invokeLoopOptimizerEndCallbacks + end +end + +function buildScalarOptimizerPipeline(fpm, @nospecialize(job::CompilerJob), opt_level) + if opt_level >= 2 + add!(fpm, AllocOptPass()) + add!(fpm, SROAPass()) + add!(fpm, InstSimplifyPass()) + add!(fpm, GVNPass()) + add!(fpm, MemCpyOptPass()) + add!(fpm, SCCPPass()) + add!(fpm, CorrelatedValuePropagationPass()) + add!(fpm, DCEPass()) + add!(fpm, IRCEPass()) + add!(fpm, InstCombinePass()) + add!(fpm, JumpThreadingPass()) + end + if opt_level >= 3 + add!(fpm, GVNPass()) + end + if opt_level >= 2 + add!(fpm, DSEPass()) + # TODO invokePeepholeCallbacks + add!(fpm, SimplifyCFGPass(AggressiveSimplifyCFGOptions)) + add!(fpm, AllocOptPass()) + add!(fpm, NewPMLoopPassManager) do lpm + add!(lpm, LoopDeletionPass()) + add!(lpm, LoopInstSimplifyPass()) + end + add!(fpm, LoopDistributePass()) + end + # TODO invokeScalarOptimizerCallbacks +end + +function buildVectorPipeline(fpm, @nospecialize(job::CompilerJob), opt_level) + add!(fpm, InjectTLIMappings()) + add!(fpm, LoopVectorizePass()) + add!(fpm, LoopLoadEliminationPass()) + add!(fpm, InstCombinePass()) + add!(fpm, SimplifyCFGPass(AggressiveSimplifyCFGOptions)) + add!(fpm, SLPVectorizerPass()) + add!(fpm, VectorCombinePass()) + # TODO invokeVectorizerCallbacks + add!(fpm, ADCEPass()) + add!(fpm, LoopUnrollPass(LoopUnrollOptions(; opt_level))) +end + +function buildIntrinsicLoweringPipeline(mpm, @nospecialize(job::CompilerJob), opt_level) + # lower exception handling + if uses_julia_runtime(job) + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, LowerExcHandlersPass()) + end + end + + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, GCInvariantVerifierPass()) + end + add!(mpm, RemoveNIPass()) + + # lower GC intrinsics + add!(mpm, NewPMFunctionPassManager) do fpm + if !uses_julia_runtime(job) + add!(legacy2newpm(lower_gc_frame!), fpm) + end + add!(fpm, LateLowerGCPass()) + if uses_julia_runtime(job) && VERSION >= v"1.11.0-DEV.208" + add!(fpm, FinalLowerGCPass()) + end + end + if uses_julia_runtime(job) && VERSION < v"1.11.0-DEV.208" + add!(mpm, FinalLowerGCPass()) + end + + if opt_level >= 2 + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, GVNPass()) + add!(fpm, SCCPPass()) + add!(fpm, DCEPass()) + end + end + + # lower kernel state intrinsics + # NOTE: we can only do so here, as GC lowering can introduce calls to the runtime, + # and thus additional uses of the kernel state intrinsics. + if job.config.kernel + # TODO: now that all kernel state-related passes are being run here, merge some? + add!(legacy2newpm(add_kernel_state!), mpm) + add!(mpm, NewPMFunctionPassManager) do fpm + add!(legacy2newpm(lower_kernel_state!), fpm) + end + add!(legacy2newpm(cleanup_kernel_state!), mpm) + end + + # lower PTLS intrinsics + if uses_julia_runtime(job) + add!(mpm, LowerPTLSPass()) + else + # remove dead uses of ptls + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, ADCEPass()) + end + add!(legacy2newpm(lower_ptls!), mpm) + end + + if opt_level >= 1 + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, InstCombinePass()) + add!(fpm, SimplifyCFGPass(AggressiveSimplifyCFGOptions)) + end + end + + # remove Julia address spaces + add!(mpm, RemoveJuliaAddrspacesPass()) + + # Julia's operand bundles confuse the inliner, so repeat here now they are gone. + # FIXME: we should fix the inliner so that inlined code gets optimized early-on + add!(mpm, AlwaysInlinerPass()) +end + +function buildCleanupPipeline(mpm, @nospecialize(job::CompilerJob), opt_level) + if opt_level >= 2 + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, CombineMulAddPass()) + add!(fpm, DivRemPairsPass()) + end + end + # TODO invokeOptimizerLastCallbacks + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, AnnotationRemarksPass()) + end + add!(mpm, NewPMFunctionPassManager) do fpm + add!(fpm, DemoteFloat16Pass()) + if opt_level >= 1 + add!(fpm, GVNPass()) + end + end +end + + +## legacy pm + +function optimize_legacypm!(@nospecialize(job::CompilerJob), mod::LLVM.Module) + triple = llvm_triple(job.config.target) + tm = llvm_machine(job.config.target) + + global current_job + current_job = job + + @dispose pm=ModulePassManager() begin + addTargetPasses!(pm, tm, triple) + addOptimizationPasses!(pm) + run!(pm, mod) + end + + # NOTE: we need to use multiple distinct pass managers to force pass ordering; + # intrinsics should never get lowered before Julia has optimized them. + # XXX: why doesn't the barrier noop pass work here? + + # lower intrinsics + @dispose pm=ModulePassManager() begin + addTargetPasses!(pm, tm, triple) + + if !uses_julia_runtime(job) + add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!)) + end + + if job.config.kernel + # GC lowering is the last pass that may introduce calls to the runtime library, + # and thus additional uses of the kernel state intrinsic. + # TODO: now that all kernel state-related passes are being run here, merge some? + add!(pm, ModulePass("AddKernelState", add_kernel_state!)) + add!(pm, FunctionPass("LowerKernelState", lower_kernel_state!)) + add!(pm, ModulePass("CleanupKernelState", cleanup_kernel_state!)) + end + + if !uses_julia_runtime(job) + # remove dead uses of ptls + aggressive_dce!(pm) + add!(pm, ModulePass("LowerPTLS", lower_ptls!)) + end + + if uses_julia_runtime(job) + lower_exc_handlers!(pm) + end + # the Julia GC lowering pass also has some clean-up that is required + late_lower_gc_frame!(pm) + if uses_julia_runtime(job) + final_lower_gc!(pm) + end + + remove_ni!(pm) + remove_julia_addrspaces!(pm) + + if uses_julia_runtime(job) + # We need these two passes and the instcombine below + # after GC lowering to let LLVM do some constant propagation on the tags. + # and remove some unnecessary write barrier checks. + gvn!(pm) + sccp!(pm) + # Remove dead use of ptls + dce!(pm) + LLVM.Interop.lower_ptls!(pm, dump_native(job)) + instruction_combining!(pm) + # Clean up write barrier and ptls lowering + cfgsimplification!(pm) + end + + # Julia's operand bundles confuse the inliner, so repeat here now they are gone. + # FIXME: we should fix the inliner so that inlined code gets optimized early-on + always_inliner!(pm) + + # some of Julia's optimization passes happen _after_ lowering intrinsics + combine_mul_add!(pm) + div_rem_pairs!(pm) + + if VERSION < v"1.10.0-DEV.1144" + # save function attributes to work around JuliaGPU/GPUCompiler#437 + current_attrs = Dict{String,Any}() + for f in functions(mod) + attrs = function_attributes(f) + length(attrs) == 0 && continue + current_attrs[LLVM.name(f)] = collect(attrs) + end + end + + run!(pm, mod) + + if VERSION < v"1.10.0-DEV.1144" + # restore function attributes + for (fn, attrs) in current_attrs + haskey(functions(mod), fn) || continue + f = functions(mod)[fn] + + for attr in attrs + # NOTE: there's no function attributes that contain a type, + # so we can just blindly add them back + push!(function_attributes(f), attr) + end + end + end + end + + # target-specific optimizations + optimize_module!(job, mod) + + # we compile a module containing the entire call graph, + # so perform some interprocedural optimizations. + # + # for some reason, these passes need to be distinct from the regular optimization chain, + # or certain values (such as the constant arrays used to populare llvm.compiler.user ad + # part of the LateLowerGCFrame pass) aren't collected properly. + # + # these might not always be safe, as Julia's IR metadata isn't designed for IPO. + @dispose pm=ModulePassManager() begin + addTargetPasses!(pm, tm, triple) + + # simplify function calls that don't use the returned value + dead_arg_elimination!(pm) + + run!(pm, mod) + end + + # compare to Clang by using the pass manager builder APIs: + #LLVM.clopts("-print-after-all", "-filter-print-funcs=$(LLVM.name(entry))") + #@dispose pm=ModulePassManager() begin + # addTargetPasses!(pm, tm, triple) + # PassManager@dispose pmb=IRBuilder() begin + # optlevel!(pmb, 2) + # populate!(pm, pmb) + # end + # run!(pm, mod) + #end + + return +end + function addTargetPasses!(pm, tm, triple) add_library_info!(pm, triple) add_transform_info!(pm, tm) @@ -171,143 +582,10 @@ function addOptimizationPasses!(pm, opt_level=2) aggressive_dce!(pm) end -function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module) - triple = llvm_triple(job.config.target) - tm = llvm_machine(job.config.target) - - global current_job - current_job = job - - @dispose pm=ModulePassManager() begin - addTargetPasses!(pm, tm, triple) - addOptimizationPasses!(pm) - run!(pm, mod) - end - - # NOTE: we need to use multiple distinct pass managers to force pass ordering; - # intrinsics should never get lowered before Julia has optimized them. - # XXX: why doesn't the barrier noop pass work here? - - # lower intrinsics - @dispose pm=ModulePassManager() begin - addTargetPasses!(pm, tm, triple) - - if !uses_julia_runtime(job) - add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!)) - end - - if job.config.kernel - # GC lowering is the last pass that may introduce calls to the runtime library, - # and thus additional uses of the kernel state intrinsic. - # TODO: now that all kernel state-related passes are being run here, merge some? - add!(pm, ModulePass("AddKernelState", add_kernel_state!)) - add!(pm, FunctionPass("LowerKernelState", lower_kernel_state!)) - add!(pm, ModulePass("CleanupKernelState", cleanup_kernel_state!)) - end - - if !uses_julia_runtime(job) - # remove dead uses of ptls - aggressive_dce!(pm) - add!(pm, ModulePass("LowerPTLS", lower_ptls!)) - end - - if uses_julia_runtime(job) - lower_exc_handlers!(pm) - end - # the Julia GC lowering pass also has some clean-up that is required - late_lower_gc_frame!(pm) - if uses_julia_runtime(job) - final_lower_gc!(pm) - end - - remove_ni!(pm) - remove_julia_addrspaces!(pm) - - if uses_julia_runtime(job) - # We need these two passes and the instcombine below - # after GC lowering to let LLVM do some constant propagation on the tags. - # and remove some unnecessary write barrier checks. - gvn!(pm) - sccp!(pm) - # Remove dead use of ptls - dce!(pm) - LLVM.Interop.lower_ptls!(pm, dump_native(job)) - instruction_combining!(pm) - # Clean up write barrier and ptls lowering - cfgsimplification!(pm) - end - - # Julia's operand bundles confuse the inliner, so repeat here now they are gone. - # FIXME: we should fix the inliner so that inlined code gets optimized early-on - always_inliner!(pm) - - # some of Julia's optimization passes happen _after_ lowering intrinsics - combine_mul_add!(pm) - div_rem_pairs!(pm) - - if VERSION < v"1.10.0-DEV.1144" - # save function attributes to work around JuliaGPU/GPUCompiler#437 - current_attrs = Dict{String,Any}() - for f in functions(mod) - attrs = function_attributes(f) - length(attrs) == 0 && continue - current_attrs[LLVM.name(f)] = collect(attrs) - end - end - - run!(pm, mod) - - if VERSION < v"1.10.0-DEV.1144" - # restore function attributes - for (fn, attrs) in current_attrs - haskey(functions(mod), fn) || continue - f = functions(mod)[fn] - - for attr in attrs - # NOTE: there's no function attributes that contain a type, - # so we can just blindly add them back - push!(function_attributes(f), attr) - end - end - end - end - - # target-specific optimizations - optimize_module!(job, mod) - - # we compile a module containing the entire call graph, - # so perform some interprocedural optimizations. - # - # for some reason, these passes need to be distinct from the regular optimization chain, - # or certain values (such as the constant arrays used to populare llvm.compiler.user ad - # part of the LateLowerGCFrame pass) aren't collected properly. - # - # these might not always be safe, as Julia's IR metadata isn't designed for IPO. - @dispose pm=ModulePassManager() begin - addTargetPasses!(pm, tm, triple) - - # simplify function calls that don't use the returned value - dead_arg_elimination!(pm) - - run!(pm, mod) - end - - # compare to Clang by using the pass manager builder APIs: - #LLVM.clopts("-print-after-all", "-filter-print-funcs=$(LLVM.name(entry))") - #@dispose pm=ModulePassManager() begin - # addTargetPasses!(pm, tm, triple) - # PassManager@dispose pmb=IRBuilder() begin - # optlevel!(pmb, 2) - # populate!(pm, pmb) - # end - # run!(pm, mod) - #end - - return -end +## custom passes -## lowering intrinsics +# lowering intrinsics cpu_features!(pm::PassManager) = add!(pm, ModulePass("LowerCPUFeatures", cpu_features!)) function cpu_features!(mod::LLVM.Module) job = current_job::CompilerJob diff --git a/src/ptx.jl b/src/ptx.jl index c215a15e..d14d72e0 100644 --- a/src/ptx.jl +++ b/src/ptx.jl @@ -131,12 +131,19 @@ function finish_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}), entry = lower_byval(job, mod, entry) end - @dispose pm=ModulePassManager() begin - # we emit properties (of the device and ptx isa) as private global constants, - # so run the optimizer so that they are inlined before the rest of the optimizer runs. - global_optimizer!(pm) - - run!(pm, mod) + # we emit properties (of the device and ptx isa) as private global constants, + # so run the optimizer so that they are inlined before the rest of the optimizer runs. + if use_newpm + @dispose pb=PassBuilder() mpm=NewPMModulePassManager(pb) begin + add!(mpm, RecomputeGlobalsAAPass()) + add!(mpm, GlobalOptPass()) + run!(mpm, mod) + end + else + @dispose pm=ModulePassManager() begin + global_optimizer!(pm) + run!(pm, mod) + end end return entry @@ -145,6 +152,7 @@ end function optimize_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}), mod::LLVM.Module) tm = llvm_machine(job.config.target) + # TODO can't convert to newpm because speculative-execution doesn't have a parameter in the default PassBuilder parser @dispose pm=ModulePassManager() begin add_library_info!(pm, triple(mod)) add_transform_info!(pm, tm) @@ -177,12 +185,10 @@ function optimize_module!(@nospecialize(job::CompilerJob{PTXCompilerTarget}), end function finish_ir!(@nospecialize(job::CompilerJob{PTXCompilerTarget}), - mod::LLVM.Module, entry::LLVM.Function) - @dispose pm=ModulePassManager() begin - add!(pm, ModulePass("LowerTrap", lower_trap!)) - add!(pm, FunctionPass("LowerUnreachable", lower_unreachable!)) - - run!(pm, mod) + mod::LLVM.Module, entry::LLVM.Function) + lower_trap!(mod) + for f in functions(mod) + lower_unreachable!(f) end if job.config.kernel diff --git a/src/rtlib.jl b/src/rtlib.jl index 7d0000ac..f0e8214f 100644 --- a/src/rtlib.jl +++ b/src/rtlib.jl @@ -64,9 +64,13 @@ function emit_function!(mod, config::CompilerConfig, f, method) end # recent Julia versions include prototypes for all runtime functions, even if unused - @dispose pm=ModulePassManager() begin - strip_dead_prototypes!(pm) - run!(pm, new_mod) + if use_newpm + run!(StripDeadPrototypesPass(), new_mod) + else + @dispose pm=ModulePassManager() begin + strip_dead_prototypes!(pm) + run!(pm, new_mod) + end end temp_name = LLVM.name(meta.entry) diff --git a/src/spirv.jl b/src/spirv.jl index 912776a9..0ba30dae 100644 --- a/src/spirv.jl +++ b/src/spirv.jl @@ -82,17 +82,13 @@ end # The SPIRV Tools don't handle Julia's debug info, rejecting DW_LANG_Julia... strip_debuginfo!(mod) - @dispose pm=ModulePassManager() begin - # SPIR-V does not support trap, and has no mechanism to abort compute kernels - # (OpKill is only available in fragment execution mode) - add!(pm, ModulePass("RemoveTrap", rm_trap!)) + # SPIR-V does not support trap, and has no mechanism to abort compute kernels + # (OpKill is only available in fragment execution mode) + rm_trap!(mod) - # the LLVM to SPIR-V translator does not support the freeze instruction - # (SPIRV-LLVM-Translator#1140) - add!(pm, ModulePass("RemoveFreeze", rm_freeze!)) - - run!(pm, mod) - end + # the LLVM to SPIR-V translator does not support the freeze instruction + # (SPIRV-LLVM-Translator#1140) + rm_freeze!(mod) # translate to SPIR-V