Don't allow reinterprets that would expose padding

Keno · Keno · commit 4d6e516844d9 · 2018-06-29T21:47:21.000-04:00
In #25908 it was noted that reinterpreting structures with paddings exposes undef LLVM values to user code. This is problematic, because an LLVM undef value is quite dangerous (it can have a different value at every use, e.g. for `a::Bool` undef, we can have `a || !a == true`. There are proposal in LLVM to create values that are merely arbitrary (but the same at every use), but that capability does not currently exist in LLVM. As such, we should try hard to prevent `undef` showing up in a user-visible way. There are several ways to fix this: 1. Wait until LLVM comes up with a safer `undef` and have the value merely be arbitrary, but not dangerous. 2. Always guarantee that padding bytes will be 0. 3. For contiguous-memory arrays, guarantee that we end up with the underlying bytes from that array. However, for now, I think don't think we should make a choice here. Issues like #21912, may play into the consideration, and I think we should be able to reserve making a choice until that point. So what this PR does is only allow reinterprets when they would not expose padding. This should hopefully cover the most common use cases of reinterpret: - Reinterpreting a vector or matrix of values to StaticVectors of the same element type. These should generally always have compatiable padding (if not, reinterpret was likely the wrong API to use). - Reinterpreting from a Vector{UInt8} to a vector of structs (that may have padding). This PR allows this for reading (but not for writing). Both cases are generally better served by the IO APIs, but hopefully this should still allow the common cases. Fixes #25908
diff --git a/base/atomics.jl b/base/atomics.jl
@@ -337,7 +337,7 @@ inttype(::Type{Float32}) = Int32
 inttype(::Type{Float64}) = Int64
 
 
-alignment(::Type{T}) where {T} = ccall(:jl_alignment, Cint, (Csize_t,), sizeof(T))
+gc_alignment(::Type{T}) where {T} = ccall(:jl_alignment, Cint, (Csize_t,), sizeof(T))
 
 # All atomic operations have acquire and/or release semantics, depending on
 # whether the load or store values. Most of the time, this is what one wants
@@ -350,13 +350,13 @@ for typ in atomictypes
     @eval getindex(x::Atomic{$typ}) =
         llvmcall($"""
                  %ptr = inttoptr i$WORD_SIZE %0 to $lt*
-                 %rv = load atomic $rt %ptr acquire, align $(alignment(typ))
+                 %rv = load atomic $rt %ptr acquire, align $(gc_alignment(typ))
                  ret $lt %rv
                  """, $typ, Tuple{Ptr{$typ}}, unsafe_convert(Ptr{$typ}, x))
     @eval setindex!(x::Atomic{$typ}, v::$typ) =
         llvmcall($"""
                  %ptr = inttoptr i$WORD_SIZE %0 to $lt*
-                 store atomic $lt %1, $lt* %ptr release, align $(alignment(typ))
+                 store atomic $lt %1, $lt* %ptr release, align $(gc_alignment(typ))
                  ret void
                  """, Cvoid, Tuple{Ptr{$typ}, $typ}, unsafe_convert(Ptr{$typ}, x), v)
 
diff --git a/base/iterators.jl b/base/iterators.jl
@@ -1030,6 +1030,9 @@ mutable struct Stateful{T, VS}
     # A bit awkward right now, but adapted to the new iteration protocol
     nextvalstate::Union{VS, Nothing}
     taken::Int
+    @inline function Stateful{<:Any, Any}(itr::T) where {T}
+        new{T, Any}(itr, iterate(itr), 0)
+    end
     @inline function Stateful(itr::T) where {T}
         VS = approx_iter_type(T)
         new{T, VS}(itr, iterate(itr)::VS, 0)
diff --git a/base/reinterpretarray.jl b/base/reinterpretarray.jl
@@ -7,6 +7,8 @@ the first dimension.
 """
 struct ReinterpretArray{T,N,S,A<:AbstractArray{S, N}} <: AbstractArray{T, N}
     parent::A
+    readable::Bool
+    writable::Bool
     global reinterpret
     function reinterpret(::Type{T}, a::A) where {T,N,S,A<:AbstractArray{S, N}}
         function throwbits(::Type{S}, ::Type{T}, ::Type{U}) where {S,T,U}
@@ -31,10 +33,32 @@ struct ReinterpretArray{T,N,S,A<:AbstractArray{S, N}} <: AbstractArray{T, N}
             dim = size(a)[1]
             rem(dim*sizeof(S),sizeof(T)) == 0 || thrownonint(S, T, dim)
         end
-        new{T, N, S, A}(a)
+        readable = array_subpadding(T, S)
+        writable = array_subpadding(S, T)
+        new{T, N, S, A}(a, readable, writable)
     end
 end
 
+function check_readable(a::ReinterpretArray{T, N, S} where N) where {T,S}
+    # See comment in check_writable
+    if !a.readable && !array_subpadding(T, S)
+        throw(PaddingError(T, S))
+    end
+end
+
+function check_writable(a::ReinterpretArray{T, N, S} where N) where {T,S}
+    # `array_subpadding` is relatively expensive (compared to a simple arrayref),
+    # so it is cached in the array. However, it is computable at compile time if,
+    # inference has the types available. By using this form of the check, we can
+    # get the best of both worlds for the success case. If the types were not
+    # available to inference, we simply need to check the field (relatively cheap)
+    # and if they were we should be able to fold this check away entirely.
+    if !a.writable && !array_subpadding(S, T)
+        throw(PaddingError(T, S))
+    end
+end
+
+
 parent(a::ReinterpretArray) = a.parent
 dataids(a::ReinterpretArray) = dataids(a.parent)
 
@@ -51,6 +75,7 @@ unsafe_convert(::Type{Ptr{T}}, a::ReinterpretArray{T,N,S} where N) where {T,S} =
 @inline @propagate_inbounds getindex(a::ReinterpretArray) = a[1]
 
 @inline @propagate_inbounds function getindex(a::ReinterpretArray{T,N,S}, inds::Vararg{Int, N}) where {T,N,S}
+    check_readable(a)
     # Make sure to match the scalar reinterpret if that is applicable
     if sizeof(T) == sizeof(S) && (fieldcount(T) + fieldcount(S)) == 0
         return reinterpret(T, a.parent[inds...])
@@ -85,6 +110,7 @@ end
 @inline @propagate_inbounds setindex!(a::ReinterpretArray, v) = (a[1] = v)
 
 @inline @propagate_inbounds function setindex!(a::ReinterpretArray{T,N,S}, v, inds::Vararg{Int, N}) where {T,N,S}
+    check_writable(a)
     v = convert(T, v)::T
     # Make sure to match the scalar reinterpret if that is applicable
     if sizeof(T) == sizeof(S) && (fieldcount(T) + fieldcount(S)) == 0
@@ -136,3 +162,97 @@ end
     end
     return a
 end
+
+# Padding
+struct Padding
+    offset::Int
+    size::Int
+end
+function intersect(p1::Padding, p2::Padding)
+    start = max(p1.offset, p2.offset)
+    stop = min(p1.offset + p1.size, p2.offset + p2.size)
+    Padding(start, max(0, stop-start))
+end
+
+struct PaddingError
+    S::Type
+    T::Type
+end
+
+function showerror(io::IO, p::PaddingError)
+    print(io, "Padding of type $(p.S) is not compatible with type $(p.T).")
+end
+
+"""
+    CyclePadding(padding, total_size)
+
+Cylces an iterator of `Padding` structs, restarting the padding at `total_size`.
+E.g. if `padding` is all the padding in a struct and `total_size` is the total
+aligned size of that array, `CyclePadding` will correspond to the padding in an
+infinite vector of such structs.
+"""
+struct CyclePadding{P}
+    padding::P
+    total_size::Int
+end
+eltype(::Type{<:CyclePadding}) = Padding
+IteratorSize(::Type{<:CyclePadding}) = IsInfinite()
+isempty(cp::CyclePadding) = isempty(cp.padding)
+function iterate(cp::CyclePadding)
+    y = iterate(cp.padding)
+    y === nothing && return nothing
+    y[1], (0, y[2])
+end
+function iterate(cp::CyclePadding, state::Tuple)
+    y = iterate(cp.padding, tail(state)...)
+    y === nothing && return iterate(cp, (state[1]+cp.total_size,))
+    Padding(y[1].offset+state[1], y[1].size), (state[1], tail(y)...)
+end
+
+"""
+    Compute the location of padding in a type.
+"""
+function padding(T)
+    padding = Padding[]
+    last_end::Int = 0
+    for i = 1:fieldcount(T)
+        offset = fieldoffset(T, i)
+        fT = fieldtype(T, i)
+        if offset != last_end
+            push!(padding, Padding(offset, offset-last_end))
+        end
+        last_end = offset + sizeof(fT)
+    end
+    padding
+end
+
+function CyclePadding(T::DataType)
+    a, s = datatype_alignment(T), sizeof(T)
+    as = s + (a - (s % a)) % a
+    pad = padding(T)
+    s != as && push!(pad, Padding(s, as - s))
+    CyclePadding(pad, as)
+end
+
+using .Iterators: Stateful
+@pure function array_subpadding(S, T)
+    checked_size = 0
+    lcm_size = lcm(sizeof(S), sizeof(T))
+    s, t = Stateful{<:Any, Any}(CyclePadding(S)),
+           Stateful{<:Any, Any}(CyclePadding(T))
+    isempty(t) && return true
+    isempty(s) && return false
+    while checked_size < lcm_size
+        # Take padding in T
+        pad = popfirst!(t)
+        # See if there's corresponding padding in S
+        while true
+            ps = peek(s)
+            ps.offset > pad.offset && return false
+            intersect(ps, pad) == pad && break
+            popfirst!(s)
+        end
+        checked_size = pad.offset + pad.size
+    end
+    return true
+end
diff --git a/base/sysimg.jl b/base/sysimg.jl
@@ -137,8 +137,6 @@ include("array.jl")
 include("abstractarray.jl")
 include("subarray.jl")
 include("views.jl")
-include("reinterpretarray.jl")
-
 
 # ## dims-type-converting Array constructors for convenience
 # type and dimensionality specified, accepting dims as series of Integers
@@ -205,6 +203,7 @@ include("reduce.jl")
 
 ## core structures
 include("reshapedarray.jl")
+include("reinterpretarray.jl")
 include("bitarray.jl")
 include("bitset.jl")
 
diff --git a/test/reinterpretarray.jl b/test/reinterpretarray.jl
@@ -49,3 +49,22 @@ let A = collect(reshape(1:20, 5, 4))
     @test view(R, :, :) isa StridedArray
     @test reshape(R, :) isa StridedArray
 end
+
+# Error on reinterprets that would expose padding
+struct S1
+    a::Int8
+    b::Int64
+end
+
+struct S2
+    a::Int16
+    b::Int64
+end
+
+A1 = S1[S1(0, 0)]
+A2 = S2[S2(0, 0)]
+@test reinterpret(S1, A2)[1] == S1(0, 0)
+@test_throws Base.PaddingError (reinterpret(S1, A2)[1] = S2(1, 2))
+@test_throws Base.PaddingError reinterpret(S2, A1)[1]
+reinterpret(S2, A1)[1] = S2(1, 2)
+@test A1[1] == S1(1, 2)