diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml index ba942b23..bb6f585a 100644 --- a/.JuliaFormatter.toml +++ b/.JuliaFormatter.toml @@ -1,2 +1,2 @@ style="blue" -format_markdown=true \ No newline at end of file +format_markdown=true diff --git a/HISTORY.md b/HISTORY.md index 1697cd36..6f9a641c 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,3 +1,9 @@ +# 0.15.13 + +Exports extra functionality that should probably have been exported, namely `ordered`, `isinvertible`, and `columnwise`, from Bijectors.jl + +The docs have been thoroughly restructured. + # 0.15.12 Improved implementation of the Enzyme rule for `Bijectors.find_alpha`. diff --git a/Project.toml b/Project.toml index cb5d3a99..3eba34fa 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "Bijectors" uuid = "76274a88-744f-5084-9051-94815aaf08c4" -version = "0.15.12" +version = "0.15.13" [deps] ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" diff --git a/README.md b/README.md index 7155a11e..f6fea731 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,35 @@ # Bijectors.jl -[![Docs - Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://turinglang.github.io/Bijectors.jl/stable) -[![Docs - Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://turinglang.github.io/Bijectors.jl/dev) -[![Interface tests](https://github.com/TuringLang/Bijectors.jl/workflows/Interface%20tests/badge.svg?branch=main)](https://github.com/TuringLang/Bijectors.jl/actions?query=workflow%3A%22Interface+tests%22+branch%3Amain) -[![AD tests](https://github.com/TuringLang/Bijectors.jl/workflows/AD%20tests/badge.svg?branch=main)](https://github.com/TuringLang/Bijectors.jl/actions?query=workflow%3A%22AD+tests%22+branch%3Amain) +[![Documentation for latest stable release](https://img.shields.io/badge/docs-stable-blue.svg)](https://turinglang.github.io/Bijectors.jl) +[![Documentation for development version](https://img.shields.io/badge/docs-dev-blue.svg)](https://turinglang.github.io/Bijectors.jl/dev) +[![CI](https://github.com/TuringLang/Bijectors.jl/actions/workflows/CI.yml/badge.svg)](https://github.com/TuringLang/Bijectors.jl/actions/workflows/CI.yml) -*A package for transforming distributions, used by [Turing.jl](https://github.com/TuringLang/Turing.jl).* +Bijectors.jl implements functions for transforming random variables and probability distributions. -Bijectors.jl implements both an interface for transforming distributions from Distributions.jl and many transformations needed in this context. -This package is used heavily in the probabilistic programming language Turing.jl. +A quick overview of some of the key functionality is provided below: -See the [documentation](https://turinglang.github.io/Bijectors.jl) for more. +```julia +julia> using Bijectors; + dist = LogNormal(); +LogNormal{Float64}(μ=0.0, σ=1.0) -## Do you want to contribute? +julia> x = rand(dist) # Constrained to (0, ∞) +0.6471106974390148 -If you feel you have some relevant skills and are interested in contributing, please get in touch! -You can find us in the #turing channel on the [Julia Slack](https://julialang.org/slack/) or [Discourse](https://discourse.julialang.org). -If you're having any problems, please open a Github issue, even if the problem seems small (like help figuring out an error message). -Every issue you open helps us to improve the library! +julia> b = bijector(dist) # This maps from (0, ∞) to ℝ +(::Base.Fix1{typeof(broadcast), typeof(log)}) (generic function with 1 method) + +julia> y = b(x) # Unconstrained value in ℝ +-0.43523790570180304 + +julia> # Log-absolute determinant of the Jacobian at x. + with_logabsdet_jacobian(b, x) +(-0.43523790570180304, 0.43523790570180304) +``` + +Please see the [documentation](https://turinglang.github.io/Bijectors.jl) for more information. + +## Get in touch + +If you have any questions, please feel free to [post on Julia Slack](https://julialang.slack.com/archives/CCYDC34A0) or [Discourse](https://discourse.julialang.org/). +We also very much welcome GitHub issues or pull requests! diff --git a/docs/Project.toml b/docs/Project.toml index 63604fda..3b0979c9 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,6 +1,7 @@ [deps] Bijectors = "76274a88-744f-5084-9051-94815aaf08c4" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" diff --git a/docs/make.jl b/docs/make.jl index 80ad43bf..d71d1031 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -9,10 +9,13 @@ makedocs(; format=Documenter.HTML(), modules=[Bijectors], pages=[ - "Home" => "index.md", - "Transforms" => "transforms.md", - "Distributions.jl integration" => "distributions.md", - "Examples" => "examples.md", + "index.md", + "interface.md", + "defining.md", + "distributions.md", + "types.md", + "advi.md", + "flows.md", ], checkdocs=:exports, doctest=false, diff --git a/docs/src/advi.md b/docs/src/advi.md new file mode 100644 index 00000000..60e305a8 --- /dev/null +++ b/docs/src/advi.md @@ -0,0 +1,94 @@ +# Example: Variational inference + +The real utility of `TransformedDistribution` becomes more apparent when using `transformed(dist, b)` for any bijector `b`. +To get the transformed distribution corresponding to the `Beta(2, 2)`, we called `transformed(dist)` before. +This is an alias for `transformed(dist, bijector(dist))`. +Remember `bijector(dist)` returns the constrained-to-constrained bijector for that particular `Distribution`. +But we can of course construct a `TransformedDistribution` using different bijectors with the same `dist`. + +This is particularly useful in _Automatic Differentiation Variational Inference (ADVI)_. + +## Univariate ADVI + +An important part of ADVI is to approximate a constrained distribution, e.g. `Beta`, as follows: + + 1. Sample `x` from a `Normal` with parameters `μ` and `σ`, i.e. `x ~ Normal(μ, σ)`. + 2. Transform `x` to `y` s.t. `y ∈ support(Beta)`, with the transform being a differentiable bijection with a differentiable inverse (a "bijector"). + +This then defines a probability density with the same _support_ as `Beta`! +Of course, it's unlikely that it will be the same density, but it's an _approximation_. + +Creating such a distribution can be done with `Bijector` and `TransformedDistribution`: + +```@example advi +using Bijectors +using StableRNGs: StableRNG +rng = StableRNG(42) + +dist = Beta(2, 2) +b = bijector(dist) # (0, 1) → ℝ +b⁻¹ = inverse(b) # ℝ → (0, 1) +td = transformed(Normal(), b⁻¹) # x ∼ 𝓝(0, 1) then b(x) ∈ (0, 1) +x = rand(rng, td) # ∈ (0, 1) +``` + +It's worth noting that `support(Beta)` is the _closed_ interval `[0, 1]`, while the constrained-to-unconstrained bijection, `Logit` in this case, is only well-defined as a map `(0, 1) → ℝ` for the _open_ interval `(0, 1)`. +This is of course not an implementation detail. +`ℝ` is itself open, thus no continuous bijection exists from a _closed_ interval to `ℝ`. +But since the boundaries of a closed interval has what's known as measure zero, this doesn't end up affecting the resulting density with support on the entire real line. +In practice, this means that + +```@example advi +td = transformed(Beta()) +inverse(td.transform)(rand(rng, td)) +``` + +will never result in `0` or `1` though any sample arbitrarily close to either `0` or `1` is possible. +_Disclaimer: numerical accuracy is limited, so you might still see `0` and `1` if you're 'lucky'._ + +## Multivariate ADVI example + +We can also do _multivariate_ ADVI using the `Stacked` bijector. +`Stacked` gives us a way to combine univariate and/or multivariate bijectors into a singe multivariate bijector. +Say you have a vector `x` of length 2 and you want to transform the first entry using `Exp` and the second entry using `Log`. +`Stacked` gives you an easy and efficient way of representing such a bijector. + +```@example advi +using Bijectors: SimplexBijector + +# Original distributions +dists = (Beta(), InverseGamma(), Dirichlet(2, 3)) + +# Construct the corresponding ranges +function make_ranges(dists) + ranges = [] + idx = 1 + for i in 1:length(dists) + d = dists[i] + push!(ranges, idx:(idx + length(d) - 1)) + idx += length(d) + end + return ranges +end + +ranges = make_ranges(dists) +ranges +``` + +```@example advi +# Base distribution; mean-field normal +num_params = ranges[end][end] + +d = MvNormal(zeros(num_params), ones(num_params)); + +# Construct the transform +bs = bijector.(dists) # constrained-to-unconstrained bijectors for dists +ibs = inverse.(bs) # invert, so we get unconstrained-to-constrained +sb = Stacked(ibs, ranges) # => Stacked <: Bijector + +# Mean-field normal with unconstrained-to-constrained stacked bijector +td = transformed(d, sb) +y = rand(td) +``` + +As can be seen from this, we now have a `y` for which `0.0 ≤ y[1] ≤ 1.0`, `0.0 < y[2]`, and `sum(y[3:4]) ≈ 1.0`. diff --git a/docs/src/defining.md b/docs/src/defining.md new file mode 100644 index 00000000..cd020dff --- /dev/null +++ b/docs/src/defining.md @@ -0,0 +1,94 @@ +# Defining a bijector + +This page describes the minimum expected interface to implement a bijector. + +In general, there are two pieces of information needed to define a bijector: + + 1. The transformation itself, i.e., the map $b: \mathbb{R}^d \to \mathbb{R}^d$. + + 2. The log-absolute determinant of the Jacobian of that transformation. + For a transformation $b: \mathbb{R}^d \to \mathbb{R}^d$, the Jacobian at point $x \in \mathbb{R}^d$ is defined as: + + $$J_{b}(x) = \begin{bmatrix} + \partial y_1/\partial x_1 & \partial y_1/\partial x_2 & \cdots & \partial y_1/\partial x_d \\ + \partial y_2/\partial x_1 & \partial y_2/\partial x_2 & \cdots & \partial y_2/\partial x_d \\ + \vdots & \vdots & \ddots & \vdots \\ + \partial y_d/\partial x_1 & \partial y_d/\partial x_2 & \cdots & \partial y_d/\partial x_d + \end{bmatrix}$$ + + where $y = b(x)$. + +## The transform itself + +The most efficient way to implement a bijector is to provide an implementation of: + +```@docs; canonical=false +Bijectors.with_logabsdet_jacobian +``` + +If you define `with_logabsdet_jacobian(b, x)`, then you will automatically get default implementations of both `transform(b, x)` and `logabsdetjac(b, x)`, which respectively return the first and second value of that tuple. +So, in fact, you can implement a bijector by defining only `with_logabsdet_jacobian`. + +If you prefer, you can implement `transform` and `logabsdetjac` separately, as described below. +Having manual implementations of these may also be useful if you expect either to be used heavily without the other. + +### Transformation + +```@docs; canonical=false +transform +``` + +If `transform(b, x)` is defined, then you will automatically get a default implementation of `b(x)` which calls that. + +### Log-absolute determinant of the Jacobian + +```@docs; canonical=false +Bijectors.logabsdetjac +``` + +## Inverse + +Often you will want to define an inverse bijector as well. +To do so, you will have to implement: + +```@docs; canonical=false +Bijectors.inverse +``` + +If `b` is a bijector, then `inverse(b)` should return the inverse bijector $b^{-1}$. + +If your bijector subtypes `Bijectors.Bijector`, then you will get a default implementation of `inverse` which constructs `Bijectors.Inverse(b)`. +This may be easier than creating a second type for the inverse bijector. +Note that you will also need to implement the methods for `with_logabsdet_jacobian` (and/or `transform` and `logabsdetjac`) for the inverse bijector type. + +If your bijector is not invertible, you can specify this here: + +```@docs; canonical=false +Bijectors.isinvertible +``` + +## Distributions + +If your bijector is intended for use with a distribution, i.e., it transforms random variables drawn from that distribution to Euclidean space, then you should also implement: + +```@docs; canonical=false +Bijectors.bijector +``` + +which should return your bijector. + +On top of that, you should also implement a method for `Bijectors.output_size(b, dist::Distribution)`: + +```@docs; canonical=false +Bijectors.output_size +``` + +## Closed-form + +If your bijector does _not_ have a closed-form expression (e.g. if it uses an iterative procedure), then this should be set to false: + +```@docs; canonical=false +Bijectors.isclosedform +``` + +The default is `true` so you only need to set this if your bijector is not closed-form. diff --git a/docs/src/distributions.md b/docs/src/distributions.md index a1135516..57a62b6a 100644 --- a/docs/src/distributions.md +++ b/docs/src/distributions.md @@ -1,55 +1,78 @@ -## Basic usage +# Usage with distributions -Other than the `logpdf_with_trans` methods, the package also provides a more composable interface through the `Bijector` types. Consider for example the one from above with `Beta(2, 2)`. +Bijectors provides many utilities for working with probability distributions. -```julia -julia> using Random; - Random.seed!(42); +```@example distributions +using Bijectors + +dist = LogNormal() +x = rand(dist) +b = bijector(dist) # bijection (0, ∞) → ℝ + +y = b(x) +``` -julia> using Bijectors; - using Bijectors: Logit; +Here, `bijector(d::Distribution)` returns the corresponding constrained-to-unconstrained bijection for `Beta`, which is a log function. +The resulting bijector can be called, just like any other function, to transform samples from the distribution to the unconstrained space. -julia> dist = Beta(2, 2) -Beta{Float64}(α=2.0, β=2.0) +The function [`link`](@ref) provides a short way of doing the above: -julia> x = rand(dist) -0.36888689965963756 +```@example distributions +link(dist, x) ≈ b(x) +``` + +See [the Turing.jl docs](https://turinglang.org/docs/developers/transforms/distributions/) for more information about how this is used in probabilistic programming. + +## Transforming distributions -julia> b = bijector(dist) # bijection (0, 1) → ℝ -Logit{Float64}(0.0, 1.0) +We can also couple a distribution together with its bijector to create a _transformed_ `Distribution`, i.e. a `Distribution` defined by sampling from a given `Distribution` and then transforming using a given transformation: -julia> y = b(x) --0.5369949942509267 +```@example distributions +dist = LogNormal() # support on (0, ∞) +tdist = transformed(dist) # support on ℝ ``` -In this case we see that `bijector(d::Distribution)` returns the corresponding constrained-to-unconstrained bijection for `Beta`, which indeed is a `Logit` with `a = 0.0` and `b = 1.0`. The resulting `Logit <: Bijector` has a method `(b::Logit)(x)` defined, allowing us to call it just like any other function. Comparing with the above example, `b(x) ≈ link(dist, x)`. Just to convince ourselves: +We can then sample from, and compute the `logpdf` for, the resulting distribution: + +```@example distributions +y = rand(tdist) +``` + +```@example distributions +logpdf(tdist, y) +``` + +We should expect here that ```julia -julia> b(x) ≈ link(dist, x) -true +logpdf(tdist, y) ≈ logpdf(dist, x) - logabsdetjac(b, x) ``` -## Transforming distributions +where `b = bijector(dist)` and `y = b(x)`. -```@setup transformed-dist-simple -using Bijectors +To verify this, we can calculate the value of `x` using the inverse bijector: + +```@example distributions +b = bijector(dist) +binv = inverse(b) + +x = binv(y) ``` -We can create a _transformed_ `Distribution`, i.e. a `Distribution` defined by sampling from a given `Distribution` and then transforming using a given transformation: +(Because `b` is just a log function, `binv` is an exponential function, i.e. `x = exp(y)`.) -```@repl transformed-dist-simple -dist = Beta(2, 2) # support on (0, 1) -tdist = transformed(dist) # support on ℝ +Then we can check the equality: -tdist isa UnivariateDistribution +```@example distributions +logpdf(tdist, y) ≈ logpdf(dist, x) - logabsdetjac(b, x) ``` -We can the then compute the `logpdf` for the resulting distribution: +You can also use [`Bijectors.logpdf_with_trans`](@ref) with the original distribution: -```@repl transformed-dist-simple -# Some example values -x = rand(dist) -y = tdist.transform(x) +```@example distributions +logpdf_with_trans(dist, x, false) ≈ logpdf(dist, x) +``` -logpdf(tdist, y) +```@example distributions +logpdf_with_trans(dist, x, true) ≈ logpdf(tdist, y) ``` diff --git a/docs/src/examples.md b/docs/src/examples.md deleted file mode 100644 index 368d4bed..00000000 --- a/docs/src/examples.md +++ /dev/null @@ -1,163 +0,0 @@ -```@setup advi -using Bijectors -``` - -## Univariate ADVI example - -But the real utility of `TransformedDistribution` becomes more apparent when using `transformed(dist, b)` for any bijector `b`. To get the transformed distribution corresponding to the `Beta(2, 2)`, we called `transformed(dist)` before. This is simply an alias for `transformed(dist, bijector(dist))`. Remember `bijector(dist)` returns the constrained-to-constrained bijector for that particular `Distribution`. But we can of course construct a `TransformedDistribution` using different bijectors with the same `dist`. This is particularly useful in something called _Automatic Differentiation Variational Inference (ADVI)_.[2] An important part of ADVI is to approximate a constrained distribution, e.g. `Beta`, as follows: - - 1. Sample `x` from a `Normal` with parameters `μ` and `σ`, i.e. `x ~ Normal(μ, σ)`. - 2. Transform `x` to `y` s.t. `y ∈ support(Beta)`, with the transform being a differentiable bijection with a differentiable inverse (a "bijector") - -This then defines a probability density with same _support_ as `Beta`! Of course, it's unlikely that it will be the same density, but it's an _approximation_. Creating such a distribution becomes trivial with `Bijector` and `TransformedDistribution`: - -```@repl advi -using StableRNGs: StableRNG -rng = StableRNG(42); -dist = Beta(2, 2) -b = bijector(dist) # (0, 1) → ℝ -b⁻¹ = inverse(b) # ℝ → (0, 1) -td = transformed(Normal(), b⁻¹) # x ∼ 𝓝(0, 1) then b(x) ∈ (0, 1) -x = rand(rng, td) # ∈ (0, 1) -``` - -It's worth noting that `support(Beta)` is the _closed_ interval `[0, 1]`, while the constrained-to-unconstrained bijection, `Logit` in this case, is only well-defined as a map `(0, 1) → ℝ` for the _open_ interval `(0, 1)`. This is of course not an implementation detail. `ℝ` is itself open, thus no continuous bijection exists from a _closed_ interval to `ℝ`. But since the boundaries of a closed interval has what's known as measure zero, this doesn't end up affecting the resulting density with support on the entire real line. In practice, this means that - -```@repl advi -td = transformed(Beta()) -inverse(td.transform)(rand(rng, td)) -``` - -will never result in `0` or `1` though any sample arbitrarily close to either `0` or `1` is possible. _Disclaimer: numerical accuracy is limited, so you might still see `0` and `1` if you're lucky._ - -## Multivariate ADVI example - -We can also do _multivariate_ ADVI using the `Stacked` bijector. `Stacked` gives us a way to combine univariate and/or multivariate bijectors into a singe multivariate bijector. Say you have a vector `x` of length 2 and you want to transform the first entry using `Exp` and the second entry using `Log`. `Stacked` gives you an easy and efficient way of representing such a bijector. - -```@repl advi -using Bijectors: SimplexBijector - -# Original distributions -dists = (Beta(), InverseGamma(), Dirichlet(2, 3)); - -# Construct the corresponding ranges -ranges = []; -idx = 1; - -for i in 1:length(dists) - d = dists[i] - push!(ranges, idx:(idx + length(d) - 1)) - - global idx - idx += length(d) -end; - -ranges - -# Base distribution; mean-field normal -num_params = ranges[end][end] - -d = MvNormal(zeros(num_params), ones(num_params)); - -# Construct the transform -bs = bijector.(dists); # constrained-to-unconstrained bijectors for dists -ibs = inverse.(bs); # invert, so we get unconstrained-to-constrained -sb = Stacked(ibs, ranges) # => Stacked <: Bijector - -# Mean-field normal with unconstrained-to-constrained stacked bijector -td = transformed(d, sb); -y = rand(td) -0.0 ≤ y[1] ≤ 1.0 -0.0 < y[2] -sum(y[3:4]) ≈ 1.0 -``` - -## Normalizing flows - -A very interesting application is that of _normalizing flows_.[1] Usually this is done by sampling from a multivariate normal distribution, and then transforming this to a target distribution using invertible neural networks. Currently there are two such transforms available in Bijectors.jl: `PlanarLayer` and `RadialLayer`. Let's create a flow with a single `PlanarLayer`: - -```@setup normalizing-flows -using Bijectors -using StableRNGs: StableRNG -rng = StableRNG(42); -``` - -```@repl normalizing-flows -d = MvNormal(zeros(2), ones(2)); -b = PlanarLayer(2) -flow = transformed(d, b) -flow isa MultivariateDistribution -``` - -That's it. Now we can sample from it using `rand` and compute the `logpdf`, like any other `Distribution`. - -```@repl normalizing-flows -y = rand(rng, flow) -logpdf(flow, y) # uses inverse of `b` -``` - -Similarily to the multivariate ADVI example, we could use `Stacked` to get a _bounded_ flow: - -```@repl normalizing-flows -d = MvNormal(zeros(2), ones(2)); -ibs = inverse.(bijector.((InverseGamma(2, 3), Beta()))); -sb = Stacked(ibs) # == Stacked(ibs, [i:i for i = 1:length(ibs)] -b = sb ∘ PlanarLayer(2) -td = transformed(d, b); -y = rand(rng, td) -0 < y[1] -0 ≤ y[2] ≤ 1 -``` - -Want to fit the flow? - -```@repl normalizing-flows -using ForwardDiff - -# Construct the flow. -b = PlanarLayer(2) - -# Convenient for extracting parameters and reconstructing the flow. -using Functors -θs, reconstruct = Functors.functor(b); - -# Make the objective a `struct` to avoid capturing global variables. -struct NLLObjective{R,D,T} - reconstruct::R - basedist::D - data::T -end - -function (obj::NLLObjective)(θs) - transformed_dist = transformed(obj.basedist, obj.reconstruct(θs)) - return -sum(Base.Fix1(logpdf, transformed_dist), eachcol(obj.data)) -end - -# Some random data to estimate the density of. -xs = randn(2, 1000); - -# Construct the objective. -f = NLLObjective(reconstruct, MvNormal(2, 1), xs); - -# Initial loss. -@info "Initial loss: $(f(θs))" - -# Train using gradient descent. -ε = 1e-3; -for i in 1:100 - ∇s = ForwardDiff.gradient(θ -> f(θ), θs) - θs = fmap(θs, ∇s) do θ, ∇ - θ - ε .* ∇ - end -end - -# Final loss -@info "Final loss: $(f(θs))" - -# Very simple check to see if we learned something useful. -samples = rand(transformed(f.basedist, f.reconstruct(θs)), 1000); -mean(eachcol(samples)) # ≈ [0, 0] -cov(samples; dims=2) # ≈ I -``` - -We can easily create more complex flows by simply doing `PlanarLayer(10) ∘ PlanarLayer(10) ∘ RadialLayer(10)` and so on. diff --git a/docs/src/flows.md b/docs/src/flows.md new file mode 100644 index 00000000..73d37a5e --- /dev/null +++ b/docs/src/flows.md @@ -0,0 +1,115 @@ +# Example: Normalizing flows + +A very interesting application of bijectors is in _normalizing flows_. +Usually this is done by sampling from a multivariate normal distribution, and then transforming this to a target distribution using invertible neural networks. +Currently there are two such transforms available in Bijectors.jl: `PlanarLayer` and `RadialLayer`. +Let's create a flow with a single `PlanarLayer`: + +```@example normalizing-flows +using Bijectors +using StableRNGs: StableRNG +rng = StableRNG(42) + +d = MvNormal(zeros(2), ones(2)) +b = PlanarLayer(2) +flow = transformed(d, b) +``` + +`flow` is itself a multivariate distribution, so we can sample from it using `rand` and compute the `logpdf`, like any other `Distribution`. + +```@example normalizing-flows +y = rand(rng, flow) +logpdf(flow, y) # uses inverse of `b` +``` + +Similarily to the multivariate ADVI example, we could use `Stacked` to get a _bounded_ flow: + +```@example normalizing-flows +d = MvNormal(zeros(2), ones(2)); +ibs = inverse.(bijector.((InverseGamma(2, 3), Beta()))); +sb = Stacked(ibs) # == Stacked(ibs, [i:i for i = 1:length(ibs)] +b = sb ∘ PlanarLayer(2) +td = transformed(d, b); +y = rand(rng, td) +``` + +(As required, we have that `0 < y[1]` and `0 ≤ y[2] ≤ 1`.) + +To fit the flow, we can define an objective function that computes the negative log-likelihood of some data. +We will need to use automatic differentiation to compute gradients of the objective with respect to the parameters. +Since most AD packages require vectorised inputs, this means we also need a way to convert between the vectorised parameters and the `PlanarLayer` struct. + +```@example normalizing-flows +using ForwardDiff + +# Construct the flow. +b = PlanarLayer(2) + +# Obtain a vectorised version of the parameters. +xs_init = vcat(b.w, b.u, b.b) + +# Function to reconstruct the `PlanarLayer` from vectorised parameters. +function reconstruct_planarlayer(xs::AbstractVector) + dim = 2 + w = xs[1:dim] + u = xs[(dim + 1):(2 * dim)] + b = xs[end:end] + return PlanarLayer(w, u, b) +end + +# Check that the reconstruction does work... +reconstruct_planarlayer(xs_init) == b +``` + +Here is the objective function: + +```@example normalizing-flows +# Make the objective a `struct` to avoid capturing global variables. +struct NLLObjective{R,D,T} + reconstruct::R + basedist::D + data::T +end + +function (obj::NLLObjective)(xs::AbstractVector) + transformed_dist = transformed(obj.basedist, obj.reconstruct(xs)) + return -sum(Base.Fix1(logpdf, transformed_dist), eachcol(obj.data)) +end + +# Some random data to estimate the density of. +xs = randn(2, 1000) + +# Construct the objective. +f = NLLObjective(reconstruct_planarlayer, MvNormal(2, 1), xs) + +println("Initial loss = $(f(xs_init)) at xs_init = $(xs_init)") +``` + +Now we can train the flow using gradient descent: + +```@example normalizing-flows +using ForwardDiff: ForwardDiff + +function train(xs_init, niters; stepsize=1e-3) + xs = xs_init + for i in 1:niters + grad = ForwardDiff.gradient(f, xs) + @. xs = xs - (stepsize * grad) + end + return xs +end +xs_trained = train(xs_init, 1000) + +println("Final loss = $(f(xs_trained)) at xs_trained = $(xs_trained)") +``` + +Finally, we can sample from the trained flow and check that the samples have approximately zero mean and identity covariance (as expected given that our data was sampled using `randn`): + +```@example normalizing-flows +samples = rand(transformed(f.basedist, f.reconstruct(xs_trained)), 1000); + +# mean ≈ [0, 0], cov ≈ I +mean(eachcol(samples)), cov(samples; dims=2) +``` + +More complex flows can be created by composing multiple layers, e.g. `PlanarLayer(10) ∘ PlanarLayer(10) ∘ RadialLayer(10)`. diff --git a/docs/src/index.md b/docs/src/index.md index 78b78331..a6e8fc1b 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,42 +1,33 @@ # Bijectors.jl -This package implements a set of functions for transforming constrained random variables (e.g. simplexes, intervals) to Euclidean space. -The 3 main functions implemented in this package are the `link`, `invlink` and `logpdf_with_trans` for a number of distributions. +This package implements functionality for transforming random variables to Euclidean space (and back). -```@docs -Bijectors.link -Bijectors.invlink -Bijectors.logpdf_with_trans +For example, consider a random variable $X \sim \mathrm{Beta}(2, 2)$, which has support on $(0, 1)$: + +```@example main +using Bijectors + +x = rand(Beta(2, 2)) +``` + +In this case, the [logit function](https://en.wikipedia.org/wiki/Logit) is used as the transformation: + +$Y = \mathrm{logit}(X) = \log\left(\frac{X}{1 - X}\right).$ + +We can construct this function + +```@example main +b = bijector(Beta(2, 2)) +``` + +and apply it to `x`: + +```@example main +y = b(x) ``` -The distributions supported are: - - 1. `RealDistribution`: `Union{Cauchy, Gumbel, Laplace, Logistic, NoncentralT, Normal, NormalCanon, TDist}`, - 2. `PositiveDistribution`: `Union{BetaPrime, Chi, Chisq, Erlang, Exponential, FDist, Frechet, Gamma, InverseGamma, InverseGaussian, Kolmogorov, LogNormal, NoncentralChisq, NoncentralF, Rayleigh, Weibull}`, - 3. `UnitDistribution`: `Union{Beta, KSOneSided, NoncentralBeta}`, - 4. `SimplexDistribution`: `Union{Dirichlet}`, - 5. `PDMatDistribution`: `Union{InverseWishart, Wishart}`, and - 6. `TransformDistribution`: `Union{T, Truncated{T}} where T<:ContinuousUnivariateDistribution`. - -All exported names from the [Distributions.jl](https://juliastats.org/Distributions.jl/stable/) package are reexported from `Bijectors`. - -Bijectors.jl also provides a nice interface for working with these maps: composition, inversion, etc. -The following table lists mathematical operations for a bijector and the corresponding code in Bijectors.jl. - -| Operation | Method | Automatic | -|:-------------------------------------------:|:-------------------------------:|:---------:| -| `b ↦ b⁻¹` | `inverse(b)` | ✓ | -| `(b₁, b₂) ↦ (b₁ ∘ b₂)` | `b₁ ∘ b₂` | ✓ | -| `(b₁, b₂) ↦ [b₁, b₂]` | `Stacked(b₁, b₂)` | ✓ | -| `x ↦ b(x)` | `b(x)` | × | -| `y ↦ b⁻¹(y)` | `inverse(b)(y)` | × | -| `x ↦ log|det J(b, x)|` | `logabsdetjac(b, x)` | AD | -| `x ↦ b(x), log|det J(b, x)|` | `with_logabsdet_jacobian(b, x)` | ✓ | -| `p ↦ q := b_* p` | `q = transformed(p, b)` | ✓ | -| `y ∼ q` | `y = rand(q)` | ✓ | -| `p ↦ b` such that `support(b_* p) = ℝᵈ` | `bijector(p)` | ✓ | -| `(x ∼ p, b(x), log|det J(b, x)|, log q(y))` | `forward(q)` | ✓ | - -In this table, `b` denotes a `Bijector`, `J(b, x)` denotes the Jacobian of `b` evaluated at `x`, `b_*` denotes the [push-forward](https://www.wikiwand.com/en/Pushforward_measure) of `p` by `b`, and `x ∼ p` denotes `x` sampled from the distribution with density `p`. - -The "Automatic" column in the table refers to whether or not you are required to implement the feature for a custom `Bijector`. "AD" refers to the fact that it can be implemented "automatically" using automatic differentiation. +You can also obtain the log absolute determinant of the Jacobian of the transformation: + +```@example main +y, ladj = with_logabsdet_jacobian(b, x) +``` diff --git a/docs/src/interface.md b/docs/src/interface.md new file mode 100644 index 00000000..b36688ea --- /dev/null +++ b/docs/src/interface.md @@ -0,0 +1,67 @@ +# Interface + +This page describes the user-facing interface of Bijectors.jl. +You should be able to use all the functions documented here with any bijector defined in Bijectors.jl. + +## Transformation + +```@docs +transform +transform! +``` + +Bijectors are also callable objects, so `b(x)` is equivalent to `transform(b, x)`. + +## Inverses + +```@docs +inverse +``` + +## Log-absolute determinant of the Jacobian + +```@docs +logabsdetjac +logabsdetjac! +logabsdetjacinv +with_logabsdet_jacobian +with_logabsdet_jacobian! +``` + +## Transform wrappers + +### Elementwise transformation + +Some transformations are well-defined for different types of inputs, e.g. `exp` can also act elementwise on an `N`-dimensional `Array{<:Real,N}`. +To specify that a transformation should act elementwise, we can wrap it in the `elementwise` wrapper: + +```@docs +Bijectors.elementwise +``` + +### Columnwise transformation + +Likewise: + +```@docs +Bijectors.columnwise +``` + +## Working with distributions + +```@docs +Bijectors.bijector +Bijectors.link +Bijectors.invlink +Bijectors.logpdf_with_trans +Bijectors.output_size +Bijectors.transformed(d::Distribution, b::Bijector) +Bijectors.ordered +``` + +## Utilities + +```@docs +Bijectors.isinvertible +Bijectors.isclosedform(t::Bijectors.Transform) +``` diff --git a/docs/src/transforms.md b/docs/src/transforms.md deleted file mode 100644 index bf09ed18..00000000 --- a/docs/src/transforms.md +++ /dev/null @@ -1,106 +0,0 @@ -## Usage - -A very simple example of a "bijector"/diffeomorphism, i.e. a differentiable transformation with a differentiable inverse, is the `exp` function: - - - The inverse of `exp` is `log`. - - The derivative of `exp` at an input `x` is simply `exp(x)`, hence `logabsdetjac` is simply `x`. - -```@repl usage -using Bijectors -transform(exp, 1.0) -logabsdetjac(exp, 1.0) -with_logabsdet_jacobian(exp, 1.0) -``` - -Some transformations are well-defined for different types of inputs, e.g. `exp` can also act elementwise on an `N`-dimensional `Array{<:Real,N}`. -To specify that a transformation should act elementwise, we use the [`elementwise`](@ref) method: - -```@repl usage -x = ones(2, 2) -transform(elementwise(exp), x) -logabsdetjac(elementwise(exp), x) -with_logabsdet_jacobian(elementwise(exp), x) -``` - -These methods also work nicely for compositions of transformations: - -```@repl usage -transform(elementwise(log ∘ exp), x) -``` - -Unlike `exp`, some transformations have parameters affecting the resulting transformation they represent, e.g. `Logit` has two parameters `a` and `b` representing the lower- and upper-bound, respectively, of its domain: - -```@repl usage -using Bijectors: Logit - -f = Logit(0.0, 1.0) -f(rand()) # takes us from `(0, 1)` to `(-∞, ∞)` -``` - -## User-facing methods - -Without mutation: - -```@docs -transform -logabsdetjac -logabsdetjacinv -inverse -``` - -```julia -with_logabsdet_jacobian -``` - -With mutation: - -```@docs -transform! -logabsdetjac! -with_logabsdet_jacobian! -``` - -## Implementing a transformation - -Any callable can be made into a bijector by providing an implementation of `ChangeOfVariables.with_logabsdet_jacobian(b, x)`. - -You can also optionally implement [`transform`](@ref) and [`logabsdetjac`](@ref) to avoid redundant computations. This is usually only worth it if you expect `transform` or `logabsdetjac` to be used heavily without the other. - -Similarly with the mutable versions [`with_logabsdet_jacobian!`](@ref), [`transform!`](@ref), and [`logabsdetjac!`](@ref). - -## Working with Distributions.jl - -```@docs -Bijectors.bijector -Bijectors.transformed(d::Distribution, b::Bijector) -``` - -## Utilities - -```@docs -Bijectors.output_size -Bijectors.elementwise -Bijectors.isinvertible -Bijectors.isclosedform(t::Bijectors.Transform) -``` - -## API - -```@docs -Bijectors.Transform -Bijectors.Bijector -Bijectors.Inverse -``` - -## Bijectors - -```@docs -Bijectors.CorrBijector -Bijectors.LeakyReLU -Bijectors.Stacked -Bijectors.RationalQuadraticSpline -Bijectors.Coupling -Bijectors.OrderedBijector -Bijectors.NamedTransform -Bijectors.NamedCoupling -``` diff --git a/docs/src/types.md b/docs/src/types.md new file mode 100644 index 00000000..eeb14108 --- /dev/null +++ b/docs/src/types.md @@ -0,0 +1,24 @@ +# Types API + +This page includes docstrings for some types defined in Bijectors. + +## General types + +```@docs +Bijectors.Transform +Bijectors.Bijector +Bijectors.Inverse +``` + +## Specific bijectors + +```@docs +Bijectors.CorrBijector +Bijectors.LeakyReLU +Bijectors.Stacked +Bijectors.RationalQuadraticSpline +Bijectors.Coupling +Bijectors.OrderedBijector +Bijectors.NamedTransform +Bijectors.NamedCoupling +``` diff --git a/src/Bijectors.jl b/src/Bijectors.jl index c98df676..e277affb 100644 --- a/src/Bijectors.jl +++ b/src/Bijectors.jl @@ -52,9 +52,11 @@ export TransformDistribution, UnitDistribution, SimplexDistribution, PDMatDistribution, + # basic interface link, invlink, logpdf_with_trans, + isinvertible, isclosedform, transform, transform!, @@ -64,19 +66,25 @@ export TransformDistribution, logabsdetjac, logabsdetjac!, logabsdetjacinv, - Bijector, - Inverse, - Stacked, bijector, transformed, + # types and specific bijectors + Bijector, UnivariateTransformed, MultivariateTransformed, + Inverse, + # specific bijectors + Stacked, PlanarLayer, RadialLayer, Coupling, InvertibleBatchNorm, + # transform wrappers elementwise, - output_size + columnwise, + # utilities + output_size, + ordered const DEBUG = Bool(parse(Int, get(ENV, "DEBUG_BIJECTORS", "0"))) _debug(str) = @debug str diff --git a/src/interface.jl b/src/interface.jl index 3531e0e0..620ec071 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -12,20 +12,61 @@ Alias for `Base.Fix1(broadcast, f)`. In the case where `f::ComposedFunction`, the result is `Base.Fix1(broadcast, f.outer) ∘ Base.Fix1(broadcast, f.inner)` rather than `Base.Fix1(broadcast, f)`. + +# Examples + +```jldoctest; setup = :(using Bijectors) +julia> x = [1.0, 2.0, 3.0]; + +julia> f = elementwise(exp); + +julia> f(x) +3-element Vector{Float64}: + 2.718281828459045 + 7.38905609893065 + 20.085536923187668 + +julia> with_logabsdet_jacobian(f, x) +([2.718281828459045, 7.38905609893065, 20.085536923187668], 6.0) +``` """ elementwise(f) = Base.Fix1(broadcast, f) -elementwise(f::typeof(identity)) = identity +elementwise(::typeof(identity)) = identity # TODO: This is makes dispatching quite a bit easier, but uncertain if this is really # the way to go. function elementwise(f::ComposedFunction) return ComposedFunction(elementwise(f.outer), elementwise(f.inner)) end + const Columnwise{F} = Base.Fix1{typeof(eachcolmaphcat),F} """ + columnwise(f) Alias for `Base.Fix1(eachcolmaphcat, f)`. Represents a function `f` which is applied to each column of an input. + +# Examples + +```jldoctest; setup = :(using Bijectors) +julia> x = [4.0 5.0 6.0; 1.0 2.0 3.0]; + +julia> my_reverse(v) = reverse(v); # To avoid type piracy. + +julia> f = columnwise(my_reverse); + +julia> f(x) +2×3 Matrix{Float64}: + 1.0 2.0 3.0 + 4.0 5.0 6.0 + +julia> # We can't use `with_logabsdet_jacobian` on `f` until we define it + # for `my_reverse`, since we need to sum over columns. + Bijectors.with_logabsdet_jacobian(::typeof(my_reverse), xs) = my_reverse(xs), 0.0; + +julia> with_logabsdet_jacobian(f, x) +([1.0 2.0 3.0; 4.0 5.0 6.0], 0.0) +``` """ columnwise(f) = Base.Fix1(eachcolmaphcat, f) inverse(f::Columnwise) = columnwise(inverse(f.x)) @@ -39,19 +80,19 @@ with_logabsdet_jacobian(f::Columnwise, x::AbstractMatrix) = (f(x), logabsdetjac( """ output_size(f, sz) -Returns the output size of `f` given the input size `sz`. +Returns the size of `f(x)` when given an input `x` of size `sz`. """ output_size(f, sz) = sz output_size(f::ComposedFunction, sz) = output_size(f.outer, output_size(f.inner, sz)) """ output_size(f, dist::Distribution) -Returns the output size of `f` given the input distribution `dist`. This is useful when -Base.size(dist) is not defined, e.g. for `ProductNamedTupleDistribution` and in particular -is used by DynamicPPL when generating new random values for transformed distributions. +Returns the output size of `f` given an input drawn from the distribution `dist`. By default this just calls `output_size(f, size(dist))`, but this can be overloaded for -specific distributions. +specific distributions. This is useful when `Base.size(dist)` is not defined, e.g. for +`ProductNamedTupleDistribution` and in particular is used by DynamicPPL when generating new +random values for transformed distributions. """ output_size(f, dist::Distribution) = output_size(f, size(dist)) output_size(f::ComposedFunction, dist::Distribution) = output_size(f, size(dist)) @@ -59,7 +100,7 @@ output_size(f::ComposedFunction, dist::Distribution) = output_size(f, size(dist) """ output_length(f, len::Int) -Returns the output length of `f` given the input length `len`. +Returns the length of `f(x)` given a vector input `x` of length `len`. """ output_length(f, len::Int) = only(output_size(f, (len,))) @@ -93,12 +134,24 @@ abstract type Transform end (t::Transform)(x) = transform(t, x) +""" + with_logabsdet_jacobian(t::Transform, x) + +Semantically, this must return a tuple of `(y, logabsdetjac)`, where `y = transform(t, x)` +and `logabsdetjac = logabsdetjac(t, x)`. However, you can implement this function to exploit +shared computation between the two quantities. +""" +function with_logabsdet_jacobian end + Broadcast.broadcastable(b::Transform) = Ref(b) """ transform(b, x) -Transform `x` using `b`, treating `x` as a single input. +Transform `x` using `b`. + +If `with_logabsdet_jacobian` is already implemented for `b`, the default implementation of +`transform` will call `first(with_logabsdet_jacobian(b, x))`. """ transform(f::F, x) where {F<:Function} = f(x) function transform(t::Transform, x)